In [75]:
!pip install statsmodels

Defaulting to user installation because normal site-packages is not writeable
Collecting statsmodels
  Downloading statsmodels-0.12.2-cp38-cp38-manylinux1_x86_64.whl (9.4 MB)
[K     |████████████████████████████████| 9.4 MB 15.7 MB/s eta 0:00:01
Collecting patsy>=0.5
  Downloading patsy-0.5.1-py2.py3-none-any.whl (231 kB)
[K     |████████████████████████████████| 231 kB 115.3 MB/s eta 0:00:01
Installing collected packages: patsy, statsmodels
Successfully installed patsy-0.5.1 statsmodels-0.12.2


In [2]:
!pip install linearmodels

Defaulting to user installation because normal site-packages is not writeable
Collecting linearmodels
  Downloading linearmodels-4.24-cp38-cp38-manylinux1_x86_64.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 21.3 MB/s eta 0:00:01
[?25hCollecting mypy-extensions>=0.4
  Downloading mypy_extensions-0.4.3-py2.py3-none-any.whl (4.5 kB)
Collecting Cython>=0.29.21
  Downloading Cython-0.29.23-cp38-cp38-manylinux1_x86_64.whl (1.9 MB)
[K     |████████████████████████████████| 1.9 MB 109.1 MB/s eta 0:00:01
Collecting pyhdfe>=0.1
  Downloading pyhdfe-0.1.0-py3-none-any.whl (18 kB)
Collecting property-cached>=1.6.3
  Downloading property_cached-1.6.4-py2.py3-none-any.whl (7.8 kB)
Installing collected packages: mypy-extensions, Cython, pyhdfe, property-cached, linearmodels
Successfully installed Cython-0.29.23 linearmodels-4.24 mypy-extensions-0.4.3 property-cached-1.6.4 pyhdfe-0.1.0


In [1]:
import numpy as np
import pandas as pd

from collections import Counter
import math
import scipy.stats as ss
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm
from statsmodels.discrete.discrete_model import Logit

pd.set_option('display.max_columns', None)

In [2]:
type_map = {
    'decision_date' : 'str',
    'filing_date' : 'str',
    'Court Name' : 'category',
    'Party of Appointing President' : 'category',
    'CIRCUIT' : 'category',
    'JURIS' : 'category',
    'NOS' : 'category',
    'ORIGIN' : 'category',
    'RESIDENC' : 'category',
    'CLASSACT' : 'category',
    'DEMANDED' : 'float64',
    'TERMDATE' : 'str',
    'DISP' : 'category',
    'PROCPROG' : 'category',
    'NOJ' : 'category',
    'AMTREC' : 'category',
    'JUDGMENT' : 'category',
    'TAPEYEAR' : 'float64',
    'district' : 'category',
    'office' : 'category',
    'county' : 'category',
    'TRCLACT' : 'category',
    'PROSE' : 'category',
    'arbit' : 'category',
    'transoff' : 'category',
    'trmarb' : 'category',
    'ifp' : 'category',
    'statuscd' : 'category'
}

cols_to_drop = [
    'decision_date',
#     'filing_date',
    'TERMDATE',
    'TAPEYEAR'
]

df = pd.read_csv(
    '/scratch/ayl316/ttml_mr_data/processed_data/cases.csv.zip', 
    dtype = type_map, 
    parse_dates = ['decision_date', 'filing_date', 'TERMDATE']
).rename(columns = {
    'Court Name' : 'court_name',
    'Party of Appointing President' : 'party'
}).drop(columns = cols_to_drop)


df['filing_year'] = pd.DatetimeIndex(df['filing_date']).year
df['filing_year'] = df['filing_year'].astype(str).astype('category')
df = df.drop(columns = ['filing_date'])

for col in ['party', 'TRCLACT', 'PROSE', 'arbit', 'transoff', 'trmarb', 'ifp', 'statuscd', 'filing_year']:
    if not '-8' in df[col].cat.categories:
        df[col] = df[col].cat.add_categories('-8')
    df[col] = df[col].fillna('-8')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1219459 entries, 0 to 1219458
Data columns (total 25 columns):
 #   Column       Non-Null Count    Dtype   
---  ------       --------------    -----   
 0   court_name   1219459 non-null  category
 1   party        1219459 non-null  category
 2   CIRCUIT      1219459 non-null  category
 3   JURIS        1219459 non-null  category
 4   NOS          1219459 non-null  category
 5   ORIGIN       1219459 non-null  category
 6   RESIDENC     1219459 non-null  category
 7   CLASSACT     1219459 non-null  category
 8   DEMANDED     1219459 non-null  float64 
 9   DISP         1219459 non-null  category
 10  PROCPROG     1219459 non-null  category
 11  NOJ          1219459 non-null  category
 12  AMTREC       1219459 non-null  category
 13  JUDGMENT     1219459 non-null  category
 14  district     1219459 non-null  category
 15  office       1219459 non-null  category
 16  county       1219459 non-null  category
 17  TRCLACT      1219459 non-nu

In [4]:
df.nunique().sort_values(ascending = False)

county         2888
DEMANDED       1391
AMTREC          968
court_name      130
NOS             115
district         95
filing_year      45
RESIDENC         36
DISP             22
PROCPROG         14
CIRCUIT          12
transoff         12
office           11
arbit            11
ORIGIN           11
NOJ               9
JUDGMENT          7
trmarb            6
CLASSACT          6
PROSE             6
party             6
JURIS             5
TRCLACT           4
statuscd          2
ifp               2
dtype: int64

In [5]:
def conditional_entropy(x,y):
    # entropy of x given y
    y_counter = Counter(y)
    xy_counter = Counter(list(zip(x,y)))
    total_occurrences = sum(y_counter.values())
    entropy = 0
    for xy in xy_counter.keys():
        p_xy = xy_counter[xy] / total_occurrences
        p_y = y_counter[xy[1]] / total_occurrences
        entropy += p_xy * math.log(p_y/p_xy)
    return entropy

def theil_u(x,y):
    s_xy = conditional_entropy(x,y)
    x_counter = Counter(x)
    total_occurrences = sum(x_counter.values())
    p_x = list(map(lambda n: n/total_occurrences, x_counter.values()))
    s_x = ss.entropy(p_x)
    if s_x == 0:
        return 1
    else:
        return (s_x - s_xy) / s_x

In [6]:
def cat_heat_map(df, cols):
    theilu = pd.DataFrame(index = cols, columns = cols)

    for i in range(0, len(cols)):
        for j in range(0, len(cols)):
            if i == j:
                theilu.loc[cols[i], cols[j]] = 1.0
            
            u = theil_u(df[cols[i]].tolist(),df[cols[j]].tolist())
            theilu.loc[cols[i], cols[j]] = u
    
    theilu.fillna(value = np.nan, inplace = True)
    plt.figure(figsize = (12, 12))
    sns.heatmap(theilu, annot = True, fmt = '.2f')
    plt.show()


In [7]:
feature_cols = [
#     'NOJ',
#     'JUDGMENT',
#     'PROSE',
#     'trmarb',
    'CLASSACT',
    'JURIS',
#     'TRCLACT',
#     'ifp', (too many nulls)
#     'statuscd',
#     'PROCPROG',
#     'CIRCUIT',
#     'transoff',
    'ORIGIN',
#     'arbit', (too many nulls)
    'office',
#     'court_name',
    'NOS',
    'district',
#     'TAPEYEAR',
    'RESIDENC',
#     'DISP',
    'filing_year'
]

target_col = 'party'

# cat_heat_map(df, feature_cols)

In [29]:
df.head()

Unnamed: 0,court_name,party,CIRCUIT,JURIS,NOS,ORIGIN,RESIDENC,CLASSACT,DEMANDED,DISP,PROCPROG,NOJ,AMTREC,JUDGMENT,district,office,county,TRCLACT,PROSE,arbit,transoff,trmarb,ifp,statuscd,filing_year
0,U.S. District Court for the District of Maryland,Democratic,4.0,2.0,510.0,1.0,-8.0,-8.0,0.0,14.0,2.0,0.0,0.0,0.0,16,8,24001,-8.0,1.0,-8,-8.0,-8,-8,L,2016.0
1,U.S. District Court for the District of Maryland,Democratic,4.0,2.0,510.0,1.0,-8.0,-8.0,0.0,14.0,2.0,0.0,0.0,0.0,16,8,24001,-8.0,1.0,-8,-8.0,-8,-8,L,2016.0
2,U.S. District Court for the District of Maryland,Democratic,4.0,2.0,510.0,1.0,-8.0,-8.0,0.0,14.0,2.0,0.0,0.0,0.0,16,8,24001,-8.0,1.0,-8,-8.0,-8,-8,L,2016.0
3,U.S. District Court for the District of Maryland,Democratic,4.0,2.0,510.0,1.0,-8.0,-8.0,0.0,14.0,2.0,0.0,0.0,0.0,16,8,24001,-8.0,1.0,-8,-8.0,-8,-8,L,2016.0
4,U.S. District Court for the District of Maryland,Democratic,4.0,2.0,510.0,1.0,-8.0,-8.0,0.0,14.0,2.0,0.0,0.0,0.0,16,8,24001,-8.0,1.0,-8,-8.0,-8,-8,L,2016.0


In [30]:
df[feature_cols]

Unnamed: 0,CLASSACT,JURIS,ORIGIN,office,NOS,district,RESIDENC,filing_year
0,-8.0,2.0,1.0,8,510.0,16,-8.0,2016.0
1,-8.0,2.0,1.0,8,510.0,16,-8.0,2016.0
2,-8.0,2.0,1.0,8,510.0,16,-8.0,2016.0
3,-8.0,2.0,1.0,8,510.0,16,-8.0,2016.0
4,-8.0,2.0,1.0,8,510.0,16,-8.0,2016.0
...,...,...,...,...,...,...,...,...
1219454,-8.0,4.0,2.0,4,110.0,65,15.0,2016.0
1219455,-8.0,4.0,2.0,4,110.0,65,15.0,2016.0
1219456,-8.0,3.0,1.0,2,442.0,15,-8.0,2015.0
1219457,-8.0,3.0,1.0,2,442.0,15,-8.0,2015.0


In [8]:


df = df[(df[target_col] == 'Republican') | (df[target_col] == 'Democratic')]



# df = df.sample(n = 50000)

for col in feature_cols:
    if df[col].dtype.name == 'category':
        df[col] = df[col].cat.remove_unused_categories()

X = df[feature_cols]
y = df[target_col]


y = y.cat.add_categories(['1', '0'])
y[y == 'Democratic'] = '1'
y[y == 'Republican'] = '0'
y = y.cat.remove_unused_categories()

In [9]:
enc = OneHotEncoder(drop = 'first')
enc.fit(X)
X_ohe = enc.transform(X).toarray()

scaler = StandardScaler().fit(X_ohe)
X_scaled = pd.DataFrame(
    scaler.transform(X_ohe),
    columns = enc.get_feature_names(feature_cols)
)

X_scaled = sm.add_constant(X_scaled)

In [12]:
X_scaled.shape

(1187426, 315)

In [13]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

def calc_vif(X):

    # Calculating VIF
    vif = pd.DataFrame()
    vif["variables"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

    return(vif)

In [None]:
# vif = calc_vif(X_scaled)

In [None]:
# vif.to_csv('/scratch/ns4486/vif.csv', index = False)

In [None]:
# vif[(vif['VIF'] > 2.5)]

In [14]:
# clf = LogisticRegression(random_state = 0).fit(X_ohe, y)


# log_reg = sm.Logit(list(y.astype(float)), X_ohe).fit(method='lbfgs')

log_reg = Logit(list(y.astype(float)), X_scaled).fit(method = 'lbfgs', maxiters = 50000)
pvalues = log_reg.pvalues



In [41]:
# print(log_reg.summary())

In [15]:
pvalues[pvalues > 0.05]

JURIS_4.0             0.994871
JURIS_5.0             0.060516
ORIGIN_1.0            0.985344
ORIGIN_2.0            0.955858
ORIGIN_3.0            0.780763
                        ...   
filing_year_2013.0    0.617296
filing_year_2014.0    0.850135
filing_year_2015.0    0.824434
filing_year_2016.0    0.565058
filing_year_nan       0.986340
Length: 167, dtype: float64

In [16]:
pvalues[pvalues < 0.05]

const            0.000000e+00
CLASSACT_-9.0    0.000000e+00
CLASSACT_0.0     0.000000e+00
CLASSACT_1.0     1.613063e-26
CLASSACT_2.0     3.814168e-34
                     ...     
district_88      8.852705e-10
district_89      1.110512e-07
district_90      4.968332e-11
district_93      2.992181e-07
RESIDENC_56.0    2.939484e-02
Length: 148, dtype: float64

In [21]:
lin_reg = sm.OLS(list(y.astype(float)), X_scaled).fit()
lin_pvalues = lin_reg.pvalues

In [24]:
lin_pvalues[lin_pvalues < 0.05]

const                 0.000000e+00
CLASSACT_-9.0         0.000000e+00
CLASSACT_0.0          0.000000e+00
CLASSACT_1.0          6.405430e-27
CLASSACT_2.0          3.172465e-36
                          ...     
RESIDENC_56.0         3.138280e-03
RESIDENC_62.0         1.615250e-02
filing_year_1975.0    3.862496e-02
filing_year_1977.0    3.443217e-08
filing_year_1983.0    9.445985e-03
Length: 154, dtype: float64

In [42]:
sig_map = {}

for col in list(lin_pvalues[lin_pvalues < 0.05].index):
    if '_' in col:
        col_name = col.split('_')[0]
        col_value = col.split('_')[1]
        
        if 'filing_year' in col:
            col_name = 'filing_year'
            col_value = col.split('_')[2]
        
        if col_name in sig_map.keys():
            sig_map[col_name].append(col_value)
        else:
            sig_map[col_name] = [col_value]

In [43]:
feature_cols

['CLASSACT',
 'JURIS',
 'ORIGIN',
 'office',
 'NOS',
 'district',
 'RESIDENC',
 'filing_year']

In [44]:
sig_map.keys()

dict_keys(['CLASSACT', 'JURIS', 'ORIGIN', 'office', 'NOS', 'district', 'RESIDENC', 'filing_year'])

In [46]:
sub_cols = list(lin_pvalues[lin_pvalues < 0.05].index)
sub_lin_reg = sm.OLS(list(y.astype(float)), X_scaled[sub_cols]).fit()
sub_lin_pvalues = sub_lin_reg.pvalues

In [47]:
sub_lin_reg.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.08
Model:,OLS,Adj. R-squared:,0.08
Method:,Least Squares,F-statistic:,671.7
Date:,"Fri, 23 Apr 2021",Prob (F-statistic):,0.0
Time:,08:25:40,Log-Likelihood:,-787270.0
No. Observations:,1187426,AIC:,1575000.0
Df Residuals:,1187272,BIC:,1577000.0
Df Model:,153,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.3979,0.000,923.392,0.000,0.397,0.399
CLASSACT_-9.0,0.0192,0.000,43.112,0.000,0.018,0.020
CLASSACT_0.0,0.0438,0.000,93.407,0.000,0.043,0.045
CLASSACT_1.0,0.0043,0.000,9.750,0.000,0.003,0.005
CLASSACT_2.0,0.0052,0.000,11.944,0.000,0.004,0.006
CLASSACT_3.0,0.0024,0.000,5.637,0.000,0.002,0.003
JURIS_2.0,-0.0034,0.001,-4.237,0.000,-0.005,-0.002
JURIS_3.0,0.0060,0.001,8.263,0.000,0.005,0.007
JURIS_5.0,-0.0012,0.000,-2.406,0.016,-0.002,-0.000

0,1,2,3
Omnibus:,6072123.368,Durbin-Watson:,0.263
Prob(Omnibus):,0.0,Jarque-Bera (JB):,145403.94
Skew:,0.337,Prob(JB):,0.0
Kurtosis:,1.424,Cond. No.,13.4


In [50]:
sub_lin_pvalues[sub_lin_pvalues < 0.05]

const                 0.000000e+00
CLASSACT_-9.0         0.000000e+00
CLASSACT_0.0          0.000000e+00
CLASSACT_1.0          1.843106e-22
CLASSACT_2.0          6.992007e-33
                          ...     
district_94           7.552159e-03
RESIDENC_56.0         9.291784e-21
RESIDENC_62.0         6.607237e-03
filing_year_1977.0    1.440133e-11
filing_year_1983.0    7.810277e-10
Length: 149, dtype: float64

In [25]:
lin_reg.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.083
Model:,OLS,Adj. R-squared:,0.083
Method:,Least Squares,F-statistic:,342.8
Date:,"Thu, 22 Apr 2021",Prob (F-statistic):,0.0
Time:,14:49:14,Log-Likelihood:,-785020.0
No. Observations:,1187426,AIC:,1571000.0
Df Residuals:,1187111,BIC:,1574000.0
Df Model:,314,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.3979,0.000,925.080,0.000,0.397,0.399
CLASSACT_-9.0,0.0194,0.000,41.920,0.000,0.018,0.020
CLASSACT_0.0,0.0453,0.001,74.820,0.000,0.044,0.047
CLASSACT_1.0,0.0047,0.000,10.743,0.000,0.004,0.006
CLASSACT_2.0,0.0055,0.000,12.568,0.000,0.005,0.006
CLASSACT_3.0,0.0026,0.000,6.025,0.000,0.002,0.003
JURIS_2.0,-0.0054,0.001,-3.733,0.000,-0.008,-0.003
JURIS_3.0,0.0037,0.002,2.336,0.019,0.001,0.007
JURIS_4.0,-0.0539,0.058,-0.936,0.349,-0.167,0.059

0,1,2,3
Omnibus:,6187852.298,Durbin-Watson:,0.263
Prob(Omnibus):,0.0,Jarque-Bera (JB):,143244.541
Skew:,0.333,Prob(JB):,0.0
Kurtosis:,1.435,Cond. No.,364.0


In [27]:
A = np.identity(len(lin_reg.params))
A = A[1:, :]

print(lin_reg.f_test(A))

<F test: F=array([[342.83862608]]), p=0.0, df_denom=1.19e+06, df_num=314>


In [32]:
print(lin_reg.wald_test(A))

<F test: F=array([[342.83862608]]), p=0.0, df_denom=1.19e+06, df_num=314>


In [None]:

# X has the following features
# Class Action Suit
# Jurisdiction
# Case origin
# office
# Nature of Suit
# district
# Diversity Residence
# Filing Year


enc = OneHotEncoder(drop = 'first')
enc.fit(X)
X_ohe = enc.transform(X).toarray()

scaler = StandardScaler().fit(X_ohe)
X_scaled = pd.DataFrame(
    scaler.transform(X_ohe),
    columns = enc.get_feature_names(feature_cols)
)

X_scaled = sm.add_constant(X_scaled)


lin_reg = sm.OLS(list(y.astype(float)), X_scaled).fit()
lin_pvalues = lin_reg.pvalues



A = np.identity(len(lin_reg.params))
A = A[1:, :]

print(lin_reg.f_test(A))