In [191]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf

### Read data

In [192]:
raw = pd.read_csv('cp5_data_with_sa_score.csv', lineterminator='\n')
# raw.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True)
raw.head()

Unnamed: 0,crid,category,allegation_name,allegation_category_id,officer_race,subject_race,cr_text,cleaned_text,token_len,score
0,1059251,Use Of Force,Excessive Force / On Duty - Injury,98,White,Black,Initial / Intake Allegation 1: The reporting p...,1: The reporting party alleges that he went to...,93,-5.85
1,1059269,Operation/Personnel Violations,Inadequate / Failure To Provide Service,204,White,Black,"Initial / Intake Allegation 1: Battery, allege...","1: Battery, alleges that the\r uniformed offic...",32,-5.75
2,1059269,Operation/Personnel Violations,Inadequate / Failure To Provide Service,204,White,Black,"Initial / Intake Allegation 1: Battery, allege...","1: Battery, alleges that the\r uniformed offic...",32,-5.75
3,1059269,Operation/Personnel Violations,Inadequate / Failure To Provide Service,204,White,Black,"Initial / Intake Allegation 1: Battery, allege...","1: Battery, alleges that the\r uniformed offic...",32,-5.75
4,1059214,Operation/Personnel Violations,Inadequate / Failure To Provide Service,204,Asian/Pacific,Black,Initial / Intake Allegation 1: when he called ...,1: when he called regarding a noise\r disturba...,35,-7.09


### Run linear regression on category, officer race, and subject race

In [193]:
# Encode categorical variables
df = raw.copy(deep=True)
df.loc[df['allegation_category_id']==98,'allegation_category_id'] = 1
df.loc[df['allegation_category_id']==204,'allegation_category_id'] = 0
df.loc[df['officer_race']!='White','officer_race'] = 'Other'
df.loc[df['subject_race']!='Black','subject_race'] = 'Other'
df.head()

Unnamed: 0,crid,category,allegation_name,allegation_category_id,officer_race,subject_race,cr_text,cleaned_text,token_len,score
0,1059251,Use Of Force,Excessive Force / On Duty - Injury,1,White,Black,Initial / Intake Allegation 1: The reporting p...,1: The reporting party alleges that he went to...,93,-5.85
1,1059269,Operation/Personnel Violations,Inadequate / Failure To Provide Service,0,White,Black,"Initial / Intake Allegation 1: Battery, allege...","1: Battery, alleges that the\r uniformed offic...",32,-5.75
2,1059269,Operation/Personnel Violations,Inadequate / Failure To Provide Service,0,White,Black,"Initial / Intake Allegation 1: Battery, allege...","1: Battery, alleges that the\r uniformed offic...",32,-5.75
3,1059269,Operation/Personnel Violations,Inadequate / Failure To Provide Service,0,White,Black,"Initial / Intake Allegation 1: Battery, allege...","1: Battery, alleges that the\r uniformed offic...",32,-5.75
4,1059214,Operation/Personnel Violations,Inadequate / Failure To Provide Service,0,Other,Black,Initial / Intake Allegation 1: when he called ...,1: when he called regarding a noise\r disturba...,35,-7.09


In [194]:
# One hot coding
df_reg = pd.concat((
    df[['allegation_category_id', 'score']],
    pd.get_dummies(df['officer_race'], drop_first=True), 
    pd.get_dummies(df['subject_race'], drop_first=True)), axis=1)
df_reg.head()

Unnamed: 0,allegation_category_id,score,White,Other
0,1,-5.85,1,0
1,0,-5.75,1,0
2,0,-5.75,1,0
3,0,-5.75,1,0
4,0,-7.09,0,0


In [195]:
Y = df_reg['allegation_category_id']
X = df_reg.loc[:, df_reg.columns != 'allegation_category_id']
X = sm.add_constant(X)
model = sm.OLS(Y,X)
results = model.fit()
print(results.summary())

                              OLS Regression Results                              
Dep. Variable:     allegation_category_id   R-squared:                       0.009
Model:                                OLS   Adj. R-squared:                  0.008
Method:                     Least Squares   F-statistic:                     5.732
Date:                    Wed, 01 Dec 2021   Prob (F-statistic):           0.000667
Time:                            02:26:10   Log-Likelihood:                -625.71
No. Observations:                    1874   AIC:                             1259.
Df Residuals:                        1870   BIC:                             1282.
Df Model:                               3                                         
Covariance Type:                nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       

  x = pd.concat(x[::order], 1)


In [196]:
# Method 2 for one hot coding
# df_reg2 = df.copy(deep=True)
# df_reg2['officer_race'] = df_reg2['officer_race'].astype('category').cat.codes
# df_reg2['subject_race'] = df_reg2['subject_race'].astype('category').cat.codes
# f = 'allegation_category_id~C(officer_race)+C(subject_race)+score'
# model_cat = smf.ols(formula=f, data=df_reg2).fit()
# model_cat.summary()

### Run linear regression on category, and cross-race

In [197]:
raw.head()

Unnamed: 0,crid,category,allegation_name,allegation_category_id,officer_race,subject_race,cr_text,cleaned_text,token_len,score
0,1059251,Use Of Force,Excessive Force / On Duty - Injury,98,White,Black,Initial / Intake Allegation 1: The reporting p...,1: The reporting party alleges that he went to...,93,-5.85
1,1059269,Operation/Personnel Violations,Inadequate / Failure To Provide Service,204,White,Black,"Initial / Intake Allegation 1: Battery, allege...","1: Battery, alleges that the\r uniformed offic...",32,-5.75
2,1059269,Operation/Personnel Violations,Inadequate / Failure To Provide Service,204,White,Black,"Initial / Intake Allegation 1: Battery, allege...","1: Battery, alleges that the\r uniformed offic...",32,-5.75
3,1059269,Operation/Personnel Violations,Inadequate / Failure To Provide Service,204,White,Black,"Initial / Intake Allegation 1: Battery, allege...","1: Battery, alleges that the\r uniformed offic...",32,-5.75
4,1059214,Operation/Personnel Violations,Inadequate / Failure To Provide Service,204,Asian/Pacific,Black,Initial / Intake Allegation 1: when he called ...,1: when he called regarding a noise\r disturba...,35,-7.09


In [198]:
# Encode categorical variables
df2 = raw.copy(deep=True)
df2.loc[df2['allegation_category_id']==98,'allegation_category_id'] = 1
df2.loc[df2['allegation_category_id']==204,'allegation_category_id'] = 0
df2['cross_race'] = df2['officer_race'] == df2['subject_race']
df2.loc['cross_race'] = df2['cross_race'].astype('int')
df2.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True)
df2.tail()

Unnamed: 0,crid,category,allegation_name,allegation_category_id,officer_race,subject_race,cr_text,cleaned_text,token_len,score,cross_race
1869,1049715.0,Use Of Force,Excessive Force / On Duty - Injury,1.0,Asian/Pacific,Black,Initial / Intake Allegation 1: The reporting p...,1: The reporting party/victim alleged that the...,83.0,-7.23,0.0
1870,1049715.0,Use Of Force,Excessive Force / On Duty - Injury,1.0,Asian/Pacific,Black,Initial / Intake Allegation 1: The reporting p...,1: The reporting party/victim alleged that the...,83.0,-7.23,0.0
1871,1049715.0,Use Of Force,Excessive Force / On Duty - Injury,1.0,Asian/Pacific,Black,Initial / Intake Allegation 1: The reporting p...,1: The reporting party/victim alleged that the...,83.0,-7.23,0.0
1872,1049715.0,Use Of Force,Excessive Force / On Duty - Injury,1.0,Asian/Pacific,Black,Initial / Intake Allegation 1: The reporting p...,1: The reporting party/victim alleged that the...,83.0,-7.23,0.0
1873,1049715.0,Use Of Force,Excessive Force / On Duty - Injury,1.0,Asian/Pacific,Black,Initial / Intake Allegation 1: The reporting p...,1: The reporting party/victim alleged that the...,83.0,-7.23,0.0


In [199]:
df2_reg = df2[['allegation_category_id','score','cross_race']].copy(deep=True)
Y = df2_reg['allegation_category_id']
X = df2_reg.loc[:, df2_reg.columns != 'allegation_category_id']
X = sm.add_constant(X)
model = sm.OLS(Y,X)
results = model.fit()
print(results.summary())

                              OLS Regression Results                              
Dep. Variable:     allegation_category_id   R-squared:                       0.009
Model:                                OLS   Adj. R-squared:                  0.008
Method:                     Least Squares   F-statistic:                     8.502
Date:                    Wed, 01 Dec 2021   Prob (F-statistic):           0.000211
Time:                            02:26:10   Log-Likelihood:                -625.81
No. Observations:                    1874   AIC:                             1258.
Df Residuals:                        1871   BIC:                             1274.
Df Model:                               2                                         
Covariance Type:                nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       

  x = pd.concat(x[::order], 1)
