In [154]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:97% !important; }</style>"))

In [155]:
import pandas as pd
import numpy as np
np.set_printoptions(suppress=True) #no scientific notation
from scipy.stats import chi2_contingency

In [156]:
df = pd.read_csv(r"Geo Modified Dataset - 80 cells - width==0.1 .csv", index_col=(0,1))

# Add Interaction Terms

In [157]:
race_set = {'ASIAN', 'BLACK', 'HISPANIC', 'NATIVE AMERICAN', 'OTHER', 'WHITE'}

for race in race_set:
    df[f"INTERACTION: D_{race} X GEO: {race} Racial Composition"] = df[f'{race} - (D_Race)'] * df[f'GEO: {race} Racial Composition']
    
    df[f"INTERACTION: D_{race} X GEO: {race} Percent of Charges that were CHANGED"] = df[f'{race} - (D_Race)'] * df[f'GEO: {race} Percent of Charges that were CHANGED']
    
    df[f"INTERACTION: D_{race} X GEO: {race} Average Speed NOT in 9,14 MPH"] = df[f'{race} - (D_Race)'] * df[f'GEO: {race} Average Speed NOT in 9,14 MPH']
    
    df[f"INTERACTION: D_{race} X D_Male"] = df[f'{race} - (D_Race)'] * df["Male"]

# Chi-Squared Test for Independence

In [158]:
# race_set = {'ASIAN', 'BLACK', 'HISPANIC', 'NATIVE AMERICAN', 'OTHER', 'WHITE'}
speeding_bool_set = {"Speed Altered", 'Speed NOT Altered'}

In [159]:
altered = []
not_altered = []


for x in df['Speed Over Posted Limit']:
    if x==9:
        altered.append(1)
        not_altered.append(0)
    elif 10 <= x <= 14:
        altered.append(0)
        not_altered.append(1)
    else:
        altered.append(np.nan)
        not_altered.append(np.nan)

df['Speed Altered'] = altered
df['Speed NOT Altered'] = not_altered

In [160]:
contingency_table = pd.DataFrame({x:[0 for race in sorted(list(race_set))] for x in speeding_bool_set}, index=sorted(list(race_set)))

In [161]:
for col in contingency_table:
    for ind in contingency_table.index:
        temp_col = []
        for x in zip(df[col], df[f"{ind} - (D_Race)"]):
            temp_col.append(x[0]==x[1]==1)
            
        contingency_table[col].loc[ind] = sum(temp_col)

        
contingency_table

Unnamed: 0,Speed NOT Altered,Speed Altered
ASIAN,236,3626
BLACK,1049,10670
HISPANIC,726,7039
NATIVE AMERICAN,10,36
OTHER,267,3179
WHITE,1479,24843


## Chi2 Test

Returns (in order):

- The test statistic.

- The p-value of the test

- Degrees of freedom

- The expected frequencies, based on the marginal sums of the table.

https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chi2_contingency.html

In [162]:
chi2_result = chi2_contingency(contingency_table)

print(f"\n Test statistic == {chi2_result[0]}")
print(f"\n P-Value == {chi2_result[1]}")
print(f"\n Degrees of Freedom == {chi2_result[2]}")
print(f"\n Expected Frequencies == \n {chi2_result[3]}")

print(f"\n Difference between Actual and Expected Frequencies (Actual - Expected) == \n\n {contingency_table-chi2_result[3]}")


 Test statistic == 231.28605043787022

 P-Value == 5.66938794054964e-48

 Degrees of Freedom == 5

 Expected Frequencies == 
 [[  273.66730625  3588.33269375]
 [  830.42650489 10888.57349511]
 [  550.23993604  7214.76006396]
 [    3.2596313     42.7403687 ]
 [  244.18890143  3201.81109857]
 [ 1865.21772009 24456.78227991]]

 Difference between Actual and Expected Frequencies (Actual - Expected) == 

                  Speed NOT Altered  Speed Altered
ASIAN                   -37.667306      37.667306
BLACK                   218.573495    -218.573495
HISPANIC                175.760064    -175.760064
NATIVE AMERICAN           6.740369      -6.740369
OTHER                    22.811099     -22.811099
WHITE                  -386.217720     386.217720


# Regression

Maybe do this in R - Python implementation is not great (scipy doesn't have a regression summary / p-value; statsmodel doesn't make sense).

## Set up

In [163]:
regression_df = df[(9<=df['Speed Over Posted Limit']) & (df['Speed Over Posted Limit']<=14)] # all observations where 9 <= speed <= 14 

#replace np.nan with 0
for x in list(zip(regression_df.isnull().sum(), regression_df.columns)):
    if x[0]!=0:
        regression_df[x[1]] = regression_df[x[1]].fillna(value=0)
    
y = regression_df['Speed Altered']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


### Deletions

#### Delete non-boolean or non-float columns

In [164]:
for col in regression_df.columns:
    if regression_df[col].dtype not in ('bool', 'float64'):
        del regression_df[col]    

#### Delete Multicollinearity-Causing Columns

In [165]:
multicollinear_cols = ['GEO: OTHER Racial Composition',
'OTHER - (D_Race)',
'Headquarters and Special Operations - (D_SubAgency)',
'ESERO - (D_Violation Type)',
'Number of writeups'
                      ]

for col in multicollinear_cols:
    try:
        del regression_df[col]
    except Exception as e:
        print(f"Failed to delete {col}, exception: {e}")

Failed to delete OTHER - (D_Race), exception: 'OTHER - (D_Race)'
Failed to delete Headquarters and Special Operations - (D_SubAgency), exception: 'Headquarters and Special Operations - (D_SubAgency)'
Failed to delete ESERO - (D_Violation Type), exception: 'ESERO - (D_Violation Type)'
Failed to delete Number of writeups, exception: 'Number of writeups'


#### Other Deletions

In [166]:
misc_del_list = set(
['Speed Altered',
'Speed NOT Altered',
    
'Citation - (D_Violation Type)',
'Warning - (D_Violation Type)',
            
'Citation - (D_Search Outcome)',
'Warning - (D_Search Outcome)',            
])

for col in regression_df:
    if 'D_Search Outcome' in col:
        misc_del_list.add(col)

for col in misc_del_list:
    
    try:
        
        del regression_df[col]
    

    except Exception as e:
        
        print(f"failed to delete {col}, exception: {e}")

X = regression_df

failed to delete Citation - (D_Violation Type), exception: 'Citation - (D_Violation Type)'
failed to delete Citation - (D_Search Outcome), exception: 'Citation - (D_Search Outcome)'


In [167]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1) #keep this random_state for reproducibility

### Confirm that there's an even proportion of True, False in training and testing sets

In [168]:
print(f"mean(y_test)-mean(y_train) == {(abs(np.mean(y_test)-np.mean(y_train)))}")

print(f'Discrepancy in raw count == { (abs(np.mean(y_test)-np.mean(y_train)))*len(regression_df)}')

mean(y_test)-mean(y_train) == 0.00018811136192620204
Discrepancy in raw count == 9.9999999999969


## Running the Regression

Questions:

- Do I need to normalize the data before L1?
- Should we add interaction terms for the geo vars?

Thoughts:

- Results with/without L1 are very different
  * All Geo vars have the same coefficients in normal logit

### With an Intercept

In [169]:
l1_model = LogisticRegression(penalty='l1', solver='liblinear')
l1_model.fit(X, y)
l1_coefficients = l1_model.coef_.tolist()[0]
zipped_l1_coefs = list(zip([['Intercept']+list(regression_df.columns)][0], [l1_model.intercept_.tolist()[0]]+[round(x,3) for x in l1_coefficients]))



log_model = LogisticRegression(max_iter=5000) #default max_iter==100
log_model.fit(X, y)
log_coefficients = log_model.coef_.tolist()[0]
zipped_log_coefs = list(zip(['Intercept']+[list(regression_df.columns)][0], [log_model.intercept_.tolist()[0]]+[round(x,3) for x in log_coefficients]))



results_df = pd.DataFrame({'Variable': [x[0] for x in zipped_l1_coefs],
                           'L1 Coefficient': [x[1] for x in zipped_l1_coefs],
                           'Normal Logit Coefficient': [x[1] for x in zipped_log_coefs]})

results_df = results_df.sort_values(by=['L1 Coefficient'], ascending=False)


with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(results_df)

                                             Variable  L1 Coefficient  Normal Logit Coefficient
0                                           Intercept       86.068477                   7.09155
1                                         Speed Limit        0.193000                   0.46500
4                                            Latitude        0.023000                   0.18200
29           GEO: ASIAN Average Speed NOT in 9,14 MPH        0.011000                  -0.00600
35           GEO: WHITE Average Speed NOT in 9,14 MPH        0.010000                  -0.00600
70  INTERACTION: D_ASIAN X GEO: ASIAN Average Spee...        0.004000                   0.00900
54  INTERACTION: D_WHITE X GEO: WHITE Average Spee...        0.002000                  -0.04000
24           GEO: OTHER Average Speed NOT in 9,14 MPH        0.002000                  -0.00600
74  INTERACTION: D_BLACK X GEO: BLACK Average Spee...        0.001000                   0.13700
41           GEO: TOTAL Average Speed NO

#### Prediction Accuracy

**NOTE: THIS WAS NOT SCORING 100% UNTIL ADDING RACExGEO AND RACExMALE INTERACTION TERMS**

(it had ~98% accuracy)

In [176]:
print(f'Percent of Stops that appeared to receive leniency == { round(100 * np.mean(df["Speed Altered"]), 3) }% \n\n')

print(f'l1_model TRAINING accuracy == {round(100 * l1_model.score(X_train, y_train), 13)}%\n')

print(f'l1_model TESTING accuracy == {round(100 * l1_model.score(X_test, y_test), 3)}% \n \n')


print(f'log_model TRAINING accuracy == {round(100 * log_model.score(X_train, y_train), 3)}%\n')

print(f'log_model TESTING accuracy == {round(100 * log_model.score(X_test, y_test), 3)}%')

Percent of Stops that appeared to receive leniency == 92.914% 


l1_model TRAINING accuracy == 100.0%

l1_model TESTING accuracy == 100.0% 
 

log_model TRAINING accuracy == 94.627%

log_model TESTING accuracy == 94.874%


#### Correlation Table

In [171]:
results_df.corr()

Unnamed: 0,L1 Coefficient,Normal Logit Coefficient
L1 Coefficient,1.0,0.879278
Normal Logit Coefficient,0.879278,1.0


### Without Intercept

Can we run this without an intercept? The intercept term is much larger than the next biggest coefficient...

In [172]:
l1_model_no_intercept = LogisticRegression(penalty='l1', solver='liblinear', fit_intercept=False)
l1_model_no_intercept.fit(X, y)

l1_coefficients_no_intercept = l1_model_no_intercept.coef_.tolist()[0]

zipped_l1_coefs = list(zip(list(regression_df.columns), [round(x,3) for x in l1_coefficients_no_intercept]))

log_model_no_intercept = LogisticRegression(max_iter=5000, fit_intercept=False) #default max_iter==100
log_model_no_intercept.fit(X, y)

log_coefficients_no_int = log_model_no_intercept.coef_.tolist()[0]

zipped_log_coefs_no_intercept = list(zip(list(regression_df.columns), [round(x,3) for x in log_coefficients_no_int]))

no_intercept_results_df = pd.DataFrame({'Variable': [x[0] for x in zipped_l1_coefs],
                           'L1 Coefficient': [x[1] for x in zipped_l1_coefs],
                           'Normal Logit Coefficient': [x[1] for x in zipped_log_coefs_no_intercept]})

no_intercept_results_df = no_intercept_results_df.sort_values(by=['L1 Coefficient'], ascending=False)

with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(no_intercept_results_df)

                                             Variable  L1 Coefficient  Normal Logit Coefficient
52  INTERACTION: D_WHITE X GEO: WHITE Percent of C...           4.346                     0.002
51  INTERACTION: D_WHITE X GEO: WHITE Racial Compo...           1.752                     0.004
54                      INTERACTION: D_WHITE X D_Male           0.387                     0.010
20  GEO: HISPANIC Percent of Charges that were CHA...           0.374                     0.003
14    GEO: BLACK Percent of Charges that were CHANGED           0.330                     0.003
8                                  Driver State != MD           0.272                    -0.000
25    GEO: OTHER Percent of Charges that were CHANGED           0.243                     0.003
58                      INTERACTION: D_OTHER X D_Male           0.238                     0.001
31    GEO: ASIAN Percent of Charges that were CHANGED           0.232                     0.003
68  INTERACTION: D_ASIAN X GEO: ASIAN Pe

In [174]:
print(f'Percent of Stops that appeared to receive leniency == { round(100 * np.mean(df["Speed Altered"]), 3) }% \n\n')

print(f'l1_model_no_intercept TRAINING accuracy == {round(100 * l1_model_no_intercept.score(X_train, y_train), 3)}%\n')

print(f'l1_model_no_intercept TESTING accuracy == {round(100 * l1_model_no_intercept.score(X_test, y_test), 3)}% \n \n')


print(f'log_model_no_intercept TRAINING accuracy == {round(100 * log_model_no_intercept.score(X_train, y_train), 3)}%\n')

print(f'log_model_no_intercept TESTING accuracy == {round(100 * log_model_no_intercept.score(X_test, y_test), 3)}%')

Percent of Stops that appeared to receive leniency == 92.914% 


l1_model_no_intercept TRAINING accuracy == 92.671%

l1_model_no_intercept TESTING accuracy == 92.645% 
 

log_model_no_intercept TRAINING accuracy == 91.725%

log_model_no_intercept TESTING accuracy == 91.789%


#### Correlation Table

Much lower correlation than with an intercept

In [173]:
no_intercept_results_df.corr()

Unnamed: 0,L1 Coefficient,Normal Logit Coefficient
L1 Coefficient,1.0,0.087031
Normal Logit Coefficient,0.087031,1.0


# Test that each race has the same proclivity to speed (% speeds >14 mph)

An implicit assumption is that all races have the same speeding distribution - at least within the ranges of 9-14 MPH. 

\
However, this is not necessarily the case:
\

    As the following cells show, most races seem to follow their own unique speeding distribution (for speeding above 14 MPH)

In [None]:
def avg_speeds_above_14(race):
    temp_list = []
    for x in zip(df[f"{race} - (D_Race)"], df['Speed Over Posted Limit']):
        if x[0]==1 and x[1]>14:
            temp_list.append(x[1])
            
    return np.mean(temp_list)

for race in race_set:
    print(f"Average {race} speed over 14 MPH == {avg_speeds_above_14(race)}")

## Simple Z test for different means (speeding above 14 mph), with hypothesized difference between means==0

In [None]:
def z_score_above_14(race1, race2):
    x1 = np.array([x[0] for x in zip(df['Speed Over Posted Limit'], df['Race']) if x[1]==race1 and 14 < x[0] < 100])
    x2 = np.array([x[0] for x in zip(df['Speed Over Posted Limit'], df["Race"]) if x[1]==race2 and 14 < x[0] < 100])

    xbar1 = np.mean(x1)
    xbar2 = np.mean(x2)
    
    sig1 = np.std(x1)
    sig2 = np.std(x2)
    
    n1 = len(x1)
    n2 = len(x2)
    
    z = (xbar1-xbar2)/np.sqrt((sig1**2)/n1 + (sig2**2)/n2)
    
    return abs(round(z,2))

In [None]:
z_tests_above_14_table = pd.DataFrame({race1:[z_score_above_14(race1, race2) for race2 in sorted(list(race_set))] for race1 in sorted(list(race_set))}, index=sorted(list(race_set)))


z_tests_above_14_table.style.apply(lambda x: ["background: red" if v > 1.96 else "" for v in x], axis = 1)

### Conclusion

Races probably have different speeding distributions

**We should run a test to specifically test similarity of distributions**

**Utlimately, we need to investigate whether white/asian/etc. drivers *appear* to receive more leniency because they have more stops actually at 9 mph**

# scratch