In [1]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, Lasso, Ridge
from sklearn.compose import TransformedTargetRegressor
from sklearn.model_selection import train_test_split
import seaborn as sns
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.preprocessing import MinMaxScaler

In [2]:
census_data = pd.read_csv('census_20.csv')

In [3]:
census_data.dtypes

state          object
total_pop       int64
white_pop       int64
white_perc    float64
dtype: object

In [4]:
df18to21 = pd.read_csv('df18to21_cleaned.csv')

In [5]:
df18to21.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'sentence_length', 'year_sentenced',
       'sentence_type', 'guideline_range', 'imprisoned', 'guideline_var_pct',
       'dependents', 'count_convictons', 'race', 'disposition', 'citizen',
       'state', 'criminal_hist', 'drug_type', 'case_type', 'age', 'weapon',
       'presentence_stat', 'gender', 'crime_type', 'region', 'college',
       'white', 'perc_charged'],
      dtype='object')

In [6]:
df18to21 = df18to21.drop(columns=['Unnamed: 0','Unnamed: 0.1'])

No discernible change in rate of prison sentences due to covid.

In [7]:
df18to21.groupby('year_sentenced')['imprisoned'].mean()

year_sentenced
2018    0.911890
2019    0.919923
2020    0.914164
2021    0.915759
Name: imprisoned, dtype: float64

Fewer number of convictions post covid

In [8]:
df18to21['year_sentenced'].value_counts()

2019    77425
2018    70185
2020    56037
2021    43886
Name: year_sentenced, dtype: int64

In [9]:
df18to21['over25'] = [1 if i >= 25 else 0 for i in df18to21['age']]

In [10]:
violent_crimes = [3,4,13,19,20,22,26,27,28]

In [11]:
df18to21['violent_crime'] = [1 if i in violent_crimes else 0 for i in df18to21['crime_type']]

Less violent crime as you get older.

In [12]:
df18to21.groupby('over25')['violent_crime'].mean()

over25
0    0.227203
1    0.170034
Name: violent_crime, dtype: float64

In [13]:
df18to21.groupby('gender')['age'].median()

gender
0.0    35.0
1.0    35.0
Name: age, dtype: float64

In [14]:
df18to21['sentence_length'].describe()

count    247533.000000
mean         57.223619
std         398.033474
min           0.000000
25%           0.000000
50%          18.000000
75%          60.000000
max        9996.000000
Name: sentence_length, dtype: float64

In [15]:
df18to21['trial_custody'] = [1 if i == 1 else 0 for i in df18to21['presentence_stat']]

In [16]:
df18to21['trial_custody'].mean()

0.8025313796544299

In [17]:
X = df18to21[['dependents', 'count_convictons','disposition', 'citizen',
       'state', 'criminal_hist', 'drug_type', 'case_type', 'age', 'weapon',
       'gender', 'crime_type','college',
       'white']]
y = df18to21['trial_custody']

In [18]:
numeric = ['count_convictons','age']
categorical = ['dependents','white','disposition','citizen', 'state',
              'criminal_hist', 'drug_type','weapon','gender','crime_type','case_type','college']

In [19]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42,stratify=y)

In [20]:
ctx = ColumnTransformer(
    [('mms',MinMaxScaler(),numeric),
     ('ohe',OneHotEncoder(handle_unknown='ignore',drop='first'),\
     categorical)],
     verbose_feature_names_out=False,
     remainder = 'passthrough'
)

ctx.fit(X_train)
X_train_t = ctx.transform(X_train)
X_test_t = ctx.transform(X_test)

X_train = pd.DataFrame(X_train_t.A,columns=ctx.get_feature_names_out())
X_test = pd.DataFrame(X_test_t.A,columns=ctx.get_feature_names_out())

In [21]:
y_test.value_counts(normalize=True)

1    0.802534
0    0.197466
Name: trial_custody, dtype: float64

In [22]:
mnb = MultinomialNB()
mnb.fit(X_train,y_train)
mnb.score(X_train,y_train), mnb.score(X_test,y_test)

(0.8405862676340837, 0.8398778359511344)

In [23]:
imps = mnb.feature_log_prob_[1,:] - mnb.feature_log_prob_[0,:]

In [24]:
mnb_feat_imp = pd.DataFrame(imps.T,
            index=X_train.columns,
            columns=['feature imps'])

In [25]:
mnb_feat_imp.sort_values(by='feature imps',ascending=False).head(15)

Unnamed: 0,feature imps
crime_type_25,1.798787
crime_type_22,1.777588
state_New Mexico,1.681598
crime_type_17,1.627959
crime_type_19,1.445571
crime_type_26,0.997658
state_Arizona,0.894053
state_Texas,0.886408
crime_type_27,0.743805
weapon_1,0.701737


https://www.nbcnews.com/news/us-news/new-mexico-eliminated-cash-bail-now-one-county-locks-more-n1250257

In [26]:
df18to21.groupby('criminal_hist')['count_convictons'].mean()

criminal_hist
0.0    1.526017
1.0    1.333087
Name: count_convictons, dtype: float64

In [27]:
df18to21['drug_type'].value_counts()

0.0     173809
6.0      31542
1.0      11810
3.0       8784
4.0       7406
77.0      6943
2.0       5455
7.0       1784
Name: drug_type, dtype: int64

 1 = Cocaine
 2 = Crack
 3 = Heroin
 4 = Marijuana
 6 = Methamphetamine
 7 = Fentanyl
77 = Other

In [28]:
df18to21.groupby('drug_type')['count_convictons'].mean()

drug_type
0.0     1.284933
1.0     1.415072
2.0     1.787351
3.0     1.506375
4.0     1.383878
6.0     1.397755
7.0     1.663117
77.0    2.183206
Name: count_convictons, dtype: float64

In [29]:
(df18to21.groupby('state')['weapon'].count()).sort_values(ascending=False)

state
Texas                   67282
California              20655
Arizona                 18052
Florida                 11971
New Mexico              11611
New York                 9193
North Carolina           6587
Missouri                 6490
Georgia                  5169
Tennessee                5130
Ohio                     5103
Pennsylvania             5022
Virginia                 4943
Illinois                 4179
Michigan                 3931
Alabama                  3667
Iowa                     3159
Indiana                  3050
South Carolina           2920
Kentucky                 2919
Oklahoma                 2908
Arkansas                 2830
Utah                     2558
Washington               2510
Louisiana                2509
Maryland                 2415
New Jersey               2212
West Virginia            1952
Mississippi              1924
South Dakota             1836
Nebraska                 1786
Colorado                 1714
Massachusetts            1700
Wisc

In [30]:
df_state_weapons = pd.DataFrame(df18to21.groupby('state')['weapon'].count()).reset_index()

In [31]:
df_state_weapons = pd.merge(df_state_weapons,census_data,how='left',on='state')

In [32]:
df_state_weapons.head()

Unnamed: 0,state,weapon,total_pop,white_pop,white_perc
0,Alabama,3667,3917166,2595046,0.66248
1,Alaska,612,554003,349626,0.63109
2,Arizona,18052,5541976,3549308,0.640441
3,Arkansas,2830,2312273,1683440,0.728046
4,California,20655,30827105,13443813,0.436104


In [33]:
df_state_weapons.dtypes

state          object
weapon          int64
total_pop       int64
white_pop       int64
white_perc    float64
dtype: object

In [34]:
df18to21['above_min'] = [1 if i > 0 else 0 for i in  df18to21['guideline_var_pct']]

In [35]:
X = df18to21[['dependents', 'count_convictons','disposition', 'citizen',
       'state', 'criminal_hist', 'drug_type', 'case_type', 'age', 'weapon',
       'gender', 'crime_type','college','presentence_stat',
       'race']]
y = df18to21['above_min']

In [36]:
y.value_counts(normalize=True)

0    0.502345
1    0.497655
Name: above_min, dtype: float64

In [37]:
numeric = ['count_convictons','age']
categorical = ['dependents','race','disposition','citizen', 'state','presentence_stat',
              'criminal_hist', 'drug_type','weapon','gender','crime_type','case_type','college']

In [38]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42,stratify=y)

In [39]:
ctx = ColumnTransformer(
    [('mms',MinMaxScaler(),numeric),
     ('ohe',OneHotEncoder(handle_unknown='ignore',drop='first'),\
     categorical)],
     verbose_feature_names_out=False,
     remainder = 'passthrough'
)

ctx.fit(X_train)
X_train_t = ctx.transform(X_train)
X_test_t = ctx.transform(X_test)

X_train = pd.DataFrame(X_train_t.A,columns=ctx.get_feature_names_out())
X_test = pd.DataFrame(X_test_t.A,columns=ctx.get_feature_names_out())

In [40]:
mnb2 = MultinomialNB()
mnb2.fit(X_train,y_train)
mnb2.score(X_train,y_train), mnb2.score(X_test,y_test)

(0.6721232002326972, 0.6717406761036778)

In [41]:
df_noncit = df18to21[df18to21['citizen'] == 0]

In [42]:
df18to21.groupby('state')['white'].mean()

state
Alabama                 0.499318
Alaska                  0.568627
Arizona                 0.900676
Arkansas                0.578799
California              0.866183
Colorado                0.766044
Connecticut             0.600326
Delaware                0.503106
District of Columbia    0.242690
Florida                 0.654081
Georgia                 0.441478
Hawaii                  0.286486
Idaho                   0.892596
Illinois                0.544149
Indiana                 0.510820
Iowa                    0.702437
Kansas                  0.719276
Kentucky                0.658787
Louisiana               0.469111
Maine                   0.759289
Maryland                0.298137
Massachusetts           0.671765
Michigan                0.477741
Minnesota               0.571429
Mississippi             0.446985
Missouri                0.472881
Montana                 0.675076
Nebraska                0.597424
Nevada                  0.679152
New Hampshire           0.810109
New 

In [43]:
df_pop_bkdwn = pd.DataFrame()

In [44]:
df_pop_bkdwn['white_conv'] = df18to21.groupby('state')['white'].mean()

In [45]:
df_pop_bkdwn.head()

Unnamed: 0_level_0,white_conv
state,Unnamed: 1_level_1
Alabama,0.499318
Alaska,0.568627
Arizona,0.900676
Arkansas,0.578799
California,0.866183


In [46]:
df_pop_bkdwn = pd.merge(df_pop_bkdwn,census_data,how='left',on='state')

In [47]:
df_pop_bkdwn.head()

Unnamed: 0,state,white_conv,total_pop,white_pop,white_perc
0,Alabama,0.499318,3917166,2595046,0.66248
1,Alaska,0.568627,554003,349626,0.63109
2,Arizona,0.900676,5541976,3549308,0.640441
3,Arkansas,0.578799,2312273,1683440,0.728046
4,California,0.866183,30827105,13443813,0.436104


In [48]:
df_pop_bkdwn = df_pop_bkdwn.drop(columns=['total_pop','white_pop'])

In [49]:
df_pop_bkdwn.head()

Unnamed: 0,state,white_conv,white_perc
0,Alabama,0.499318,0.66248
1,Alaska,0.568627,0.63109
2,Arizona,0.900676,0.640441
3,Arkansas,0.578799,0.728046
4,California,0.866183,0.436104
