In [214]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, Lasso, Ridge
from sklearn.compose import TransformedTargetRegressor
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor

In [2]:
df18to21 = pd.read_csv('df18to21_cleaned.csv')

In [3]:
df18to21 = df18to21.drop(columns = ['Unnamed: 0.1','Unnamed: 0'])

In [4]:
df18to21.columns

Index(['sentence_length', 'year_sentenced', 'sentence_type', 'guideline_range',
       'imprisoned', 'guideline_var_pct', 'dependents', 'count_convictons',
       'race', 'disposition', 'citizen', 'state', 'criminal_hist', 'drug_type',
       'case_type', 'age', 'weapon', 'presentence_stat', 'gender',
       'crime_type', 'region', 'college', 'white', 'perc_charged'],
      dtype='object')

### Linear Regression for Percent above Guideline Min

In [5]:
X = df18to21[['year_sentenced',
       'dependents', 'count_convictons',
       'race', 'disposition', 'citizen', 'state', 'criminal_hist', 'drug_type',
       'age', 'weapon','gender',
       'crime_type', 'college']]
y = df18to21['guideline_var_pct']


In [6]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42)

In [7]:
numeric = ['count_convictons','age']

In [8]:
categorical = ['year_sentenced','dependents','race','disposition','citizen','state',
              'criminal_hist', 'drug_type','weapon','gender','crime_type', 'college']

In [9]:
ctx = ColumnTransformer(
    [('ss',StandardScaler(),numeric),
     ('ohe',OneHotEncoder(handle_unknown='ignore',drop='first'),\
     categorical)],
     verbose_feature_names_out=False,
     remainder = 'passthrough'
)

In [10]:
ctx.fit(X_train)
X_train_t = ctx.transform(X_train)
X_test_t = ctx.transform(X_test)

In [11]:
X_test_t

<61884x105 sparse matrix of type '<class 'numpy.float64'>'
	with 485083 stored elements in Compressed Sparse Row format>

In [12]:
len(ctx.get_feature_names_out())

105

In [13]:
X_train = pd.DataFrame(X_train_t.A,columns=ctx.get_feature_names_out())

In [14]:
X_test = pd.DataFrame(X_test_t.A,columns=ctx.get_feature_names_out())

In [15]:
lr = LinearRegression()
lr.fit(X_train,y_train)
lr.score(X_train,y_train), lr.score(X_test,y_test)

(0.14712796793340943, 0.1400030014639897)

### Linear Regression for Sentence Length

In [16]:
df18to21.columns

Index(['sentence_length', 'year_sentenced', 'sentence_type', 'guideline_range',
       'imprisoned', 'guideline_var_pct', 'dependents', 'count_convictons',
       'race', 'disposition', 'citizen', 'state', 'criminal_hist', 'drug_type',
       'case_type', 'age', 'weapon', 'presentence_stat', 'gender',
       'crime_type', 'region', 'college', 'white', 'perc_charged'],
      dtype='object')

In [17]:
X = df18to21[['year_sentenced',
       'dependents', 'count_convictons',
       'race', 'disposition', 'citizen', 'state', 'criminal_hist', 'drug_type',
       'age', 'weapon','gender',
       'crime_type', 'college']]
y = df18to21['sentence_length']


In [18]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42)

In [19]:
numeric = ['count_convictons','age']
categorical = ['year_sentenced','dependents','race','disposition','citizen','state',
              'criminal_hist', 'drug_type','weapon','gender','crime_type', 'college']

In [20]:
ctx = ColumnTransformer(
    [('ss',StandardScaler(),numeric),
     ('ohe',OneHotEncoder(handle_unknown='ignore',drop='first'),\
     categorical)],
     verbose_feature_names_out=False,
     remainder = 'passthrough'
)

ctx.fit(X_train)
X_train_t = ctx.transform(X_train)
X_test_t = ctx.transform(X_test)

X_train = pd.DataFrame(X_train_t.A,columns=ctx.get_feature_names_out())
X_test = pd.DataFrame(X_test_t.A,columns=ctx.get_feature_names_out())

In [21]:
lr2 = LinearRegression()
lr2.fit(X_train,y_train)
lr2.score(X_train,y_train), lr2.score(X_test,y_test)

(0.14452012559627292, 0.15818674424366885)

### Linear Regression for Sentence Length 2

In [22]:
df18to21.columns

Index(['sentence_length', 'year_sentenced', 'sentence_type', 'guideline_range',
       'imprisoned', 'guideline_var_pct', 'dependents', 'count_convictons',
       'race', 'disposition', 'citizen', 'state', 'criminal_hist', 'drug_type',
       'case_type', 'age', 'weapon', 'presentence_stat', 'gender',
       'crime_type', 'region', 'college', 'white', 'perc_charged'],
      dtype='object')

In [39]:
X = df18to21[['count_convictons','criminal_hist','crime_type','weapon','disposition','age']]
y = df18to21['sentence_length']


In [40]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42)

In [41]:
numeric = ['count_convictons','age']
categorical = ['criminal_hist','crime_type','weapon','disposition']

In [42]:
ctx = ColumnTransformer(
    [('ss',StandardScaler(),numeric),
     ('ohe',OneHotEncoder(handle_unknown='ignore',drop='first'),\
     categorical)],
     verbose_feature_names_out=False,
     remainder = 'passthrough'
)

ctx.fit(X_train)
X_train_t = ctx.transform(X_train)
X_test_t = ctx.transform(X_test)

X_train = pd.DataFrame(X_train_t.A,columns=ctx.get_feature_names_out())
X_test = pd.DataFrame(X_test_t.A,columns=ctx.get_feature_names_out())

In [43]:
lr3 = LinearRegression()
lr3.fit(X_train,y_train)
lr3.score(X_train,y_train), lr3.score(X_test,y_test)

(0.14212213734847134, 0.15627837294578228)

### Linear Regression, Non-Life

In [194]:
df_nonlife = df18to21[df18to21['sentence_length'] != 9996]

In [195]:
X = df_nonlife[['year_sentenced',
       'dependents', 'count_convictons',
       'race', 'disposition', 'citizen', 'criminal_hist', 'drug_type',
       'age', 'weapon','gender','state',
       'crime_type']]
y = df_nonlife['sentence_length']

In [196]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42)

In [197]:
numeric = ['count_convictons','age']
categorical = ['year_sentenced','dependents','race','disposition','citizen', 'state',
              'criminal_hist', 'drug_type','weapon','gender','crime_type']

In [198]:
ctx = ColumnTransformer(
    [('ss',StandardScaler(),numeric),
     ('ohe',OneHotEncoder(handle_unknown='ignore',drop='first'),\
     categorical)],
     verbose_feature_names_out=False,
     remainder = 'passthrough'
)

ctx.fit(X_train)
X_train_t = ctx.transform(X_train)
X_test_t = ctx.transform(X_test)

X_train = pd.DataFrame(X_train_t.A,columns=ctx.get_feature_names_out())
X_test = pd.DataFrame(X_test_t.A,columns=ctx.get_feature_names_out())

In [199]:
lr5 = LinearRegression()
lr5.fit(X_train,y_train)
lr5.score(X_train,y_train), lr5.score(X_test,y_test)

(0.4593885246748831, 0.4020010719142436)

### Lasso Regression

In [206]:
lassor = Lasso()
lassor.fit(X_train,y_train)
lassor.score(X_train,y_train), lassor.score(X_test,y_test)

(0.35793864055845337, 0.3109349166277676)

In [207]:
lasso_params = {
    'alpha' : [.01,.1,1,10,100] 
}

gslasso = GridSearchCV(
    lassor,
    lasso_params,
    cv =5,
)

In [208]:
gslasso.fit(X_train,y_train)
gslasso.score(X_train,y_train), gslasso.score(X_test,y_test)

(0.4587780752787499, 0.4013491078839204)

In [211]:
gslasso.best_params_

{'alpha': 0.01}

In [212]:
lassor2 = Lasso(alpha=.001)
lassor2.fit(X_train,y_train)
lassor2.score(X_train,y_train), lassor2.score(X_test,y_test)

(0.45937262858210215, 0.4019727618961694)

### Ridge Regression

In [209]:
ridge = Ridge()
ridge.fit(X_train,y_train)
ridge.score(X_train,y_train), ridge.score(X_test,y_test)

(0.4593874110630486, 0.4019928975021062)

### Further Analysis

In [216]:
df18to21.columns

Index(['sentence_length', 'year_sentenced', 'sentence_type', 'guideline_range',
       'imprisoned', 'guideline_var_pct', 'dependents', 'count_convictons',
       'race', 'disposition', 'citizen', 'state', 'criminal_hist', 'drug_type',
       'case_type', 'age', 'weapon', 'presentence_stat', 'gender',
       'crime_type', 'region', 'college', 'white', 'perc_charged'],
      dtype='object')

In [217]:
X = df_nonlife[['year_sentenced',
       'dependents', 'count_convictons',
       'race', 'disposition', 'citizen', 'criminal_hist', 'drug_type',
       'age', 'weapon','gender','state',
       'crime_type','case_type','presentence_stat']]
y = df_nonlife['sentence_length']

In [218]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42)

In [None]:
numeric = ['count_convictons','age']
categorical = ['year_sentenced','dependents','race','disposition','citizen', 'state',
              'criminal_hist', 'drug_type','weapon','gender','crime_type','case_type','presentence_stat']

In [219]:
ctx = ColumnTransformer(
    [('ss',StandardScaler(),numeric),
     ('ohe',OneHotEncoder(handle_unknown='ignore',drop='first'),\
     categorical)],
     verbose_feature_names_out=False,
     remainder = 'passthrough'
)

ctx.fit(X_train)
X_train_t = ctx.transform(X_train)
X_test_t = ctx.transform(X_test)

X_train = pd.DataFrame(X_train_t.A,columns=ctx.get_feature_names_out())
X_test = pd.DataFrame(X_test_t.A,columns=ctx.get_feature_names_out())

In [220]:
lr6 = LinearRegression()
lr6.fit(X_train,y_train)
lr6.score(X_train,y_train), lr6.score(X_test,y_test)

(0.47635572079458166, 0.4166992854084739)

### Creating Additional Features

In [221]:
df_nonlife_it = df_nonlife.copy()

In [223]:
df_nonlife_it.columns

Index(['sentence_length', 'year_sentenced', 'sentence_type', 'guideline_range',
       'imprisoned', 'guideline_var_pct', 'dependents', 'count_convictons',
       'race', 'disposition', 'citizen', 'state', 'criminal_hist', 'drug_type',
       'case_type', 'age', 'weapon', 'presentence_stat', 'gender',
       'crime_type', 'region', 'college', 'white', 'perc_charged'],
      dtype='object')

In [224]:
df_nonlife_it['r_c'] = df_nonlife_it['race'] * df_nonlife_it['citizen']
df_nonlife_it['r_d'] = df_nonlife_it['race'] * df_nonlife_it['drug_type']
df_nonlife_it['r_w'] = df_nonlife_it['race'] * df_nonlife_it['weapon']
df_nonlife_it['r_g'] = df_nonlife_it['race'] * df_nonlife_it['gender']
df_nonlife_it['c_g'] = df_nonlife_it['citizen'] * df_nonlife_it['gender']
df_nonlife_it['r_e'] = df_nonlife_it['race'] * df_nonlife_it['college']
df_nonlife_it['r_e'] = df_nonlife_it['gender'] * df_nonlife_it['college']
df_nonlife_it['r_d'] = df_nonlife_it['race'] * df_nonlife_it['dependents']
df_nonlife_it['g_d'] = df_nonlife_it['gender'] * df_nonlife_it['dependents']
df_nonlife_it['r_y'] = df_nonlife_it['race'] * df_nonlife_it['year_sentenced']
df_nonlife_it['c_y'] = df_nonlife_it['citizen'] * df_nonlife_it['year_sentenced']
df_nonlife_it['r_ch'] = df_nonlife_it['race'] * df_nonlife_it['criminal_hist']

In [227]:
df_nonlife_it['r_dis'] = df_nonlife_it['race'] * df_nonlife_it['disposition']

In [228]:
df_nonlife_it.columns

Index(['sentence_length', 'year_sentenced', 'sentence_type', 'guideline_range',
       'imprisoned', 'guideline_var_pct', 'dependents', 'count_convictons',
       'race', 'disposition', 'citizen', 'state', 'criminal_hist', 'drug_type',
       'case_type', 'age', 'weapon', 'presentence_stat', 'gender',
       'crime_type', 'region', 'college', 'white', 'perc_charged', 'r_c',
       'r_d', 'r_w', 'r_g', 'c_g', 'r_e', 'g_d', 'r_y', 'c_y', 'r_ch',
       'r_dis'],
      dtype='object')

In [229]:
numeric = ['count_convictons','age']
categorical = ['year_sentenced','dependents','race','disposition','citizen', 'state',
              'criminal_hist', 'drug_type','weapon','gender','crime_type','case_type','presentence_stat',
              'sentence_type','college','r_c','r_d', 'r_w', 'r_g', 'c_g',
              'r_e', 'g_d', 'r_y', 'c_y', 'r_ch','r_dis'
               ]

In [232]:
X = df_nonlife_it[['count_convictons','age',
              'year_sentenced','dependents','race','disposition','citizen', 'state',
              'criminal_hist', 'drug_type','weapon','gender','crime_type','case_type','presentence_stat',
              'sentence_type','college','r_c','r_d', 'r_w', 'r_g', 'c_g',
              'r_e', 'g_d', 'r_y', 'c_y', 'r_ch','r_dis'
               ]]
y = df_nonlife_it['sentence_length']

In [233]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42)

In [234]:
ctx = ColumnTransformer(
    [('ss',StandardScaler(),numeric),
     ('ohe',OneHotEncoder(handle_unknown='ignore',drop='first'),\
     categorical)],
     verbose_feature_names_out=False,
     remainder = 'passthrough'
)

ctx.fit(X_train)
X_train_t = ctx.transform(X_train)
X_test_t = ctx.transform(X_test)

X_train = pd.DataFrame(X_train_t.A,columns=ctx.get_feature_names_out())
X_test = pd.DataFrame(X_test_t.A,columns=ctx.get_feature_names_out())

In [235]:
lr7 = LinearRegression()
lr7.fit(X_train,y_train)
lr7.score(X_train,y_train), lr7.score(X_test,y_test)

(0.4861340285174224, 0.4254602485131993)

In [241]:
lassor3 = Lasso(alpha=.001)
lassor3.fit(X_train,y_train)
lassor3.score(X_train,y_train), lassor3.score(X_test,y_test)

  model = cd_fast.enet_coordinate_descent(


(0.48613772251299114, 0.4255166024358341)

In [244]:
coefficients = lassor3.coef_
importance = np.abs(lassor3.coef_)
importance

array([4.61160358e+00, 3.35469118e+00, 2.70612888e-01, 2.32055497e+00,
       5.87760024e-01, 7.35307715e-01, 1.26468858e+01, 2.67824415e+00,
       1.32455460e+00, 0.00000000e+00, 7.74940062e-01, 5.49146228e+01,
       1.58141113e+01, 5.78792106e+01, 1.71096618e+01, 9.14245536e+00,
       4.17296218e+00, 4.78302478e+00, 1.06842819e+01, 2.12165817e+00,
       7.88504106e+00, 7.47928932e+00, 1.03205555e+01, 1.18540147e+01,
       2.97749173e+00, 7.43599839e-01, 7.25621155e+00, 5.50544171e+00,
       1.16487385e+01, 2.00261168e+01, 5.60343136e+00, 7.19269957e+00,
       2.18268048e+00, 4.96023941e+00, 6.25419372e+00, 6.61564456e+00,
       7.06010726e+00, 1.50001895e+01, 1.99373505e+00, 2.51062446e+00,
       1.19005497e+01, 4.73300932e+00, 8.21377804e+00, 6.68868375e+00,
       2.81814650e-01, 5.52082784e+00, 1.03388412e+01, 6.91124360e+00,
       6.84838311e+00, 2.18532199e+00, 2.20731713e+00, 9.33008380e+00,
       1.82451053e+00, 1.83642165e+00, 5.02563600e+00, 2.34453242e+00,
      

### CAPPING THE SENTENCE LENGTH TO 470

In [333]:
df18to21_470cap = df18to21.copy()

In [334]:
df18to21_470cap.head()

Unnamed: 0,sentence_length,year_sentenced,sentence_type,guideline_range,imprisoned,guideline_var_pct,dependents,count_convictons,race,disposition,...,case_type,age,weapon,presentence_stat,gender,crime_type,region,college,white,perc_charged
0,188,2018,1,1.0,1,0.0,1,1,1,3,...,1.0,24.0,0,1.0,0.0,10,West,0,1,0.079034
1,0,2018,1,0.0,1,0.0,1,1,1,1,...,1.0,28.0,0,1.0,0.0,17,South,0,1,0.057644
2,300,2018,1,6.0,1,0.0,1,3,1,3,...,1.0,41.0,1,1.0,0.0,10,Midwest,0,1,0.148692
3,63,2018,1,1.0,1,0.0,0,1,2,1,...,1.0,25.0,1,1.0,0.0,26,South,0,0,0.074698
4,0,2018,1,2.0,1,0.0,1,1,1,1,...,1.0,27.0,0,1.0,0.0,17,West,0,1,0.029711


In [335]:
df18to21_470cap['sentence_length'] = [i == 840 if i > 840 else i for i in df18to21_470cap['sentence_length']]

In [336]:
df18to21_470cap['sentence_length'].max()

840

In [337]:
X = df18to21_470cap[['year_sentenced',
       'dependents', 'count_convictons',
       'race', 'disposition', 'citizen', 'criminal_hist', 'drug_type',
       'age', 'weapon','gender','state','sentence_type',
       'crime_type','case_type','presentence_stat','college']]
y = df18to21_470cap['sentence_length']

In [338]:
numeric = ['count_convictons','age']
categorical = ['year_sentenced','dependents','race','disposition','citizen', 'state',
              'criminal_hist', 'drug_type','weapon','gender','crime_type','case_type','presentence_stat',
              'sentence_type','college']

In [339]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42)

In [340]:
ctx = ColumnTransformer(
    [('ss',StandardScaler(),numeric),
     ('ohe',OneHotEncoder(handle_unknown='ignore',drop='first'),\
     categorical)],
     verbose_feature_names_out=False,
     remainder = 'passthrough'
)

ctx.fit(X_train)
X_train_t = ctx.transform(X_train)
X_test_t = ctx.transform(X_test)

X_train = pd.DataFrame(X_train_t.A,columns=ctx.get_feature_names_out())
X_test = pd.DataFrame(X_test_t.A,columns=ctx.get_feature_names_out())

In [341]:
lassor4 = Lasso(alpha=.001)
lassor4.fit(X_train,y_train)
lassor4.score(X_train,y_train), lassor4.score(X_test,y_test)

(0.5277466501463421, 0.5270109735447235)

In [342]:
df18to21_470cap['r_c'] = df_nonlife_it['race'] * df18to21_470cap['citizen']
df18to21_470cap['r_d'] = df_nonlife_it['race'] * df18to21_470cap['drug_type']
df18to21_470cap['r_w'] = df_nonlife_it['race'] * df18to21_470cap['weapon']
df18to21_470cap['r_g'] = df_nonlife_it['race'] * df18to21_470cap['gender']
df18to21_470cap['c_g'] = df_nonlife_it['citizen'] * df18to21_470cap['gender']
df18to21_470cap['r_e'] = df_nonlife_it['race'] * df18to21_470cap['college']
df18to21_470cap['r_e'] = df_nonlife_it['gender'] * df18to21_470cap['college']
df18to21_470cap['r_d'] = df_nonlife_it['race'] * df18to21_470cap['dependents']
df18to21_470cap['g_d'] = df_nonlife_it['gender'] * df18to21_470cap['dependents']
df18to21_470cap['r_y'] = df_nonlife_it['race'] * df18to21_470cap['year_sentenced']
df18to21_470cap['c_y'] = df_nonlife_it['citizen'] * df18to21_470cap['year_sentenced']
df18to21_470cap['r_ch'] = df_nonlife_it['race'] * df18to21_470cap['criminal_hist']
df18to21_470cap['r_dis'] = df_nonlife_it['race'] * df18to21_470cap['disposition']

In [343]:
numeric = ['count_convictons','age']
categorical = ['year_sentenced','dependents','race','disposition','citizen', 'state',
              'criminal_hist', 'drug_type','weapon','gender','crime_type','case_type','presentence_stat',
              'sentence_type','college','r_c','r_d', 'r_w', 'r_g', 'c_g',
              'r_e', 'g_d', 'r_y', 'c_y', 'r_ch','r_dis'
               ]

In [344]:
X = df18to21_470cap[['count_convictons','age',
              'year_sentenced','dependents','race','disposition','citizen', 'state',
              'criminal_hist', 'drug_type','weapon','gender','crime_type','case_type','presentence_stat',
              'sentence_type','college','r_c','r_d', 'r_w', 'r_g', 'c_g',
              'r_e', 'g_d', 'r_y', 'c_y', 'r_ch','r_dis'
               ]]
y = df18to21_470cap['sentence_length']

In [345]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42)

In [346]:
ctx = ColumnTransformer(
    [('ss',StandardScaler(),numeric),
     ('ohe',OneHotEncoder(handle_unknown='ignore',drop='first'),\
     categorical)],
     verbose_feature_names_out=False,
     remainder = 'passthrough'
)

ctx.fit(X_train)
X_train_t = ctx.transform(X_train)
X_test_t = ctx.transform(X_test)

X_train = pd.DataFrame(X_train_t.A,columns=ctx.get_feature_names_out())
X_test = pd.DataFrame(X_test_t.A,columns=ctx.get_feature_names_out())

In [347]:
lassor5 = Lasso(alpha=.0001)
lassor5.fit(X_train,y_train)
lassor5.score(X_train,y_train), lassor5.score(X_test,y_test)

  model = cd_fast.enet_coordinate_descent(


(0.547024243845947, 0.5504479161955145)

In [384]:
# rfr = RandomForestRegressor()
# rfr.fit(X_train,y_train)
# rfr.score(X_train,y_train), rfr.score(X_test,y_test)

TypeError: 'RandomForestRegressor' object is not callable