In [1]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, Lasso, Ridge
from sklearn.compose import TransformedTargetRegressor
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostClassifier
import seaborn as sns

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
df18to21 = pd.read_csv('df18to21_cleaned.csv')

In [4]:
df18to21 = df18to21.drop(columns = ['Unnamed: 0.1','Unnamed: 0'])

In [5]:
df18to21.columns

Index(['sentence_length', 'year_sentenced', 'sentence_type', 'guideline_range',
       'imprisoned', 'guideline_var_pct', 'dependents', 'count_convictons',
       'race', 'disposition', 'citizen', 'state', 'criminal_hist', 'drug_type',
       'case_type', 'age', 'weapon', 'presentence_stat', 'gender',
       'crime_type', 'region', 'college', 'white', 'perc_charged'],
      dtype='object')

### Linear Regression for Percent above Guideline Min

In [6]:
X = df18to21[['year_sentenced',
       'dependents', 'count_convictons',
       'race', 'disposition', 'citizen', 'state', 'criminal_hist', 'drug_type',
       'age', 'weapon','gender',
       'crime_type', 'college']]
y = df18to21['guideline_var_pct']


In [7]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42)

In [8]:
numeric = ['count_convictons','age']

In [9]:
categorical = ['year_sentenced','dependents','race','disposition','citizen','state',
              'criminal_hist', 'drug_type','weapon','gender','crime_type', 'college']

In [10]:
ctx = ColumnTransformer(
    [('ss',StandardScaler(),numeric),
     ('ohe',OneHotEncoder(handle_unknown='ignore',drop='first'),\
     categorical)],
     verbose_feature_names_out=False,
     remainder = 'passthrough'
)

In [11]:
ctx.fit(X_train)
X_train_t = ctx.transform(X_train)
X_test_t = ctx.transform(X_test)

In [12]:
X_test_t

<61884x105 sparse matrix of type '<class 'numpy.float64'>'
	with 485083 stored elements in Compressed Sparse Row format>

In [13]:
len(ctx.get_feature_names_out())

105

In [14]:
X_train = pd.DataFrame(X_train_t.A,columns=ctx.get_feature_names_out())

In [15]:
X_test = pd.DataFrame(X_test_t.A,columns=ctx.get_feature_names_out())

In [16]:
lr = LinearRegression()
lr.fit(X_train,y_train)
lr.score(X_train,y_train), lr.score(X_test,y_test)

(0.14712796793340943, 0.1400030014639897)

### Linear Regression for Sentence Length

In [17]:
df18to21.columns

Index(['sentence_length', 'year_sentenced', 'sentence_type', 'guideline_range',
       'imprisoned', 'guideline_var_pct', 'dependents', 'count_convictons',
       'race', 'disposition', 'citizen', 'state', 'criminal_hist', 'drug_type',
       'case_type', 'age', 'weapon', 'presentence_stat', 'gender',
       'crime_type', 'region', 'college', 'white', 'perc_charged'],
      dtype='object')

In [18]:
X = df18to21[['year_sentenced',
       'dependents', 'count_convictons',
       'race', 'disposition', 'citizen', 'state', 'criminal_hist', 'drug_type',
       'age', 'weapon','gender',
       'crime_type', 'college']]
y = df18to21['sentence_length']


In [19]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42)

In [20]:
numeric = ['count_convictons','age']
categorical = ['year_sentenced','dependents','race','disposition','citizen','state',
              'criminal_hist', 'drug_type','weapon','gender','crime_type', 'college']

In [21]:
ctx = ColumnTransformer(
    [('ss',StandardScaler(),numeric),
     ('ohe',OneHotEncoder(handle_unknown='ignore',drop='first'),\
     categorical)],
     verbose_feature_names_out=False,
     remainder = 'passthrough'
)

ctx.fit(X_train)
X_train_t = ctx.transform(X_train)
X_test_t = ctx.transform(X_test)

X_train = pd.DataFrame(X_train_t.A,columns=ctx.get_feature_names_out())
X_test = pd.DataFrame(X_test_t.A,columns=ctx.get_feature_names_out())

In [22]:
lr2 = LinearRegression()
lr2.fit(X_train,y_train)
lr2.score(X_train,y_train), lr2.score(X_test,y_test)

(0.14452012559627292, 0.15818674424366885)

### Linear Regression for Sentence Length 2

In [23]:
df18to21.columns

Index(['sentence_length', 'year_sentenced', 'sentence_type', 'guideline_range',
       'imprisoned', 'guideline_var_pct', 'dependents', 'count_convictons',
       'race', 'disposition', 'citizen', 'state', 'criminal_hist', 'drug_type',
       'case_type', 'age', 'weapon', 'presentence_stat', 'gender',
       'crime_type', 'region', 'college', 'white', 'perc_charged'],
      dtype='object')

In [24]:
X = df18to21[['count_convictons','criminal_hist','crime_type','weapon','disposition','age']]
y = df18to21['sentence_length']


In [25]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42)

In [26]:
numeric = ['count_convictons','age']
categorical = ['criminal_hist','crime_type','weapon','disposition']

In [27]:
ctx = ColumnTransformer(
    [('ss',StandardScaler(),numeric),
     ('ohe',OneHotEncoder(handle_unknown='ignore',drop='first'),\
     categorical)],
     verbose_feature_names_out=False,
     remainder = 'passthrough'
)

ctx.fit(X_train)
X_train_t = ctx.transform(X_train)
X_test_t = ctx.transform(X_test)

X_train = pd.DataFrame(X_train_t.A,columns=ctx.get_feature_names_out())
X_test = pd.DataFrame(X_test_t.A,columns=ctx.get_feature_names_out())

In [28]:
lr3 = LinearRegression()
lr3.fit(X_train,y_train)
lr3.score(X_train,y_train), lr3.score(X_test,y_test)

(0.14212213734847134, 0.15627837294578228)

### Linear Regression, Non-Life

In [29]:
df_nonlife = df18to21[df18to21['sentence_length'] != 9996]

In [30]:
X = df_nonlife[['year_sentenced',
       'dependents', 'count_convictons',
       'race', 'disposition', 'citizen', 'criminal_hist', 'drug_type',
       'age', 'weapon','gender','state',
       'crime_type']]
y = df_nonlife['sentence_length']

In [31]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42)

In [32]:
numeric = ['count_convictons','age']
categorical = ['year_sentenced','dependents','race','disposition','citizen', 'state',
              'criminal_hist', 'drug_type','weapon','gender','crime_type']

In [33]:
ctx = ColumnTransformer(
    [('ss',StandardScaler(),numeric),
     ('ohe',OneHotEncoder(handle_unknown='ignore',drop='first'),\
     categorical)],
     verbose_feature_names_out=False,
     remainder = 'passthrough'
)

ctx.fit(X_train)
X_train_t = ctx.transform(X_train)
X_test_t = ctx.transform(X_test)

X_train = pd.DataFrame(X_train_t.A,columns=ctx.get_feature_names_out())
X_test = pd.DataFrame(X_test_t.A,columns=ctx.get_feature_names_out())

In [34]:
lr5 = LinearRegression()
lr5.fit(X_train,y_train)
lr5.score(X_train,y_train), lr5.score(X_test,y_test)

(0.4593885246748831, 0.4020010719142436)

### Lasso Regression

In [35]:
lassor = Lasso()
lassor.fit(X_train,y_train)
lassor.score(X_train,y_train), lassor.score(X_test,y_test)

(0.35793864055845337, 0.3109349166277676)

In [36]:
lasso_params = {
    'alpha' : [.01,.1,1,10,100] 
}

gslasso = GridSearchCV(
    lassor,
    lasso_params,
    cv =5,
)

In [37]:
gslasso.fit(X_train,y_train)
gslasso.score(X_train,y_train), gslasso.score(X_test,y_test)

(0.4587780752787499, 0.4013491078839204)

In [38]:
gslasso.best_params_

{'alpha': 0.01}

In [39]:
lassor2 = Lasso(alpha=.001)
lassor2.fit(X_train,y_train)
lassor2.score(X_train,y_train), lassor2.score(X_test,y_test)

(0.45937262858210215, 0.4019727618961694)

### Ridge Regression

In [40]:
ridge = Ridge()
ridge.fit(X_train,y_train)
ridge.score(X_train,y_train), ridge.score(X_test,y_test)

(0.4593874110630486, 0.4019928975021062)

### Further Analysis

In [41]:
df18to21.columns

Index(['sentence_length', 'year_sentenced', 'sentence_type', 'guideline_range',
       'imprisoned', 'guideline_var_pct', 'dependents', 'count_convictons',
       'race', 'disposition', 'citizen', 'state', 'criminal_hist', 'drug_type',
       'case_type', 'age', 'weapon', 'presentence_stat', 'gender',
       'crime_type', 'region', 'college', 'white', 'perc_charged'],
      dtype='object')

In [42]:
X = df_nonlife[['year_sentenced',
       'dependents', 'count_convictons',
       'race', 'disposition', 'citizen', 'criminal_hist', 'drug_type',
       'age', 'weapon','gender','state',
       'crime_type','case_type','presentence_stat']]
y = df_nonlife['sentence_length']

In [43]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42)

In [44]:
numeric = ['count_convictons','age']
categorical = ['year_sentenced','dependents','race','disposition','citizen', 'state',
              'criminal_hist', 'drug_type','weapon','gender','crime_type','case_type','presentence_stat']

In [45]:
ctx = ColumnTransformer(
    [('ss',StandardScaler(),numeric),
     ('ohe',OneHotEncoder(handle_unknown='ignore',drop='first'),\
     categorical)],
     verbose_feature_names_out=False,
     remainder = 'passthrough'
)

ctx.fit(X_train)
X_train_t = ctx.transform(X_train)
X_test_t = ctx.transform(X_test)

X_train = pd.DataFrame(X_train_t.A,columns=ctx.get_feature_names_out())
X_test = pd.DataFrame(X_test_t.A,columns=ctx.get_feature_names_out())

In [46]:
lr6 = LinearRegression()
lr6.fit(X_train,y_train)
lr6.score(X_train,y_train), lr6.score(X_test,y_test)

(0.4806429251954645, 0.42051442585763554)

### Creating Additional Features

In [47]:
df_nonlife_it = df_nonlife.copy()

In [48]:
df_nonlife_it.columns

Index(['sentence_length', 'year_sentenced', 'sentence_type', 'guideline_range',
       'imprisoned', 'guideline_var_pct', 'dependents', 'count_convictons',
       'race', 'disposition', 'citizen', 'state', 'criminal_hist', 'drug_type',
       'case_type', 'age', 'weapon', 'presentence_stat', 'gender',
       'crime_type', 'region', 'college', 'white', 'perc_charged'],
      dtype='object')

In [49]:
df_nonlife_it['r_c'] = df_nonlife_it['race'] * df_nonlife_it['citizen']
df_nonlife_it['r_d'] = df_nonlife_it['race'] * df_nonlife_it['drug_type']
df_nonlife_it['r_w'] = df_nonlife_it['race'] * df_nonlife_it['weapon']
df_nonlife_it['r_g'] = df_nonlife_it['race'] * df_nonlife_it['gender']
df_nonlife_it['c_g'] = df_nonlife_it['citizen'] * df_nonlife_it['gender']
df_nonlife_it['r_e'] = df_nonlife_it['race'] * df_nonlife_it['college']
df_nonlife_it['r_e'] = df_nonlife_it['gender'] * df_nonlife_it['college']
df_nonlife_it['r_d'] = df_nonlife_it['race'] * df_nonlife_it['dependents']
df_nonlife_it['g_d'] = df_nonlife_it['gender'] * df_nonlife_it['dependents']
df_nonlife_it['r_y'] = df_nonlife_it['race'] * df_nonlife_it['year_sentenced']
df_nonlife_it['c_y'] = df_nonlife_it['citizen'] * df_nonlife_it['year_sentenced']
df_nonlife_it['r_ch'] = df_nonlife_it['race'] * df_nonlife_it['criminal_hist']

In [50]:
df_nonlife_it['r_dis'] = df_nonlife_it['race'] * df_nonlife_it['disposition']

In [51]:
df_nonlife_it.columns

Index(['sentence_length', 'year_sentenced', 'sentence_type', 'guideline_range',
       'imprisoned', 'guideline_var_pct', 'dependents', 'count_convictons',
       'race', 'disposition', 'citizen', 'state', 'criminal_hist', 'drug_type',
       'case_type', 'age', 'weapon', 'presentence_stat', 'gender',
       'crime_type', 'region', 'college', 'white', 'perc_charged', 'r_c',
       'r_d', 'r_w', 'r_g', 'c_g', 'r_e', 'g_d', 'r_y', 'c_y', 'r_ch',
       'r_dis'],
      dtype='object')

In [52]:
numeric = ['count_convictons','age']
categorical = ['year_sentenced','dependents','race','disposition','citizen', 'state',
              'criminal_hist', 'drug_type','weapon','gender','crime_type','case_type','presentence_stat',
              'sentence_type','college','r_c','r_d', 'r_w', 'r_g', 'c_g',
              'r_e', 'g_d', 'r_y', 'c_y', 'r_ch','r_dis'
               ]

In [53]:
X = df_nonlife_it[['count_convictons','age',
              'year_sentenced','dependents','race','disposition','citizen', 'state',
              'criminal_hist', 'drug_type','weapon','gender','crime_type','case_type','presentence_stat',
              'sentence_type','college','r_c','r_d', 'r_w', 'r_g', 'c_g',
              'r_e', 'g_d', 'r_y', 'c_y', 'r_ch','r_dis'
               ]]
y = df_nonlife_it['sentence_length']

In [54]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42)

In [55]:
ctx = ColumnTransformer(
    [('ss',StandardScaler(),numeric),
     ('ohe',OneHotEncoder(handle_unknown='ignore',drop='first'),\
     categorical)],
     verbose_feature_names_out=False,
     remainder = 'passthrough'
)

ctx.fit(X_train)
X_train_t = ctx.transform(X_train)
X_test_t = ctx.transform(X_test)

X_train = pd.DataFrame(X_train_t.A,columns=ctx.get_feature_names_out())
X_test = pd.DataFrame(X_test_t.A,columns=ctx.get_feature_names_out())

In [56]:
lr7 = LinearRegression()
lr7.fit(X_train,y_train)
lr7.score(X_train,y_train), lr7.score(X_test,y_test)

(0.4861340285174224, 0.4254602485131993)

In [57]:
lassor3 = Lasso(alpha=.001)
lassor3.fit(X_train,y_train)
lassor3.score(X_train,y_train), lassor3.score(X_test,y_test)

  model = cd_fast.enet_coordinate_descent(


(0.48613772251299114, 0.4255166024358341)

### CAPPING THE SENTENCE LENGTH TO 840 (70 Years)

In [71]:
df18to21_840cap = pd.read_csv('df18to21_cleanedH.csv')

In [72]:
df18to21_840cap['sentence_length'].describe()

count    247533.000000
mean         57.223619
std         398.033474
min           0.000000
25%           0.000000
50%          18.000000
75%          60.000000
max        9996.000000
Name: sentence_length, dtype: float64

In [73]:
df18to21_840cap['sentence_length'] = [840 if i >= 840 else i for i in df18to21_840cap['sentence_length']]

In [74]:
df18to21_840cap['sentence_length'].describe()

count    247533.000000
mean         42.800665
std          71.250386
min           0.000000
25%           0.000000
50%          18.000000
75%          60.000000
max         840.000000
Name: sentence_length, dtype: float64

In [75]:
df18to21_840cap['sentence_length'].max()

840

In [76]:
df18to21_840cap['sentence_length'].describe()

count    247533.000000
mean         42.800665
std          71.250386
min           0.000000
25%           0.000000
50%          18.000000
75%          60.000000
max         840.000000
Name: sentence_length, dtype: float64

In [77]:
X = df18to21_840cap[['year_sentenced',
       'dependents', 'count_convictons',
       'race', 'disposition', 'citizen', 'criminal_hist', 'drug_type',
       'age', 'weapon','gender','state','sentence_type',
       'crime_type','case_type','presentence_stat','college']]
y = df18to21_840cap['sentence_length']

In [78]:
numeric = ['count_convictons','age']
categorical = ['year_sentenced','dependents','race','disposition','citizen', 'state',
              'criminal_hist', 'drug_type','weapon','gender','crime_type','case_type','presentence_stat',
              'sentence_type','college']

In [79]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42)

In [80]:
ctx = ColumnTransformer(
    [('ss',StandardScaler(),numeric),
     ('ohe',OneHotEncoder(handle_unknown='ignore',drop='first'),\
     categorical)],
     verbose_feature_names_out=False,
     remainder = 'passthrough'
)

ctx.fit(X_train)
X_train_t = ctx.transform(X_train)
X_test_t = ctx.transform(X_test)

X_train = pd.DataFrame(X_train_t.A,columns=ctx.get_feature_names_out())
X_test = pd.DataFrame(X_test_t.A,columns=ctx.get_feature_names_out())

In [81]:
lassor4 = Lasso(alpha=.001)
lassor4.fit(X_train,y_train)
lassor4.score(X_train,y_train), lassor4.score(X_test,y_test)

(0.5261302604775733, 0.5364943583632693)