In [1]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, Lasso, Ridge
from sklearn.compose import TransformedTargetRegressor
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostClassifier

df_data = pd.read_csv('regression_data.csv')

In [2]:
df_data.columns

Index(['Unnamed: 0.2', 'Unnamed: 0.1', 'Unnamed: 0', 'sentence_length',
       'year_sentenced', 'sentence_type', 'guideline_range', 'imprisoned',
       'guideline_var_pct', 'dependents', 'count_convictons', 'race',
       'disposition', 'citizen', 'state', 'criminal_hist', 'drug_type',
       'case_type', 'age', 'weapon', 'presentence_stat', 'gender',
       'crime_type', 'region', 'college', 'white', 'perc_charged'],
      dtype='object')

In [3]:
df_data = df_data.drop(columns=['Unnamed: 0','Unnamed: 0.2','Unnamed: 0.1',])

In [4]:
X = df_data.drop(columns=['sentence_length','guideline_range','guideline_var_pct','region','white','perc_charged'])
y = df_data['sentence_length']

In [5]:
numeric = ['count_convictons','age']
categorical = ['year_sentenced','dependents','race','disposition','citizen', 'state',
              'criminal_hist', 'drug_type','weapon','gender','crime_type','case_type','presentence_stat',
              'sentence_type','college']

In [6]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42)

ctx = ColumnTransformer(
    [('ss',StandardScaler(),numeric),
     ('ohe',OneHotEncoder(handle_unknown='ignore'),\
     categorical)],
     verbose_feature_names_out=False,
     remainder = 'passthrough'
)

ctx.fit(X_train)
X_train_t = ctx.transform(X_train)
X_test_t = ctx.transform(X_test)

X_train = pd.DataFrame(X_train_t.A,columns=ctx.get_feature_names_out())
X_test = pd.DataFrame(X_test_t.A,columns=ctx.get_feature_names_out())

In [7]:
lasso = Lasso(alpha=.001,max_iter=10000,random_state=42)
lasso.fit(X_train,y_train)
lasso.score(X_train,y_train), lasso.score(X_test,y_test)

(0.5261431286697706, 0.5364949039384008)

In [8]:
lzip = zip(ctx.get_feature_names_out(),lasso.coef_)
lc_dtc = {a:b for a,b in lzip}

In [11]:
lc_df=pd.DataFrame(lc_dtc,index=[0])

In [12]:
lc_df

Unnamed: 0,count_convictons,age,year_sentenced_2018,year_sentenced_2019,year_sentenced_2020,year_sentenced_2021,dependents_0,dependents_1,race_0,race_1,...,presentence_stat_3.0,presentence_stat_4.0,sentence_type_0,sentence_type_1,sentence_type_2,sentence_type_3,sentence_type_4,college_0,college_1,imprisoned
0,4.058243,3.426164,2.170427,-0.004163,-1.413627,0.385815,-0.811366,1.26673e-15,0.0,-0.0,...,-0.004616,0.659203,2.015051,12.634918,-6.843359,-3.48454,0.0,1.877117,-0.0,-0.0


In [17]:
lc_dff = pd.DataFrame()
lc_dff['feature'] = [i for i in ctx.get_feature_names_out()]
lc_dff['coeffs'] = [i for i in lasso.coef_]

In [18]:
lc_dff

Unnamed: 0,feature,coeffs
0,count_convictons,4.058243
1,age,3.426164
2,year_sentenced_2018,2.170427
3,year_sentenced_2019,-0.004163
4,year_sentenced_2020,-1.413627
...,...,...
125,sentence_type_3,-3.484540
126,sentence_type_4,0.000000
127,college_0,1.877117
128,college_1,-0.000000


In [35]:
state_coeff = lc_dff[lc_dff['feature'].str.contains("state")]

In [36]:
state_coeff['feature'] = [i.split('_')[1] for i in state_coeff['feature']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  state_coeff['feature'] = [i.split('_')[1] for i in state_coeff['feature']]


In [37]:
state_coeff = state_coeff.rename(columns={'feature':'state'})

In [38]:
state_coeff.head()

Unnamed: 0,state,coeffs
21,Alabama,3.06653
22,Alaska,-4.649757
23,Arizona,-2.076129
24,Arkansas,6.494063
25,California,-8.494445


In [39]:
pop_breakdown = pd.read_csv('pop_breakdowns.csv')

In [41]:
pop_breakdown = pd.merge(pop_breakdown,state_coeff,how='inner',on='state')

In [43]:
pop_breakdown = pop_breakdown.rename(columns={'coeffs':'model_coeffs'})

In [45]:
pop_breakdown.to_csv('pop_breakdowns.csv')

In [22]:
crime_coeff = lc_dff[lc_dff['feature'].str.contains("crime")]

In [48]:
crime_coeff.to_csv('crime_coeffs.csv')