In [1]:
pip install -U imbalanced-learn

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, Lasso, Ridge
from sklearn.compose import TransformedTargetRegressor
from sklearn.model_selection import train_test_split
import seaborn as sns
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.datasets import make_imbalance
from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import RandomOverSampler


ImportError: cannot import name '_joblib_parallel_args' from 'sklearn.utils.fixes' (/Users/danielschlant/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/fixes.py)

In [None]:
census_data = pd.read_csv('census_20.csv')

In [None]:
df18to21 = pd.read_csv('df18to21_cleaned.csv')

In [None]:
census_data.head()

In [None]:
df18to21.columns

In [None]:
df18to21 = df18to21.drop(columns=['Unnamed: 0','Unnamed: 0.1'])

#### Imprisoned Rate by Year

No discernible change in rate of prison sentences due to covid.

In [None]:
df18to21.groupby('year_sentenced')['imprisoned'].mean()

Fewer number of convictions post covid

#### Number of Sentenced by Year

In [None]:
df18to21['year_sentenced'].value_counts()

#### Analysis, Over 25 years old

In [None]:
df18to21['over25'] = [1 if i >= 25 else 0 for i in df18to21['age']]

### Violent Crime Analysis

https://www.ussc.gov/guidelines/amendment/798#:~:text=%E2%80%9C%20'Crime%20of%20violence'%20includes,and%20burglary%20of%20a%20dwelling.

“Crime of violence’ includes murder, manslaughter, kidnapping, aggravated assault, forcible sex offenses, robbery, arson, extortion, extortionate extension of credit, and burglary of a dwelling."

In [None]:
violent_crimes = [22,27,20,19,4,26,3,12,26]

In [None]:
df18to21['violent_crime'] = [1 if i in violent_crimes else 0 for i in df18to21['crime_type']]

Less violent crime as you get older.

In [None]:
df18to21.groupby('over25')['violent_crime'].mean()

In [None]:
df18to21['violent_crime'].mean()

In [None]:
df_age_violent = pd.DataFrame(df18to21.groupby('age')['violent_crime'].mean()).reset_index()
df_age_violent.head()

In [None]:
sns.scatterplot(x = df_age_violent['age'],y = df_age_violent['violent_crime'])

### Classifier for whether someone was held in custody for trial

In [None]:
df18to21['trial_custody'] = [1 if i == 1 else 0 for i in df18to21['presentence_stat']]

In [None]:
df18to21['trial_custody'].mean()

In [None]:
X = df18to21[['dependents', 'count_convictons','disposition', 'citizen',
       'state', 'criminal_hist', 'drug_type', 'case_type', 'age', 'weapon',
       'gender', 'crime_type','college',
       'white']]
y = df18to21['trial_custody']

In [None]:
numeric = ['count_convictons','age']
categorical = ['dependents','white','disposition','citizen', 'state',
              'criminal_hist', 'drug_type','weapon','gender','crime_type','case_type','college']

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42,stratify=y)

In [None]:
ctx = ColumnTransformer(
    [('mms',MinMaxScaler(),numeric),
     ('ohe',OneHotEncoder(handle_unknown='ignore',drop='first'),\
     categorical)],
     verbose_feature_names_out=False,
     remainder = 'passthrough'
)

ctx.fit(X_train)
X_train_t = ctx.transform(X_train)
X_test_t = ctx.transform(X_test)

X_train = pd.DataFrame(X_train_t.A,columns=ctx.get_feature_names_out())
X_test = pd.DataFrame(X_test_t.A,columns=ctx.get_feature_names_out())

In [None]:
y_test.value_counts(normalize=True)

In [None]:
mnb = MultinomialNB()
mnb.fit(X_train,y_train)
mnb.score(X_train,y_train), mnb.score(X_test,y_test)

In [None]:
logreg = LogisticRegression(random_state=42,max_iter=10000)
logreg.fit(X_train,y_train)
logreg.score(X_train,y_train),logreg.score(X_test,y_test)

In [None]:
imps = mnb.feature_log_prob_[1,:] - mnb.feature_log_prob_[0,:]

In [None]:
mnb_feat_imp = pd.DataFrame(imps.T,
            index=X_train.columns,
            columns=['feature imps'])

In [None]:
mnb_feat_imp.sort_values(by='feature imps',ascending=False).head(15)

https://www.nbcnews.com/news/us-news/new-mexico-eliminated-cash-bail-now-one-county-locks-more-n1250257

In [None]:
df18to21['drug_type'].value_counts()

 1 = Cocaine
 2 = Crack
 3 = Heroin
 4 = Marijuana
 6 = Methamphetamine
 7 = Fentanyl
77 = Other

### Classification for if Minimum Guideline Exceeded

In [None]:
df18to21['above_min'] = [1 if i > 0 else 0 for i in  df18to21['guideline_var_pct']]

In [None]:
X = df18to21[['dependents', 'count_convictons','disposition', 'citizen',
       'state', 'criminal_hist', 'drug_type', 'case_type', 'age', 'weapon',
       'gender', 'crime_type','college','presentence_stat',
       'race']]
y = df18to21['above_min']

In [None]:
y.value_counts(normalize=True)

In [None]:
numeric = ['count_convictons','age']
categorical = ['dependents','race','disposition','citizen', 'state','presentence_stat',
              'criminal_hist', 'drug_type','weapon','gender','crime_type','case_type','college']

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42,stratify=y)

In [None]:
ctx = ColumnTransformer(
    [('mms',MinMaxScaler(),numeric),
     ('ohe',OneHotEncoder(handle_unknown='ignore',drop='first'),\
     categorical)],
     verbose_feature_names_out=False,
     remainder = 'passthrough'
)

ctx.fit(X_train)
X_train_t = ctx.transform(X_train)
X_test_t = ctx.transform(X_test)

X_train = pd.DataFrame(X_train_t.A,columns=ctx.get_feature_names_out())
X_test = pd.DataFrame(X_test_t.A,columns=ctx.get_feature_names_out())

In [None]:
mnb2 = MultinomialNB()
mnb2.fit(X_train,y_train)
mnb2.score(X_train,y_train), mnb2.score(X_test,y_test)

In [None]:
logreg2 = LogisticRegression()
logreg2.fit(X_train,y_train)
logreg2.score(X_train,y_train),logreg2.score(X_test,y_test)

In [None]:
knc = KNeighborsClassifier(n_neighbors=3)
knc.fit(X_train,y_train)
knc.score(X_train,y_train),knc.score(X_test,y_test)