#Description

In this code, we play around with synthetic minority over-sampling technique (SMOTE) to address the class imbalance problem we identified in the baseline models. We continue to use only three features in this code.

# Load the data

In [4]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import math

In [5]:
df = pd.read_csv('./preprocessed_TJI_deaths.csv')

In [8]:
df_white = df[df['race']=='WHITE']
df_black = df[df['race']=='BLACK']
df_hispanic = df[df['race']=='HISPANIC']

#White baseline with SMOTE

In [26]:
from imblearn.over_sampling import SMOTE

In [27]:
from sklearn.model_selection import train_test_split

df_white_Y = df_white['Natural']
df_white_X = df_white[['sex',
                       'age_at_time_of_death',
                       'Offense',]]

X_train, X_test, y_train, y_test = train_test_split(df_white_X, df_white_Y, test_size=0.2)

Repeat the logistic regression by first trying SMOTE to address imbalanced classes

In [28]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from imblearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

preprocess_pipeline = ColumnTransformer([
    ('numerical', StandardScaler(), ['age_at_time_of_death']),
    ('categorical', OneHotEncoder(handle_unknown='ignore'), ['Offense']),
    ('binary', OrdinalEncoder(), ['sex'])
])

Check the performance of the new logistic regression with this change

In [29]:
model_white = LogisticRegression()

pipeline = Pipeline([
    ('preprocess', preprocess_pipeline),
    ('smote', SMOTE(sampling_strategy='auto')),
    ('model', model_white)
])

cv_results = cross_val_score(pipeline, X_train, y_train, cv=10, scoring = 'accuracy')

print("CV-results:\n", cv_results, "\n-------------------------------")

cv = StratifiedKFold(n_splits=10)

param_grid = [
    {'model__penalty' : ['l2'],
    'model__C' : np.logspace(-4, 4, 20),
    'model__solver' : ['lbfgs','liblinear', 'saga']}
]

gridsearch = GridSearchCV(pipeline, param_grid = param_grid, cv = cv, verbose=True, n_jobs=-1)

gridsearch.fit(X_train, y_train)

print("Best hyperparameters:", gridsearch.best_params_, "\n-------------------------------")

best_model = gridsearch.best_estimator_

y_pred = best_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

print('Test accuracy: ', accuracy)

CV-results:
 [0.82926829 0.81300813 0.82113821 0.81300813 0.7696477  0.80758808
 0.82655827 0.80978261 0.75271739 0.83152174] 
-------------------------------
Fitting 10 folds for each of 60 candidates, totalling 600 fits
Best hyperparameters: {'model__C': 0.615848211066026, 'model__penalty': 'l2', 'model__solver': 'saga'} 
-------------------------------
Test accuracy:  0.8134490238611713


In [30]:
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion matrix:\n", conf_matrix)

print(classification_report(y_test, y_pred))

Confusion matrix:
 [[627 138]
 [ 34 123]]
              precision    recall  f1-score   support

     Natural       0.95      0.82      0.88       765
   Unnatural       0.47      0.78      0.59       157

    accuracy                           0.81       922
   macro avg       0.71      0.80      0.73       922
weighted avg       0.87      0.81      0.83       922



#Black baseline with SMOTE

In [31]:
df_black_X = df_black[['sex',
                       'age_at_time_of_death',
                       'Offense',]]

df_black_Y = df_black['Natural']

X_train, X_test, y_train, y_test = train_test_split(df_black_X, df_black_Y, test_size=0.2)

preprocess_pipeline = ColumnTransformer([
    ('numerical', StandardScaler(), ['age_at_time_of_death']),
    ('categorical', OneHotEncoder(handle_unknown='ignore'), ['Offense']),
    ('binary', OrdinalEncoder(), ['sex'])
])

model_black = LogisticRegression()

pipeline = Pipeline([
    ('preprocess', preprocess_pipeline),
    ('smote', SMOTE(sampling_strategy='auto')),
    ('model', model_black)
])

cv_results = cross_val_score(pipeline, X_train, y_train, cv=10, scoring = 'accuracy')


print("CV-results:\n", cv_results, "\n-------------------------------")

cv = StratifiedKFold(n_splits=10)

param_grid = [
    {'model__penalty' : ['l2'],
    'model__C' : np.logspace(-4, 4, 20),
    'model__solver' : ['lbfgs','liblinear', 'saga']}
]

gridsearch = GridSearchCV(pipeline, param_grid = param_grid, cv = cv, verbose=True, n_jobs=-1)

gridsearch.fit(X_train, y_train)

print("Best hyperparameters:", gridsearch.best_params_, "\n-------------------------------")

best_model = gridsearch.best_estimator_

y_pred = best_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

print('Test accuracy: ', accuracy)

CV-results:
 [0.752      0.74       0.78       0.792      0.784      0.776
 0.7751004  0.77911647 0.80321285 0.73895582] 
-------------------------------
Fitting 10 folds for each of 60 candidates, totalling 600 fits
Best hyperparameters: {'model__C': 0.08858667904100823, 'model__penalty': 'l2', 'model__solver': 'lbfgs'} 
-------------------------------
Test accuracy:  0.7756410256410257


In [32]:
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion matrix:\n", conf_matrix)

print(classification_report(y_test, y_pred))

Confusion matrix:
 [[416 128]
 [ 12  68]]
              precision    recall  f1-score   support

     Natural       0.97      0.76      0.86       544
   Unnatural       0.35      0.85      0.49        80

    accuracy                           0.78       624
   macro avg       0.66      0.81      0.67       624
weighted avg       0.89      0.78      0.81       624



# Hispanic baseline with SMOTE

In [33]:
df_hispanic_X = df_hispanic[['sex',
                       'age_at_time_of_death',
                       'Offense',]]

df_hispanic_Y = df_hispanic['Natural']

X_train, X_test, y_train, y_test = train_test_split(df_hispanic_X, df_hispanic_Y, test_size=0.2)

preprocess_pipeline = ColumnTransformer([
    ('numerical', StandardScaler(), ['age_at_time_of_death']),
    ('categorical', OneHotEncoder(handle_unknown='ignore'), ['Offense']),
    ('binary', OrdinalEncoder(), ['sex'])
])

model_hispanic = LogisticRegression()

pipeline = Pipeline([
    ('preprocess', preprocess_pipeline),
    ('smote', SMOTE(sampling_strategy='auto')),
    ('model', model_hispanic)
])

cv_results = cross_val_score(pipeline, X_train, y_train, cv=10, scoring = 'accuracy')

print("CV-results:\n", cv_results, "\n-------------------------------")

cv = StratifiedKFold(n_splits=10)

param_grid = [
    {'model__penalty' : ['l2'],
    'model__C' : np.logspace(-4, 4, 20),
    'model__solver' : ['lbfgs','liblinear', 'saga']}
]

gridsearch = GridSearchCV(pipeline, param_grid = param_grid, cv = cv, verbose=True, n_jobs=-1)

gridsearch.fit(X_train, y_train)

print("Best hyperparameters:", gridsearch.best_params_, "\n-------------------------------")

best_model = gridsearch.best_estimator_

y_pred = best_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

print('Test accuracy: ', accuracy)

CV-results:
 [0.81818182 0.8008658  0.82251082 0.78787879 0.80952381 0.76190476
 0.77489177 0.82683983 0.77489177 0.81385281] 
-------------------------------
Fitting 10 folds for each of 60 candidates, totalling 600 fits
Best hyperparameters: {'model__C': 78.47599703514607, 'model__penalty': 'l2', 'model__solver': 'liblinear'} 
-------------------------------
Test accuracy:  0.8062283737024222


In [34]:
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion matrix:\n", conf_matrix)

print(classification_report(y_test, y_pred))

Confusion matrix:
 [[372  91]
 [ 21  94]]
              precision    recall  f1-score   support

     Natural       0.95      0.80      0.87       463
   Unnatural       0.51      0.82      0.63       115

    accuracy                           0.81       578
   macro avg       0.73      0.81      0.75       578
weighted avg       0.86      0.81      0.82       578



While SMOTE helped improve recall, it did so at a dramatic expense of overall precision.

# Modification 1: use weighted loss functions

One solution recommended by Dr. Orchard was to introduce class weights. This would then influence the cost function (https://medium.com/@rafaelnduarte/class-weight-smote-random-over-and-under-sampling-bca603378e02). Let's try applying this to our Black machine.

In [35]:
df_black_X = df_black[['sex',
                       'age_at_time_of_death',
                       'Offense',]]

df_black_Y = df_black['Natural']

X_train, X_test, y_train, y_test = train_test_split(df_black_X, df_black_Y, test_size=0.2)

preprocess_pipeline = ColumnTransformer([
    ('numerical', StandardScaler(), ['age_at_time_of_death']),
    ('categorical', OneHotEncoder(handle_unknown='ignore'), ['Offense']),
    ('binary', OrdinalEncoder(), ['sex'])
])

model_black = LogisticRegression(class_weight='balanced')

pipeline = Pipeline([
    ('preprocess', preprocess_pipeline),
    ('smote', SMOTE(sampling_strategy='auto')),
    ('model', model_black)
])

cv_results = cross_val_score(pipeline, X_train, y_train, cv=10, scoring = 'accuracy')


print("CV-results:\n", cv_results, "\n-------------------------------")

cv = StratifiedKFold(n_splits=10)

param_grid = [
    {'model__penalty' : ['l2'],
    'model__C' : np.logspace(-4, 4, 20),
    'model__solver' : ['lbfgs','liblinear', 'saga']}
]

gridsearch = GridSearchCV(pipeline, param_grid = param_grid, cv = cv, verbose=True, n_jobs=-1)

gridsearch.fit(X_train, y_train)

print("Best hyperparameters:", gridsearch.best_params_, "\n-------------------------------")

best_model = gridsearch.best_estimator_

y_pred = best_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

print('Test accuracy: ', accuracy)

CV-results:
 [0.768      0.788      0.78       0.816      0.732      0.824
 0.78714859 0.78313253 0.74698795 0.75903614] 
-------------------------------
Fitting 10 folds for each of 60 candidates, totalling 600 fits
Best hyperparameters: {'model__C': 206.913808111479, 'model__penalty': 'l2', 'model__solver': 'lbfgs'} 
-------------------------------
Test accuracy:  0.7323717948717948


In [36]:
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion matrix:\n", conf_matrix)

print(classification_report(y_test, y_pred))

Confusion matrix:
 [[398 143]
 [ 24  59]]
              precision    recall  f1-score   support

     Natural       0.94      0.74      0.83       541
   Unnatural       0.29      0.71      0.41        83

    accuracy                           0.73       624
   macro avg       0.62      0.72      0.62       624
weighted avg       0.86      0.73      0.77       624



The precision and recall seem to remain relatively the same, indicating no real improvement.

#Modification 2: try ADASYN

In [37]:
from imblearn.over_sampling import ADASYN

df_black_X = df_black[['sex',
                       'age_at_time_of_death',
                       'Offense',]]

df_black_Y = df_black['Natural']

X_train, X_test, y_train, y_test = train_test_split(df_black_X, df_black_Y, test_size=0.2)

preprocess_pipeline = ColumnTransformer([
    ('numerical', StandardScaler(), ['age_at_time_of_death']),
    ('categorical', OneHotEncoder(handle_unknown='ignore'), ['Offense']),
    ('binary', OrdinalEncoder(), ['sex'])
])

model_black = LogisticRegression()

pipeline = Pipeline([
    ('preprocess', preprocess_pipeline),
    ('adasyn', ADASYN(sampling_strategy='auto')),
    ('model', model_black)
])

cv_results = cross_val_score(pipeline, X_train, y_train, cv=10, scoring = 'accuracy')


print("CV-results:\n", cv_results, "\n-------------------------------")

cv = StratifiedKFold(n_splits=10)

param_grid = [
    {'model__penalty' : ['l2'],
    'model__C' : np.logspace(-4, 4, 20),
    'model__solver' : ['lbfgs','liblinear', 'saga']}
]

gridsearch = GridSearchCV(pipeline, param_grid = param_grid, cv = cv, verbose=True, n_jobs=-1)

gridsearch.fit(X_train, y_train)

print("Best hyperparameters:", gridsearch.best_params_, "\n-------------------------------")

best_model = gridsearch.best_estimator_

y_pred = best_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

print('Test accuracy: ', accuracy)

CV-results:
 [0.74       0.76       0.772      0.776      0.8        0.708
 0.73493976 0.75903614 0.72289157 0.76706827] 
-------------------------------
Fitting 10 folds for each of 60 candidates, totalling 600 fits
Best hyperparameters: {'model__C': 545.5594781168514, 'model__penalty': 'l2', 'model__solver': 'saga'} 
-------------------------------
Test accuracy:  0.782051282051282


In [38]:
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion matrix:\n", conf_matrix)

print(classification_report(y_test, y_pred))

Confusion matrix:
 [[418 115]
 [ 21  70]]
              precision    recall  f1-score   support

     Natural       0.95      0.78      0.86       533
   Unnatural       0.38      0.77      0.51        91

    accuracy                           0.78       624
   macro avg       0.67      0.78      0.68       624
weighted avg       0.87      0.78      0.81       624



While the precision marginally increased, the recall fell as well.

#General remarks

In our presentation, we included SMOTE. However, in hindsight, this may have interfered with our performance metrics. For now, we will repeat the analysis, but without oversampling of the minority class.