In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder 
from path import Path
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from imblearn.ensemble import BalancedRandomForestClassifier, EasyEnsembleClassifier

In [3]:
file_path = Path("../ml_df.csv")
ml_df = pd.read_csv(file_path)
ml_df.head()

Unnamed: 0,ZipCode,PdDistricts,Months,Day,TimeOfDay,Resolutions,Category_ASSAULT,Category_BURGLARY,Category_DRUG/NARCOTIC,Category_FRAUD,...,Descript_POSSESSION OF NARCOTICS PARAPHERNALIA,Descript_RESISTING ARREST,Descript_STOLEN AUTOMOBILE,Descript_STOLEN TRUCK,Descript_SUSPICIOUS OCCURRENCE,Descript_THREATS AGAINST LIFE,Descript_TRAFFIC VIOLATION,Descript_TRAFFIC VIOLATION ARREST,Descript_TRESPASSING,Descript_WARRANT ARREST
0,-0.860927,0.950634,-0.443307,-1.465466,0.720326,0,-0.315846,-0.202024,-0.171087,-0.134147,...,-0.085168,-0.093327,-0.155175,-0.101159,-0.141017,-0.107157,-0.111078,-0.123178,-0.092808,-0.145267
1,-0.860927,0.950634,-0.443307,-1.465466,0.720326,0,-0.315846,-0.202024,-0.171087,-0.134147,...,-0.085168,-0.093327,-0.155175,-0.101159,-0.141017,-0.107157,-0.111078,-0.123178,-0.092808,-0.145267
2,0.909206,-1.566836,-1.598508,-0.970159,-1.132944,0,-0.315846,-0.202024,-0.171087,-0.134147,...,-0.085168,-0.093327,-0.155175,-0.101159,-0.141017,-0.107157,-0.111078,-0.123178,-0.092808,6.883863
3,-0.439467,1.669911,-0.443307,1.01107,-0.206309,1,-0.315846,-0.202024,-0.171087,-0.134147,...,-0.085168,-0.093327,-0.155175,-0.101159,-0.141017,-0.107157,-0.111078,-0.123178,-0.092808,-0.145267
4,-0.945219,-0.487921,-0.443307,-1.465466,1.646961,1,-0.315846,-0.202024,-0.171087,-0.134147,...,-0.085168,-0.093327,-0.155175,-0.101159,-0.141017,-0.107157,-0.111078,-0.123178,-0.092808,-0.145267


In [4]:
#Look at number of resolutions
ml_df.Resolutions.value_counts()

1    107779
0     39416
Name: Resolutions, dtype: int64

In [5]:
#create features
X = ml_df.drop("Resolutions", axis=1)

#create target
y = ml_df["Resolutions"]

In [6]:
#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [7]:
# try random over sampling
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0: 80781, 1: 80781})

In [8]:
# train the classifier
brf = BalancedRandomForestClassifier()
brf.fit(X_resampled, y_resampled)
y_pred = brf.predict(X_test)

In [9]:
# Calculated the balanced accuracy score
balanced_acc_score = balanced_accuracy_score(y_test,y_pred)
print(f"balanced accuracy score = {balanced_acc_score*100:.2f}%")

balanced accuracy score = 81.71%


In [10]:
# Random oversampling didn't help our model beat the standard of 83% set by the baseline random forest model
# Try Synthetic oversampling
from imblearn.over_sampling import SMOTE
X_resampled2, y_resampled2 = SMOTE(random_state=1,
sampling_strategy='auto').fit_resample(X_train, y_train)

In [11]:
# train the classifier
brf = BalancedRandomForestClassifier()
brf.fit(X_resampled2, y_resampled2)
y_pred2 = brf.predict(X_test)

In [12]:
# Calculated the balanced accuracy score
balanced_acc_score2 = balanced_accuracy_score(y_test,y_pred2)
print(f"balanced accuracy score = {balanced_acc_score2*100:.2f}%")

balanced accuracy score = 81.56%


In [13]:
# SMOTE oversampling didn't help our model either
# We'll try random undersampling next
from imblearn.under_sampling import RandomUnderSampler
ros = RandomUnderSampler(random_state=1)
X_resampled3, y_resampled3 = ros.fit_resample(X_train, y_train)
Counter(y_resampled3)

Counter({0: 29615, 1: 29615})

In [14]:
# train the classifier
brf = BalancedRandomForestClassifier()
brf.fit(X_resampled3, y_resampled3)
y_pred3 = brf.predict(X_test)

In [15]:
# Calculated the balanced accuracy score
balanced_acc_score3 = balanced_accuracy_score(y_test,y_pred3)
print(f"balanced accuracy score = {balanced_acc_score3*100:.2f}%")

balanced accuracy score = 83.15%


In [16]:
#Random under sampling helped our model a bit. But it still hasn't surpassed our baseline of 83%
# Try 

KeyboardInterrupt: 

In [None]:
# train the classifier
brf = BalancedRandomForestClassifier()
brf.fit(X_resampled4, y_resampled4)
y_pred4 = brf.predict(X_test)