In [410]:
import warnings
warnings.filterwarnings('ignore')

In [411]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.model_selection import train_test_split

In [412]:
cleaned_crime = pd.read_csv('Resources/crime_chip_for_model.csv')
cleaned_crime.head()

Unnamed: 0.1,Unnamed: 0,ZIP,Ward,Primary_Type,rankings,Latitude,Longitude,Arrest,Domestic,Population,People/Sq.Mile,National_Rank,state,location,address,chip_latitude,chip_longitude,chipotle,Safety
0,0,60601,27,THEFT,3,,,False,False,5591,17101.15,#271,Illinois,Chicago,"316 N Michigan Ave Chicago, IL 60601 US",41.887288,-87.624848,1,Bad
1,1,60601,27,CRIMINAL DAMAGE,4,41.883932,-87.679964,False,False,5591,17101.15,#271,Illinois,Chicago,"316 N Michigan Ave Chicago, IL 60601 US",41.887288,-87.624848,1,Bad
2,2,60601,27,THEFT,3,41.896569,-87.636063,False,False,5591,17101.15,#271,Illinois,Chicago,"316 N Michigan Ave Chicago, IL 60601 US",41.887288,-87.624848,1,Bad
3,3,60601,27,SEX OFFENSE,3,41.883937,-87.683368,True,False,5591,17101.15,#271,Illinois,Chicago,"316 N Michigan Ave Chicago, IL 60601 US",41.887288,-87.624848,1,Bad
4,4,60601,27,NARCOTICS,4,41.892856,-87.710137,True,False,5591,17101.15,#271,Illinois,Chicago,"316 N Michigan Ave Chicago, IL 60601 US",41.887288,-87.624848,1,Bad


In [413]:
cleaned_crime = cleaned_crime.drop(columns=['Unnamed: 0', 'state', 'location', 'address'])
cleaned_crime = cleaned_crime.dropna()
cleaned_crime['National_Rank'] = cleaned_crime['National_Rank'].str.replace('#', '')
cleaned_crime['National_Rank'] = cleaned_crime['National_Rank'].str.replace(',', '')
cleaned_crime['National_Rank'] = cleaned_crime['National_Rank'].astype('int')
cleaned_crime['People/Sq.Mile'] = cleaned_crime['People/Sq.Mile'].str.replace(',', '')
cleaned_crime['People/Sq.Mile'] = cleaned_crime['People/Sq.Mile'].astype('float')
cleaned_crime['Population'] = cleaned_crime['Population'].str.replace(',', '')
cleaned_crime['Population'] = cleaned_crime['Population'].astype('int')
sample = cleaned_crime.sample(30000)
sample = sample.reset_index()
sample = sample.drop(columns=['index'])
sample.head()

Unnamed: 0,ZIP,Ward,Primary_Type,rankings,Latitude,Longitude,Arrest,Domestic,Population,People/Sq.Mile,National_Rank,chip_latitude,chip_longitude,chipotle,Safety
0,60602,42,DECEPTIVE PRACTICE,3,41.88195,-87.633336,False,False,70,810.58,6855,41.882277,-87.627121,1,Bad
1,60657,7,OTHER OFFENSE,3,41.725687,-87.570565,False,True,66789,31204.02,112,41.932835,-87.668025,1,Bad
2,60652,40,ROBBERY,8,41.996803,-87.673947,False,False,39126,8425.81,757,41.754775,-87.740664,1,Bad
3,60612,16,CRIMINAL DAMAGE,4,41.795188,-87.671907,False,False,37990,10195.72,586,41.872368,-87.677238,1,Bad
4,60612,16,ASSAULT,7,41.778622,-87.659337,True,True,37990,10195.72,586,41.872368,-87.677238,1,Bad


In [414]:
primary_type = sample.Primary_Type.value_counts()

In [415]:
# Determine which values to replace if counts are less than ...?
replace_primary = list(primary_type[primary_type < 1000].index)

# Replace in dataframe
for primary in replace_primary:
    sample.Primary_Type = sample.Primary_Type.replace(primary,"Other")
    
# Check to make sure binning was successful
sample.Primary_Type.value_counts()

BATTERY                6334
THEFT                  5896
CRIMINAL DAMAGE        3620
Other                  2915
ASSAULT                2575
DECEPTIVE PRACTICE     1865
OTHER OFFENSE          1849
BURGLARY               1396
MOTOR VEHICLE THEFT    1282
NARCOTICS              1157
WEAPONS VIOLATION      1111
Name: Primary_Type, dtype: int64

In [416]:
# Generate our categorical variable lists
crime_obj = list(sample.dtypes[sample.dtypes == 'object'].index)

In [417]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(cleaned_crime[crime_obj]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(crime_obj)
encode_df.head()

Unnamed: 0,Primary_Type_ARSON,Primary_Type_ASSAULT,Primary_Type_BATTERY,Primary_Type_BURGLARY,Primary_Type_CONCEALED CARRY LICENSE VIOLATION,Primary_Type_CRIM SEXUAL ASSAULT,Primary_Type_CRIMINAL DAMAGE,Primary_Type_CRIMINAL SEXUAL ASSAULT,Primary_Type_CRIMINAL TRESPASS,Primary_Type_DECEPTIVE PRACTICE,...,Primary_Type_PROSTITUTION,Primary_Type_PUBLIC INDECENCY,Primary_Type_PUBLIC PEACE VIOLATION,Primary_Type_ROBBERY,Primary_Type_SEX OFFENSE,Primary_Type_STALKING,Primary_Type_THEFT,Primary_Type_WEAPONS VIOLATION,Safety_Bad,Safety_Good
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [418]:
# Merge one-hot encoded features and drop the originals
sample = sample.merge(encode_df, left_index=True, right_index=True)
sample = sample.drop(columns=crime_obj,axis=1)
sample.head()

Unnamed: 0,ZIP,Ward,rankings,Latitude,Longitude,Arrest,Domestic,Population,People/Sq.Mile,National_Rank,...,Primary_Type_PROSTITUTION,Primary_Type_PUBLIC INDECENCY,Primary_Type_PUBLIC PEACE VIOLATION,Primary_Type_ROBBERY,Primary_Type_SEX OFFENSE,Primary_Type_STALKING,Primary_Type_THEFT,Primary_Type_WEAPONS VIOLATION,Safety_Bad,Safety_Good
0,60602,42,3,41.88195,-87.633336,False,False,70,810.58,6855,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,60657,7,3,41.725687,-87.570565,False,True,66789,31204.02,112,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,60652,40,8,41.996803,-87.673947,False,False,39126,8425.81,757,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,60612,16,4,41.795188,-87.671907,False,False,37990,10195.72,586,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,60612,16,7,41.778622,-87.659337,True,True,37990,10195.72,586,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [419]:
# Split our preprocessed data into our features and target arrays
X = sample.drop(columns=['Safety'],axis=1)
y = sample['Safety']

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

# Resampling Techniques

### Naives Random

In [420]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled_ros, y_resampled_ros = ros.fit_resample(X_train, y_train)

In [421]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
model_ros = LogisticRegression(solver='lbfgs', random_state=1)
model_ros.fit(X_resampled_ros, y_resampled_ros)

LogisticRegression(random_state=1)

In [422]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred_ros = model_ros.predict(X_test)
balanced_accuracy_score(y_test, y_pred_ros)

0.5022931442080378

In [423]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred_ros)

array([[3432, 3618],
       [ 217,  233]], dtype=int64)

In [424]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred_ros))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.94      0.49      0.52      0.64      0.50      0.25      7050
        1.0       0.06      0.52      0.49      0.11      0.50      0.25       450

avg / total       0.89      0.49      0.52      0.61      0.50      0.25      7500



### SMOTE

In [425]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=1, sampling_strategy='auto')
X_resampled_smote, y_resampled_smote = smote.fit_resample(X_train, y_train)

In [426]:
# Train the Logistic Regression model using the resampled data
model_smote = LogisticRegression(solver='lbfgs', random_state=1)
model_smote.fit(X_resampled_smote, y_resampled_smote)

LogisticRegression(random_state=1)

In [427]:
# Calculated the balanced accuracy score
y_pred_smote = model_smote.predict(X_test)
balanced_accuracy_score(y_test, y_pred_smote)

0.4832387706855792

In [428]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred_smote)

array([[3978, 3072],
       [ 269,  181]], dtype=int64)

In [429]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred_smote))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.94      0.56      0.40      0.70      0.48      0.23      7050
        1.0       0.06      0.40      0.56      0.10      0.48      0.22       450

avg / total       0.88      0.55      0.41      0.67      0.48      0.23      7500



### Cluster Centroids

In [430]:
# Resample the data using the ClusterCentroids resampler
# Warning: This is a large dataset, and this step may take some time to complete
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(random_state=1)
X_resampled_cc, y_resampled_cc = cc.fit_resample(X_train, y_train)

In [431]:
# Train the Logistic Regression model using the resampled data
model_cc = LogisticRegression(solver='lbfgs', random_state=1)
model_cc.fit(X_resampled_cc, y_resampled_cc)

LogisticRegression(random_state=1)

In [432]:
# Calculated the balanced accuracy score
y_pred_cc = model_cc.predict(X_test)
balanced_accuracy_score(y_test, y_pred_cc)

0.5106382978723405

In [433]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred_cc)

array([[3675, 3375],
       [ 225,  225]], dtype=int64)

In [434]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred_cc))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.94      0.52      0.50      0.67      0.51      0.26      7050
        1.0       0.06      0.50      0.52      0.11      0.51      0.26       450

avg / total       0.89      0.52      0.50      0.64      0.51      0.26      7500



### Combination (Over and Under) Sampling

In [435]:
# Resample the training data with SMOTEENN
# Warning: This is a large dataset, and this step may take some time to complete
# YOUR CODE HERE
from imblearn.combine import SMOTEENN
smoteenn = SMOTEENN(random_state=0)
X_resampled_smoteenn, y_resampled_smoteenn = smoteenn.fit_resample(X, y)

In [436]:
# Train the Logistic Regression model using the resampled data
model_smoteenn = LogisticRegression(solver='lbfgs', random_state=1)
model_smoteenn.fit(X_resampled_smoteenn, y_resampled_smoteenn)

LogisticRegression(random_state=1)

In [437]:
# Calculated the balanced accuracy score
y_pred_smoteenn = model_smoteenn.predict(X_test)
balanced_accuracy_score(y_test, y_pred_smoteenn)

0.5155082742316784

In [438]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred_smoteenn)

array([[3070, 3980],
       [ 182,  268]], dtype=int64)

In [439]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred_smoteenn))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.94      0.44      0.60      0.60      0.51      0.26      7050
        1.0       0.06      0.60      0.44      0.11      0.51      0.26       450

avg / total       0.89      0.45      0.59      0.57      0.51      0.26      7500



# Ensemble Techniques

### Balanced Random Forest

In [440]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
brfc_model = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
brfc_model.fit(X_train, y_train)

BalancedRandomForestClassifier(random_state=1)

In [441]:
# Calculated the balanced accuracy score
y_pred_brfc = brfc_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred_brfc)

0.9352482269503546

In [442]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred_brfc)

array([[6560,  490],
       [  27,  423]], dtype=int64)

In [443]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred_brfc))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       1.00      0.93      0.94      0.96      0.94      0.87      7050
        1.0       0.46      0.94      0.93      0.62      0.94      0.88       450

avg / total       0.96      0.93      0.94      0.94      0.94      0.87      7500



In [444]:
# List the features sorted in descending order by feature importance
importances = brfc_model.feature_importances_
sorted(zip(brfc_model.feature_importances_, X.columns), reverse=True)

[(0.46846282676790635, 'Primary_Type_ASSAULT'),
 (0.08895665689257837, 'Primary_Type_THEFT'),
 (0.08165191108202052, 'Primary_Type_BATTERY'),
 (0.05051543293122887, 'Longitude'),
 (0.05030993820551153, 'Latitude'),
 (0.04822871685093796, 'Primary_Type_DECEPTIVE PRACTICE'),
 (0.040707779088361734, 'Primary_Type_CRIMINAL DAMAGE'),
 (0.02087267671444719, 'rankings'),
 (0.01724625413221877, 'Primary_Type_BURGLARY'),
 (0.0171475827909164, 'Primary_Type_OTHER OFFENSE'),
 (0.015119699806775877, 'Primary_Type_MOTOR VEHICLE THEFT'),
 (0.012163241311598554, 'Primary_Type_NARCOTICS'),
 (0.008393418604180717, 'Primary_Type_ROBBERY'),
 (0.008123265434910532, 'chip_latitude'),
 (0.007854694769521501, 'Primary_Type_WEAPONS VIOLATION'),
 (0.007677651180465466, 'chip_longitude'),
 (0.006957734415195485, 'Primary_Type_CONCEALED CARRY LICENSE VIOLATION'),
 (0.006760158875245541, 'Primary_Type_CRIMINAL TRESPASS'),
 (0.006064021755847914, 'Ward'),
 (0.005684853215375099, 'Domestic'),
 (0.005390524800132548

### Easy Ensemble AdaBoost Classifier

In [445]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
ee_model = EasyEnsembleClassifier(n_estimators=100, random_state=1)
ee_model.fit(X_train, y_train)
# Train the EasyEnsembleClassifier

EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [446]:
# Calculated the balanced accuracy score
y_pred_ee = ee_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred_ee)

0.9457919621749409

In [447]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred_ee)

array([[6411,  639],
       [   8,  442]], dtype=int64)

In [448]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred_ee))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       1.00      0.91      0.98      0.95      0.95      0.89      7050
        1.0       0.41      0.98      0.91      0.58      0.95      0.90       450

avg / total       0.96      0.91      0.98      0.93      0.95      0.89      7500

