In [250]:
import warnings
warnings.filterwarnings('ignore')

In [251]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.model_selection import train_test_split

In [252]:
cleaned_crime = pd.read_csv('Resources/crime_chip_for_model.csv')
cleaned_crime.head()

Unnamed: 0.1,Unnamed: 0,ZIP,Ward,Primary_Type,rankings,Latitude,Longitude,Arrest,Domestic,Population,...,National_Rank,state,location,address,chip_latitude,chip_longitude,chipotle,Safety,Arrest_1,Domestic_1
0,0,60601,27,THEFT,3,,,0,0,5591,...,#271,Illinois,Chicago,"316 N Michigan Ave Chicago, IL 60601 US",41.887288,-87.624848,1,1,False,False
1,1,60601,27,CRIMINAL DAMAGE,4,41.883932,-87.679964,0,0,5591,...,#271,Illinois,Chicago,"316 N Michigan Ave Chicago, IL 60601 US",41.887288,-87.624848,1,0,False,False
2,2,60601,27,THEFT,3,41.896569,-87.636063,0,0,5591,...,#271,Illinois,Chicago,"316 N Michigan Ave Chicago, IL 60601 US",41.887288,-87.624848,1,1,False,False
3,3,60601,27,SEX OFFENSE,3,41.883937,-87.683368,1,0,5591,...,#271,Illinois,Chicago,"316 N Michigan Ave Chicago, IL 60601 US",41.887288,-87.624848,1,1,True,False
4,4,60601,27,NARCOTICS,4,41.892856,-87.710137,1,0,5591,...,#271,Illinois,Chicago,"316 N Michigan Ave Chicago, IL 60601 US",41.887288,-87.624848,1,0,True,False


In [253]:
cleaned_crime = cleaned_crime.drop(columns=['Unnamed: 0', 'state', 'location', 'address' ,'chip_latitude', 'chip_longitude', 'Arrest_1', 'Domestic_1'])
cleaned_crime = cleaned_crime.dropna()
cleaned_crime['National_Rank'] = cleaned_crime['National_Rank'].str.replace('#', '')
cleaned_crime['National_Rank'] = cleaned_crime['National_Rank'].str.replace(',', '')
cleaned_crime['National_Rank'] = cleaned_crime['National_Rank'].astype('int')
cleaned_crime['People/Sq.Mile'] = cleaned_crime['People/Sq.Mile'].str.replace(',', '')
cleaned_crime['People/Sq.Mile'] = cleaned_crime['People/Sq.Mile'].astype('float')
cleaned_crime['Population'] = cleaned_crime['Population'].str.replace(',', '')
cleaned_crime['Population'] = cleaned_crime['Population'].astype('int')
sample = cleaned_crime.sample(n=30000)
sample = sample.reset_index()
sample = sample.drop(columns=['index'])
sample.head()

Unnamed: 0,ZIP,Ward,Primary_Type,rankings,Latitude,Longitude,Arrest,Domestic,Population,People/Sq.Mile,National_Rank,chipotle,Safety
0,60639,34,CRIMINAL DAMAGE,4,41.682615,-87.633615,0,1,92951,19854.06,214,1,0
1,60601,27,WEAPONS VIOLATION,5,41.898106,-87.718842,1,0,5591,17101.15,271,1,0
2,60601,27,CRIMINAL DAMAGE,4,41.881442,-87.667838,0,0,5591,17101.15,271,1,0
3,60607,29,CRIMINAL DAMAGE,4,41.908245,-87.77557,0,0,15552,6428.11,1194,1,0
4,60657,7,WEAPONS VIOLATION,3,41.74192,-87.561219,0,0,66789,31204.02,112,1,1


In [254]:
primary_type = sample.Primary_Type.value_counts()

In [255]:
# Determine which values to replace if counts are less than ...?
replace_primary = list(primary_type[primary_type < 1000].index)

# Replace in dataframe
for primary in replace_primary:
    sample.Primary_Type = sample.Primary_Type.replace(primary,"OTHER OFFENSE")
    
# Check to make sure binning was successful
sample.Primary_Type.value_counts()

BATTERY                6291
THEFT                  5795
OTHER OFFENSE          3829
CRIMINAL DAMAGE        3681
ASSAULT                2642
DECEPTIVE PRACTICE     1834
BURGLARY               1317
MOTOR VEHICLE THEFT    1294
NARCOTICS              1168
WEAPONS VIOLATION      1092
ROBBERY                1057
Name: Primary_Type, dtype: int64

In [256]:
# Generate our categorical variable lists
crime_obj = list(sample.dtypes[sample.dtypes == 'object'].index)

In [257]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(cleaned_crime[crime_obj]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(crime_obj)
encode_df.head()

Unnamed: 0,Primary_Type_ARSON,Primary_Type_ASSAULT,Primary_Type_BATTERY,Primary_Type_BURGLARY,Primary_Type_CONCEALED CARRY LICENSE VIOLATION,Primary_Type_CRIM SEXUAL ASSAULT,Primary_Type_CRIMINAL DAMAGE,Primary_Type_CRIMINAL SEXUAL ASSAULT,Primary_Type_CRIMINAL TRESPASS,Primary_Type_DECEPTIVE PRACTICE,...,Primary_Type_OTHER NARCOTIC VIOLATION,Primary_Type_OTHER OFFENSE,Primary_Type_PROSTITUTION,Primary_Type_PUBLIC INDECENCY,Primary_Type_PUBLIC PEACE VIOLATION,Primary_Type_ROBBERY,Primary_Type_SEX OFFENSE,Primary_Type_STALKING,Primary_Type_THEFT,Primary_Type_WEAPONS VIOLATION
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [258]:
# Merge one-hot encoded features and drop the originals
# sample = sample.merge(encode_df, left_index=True, right_index=True)
sample = sample.drop(columns=crime_obj,axis=1)
sample.head()

Unnamed: 0,ZIP,Ward,rankings,Latitude,Longitude,Arrest,Domestic,Population,People/Sq.Mile,National_Rank,chipotle,Safety
0,60639,34,4,41.682615,-87.633615,0,1,92951,19854.06,214,1,0
1,60601,27,5,41.898106,-87.718842,1,0,5591,17101.15,271,1,0
2,60601,27,4,41.881442,-87.667838,0,0,5591,17101.15,271,1,0
3,60607,29,4,41.908245,-87.77557,0,0,15552,6428.11,1194,1,0
4,60657,7,3,41.74192,-87.561219,0,0,66789,31204.02,112,1,1


In [259]:
# Split our preprocessed data into our features and target arrays
X = sample.drop(columns=['chipotle','Ward','Population','rankings','Latitude','Longitude'],axis=1)
y = sample['chipotle']

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

# Resampling Techniques

### Naives Random

In [260]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled_ros, y_resampled_ros = ros.fit_resample(X_train, y_train)

In [261]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
model_ros = LogisticRegression(solver='lbfgs', random_state=1)
model_ros.fit(X_resampled_ros, y_resampled_ros)

LogisticRegression(random_state=1)

In [262]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred_ros = model_ros.predict(X_test)
balanced_accuracy_score(y_test, y_pred_ros)

0.5261783749438314

In [263]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred_ros)

array([[2162,  660],
       [3339, 1339]], dtype=int64)

In [264]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred_ros))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.39      0.77      0.29      0.52      0.47      0.23      2822
          1       0.67      0.29      0.77      0.40      0.47      0.21      4678

avg / total       0.57      0.47      0.59      0.45      0.47      0.22      7500



### SMOTE

In [265]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=1, sampling_strategy='auto')
X_resampled_smote, y_resampled_smote = smote.fit_resample(X_train, y_train)

In [266]:
# Train the Logistic Regression model using the resampled data
model_smote = LogisticRegression(solver='lbfgs', random_state=1)
model_smote.fit(X_resampled_smote, y_resampled_smote)

LogisticRegression(random_state=1)

In [267]:
# Calculated the balanced accuracy score
y_pred_smote = model_smote.predict(X_test)
balanced_accuracy_score(y_test, y_pred_smote)

0.5390330782173535

In [268]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred_smote)

array([[2392,  430],
       [3600, 1078]], dtype=int64)

In [269]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred_smote))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.40      0.85      0.23      0.54      0.44      0.21      2822
          1       0.71      0.23      0.85      0.35      0.44      0.18      4678

avg / total       0.60      0.46      0.62      0.42      0.44      0.19      7500



### Cluster Centroids

In [270]:
# Resample the data using the ClusterCentroids resampler
# Warning: This is a large dataset, and this step may take some time to complete
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(random_state=1)
X_resampled_cc, y_resampled_cc = cc.fit_resample(X_train, y_train)

In [271]:
# Train the Logistic Regression model using the resampled data
model_cc = LogisticRegression(solver='lbfgs', random_state=1)
model_cc.fit(X_resampled_cc, y_resampled_cc)

LogisticRegression(random_state=1)

In [272]:
# Calculated the balanced accuracy score
y_pred_cc = model_cc.predict(X_test)
balanced_accuracy_score(y_test, y_pred_cc)

0.5452323086577127

In [273]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred_cc)

array([[2392,  430],
       [3542, 1136]], dtype=int64)

In [274]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred_cc))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.40      0.85      0.24      0.55      0.45      0.22      2822
          1       0.73      0.24      0.85      0.36      0.45      0.19      4678

avg / total       0.60      0.47      0.62      0.43      0.45      0.20      7500



### Combination (Over and Under) Sampling

In [275]:
# Resample the training data with SMOTEENN
# Warning: This is a large dataset, and this step may take some time to complete
# YOUR CODE HERE
from imblearn.combine import SMOTEENN
smoteenn = SMOTEENN(random_state=0)
X_resampled_smoteenn, y_resampled_smoteenn = smoteenn.fit_resample(X, y)

In [276]:
# Train the Logistic Regression model using the resampled data
model_smoteenn = LogisticRegression(solver='lbfgs', random_state=1)
model_smoteenn.fit(X_resampled_smoteenn, y_resampled_smoteenn)

LogisticRegression(random_state=1)

In [277]:
# Calculated the balanced accuracy score
y_pred_smoteenn = model_smoteenn.predict(X_test)
balanced_accuracy_score(y_test, y_pred_smoteenn)

0.5261783749438314

In [278]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred_smoteenn)

array([[2162,  660],
       [3339, 1339]], dtype=int64)

In [279]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred_smoteenn))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.39      0.77      0.29      0.52      0.47      0.23      2822
          1       0.67      0.29      0.77      0.40      0.47      0.21      4678

avg / total       0.57      0.47      0.59      0.45      0.47      0.22      7500



# Ensemble Techniques

### Balanced Random Forest

In [280]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
brfc_model = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
brfc_model.fit(X_train, y_train)

BalancedRandomForestClassifier(random_state=1)

In [281]:
# Calculated the balanced accuracy score
y_pred_brfc = brfc_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred_brfc)

1.0

In [282]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred_brfc)

array([[2822,    0],
       [   0, 4678]], dtype=int64)

In [283]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred_brfc))

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      1.00      1.00      1.00      1.00      1.00      2822
          1       1.00      1.00      1.00      1.00      1.00      1.00      4678

avg / total       1.00      1.00      1.00      1.00      1.00      1.00      7500



In [284]:
# List the features sorted in descending order by feature importance
importances = brfc_model.feature_importances_
sorted(zip(brfc_model.feature_importances_, X.columns), reverse=True)

[(0.4021372069048553, 'ZIP'),
 (0.29946969256982064, 'People/Sq.Mile'),
 (0.2958119688644478, 'National_Rank'),
 (0.001179080080704252, 'Safety'),
 (0.0010192444276757345, 'Domestic'),
 (0.0003828071524960761, 'Arrest')]

### Easy Ensemble AdaBoost Classifier

In [285]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
ee_model = EasyEnsembleClassifier(n_estimators=100, random_state=1)
ee_model.fit(X_train, y_train)
# Train the EasyEnsembleClassifier

EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [286]:
# Calculated the balanced accuracy score
y_pred_ee = ee_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred_ee)

1.0

In [287]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred_ee)

array([[2822,    0],
       [   0, 4678]], dtype=int64)

In [288]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred_ee))

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      1.00      1.00      1.00      1.00      1.00      2822
          1       1.00      1.00      1.00      1.00      1.00      1.00      4678

avg / total       1.00      1.00      1.00      1.00      1.00      1.00      7500

