In [276]:
import warnings
warnings.filterwarnings('ignore')

In [277]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.model_selection import train_test_split

In [278]:
cleaned_crime = pd.read_csv('Resources/crime_chip_for_model.csv')
cleaned_crime.head()

Unnamed: 0.1,Unnamed: 0,ZIP,Ward,Primary_Type,rankings,Latitude,Longitude,Arrest,Domestic,Population,People/Sq.Mile,National_Rank,state,location,address,chip_latitude,chip_longitude,chipotle,Safety
0,0,60601,27,THEFT,3,,,False,False,5591,17101.15,#271,Illinois,Chicago,"316 N Michigan Ave Chicago, IL 60601 US",41.887288,-87.624848,1,Bad
1,1,60601,27,CRIMINAL DAMAGE,4,41.883932,-87.679964,False,False,5591,17101.15,#271,Illinois,Chicago,"316 N Michigan Ave Chicago, IL 60601 US",41.887288,-87.624848,1,Bad
2,2,60601,27,THEFT,3,41.896569,-87.636063,False,False,5591,17101.15,#271,Illinois,Chicago,"316 N Michigan Ave Chicago, IL 60601 US",41.887288,-87.624848,1,Bad
3,3,60601,27,SEX OFFENSE,3,41.883937,-87.683368,True,False,5591,17101.15,#271,Illinois,Chicago,"316 N Michigan Ave Chicago, IL 60601 US",41.887288,-87.624848,1,Bad
4,4,60601,27,NARCOTICS,4,41.892856,-87.710137,True,False,5591,17101.15,#271,Illinois,Chicago,"316 N Michigan Ave Chicago, IL 60601 US",41.887288,-87.624848,1,Bad


In [279]:
cleaned_crime = cleaned_crime.drop(columns=['Unnamed: 0', 'state', 'location', 'address'])
cleaned_crime = cleaned_crime.dropna()
cleaned_crime['National_Rank'] = cleaned_crime['National_Rank'].str.replace('#', '')
cleaned_crime['National_Rank'] = cleaned_crime['National_Rank'].str.replace(',', '')
cleaned_crime['National_Rank'] = cleaned_crime['National_Rank'].astype('int')
cleaned_crime['People/Sq.Mile'] = cleaned_crime['People/Sq.Mile'].str.replace(',', '')
cleaned_crime['People/Sq.Mile'] = cleaned_crime['People/Sq.Mile'].astype('float')
cleaned_crime['Population'] = cleaned_crime['Population'].str.replace(',', '')
cleaned_crime['Population'] = cleaned_crime['Population'].astype('int')
cleaned_crime.head()

Unnamed: 0,ZIP,Ward,Primary_Type,rankings,Latitude,Longitude,Arrest,Domestic,Population,People/Sq.Mile,National_Rank,chip_latitude,chip_longitude,chipotle,Safety
1,60601,27,CRIMINAL DAMAGE,4,41.883932,-87.679964,False,False,5591,17101.15,271,41.887288,-87.624848,1,Bad
2,60601,27,THEFT,3,41.896569,-87.636063,False,False,5591,17101.15,271,41.887288,-87.624848,1,Bad
3,60601,27,SEX OFFENSE,3,41.883937,-87.683368,True,False,5591,17101.15,271,41.887288,-87.624848,1,Bad
4,60601,27,NARCOTICS,4,41.892856,-87.710137,True,False,5591,17101.15,271,41.887288,-87.624848,1,Bad
5,60601,27,OFFENSE INVOLVING CHILDREN,7,41.901683,-87.718962,False,True,5591,17101.15,271,41.887288,-87.624848,1,Bad


In [280]:
primary_type = cleaned_crime.Primary_Type.value_counts()

In [281]:
# Determine which values to replace if counts are less than ...?
replace_primary = list(primary_type[primary_type < 1000].index)

# Replace in dataframe
for primary in replace_primary:
    cleaned_crime.Primary_Type = cleaned_crime.Primary_Type.replace(primary,"Other")
    
# Check to make sure binning was successful
cleaned_crime.Primary_Type.value_counts()

BATTERY                15820
THEFT                  14747
CRIMINAL DAMAGE         9184
ASSAULT                 6521
OTHER OFFENSE           4639
DECEPTIVE PRACTICE      4575
BURGLARY                3528
Other                   3243
MOTOR VEHICLE THEFT     3151
NARCOTICS               2890
WEAPONS VIOLATION       2689
ROBBERY                 2512
CRIMINAL TRESPASS       1611
Name: Primary_Type, dtype: int64

In [282]:
# Generate our categorical variable lists
crime_obj = list(cleaned_crime.dtypes[cleaned_crime.dtypes == 'object'].index)

In [283]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(cleaned_crime[crime_obj]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(crime_obj)
encode_df.head()

Unnamed: 0,Primary_Type_ASSAULT,Primary_Type_BATTERY,Primary_Type_BURGLARY,Primary_Type_CRIMINAL DAMAGE,Primary_Type_CRIMINAL TRESPASS,Primary_Type_DECEPTIVE PRACTICE,Primary_Type_MOTOR VEHICLE THEFT,Primary_Type_NARCOTICS,Primary_Type_OTHER OFFENSE,Primary_Type_Other,Primary_Type_ROBBERY,Primary_Type_THEFT,Primary_Type_WEAPONS VIOLATION,Safety_Bad,Safety_Good
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [284]:
# Merge one-hot encoded features and drop the originals
cleaned_crime = cleaned_crime.merge(encode_df, left_index=True, right_index=True)
cleaned_crime = cleaned_crime.drop(columns=crime_obj,axis=1)
cleaned_crime.head()

Unnamed: 0,ZIP,Ward,rankings,Latitude,Longitude,Arrest,Domestic,Population,People/Sq.Mile,National_Rank,...,Primary_Type_DECEPTIVE PRACTICE,Primary_Type_MOTOR VEHICLE THEFT,Primary_Type_NARCOTICS,Primary_Type_OTHER OFFENSE,Primary_Type_Other,Primary_Type_ROBBERY,Primary_Type_THEFT,Primary_Type_WEAPONS VIOLATION,Safety_Bad,Safety_Good
1,60601,27,4,41.883932,-87.679964,False,False,5591,17101.15,271,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,60601,27,3,41.896569,-87.636063,False,False,5591,17101.15,271,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,60601,27,3,41.883937,-87.683368,True,False,5591,17101.15,271,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,60601,27,4,41.892856,-87.710137,True,False,5591,17101.15,271,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
5,60601,27,7,41.901683,-87.718962,False,True,5591,17101.15,271,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [285]:
# Split our preprocessed data into our features and target arrays
X = cleaned_crime.drop(columns=['Safety_Bad','Safety_Good'],axis=1)
y = cleaned_crime['Safety_Good']

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

# Resampling Techniques

### Naives Random

In [286]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled_ros, y_resampled_ros = ros.fit_resample(X_train, y_train)

In [287]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
model_ros = LogisticRegression(solver='lbfgs', random_state=1)
model_ros.fit(X_resampled_ros, y_resampled_ros)

LogisticRegression(random_state=1)

In [288]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred_ros = model_ros.predict(X_test)
balanced_accuracy_score(y_test, y_pred_ros)

0.5079853688403481

In [289]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred_ros)

array([[ 2111, 10068],
       [  124,   664]], dtype=int64)

In [290]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred_ros))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.94      0.17      0.84      0.29      0.38      0.14     12179
        1.0       0.06      0.84      0.17      0.12      0.38      0.16       788

avg / total       0.89      0.21      0.80      0.28      0.38      0.14     12967



### SMOTE

In [291]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=1, sampling_strategy='auto')
X_resampled_smote, y_resampled_smote = smote.fit_resample(X_train, y_train)

In [292]:
# Train the Logistic Regression model using the resampled data
model_smote = LogisticRegression(solver='lbfgs', random_state=1)
model_smote.fit(X_resampled_smote, y_resampled_smote)

LogisticRegression(random_state=1)

In [293]:
# Calculated the balanced accuracy score
y_pred_smote = model_smote.predict(X_test)
balanced_accuracy_score(y_test, y_pred_smote)

0.5058199642973696

In [294]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred_smote)

array([[2321, 9858],
       [ 141,  647]], dtype=int64)

In [295]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred_smote))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.94      0.19      0.82      0.32      0.40      0.15     12179
        1.0       0.06      0.82      0.19      0.11      0.40      0.17       788

avg / total       0.89      0.23      0.78      0.30      0.40      0.15     12967



### Cluster Centroids

In [296]:
# Resample the data using the ClusterCentroids resampler
# Warning: This is a large dataset, and this step may take some time to complete
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(random_state=1)
X_resampled_cc, y_resampled_cc = cc.fit_resample(X_train, y_train)

In [297]:
# Train the Logistic Regression model using the resampled data
model_cc = LogisticRegression(solver='lbfgs', random_state=1)
model_cc.fit(X_resampled_cc, y_resampled_cc)

LogisticRegression(random_state=1)

In [298]:
# Calculated the balanced accuracy score
y_pred_cc = model_cc.predict(X_test)
balanced_accuracy_score(y_test, y_pred_cc)

0.5136017289475976

In [299]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred_cc)

array([[6529, 5650],
       [ 401,  387]], dtype=int64)

In [300]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred_cc))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.94      0.54      0.49      0.68      0.51      0.26     12179
        1.0       0.06      0.49      0.54      0.11      0.51      0.26       788

avg / total       0.89      0.53      0.49      0.65      0.51      0.26     12967



### Combination (Over and Under) Sampling

In [301]:
# Resample the training data with SMOTEENN
# Warning: This is a large dataset, and this step may take some time to complete
# YOUR CODE HERE
from imblearn.combine import SMOTEENN
smoteenn = SMOTEENN(random_state=0)
X_resampled_smoteenn, y_resampled_smoteenn = smoteenn.fit_resample(X, y)

In [302]:
# Train the Logistic Regression model using the resampled data
model_smoteenn = LogisticRegression(solver='lbfgs', random_state=1)
model_smoteenn.fit(X_resampled_smoteenn, y_resampled_smoteenn)

LogisticRegression(random_state=1)

In [303]:
# Calculated the balanced accuracy score
y_pred_smoteenn = model_smoteenn.predict(X_test)
balanced_accuracy_score(y_test, y_pred_smoteenn)

0.5058199642973696

In [304]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred_smoteenn)

array([[2321, 9858],
       [ 141,  647]], dtype=int64)

In [305]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred_smoteenn))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.94      0.19      0.82      0.32      0.40      0.15     12179
        1.0       0.06      0.82      0.19      0.11      0.40      0.17       788

avg / total       0.89      0.23      0.78      0.30      0.40      0.15     12967



# Ensemble Techniques

### Balanced Random Forest

In [306]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
brfc_model = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
brfc_model.fit(X_train, y_train)

BalancedRandomForestClassifier(random_state=1)

In [307]:
# Calculated the balanced accuracy score
y_pred_brfc = brfc_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred_brfc)

0.9374838752566934

In [308]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred_brfc)

array([[11259,   920],
       [   39,   749]], dtype=int64)

In [309]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred_brfc))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       1.00      0.92      0.95      0.96      0.94      0.88     12179
        1.0       0.45      0.95      0.92      0.61      0.94      0.88       788

avg / total       0.96      0.93      0.95      0.94      0.94      0.88     12967



In [310]:
# List the features sorted in descending order by feature importance
importances = brfc_model.feature_importances_
sorted(zip(brfc_model.feature_importances_, X.columns), reverse=True)

[(0.5134809531982681, 'Primary_Type_ASSAULT'),
 (0.07357571471074288, 'Primary_Type_BATTERY'),
 (0.06699280276745866, 'Primary_Type_THEFT'),
 (0.06090997288376309, 'Latitude'),
 (0.060739013876999114, 'Longitude'),
 (0.05185175219670812, 'Primary_Type_DECEPTIVE PRACTICE'),
 (0.03682596588408578, 'Primary_Type_CRIMINAL DAMAGE'),
 (0.020739498560144464, 'rankings'),
 (0.012824379155287282, 'Primary_Type_BURGLARY'),
 (0.012717372961112054, 'Primary_Type_MOTOR VEHICLE THEFT'),
 (0.012370699234980931, 'Primary_Type_NARCOTICS'),
 (0.011948994167052673, 'Primary_Type_OTHER OFFENSE'),
 (0.008904560703258728, 'Primary_Type_Other'),
 (0.008624010240666159, 'Primary_Type_ROBBERY'),
 (0.0058342926434960165, 'Domestic'),
 (0.005784500770652778, 'Arrest'),
 (0.005691779971305466, 'chip_longitude'),
 (0.005368554960448152, 'Primary_Type_WEAPONS VIOLATION'),
 (0.005115237855063011, 'chip_latitude'),
 (0.004362857550368177, 'Ward'),
 (0.004148803437695163, 'Primary_Type_CRIMINAL TRESPASS'),
 (0.0029678

### Easy Ensemble AdaBoost Classifier

In [311]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
ee_model = EasyEnsembleClassifier(n_estimators=100, random_state=1)
ee_model.fit(X_train, y_train)
# Train the EasyEnsembleClassifier

EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [312]:
# Calculated the balanced accuracy score
y_pred_ee = ee_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred_ee)

0.9383168393794261

In [313]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred_ee)

array([[11032,  1147],
       [   23,   765]], dtype=int64)

In [314]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred_ee))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       1.00      0.91      0.97      0.95      0.94      0.87     12179
        1.0       0.40      0.97      0.91      0.57      0.94      0.89       788

avg / total       0.96      0.91      0.97      0.93      0.94      0.87     12967

