In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.model_selection import train_test_split

In [4]:
cleaned_crime = pd.read_csv('Resources/crime_chip_for_model.csv')
cleaned_crime.head()

Unnamed: 0.1,Unnamed: 0,ZIP,Ward,Primary_Type,rankings,Latitude,Longitude,Arrest,Domestic,Population,People/Sq.Mile,National_Rank,state,location,address,chip_latitude,chip_longitude,chipotle,Safety
0,0,60601,27,THEFT,3,,,False,False,5591,17101.15,#271,Illinois,Chicago,"316 N Michigan Ave Chicago, IL 60601 US",41.887288,-87.624848,1,Bad
1,1,60601,27,CRIMINAL DAMAGE,4,41.883932,-87.679964,False,False,5591,17101.15,#271,Illinois,Chicago,"316 N Michigan Ave Chicago, IL 60601 US",41.887288,-87.624848,1,Bad
2,2,60601,27,THEFT,3,41.896569,-87.636063,False,False,5591,17101.15,#271,Illinois,Chicago,"316 N Michigan Ave Chicago, IL 60601 US",41.887288,-87.624848,1,Bad
3,3,60601,27,SEX OFFENSE,3,41.883937,-87.683368,True,False,5591,17101.15,#271,Illinois,Chicago,"316 N Michigan Ave Chicago, IL 60601 US",41.887288,-87.624848,1,Bad
4,4,60601,27,NARCOTICS,4,41.892856,-87.710137,True,False,5591,17101.15,#271,Illinois,Chicago,"316 N Michigan Ave Chicago, IL 60601 US",41.887288,-87.624848,1,Bad


In [5]:
def get_weighted_sample(df,n):
    def get_class_prob(x):
        weight_x = int(np.rint(n * len(x[x.click != 0]) / len(df[df.click != 0])))
        sampled_x = x.sample(weight_x).reset_index(drop=True)
        return (sampled_x)
        # we are grouping by the target class we use for the proportions

    weighted_sample = df.groupby('event_type').apply(get_class_prob)
    print(weighted_sample["event_type"].value_counts())
    return (weighted_sample)

sample = get_weighted_sample(cleaned_crime,30000)
sample

KeyError: 'event_type'

In [161]:
cleaned_crime = cleaned_crime.drop(columns=['Unnamed: 0', 'state', 'location', 'address' ,'chip_latitude', 'chip_longitude'])
cleaned_crime = cleaned_crime.dropna()
cleaned_crime['National_Rank'] = cleaned_crime['National_Rank'].str.replace('#', '')
cleaned_crime['National_Rank'] = cleaned_crime['National_Rank'].str.replace(',', '')
cleaned_crime['National_Rank'] = cleaned_crime['National_Rank'].astype('int')
cleaned_crime['People/Sq.Mile'] = cleaned_crime['People/Sq.Mile'].str.replace(',', '')
cleaned_crime['People/Sq.Mile'] = cleaned_crime['People/Sq.Mile'].astype('float')
cleaned_crime['Population'] = cleaned_crime['Population'].str.replace(',', '')
cleaned_crime['Population'] = cleaned_crime['Population'].astype('int')
sample = cleaned_crime.sample(n=30000)
sample = sample.reset_index()
sample = sample.drop(columns=['index'])
sample.head()

Unnamed: 0,ZIP,Ward,Primary_Type,rankings,Latitude,Longitude,Arrest,Domestic,Population,People/Sq.Mile,National_Rank,chipotle,Safety
0,60602,26,BURGLARY,6,41.909784,-87.734127,False,False,70,810.58,6855,1,Bad
1,60657,7,CRIMINAL DAMAGE,4,41.763647,-87.586265,False,False,66789,31204.02,112,1,Bad
2,60622,36,DECEPTIVE PRACTICE,3,41.91949,-87.759691,False,False,76015,17489.0,258,0,Bad
3,60659,50,OFFENSE INVOLVING CHILDREN,3,41.994828,-87.699545,False,True,39155,19859.02,213,1,Bad
4,60602,26,BURGLARY,6,41.916262,-87.716739,False,False,70,810.58,6855,1,Bad


In [162]:
primary_type = sample.Primary_Type.value_counts()

In [6]:
# Determine which values to replace if counts are less than ...?
replace_primary = list(primary_type[primary_type < 1000].index)

# Replace in dataframe
for primary in replace_primary:
    sample.Primary_Type = sample.Primary_Type.replace(primary,"OTHER OFFENSE")
    
# Check to make sure binning was successful
sample.Primary_Type.value_counts()

NameError: name 'primary_type' is not defined

In [164]:
# Generate our categorical variable lists
crime_obj = list(sample.dtypes[sample.dtypes == 'object'].index)

In [165]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(cleaned_crime[crime_obj]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(crime_obj)
encode_df.head()

Unnamed: 0,Primary_Type_ARSON,Primary_Type_ASSAULT,Primary_Type_BATTERY,Primary_Type_BURGLARY,Primary_Type_CONCEALED CARRY LICENSE VIOLATION,Primary_Type_CRIM SEXUAL ASSAULT,Primary_Type_CRIMINAL DAMAGE,Primary_Type_CRIMINAL SEXUAL ASSAULT,Primary_Type_CRIMINAL TRESPASS,Primary_Type_DECEPTIVE PRACTICE,...,Primary_Type_PROSTITUTION,Primary_Type_PUBLIC INDECENCY,Primary_Type_PUBLIC PEACE VIOLATION,Primary_Type_ROBBERY,Primary_Type_SEX OFFENSE,Primary_Type_STALKING,Primary_Type_THEFT,Primary_Type_WEAPONS VIOLATION,Safety_Bad,Safety_Good
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [166]:
# Merge one-hot encoded features and drop the originals
sample = sample.merge(encode_df, left_index=True, right_index=True)
sample = sample.drop(columns=crime_obj,axis=1)
sample.head()

Unnamed: 0,ZIP,Ward,rankings,Latitude,Longitude,Arrest,Domestic,Population,People/Sq.Mile,National_Rank,...,Primary_Type_PROSTITUTION,Primary_Type_PUBLIC INDECENCY,Primary_Type_PUBLIC PEACE VIOLATION,Primary_Type_ROBBERY,Primary_Type_SEX OFFENSE,Primary_Type_STALKING,Primary_Type_THEFT,Primary_Type_WEAPONS VIOLATION,Safety_Bad,Safety_Good
0,60602,26,6,41.909784,-87.734127,False,False,70,810.58,6855,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,60657,7,4,41.763647,-87.586265,False,False,66789,31204.02,112,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,60622,36,3,41.91949,-87.759691,False,False,76015,17489.0,258,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,60659,50,3,41.994828,-87.699545,False,True,39155,19859.02,213,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,60602,26,6,41.916262,-87.716739,False,False,70,810.58,6855,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [167]:
# Split our preprocessed data into our features and target arrays
X = sample.drop(columns=['chipotle'],axis=1)
y = sample['chipotle']

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

# Resampling Techniques

### Naives Random

In [168]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled_ros, y_resampled_ros = ros.fit_resample(X_train, y_train)

In [169]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
model_ros = LogisticRegression(solver='lbfgs', random_state=1)
model_ros.fit(X_resampled_ros, y_resampled_ros)

LogisticRegression(random_state=1)

In [170]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred_ros = model_ros.predict(X_test)
balanced_accuracy_score(y_test, y_pred_ros)

0.616732340356009

In [171]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred_ros)

array([[1926,  920],
       [2063, 2591]], dtype=int64)

In [172]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred_ros))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.48      0.68      0.56      0.56      0.61      0.38      2846
          1       0.74      0.56      0.68      0.63      0.61      0.37      4654

avg / total       0.64      0.60      0.63      0.61      0.61      0.38      7500



### SMOTE

In [173]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=1, sampling_strategy='auto')
X_resampled_smote, y_resampled_smote = smote.fit_resample(X_train, y_train)

In [174]:
# Train the Logistic Regression model using the resampled data
model_smote = LogisticRegression(solver='lbfgs', random_state=1)
model_smote.fit(X_resampled_smote, y_resampled_smote)

LogisticRegression(random_state=1)

In [175]:
# Calculated the balanced accuracy score
y_pred_smote = model_smote.predict(X_test)
balanced_accuracy_score(y_test, y_pred_smote)

0.6544487834311443

In [176]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred_smote)

array([[2285,  561],
       [2299, 2355]], dtype=int64)

In [177]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred_smote))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.50      0.80      0.51      0.62      0.64      0.42      2846
          1       0.81      0.51      0.80      0.62      0.64      0.39      4654

avg / total       0.69      0.62      0.69      0.62      0.64      0.40      7500



### Cluster Centroids

In [178]:
# Resample the data using the ClusterCentroids resampler
# Warning: This is a large dataset, and this step may take some time to complete
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(random_state=1)
X_resampled_cc, y_resampled_cc = cc.fit_resample(X_train, y_train)

In [179]:
# Train the Logistic Regression model using the resampled data
model_cc = LogisticRegression(solver='lbfgs', random_state=1)
model_cc.fit(X_resampled_cc, y_resampled_cc)

LogisticRegression(random_state=1)

In [180]:
# Calculated the balanced accuracy score
y_pred_cc = model_cc.predict(X_test)
balanced_accuracy_score(y_test, y_pred_cc)

0.6048111161678376

In [181]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred_cc)

array([[1808, 1038],
       [1981, 2673]], dtype=int64)

In [182]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred_cc))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.48      0.64      0.57      0.54      0.60      0.37      2846
          1       0.72      0.57      0.64      0.64      0.60      0.36      4654

avg / total       0.63      0.60      0.61      0.60      0.60      0.36      7500



### Combination (Over and Under) Sampling

In [183]:
# Resample the training data with SMOTEENN
# Warning: This is a large dataset, and this step may take some time to complete
# YOUR CODE HERE
from imblearn.combine import SMOTEENN
smoteenn = SMOTEENN(random_state=0)
X_resampled_smoteenn, y_resampled_smoteenn = smoteenn.fit_resample(X, y)

In [184]:
# Train the Logistic Regression model using the resampled data
model_smoteenn = LogisticRegression(solver='lbfgs', random_state=1)
model_smoteenn.fit(X_resampled_smoteenn, y_resampled_smoteenn)

LogisticRegression(random_state=1)

In [185]:
# Calculated the balanced accuracy score
y_pred_smoteenn = model_smoteenn.predict(X_test)
balanced_accuracy_score(y_test, y_pred_smoteenn)

0.6804479239554244

In [186]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred_smoteenn)

array([[2285,  561],
       [2057, 2597]], dtype=int64)

In [187]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred_smoteenn))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.53      0.80      0.56      0.64      0.67      0.46      2846
          1       0.82      0.56      0.80      0.66      0.67      0.44      4654

avg / total       0.71      0.65      0.71      0.65      0.67      0.45      7500



# Ensemble Techniques

### Balanced Random Forest

In [188]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
brfc_model = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
brfc_model.fit(X_train, y_train)

BalancedRandomForestClassifier(random_state=1)

In [189]:
# Calculated the balanced accuracy score
y_pred_brfc = brfc_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred_brfc)

1.0

In [190]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred_brfc)

array([[2846,    0],
       [   0, 4654]], dtype=int64)

In [191]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred_brfc))

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      1.00      1.00      1.00      1.00      1.00      2846
          1       1.00      1.00      1.00      1.00      1.00      1.00      4654

avg / total       1.00      1.00      1.00      1.00      1.00      1.00      7500



In [192]:
# List the features sorted in descending order by feature importance
importances = brfc_model.feature_importances_
sorted(zip(brfc_model.feature_importances_, X.columns), reverse=True)

[(0.192748449314745, 'People/Sq.Mile'),
 (0.17811185826054618, 'Population'),
 (0.17447432527080234, 'ZIP'),
 (0.17221812152129443, 'National_Rank'),
 (0.15402229594525368, 'Ward'),
 (0.06953172026272202, 'Latitude'),
 (0.05450832214763789, 'Longitude'),
 (0.0011603823493585273, 'rankings'),
 (0.0007100553850966744, 'Domestic'),
 (0.00034671768828469445, 'Arrest'),
 (0.0001300862419524161, 'Safety_Bad'),
 (0.00012305773371559895, 'Safety_Good'),
 (0.00012235695713013378, 'Primary_Type_ASSAULT'),
 (0.00011766138631463764, 'Primary_Type_OTHER OFFENSE'),
 (0.00011505349139550997, 'Primary_Type_WEAPONS VIOLATION'),
 (0.00010834832650737047, 'Primary_Type_DECEPTIVE PRACTICE'),
 (0.0001083271203531448, 'Primary_Type_NARCOTICS'),
 (9.929905687491392e-05, 'Primary_Type_BATTERY'),
 (9.795461659531515e-05, 'Primary_Type_MOTOR VEHICLE THEFT'),
 (9.694033938015772e-05, 'Primary_Type_CRIMINAL DAMAGE'),
 (9.520316561189722e-05, 'Primary_Type_THEFT'),
 (9.517186010165456e-05, 'Primary_Type_CRIMINAL T

### Easy Ensemble AdaBoost Classifier

In [193]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
ee_model = EasyEnsembleClassifier(n_estimators=100, random_state=1)
ee_model.fit(X_train, y_train)
# Train the EasyEnsembleClassifier

EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [194]:
# Calculated the balanced accuracy score
y_pred_ee = ee_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred_ee)

1.0

In [195]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred_ee)

array([[2846,    0],
       [   0, 4654]], dtype=int64)

In [196]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred_ee))

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      1.00      1.00      1.00      1.00      1.00      2846
          1       1.00      1.00      1.00      1.00      1.00      1.00      4654

avg / total       1.00      1.00      1.00      1.00      1.00      1.00      7500

