In [1]:
# Import our dependencies
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from path import Path
from sklearn.preprocessing import LabelEncoder as le
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [2]:
# Load data
file_path = Path("../Resources/cleaned_df.csv")
cleaned_df = pd.read_csv(file_path)
cleaned_df = cleaned_df.drop(columns=['Unnamed: 0'])
cleaned_df.head()

Unnamed: 0,Category,PdDistrict,Resolution
0,WEAPON LAWS,SOUTHERN,"ARREST, BOOKED"
1,WEAPON LAWS,SOUTHERN,"ARREST, BOOKED"
2,WARRANTS,BAYVIEW,"ARREST, BOOKED"
3,NON-CRIMINAL,TENDERLOIN,NONE
4,NON-CRIMINAL,MISSION,NONE


In [3]:
# Generate our categorical variable list
cleaned_cat = cleaned_df.dtypes[cleaned_df.dtypes == "object"].index.tolist()

In [4]:
# Check the number of unique values in each column
cleaned_df[cleaned_cat].nunique()

Category      39
PdDistrict    10
Resolution     2
dtype: int64

In [5]:
# Print out the Category value counts
category_counts = cleaned_df.Category.value_counts()
category_counts

LARCENY/THEFT                  40284
OTHER OFFENSES                 19030
NON-CRIMINAL                   17112
ASSAULT                        13352
VANDALISM                       8548
VEHICLE THEFT                   6207
WARRANTS                        5824
BURGLARY                        5772
SUSPICIOUS OCC                  5540
DRUG/NARCOTIC                   4186
MISSING PERSON                  3998
ROBBERY                         3216
FRAUD                           2602
TRESPASS                        1787
SECONDARY CODES                 1685
WEAPON LAWS                     1582
SEX OFFENSES, FORCIBLE           889
STOLEN PROPERTY                  857
RECOVERED VEHICLE                732
DISORDERLY CONDUCT               643
PROSTITUTION                     593
FORGERY/COUNTERFEITING           567
DRUNKENNESS                      462
DRIVING UNDER THE INFLUENCE      376
ARSON                            279
KIDNAPPING                       248
EMBEZZLEMENT                     168
L

In [6]:
# Determine which values to replace
replace_category = list(category_counts[category_counts < 1000].index)

# Replace in DataFrame
for category in replace_category:
    cleaned_df.Category = cleaned_df.Category.replace(category,"Other")


# Check to make sure binning was successful
cleaned_df.Category.value_counts()

LARCENY/THEFT      40284
OTHER OFFENSES     19030
NON-CRIMINAL       17112
ASSAULT            13352
VANDALISM           8548
Other               6470
VEHICLE THEFT       6207
WARRANTS            5824
BURGLARY            5772
SUSPICIOUS OCC      5540
DRUG/NARCOTIC       4186
MISSING PERSON      3998
ROBBERY             3216
FRAUD               2602
TRESPASS            1787
SECONDARY CODES     1685
WEAPON LAWS         1582
Name: Category, dtype: int64

In [7]:
# Transform objects to integers
cleaned_df = cleaned_df.apply(le().fit_transform)
cleaned_df.dtypes

Category      int32
PdDistrict    int32
Resolution    int32
dtype: object

In [8]:
# Create Features and Target 
X = cleaned_df.copy()
X = X.drop("Resolution", axis=1)
y = cleaned_df["Resolution"].values

In [9]:
# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78, stratify=y)

In [10]:
# Fit the StandardScaler
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [11]:
# Create adaboost classifer object
abc = AdaBoostClassifier(n_estimators=100,
                         learning_rate=1)
# Train Adaboost Classifer
model = abc.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = model.predict(X_test)

In [12]:
# Display the confusion matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[ 6062,  3792],
       [ 2497, 24448]], dtype=int64)

In [13]:
# Calculated the balanced accuracy score
balanced_acc_score = balanced_accuracy_score(y_test,y_pred)
print(f"balanced accuracy score: {balanced_acc_score*100:.2f}%")

balanced accuracy score: 76.13%


In [14]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.71      0.62      0.91      0.66      0.75      0.54      9854
          1       0.87      0.91      0.62      0.89      0.75      0.57     26945

avg / total       0.82      0.83      0.69      0.83      0.75      0.57     36799

