In [1]:
import pandas as pd
import sklearn
from matplotlib import pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import LassoCV

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn import metrics
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.metrics import multilabel_confusion_matrix

%matplotlib inline

import statsmodels.api as sm
from statsmodels.api import OLS

In [2]:
crime_property_light_population= pd.read_csv("./model_data/crime_light_density.csv")
crime_property_light_population.head()

predictors = ['OFFENSE_CODE_GROUP','SHOOTING', 'MONTH', 'DAY_OF_WEEK', 'HOUR', 'Population density (per square mile of land area)', 'BLDG_VAL', 'LAND_VAL','light_density']
df = crime_property_light_population[predictors]
df = df.dropna()

In [19]:
top5 = df['OFFENSE_CODE_GROUP'].value_counts().index[:5]

#cut the df
df = df[df['OFFENSE_CODE_GROUP'].isin(top5)]

In [20]:
#Let's label encode & one-hot encode the Categoricals (SHOOTING, DAY_OF_WEEK)

enc = OneHotEncoder(handle_unknown='ignore')

le = LabelEncoder() 
  
df['SHOOTING']= le.fit_transform(df['SHOOTING'])
df['DAY_OF_WEEK']= le.fit_transform(df['DAY_OF_WEEK']) 
df['OFFENSE_CODE_GROUP'] = le.fit_transform(df['OFFENSE_CODE_GROUP'])

In [21]:
df_cat = enc.fit_transform(df[["MONTH","SHOOTING","DAY_OF_WEEK"]]).toarray()

df_cat = pd.DataFrame(df_cat, columns=enc.get_feature_names(['MONTH','SHOOTING', 'DAY_OF_WEEK']))
df = pd.concat([df, df_cat], axis=1)
df = df.dropna()

df.head()

Unnamed: 0,OFFENSE_CODE_GROUP,SHOOTING,MONTH,DAY_OF_WEEK,HOUR,Population density (per square mile of land area),BLDG_VAL,LAND_VAL,light_density,MONTH_1,...,MONTH_12.0,SHOOTING_0,SHOOTING_1,DAY_OF_WEEK_0,DAY_OF_WEEK_1,DAY_OF_WEEK_2,DAY_OF_WEEK_3,DAY_OF_WEEK_4,DAY_OF_WEEK_5,DAY_OF_WEEK_6
6,0.0,0.0,9.0,3.0,3.0,26693.9,3362.723369,581.882742,0.699176,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
9,3.0,0.0,9.0,3.0,5.0,24074.6,5490.169358,260.905478,0.7289,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
10,0.0,0.0,9.0,3.0,1.0,8352.4,2009.317606,59.185114,0.525616,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
16,3.0,0.0,9.0,3.0,0.0,15913.2,1293.230035,31.015855,0.568578,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
18,0.0,0.0,9.0,3.0,3.0,14884.4,1040.984096,31.260815,0.640733,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [22]:

X_train, X_test, y_train, y_test = train_test_split(df.loc[:, df.columns != 'OFFENSE_CODE_GROUP'], 
                                                         df.OFFENSE_CODE_GROUP, test_size=0.2, 
                                                         random_state = 109, 
                                                         stratify = df.OFFENSE_CODE_GROUP)

In [23]:

X_train.head()

Unnamed: 0,SHOOTING,MONTH,DAY_OF_WEEK,HOUR,Population density (per square mile of land area),BLDG_VAL,LAND_VAL,light_density,MONTH_1,MONTH_2,...,MONTH_12.0,SHOOTING_0,SHOOTING_1,DAY_OF_WEEK_0,DAY_OF_WEEK_1,DAY_OF_WEEK_2,DAY_OF_WEEK_3,DAY_OF_WEEK_4,DAY_OF_WEEK_5,DAY_OF_WEEK_6
101935,0.0,8.0,1.0,18.0,15744.4,10385.587535,109.607753,0.644978,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
18909,0.0,7.0,1.0,17.0,16236.4,1135.641813,46.780408,0.574557,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
107578,0.0,7.0,3.0,20.0,21720.8,6001.500969,516.524296,0.738507,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
130942,0.0,5.0,4.0,15.0,40159.2,8464.863983,333.361435,0.714606,0.0,0.0,...,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
78168,0.0,11.0,6.0,16.0,24074.6,5490.169358,260.905478,0.7289,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [24]:
#Let's scale the variables
min_max_scaler = MinMaxScaler()
numerical = ["HOUR", "BLDG_VAL", "LAND_VAL", "light_density", "MONTH", "Population density (per square mile of land area)" ]
X_normalized = (X_train - X_train.min(axis=0)) / (X_train.max(axis=0) - X_train.min(axis=0))
min_max_scaler.fit(X_train)
X_normalized_tst = pd.DataFrame(min_max_scaler.transform(X_test))
X_normalized_tst.columns = X_test.columns


display(X_normalized.head())
display(X_normalized_tst.head())


Unnamed: 0,SHOOTING,MONTH,DAY_OF_WEEK,HOUR,Population density (per square mile of land area),BLDG_VAL,LAND_VAL,light_density,MONTH_1,MONTH_2,...,MONTH_12.0,SHOOTING_0,SHOOTING_1,DAY_OF_WEEK_0,DAY_OF_WEEK_1,DAY_OF_WEEK_2,DAY_OF_WEEK_3,DAY_OF_WEEK_4,DAY_OF_WEEK_5,DAY_OF_WEEK_6
101935,0.0,0.636364,0.166667,0.782609,0.203114,0.080711,0.11679,0.368458,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
18909,0.0,0.545455,0.166667,0.73913,0.210523,0.008147,0.033911,0.243188,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
107578,0.0,0.545455,0.5,0.869565,0.293108,0.046319,0.653572,0.534835,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
130942,0.0,0.363636,0.666667,0.652174,0.570757,0.065643,0.411953,0.492319,0.0,0.0,...,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
78168,0.0,0.909091,1.0,0.695652,0.328552,0.042308,0.316373,0.517746,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,SHOOTING,MONTH,DAY_OF_WEEK,HOUR,Population density (per square mile of land area),BLDG_VAL,LAND_VAL,light_density,MONTH_1,MONTH_2,...,MONTH_12.0,SHOOTING_0,SHOOTING_1,DAY_OF_WEEK_0,DAY_OF_WEEK_1,DAY_OF_WEEK_2,DAY_OF_WEEK_3,DAY_OF_WEEK_4,DAY_OF_WEEK_5,DAY_OF_WEEK_6
0,0.0,0.727273,0.333333,0.695652,0.210523,0.008147,0.033911,0.243188,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.636364,0.833333,0.652174,0.328552,0.042308,0.316373,0.517746,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.181818,0.666667,0.73913,0.334295,0.028569,0.11121,0.614133,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.333333,0.869565,0.190164,0.007405,0.013439,0.360906,1.0,0.0,...,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.272727,0.333333,0.391304,0.051409,0.0,0.009073,0.086291,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [25]:
# Let's first start with Logistic Model

logreg = LogisticRegression(random_state = 0, solver='lbfgs', multi_class='multinomial', max_iter = 1000)
logreg.fit(X_normalized, y_train)
y_pred = logreg.predict(X_normalized_tst)

In [26]:
mcm = multilabel_confusion_matrix(y_test, y_pred, sample_weight=None, labels=None, samplewise=False)
logreg.score(X_normalized_tst , y_test)

0.30972968053153727

In [27]:
knn = KNeighborsClassifier(n_neighbors = 7).fit(X_normalized, y_train) 
  
# accuracy on X_test 
accuracy = knn.score(X_normalized_tst, y_test) 

In [28]:
from sklearn.metrics import confusion_matrix 
cm = confusion_matrix(y_test, y_pred)

print(cm)

[[   0  398  134 1134    0]
 [   1 1225  126 1060    0]
 [   0  589  216 1657    0]
 [   0  656  237 1962    0]
 [   0  458  133 1001    0]]


In [29]:
import pickle

pickle.dump(logreg, open('logreg', 'wb'))
pickle.dump(knn, open('knn', 'wb'))