# Crime Prediction using Machine Learning

In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt

In [2]:
df_crime = pd.read_csv("data/crime-final.csv")
df_crime

Unnamed: 0,TYPE,YEAR,MONTH,DAY,HOUR,MINUTE,GEO_NAME,LAT,LON,D_SCHOOL,...,YOUTH_FEMALE,ADULT_TOTAL,ADULT_MALE,ADULT_FEMALE,SENIOR_TOTAL,SENIOR_MALE,SENIOR_FEMALE,HOUSEHOLD_INCOME,EMPLOYMENT_RATE,UNEMPLOYMENT_RATE
0,Break and Enter Commercial,2019,3,7,2,6,Vancouver Granville,49.266678,-123.129029,779.657216,...,6245.0,63710.0,29620.0,34090.0,16375.0,7005.0,9370.0,48325.0,63.7,5.3
1,Break and Enter Commercial,2013,4,19,7,43,Vancouver Centre,49.282901,-123.126558,1221.974992,...,5675.0,81300.0,42965.0,38330.0,11260.0,5475.0,5785.0,62040.0,69.0,6.2
2,Break and Enter Commercial,2014,12,31,20,0,Vancouver Centre,49.282786,-123.126215,1203.742734,...,5675.0,81300.0,42965.0,38330.0,11260.0,5475.0,5785.0,62040.0,69.0,6.2
3,Break and Enter Commercial,2017,8,24,3,59,Vancouver Centre,49.282661,-123.126206,1190.086741,...,5005.0,84545.0,44410.0,40135.0,15520.0,7595.0,7925.0,70080.0,70.4,5.2
4,Break and Enter Commercial,2015,2,13,2,32,Vancouver Centre,49.282762,-123.126178,1200.494511,...,5675.0,81300.0,42965.0,38330.0,11260.0,5475.0,5785.0,62040.0,69.0,6.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
559878,Vehicle Collision or Pedestrian Struck (with I...,2003,11,19,17,35,Vancouver South,49.233763,-123.123888,87.002489,...,8410.0,79230.0,37360.0,41855.0,19000.0,8225.0,10765.0,42430.0,56.1,8.2
559879,Vehicle Collision or Pedestrian Struck (with I...,2010,8,4,17,15,Vancouver Centre,49.233862,-123.123875,91.134276,...,7220.0,109020.0,55535.0,53465.0,15365.0,7005.0,8340.0,82775.0,70.5,6.0
559880,Vehicle Collision or Pedestrian Struck (with I...,2004,4,19,13,5,Vancouver South,49.232530,-123.116083,497.656899,...,8410.0,79230.0,37360.0,41855.0,19000.0,8225.0,10765.0,42430.0,56.1,8.2
559881,Vehicle Collision or Pedestrian Struck (with I...,2004,6,6,14,56,Vancouver South,49.232530,-123.116083,497.656899,...,8410.0,79230.0,37360.0,41855.0,19000.0,8225.0,10765.0,42430.0,56.1,8.2


In [3]:
df_crime.columns

Index(['TYPE', 'YEAR', 'MONTH', 'DAY', 'HOUR', 'MINUTE', 'GEO_NAME', 'LAT',
       'LON', 'D_SCHOOL', 'SCHOOL_SCORE', 'POPULATION_DENSITY',
       'POPULATION_PERCENTAGE_CHANGE', 'POP_TOTAL', 'POP_MALE', 'POP_FEMALE',
       'CHILDREN_TOTAL', 'CHILDREN_MALE', 'CHILDREN_FEMALE', 'YOUTH_TOTAL',
       'YOUTH_MALE', 'YOUTH_FEMALE', 'ADULT_TOTAL', 'ADULT_MALE',
       'ADULT_FEMALE', 'SENIOR_TOTAL', 'SENIOR_MALE', 'SENIOR_FEMALE',
       'HOUSEHOLD_INCOME', 'EMPLOYMENT_RATE', 'UNEMPLOYMENT_RATE'],
      dtype='object')

## Model 1: Crime and School

Features: YEAR, MONTH, DAY, HOUR, MINUTE, LAT, LON, D_SCHOOL, SCHOOL_SCORE

Label: TYPE

In [4]:
df1 = df_crime[['TYPE','YEAR','MONTH','DAY','HOUR','MINUTE','LAT','LON','D_SCHOOL','SCHOOL_SCORE']]
df1

Unnamed: 0,TYPE,YEAR,MONTH,DAY,HOUR,MINUTE,LAT,LON,D_SCHOOL,SCHOOL_SCORE
0,Break and Enter Commercial,2019,3,7,2,6,49.266678,-123.129029,779.657216,6.4
1,Break and Enter Commercial,2013,4,19,7,43,49.282901,-123.126558,1221.974992,6.4
2,Break and Enter Commercial,2014,12,31,20,0,49.282786,-123.126215,1203.742734,6.4
3,Break and Enter Commercial,2017,8,24,3,59,49.282661,-123.126206,1190.086741,6.4
4,Break and Enter Commercial,2015,2,13,2,32,49.282762,-123.126178,1200.494511,6.4
...,...,...,...,...,...,...,...,...,...,...
559878,Vehicle Collision or Pedestrian Struck (with I...,2003,11,19,17,35,49.233763,-123.123888,87.002489,8.7
559879,Vehicle Collision or Pedestrian Struck (with I...,2010,8,4,17,15,49.233862,-123.123875,91.134276,8.7
559880,Vehicle Collision or Pedestrian Struck (with I...,2004,4,19,13,5,49.232530,-123.116083,497.656899,8.7
559881,Vehicle Collision or Pedestrian Struck (with I...,2004,6,6,14,56,49.232530,-123.116083,497.656899,8.7


In [5]:
# Scale data and split data into train and test
scaler = StandardScaler()
y = df1['TYPE'].values
X = df1.iloc[:,1:].values
X = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
print("Training set: ", X_train.shape, y_train.shape)
print("Testing set: ", X_test.shape, y_test.shape)

Training set:  (447906, 9) (447906,)
Testing set:  (111977, 9) (111977,)


### K Nearest Neighbor

In [19]:
from sklearn.neighbors import KNeighborsClassifier
kNN = KNeighborsClassifier(n_neighbors = 7).fit(X_train,y_train)
pred_kNN = kNN.predict(X_test)
pred_kNN[0:5]

array(['Break and Enter Residential/Other', 'Theft from Vehicle',
       'Other Theft', 'Other Theft', 'Break and Enter Residential/Other'],
      dtype=object)

In [20]:
print("KNN Accuracy: ", accuracy_score(y_test, pred_kNN))

KNN Accuracy:  0.39661716245300377


In [32]:
# Print confusion matrix
confusion_matrix(y_test, pred_kNN)

array([[ 1349,   610,  1322,   554,  3697,   105,   110,     0,    86],
       [  601,  3831,  1393,   869,  5839,   133,   329,     0,   216],
       [ 1239,  1990,  3010,  1629,  8037,   206,   311,     0,   180],
       [  270,   634,   778,  8181,  2858,   124,    65,     0,   115],
       [ 1521,  3551,  3778,  2876, 28300,   566,   744,     0,   190],
       [  222,   523,   595,   675,  3634,   425,    65,     0,    37],
       [  353,  1210,   862,   411,  5001,    60,   453,     0,    64],
       [    4,     7,    15,     4,    25,     4,     1,     0,     1],
       [  341,   961,   766,   913,  1741,    57,    86,     0,   264]])

In [23]:
# Find best k
Ks = 10
mean_acc = np.zeros((Ks-1))
std_acc = np.zeros((Ks-1))

for n in range(1,Ks):    
    kNN = KNeighborsClassifier(n_neighbors = n).fit(X_train,y_train)
    pred_kNN=kNN.predict(X_test)
    mean_acc[n-1] = accuracy_score(y_test, pred_kNN)
    std_acc[n-1]=np.std(pred_kNN==y_test)/np.sqrt(pred_kNN.shape[0])

mean_acc

array([0.32837994, 0.31921734, 0.34395456, 0.36850425, 0.38257857,
       0.39021406, 0.39661716, 0.40392223, 0.40912866])

In [27]:
print( "The best accuracy was with", mean_acc.max(), "with k=", mean_acc.argmax()+1) 

The best accuracy was with 0.40912866034989326 with k= 9


### Decision Tree

In [28]:
from sklearn.tree import DecisionTreeClassifier
decisionTree = DecisionTreeClassifier()
decisionTree.fit(X_train,y_train)
pred_decisionTree = decisionTree.predict(X_test)
pred_decisionTree [0:5]

array(['Theft from Vehicle', 'Theft from Vehicle', 'Other Theft',
       'Mischief', 'Mischief'], dtype=object)

In [29]:
print("Decision Tree's Accuracy: ", accuracy_score(y_test, pred_decisionTree))

Decision Tree's Accuracy:  0.40031435026835865


In [33]:
confusion_matrix(y_test, pred_decisionTree)

array([[ 1888,   662,  1502,   322,  2249,   461,   475,     5,   269],
       [  719,  3884,  2193,   197,  3841,   663,  1152,     8,   554],
       [ 1603,  2191,  3837,   839,  5301,   901,  1165,    10,   755],
       [  313,   178,   896,  9992,   950,   321,   186,     3,   186],
       [ 2547,  4109,  5992,  1013, 20785,  2578,  3712,     7,   783],
       [  403,   654,   862,   291,  2331,  1125,   385,     2,   123],
       [  480,  1258,  1167,   165,  3443,   403,  1298,     4,   196],
       [    9,     7,     7,     3,     9,     2,     4,     0,    20],
       [  341,   534,   808,   183,   835,   182,   214,    15,  2017]])

### Random Forest

In [6]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train,y_train)
pred_rfc = rfc.predict(X_test)
pred_rfc[0:5]

array(['Break and Enter Residential/Other', 'Mischief', 'Other Theft',
       'Other Theft', 'Other Theft'], dtype=object)

In [7]:
print("Random Forest Accuracy: ", accuracy_score(y_test, pred_rfc))

Random Forest Accuracy:  0.5193477231931557


In [8]:
confusion_matrix(y_test, pred_rfc)

array([[ 1923,   374,  1320,   213,  3472,   119,   106,     0,   149],
       [  288,  4917,  1412,    55,  5910,   157,   290,     0,   336],
       [  998,  1761,  4005,   649,  8339,   214,   287,     1,   443],
       [   78,    88,   531, 10963,  1065,    63,    25,     0,   114],
       [  935,  2683,  2653,   588, 32952,   531,   983,     0,   342],
       [  165,   448,   541,   227,  3948,   795,    48,     1,    95],
       [  146,   947,   557,   101,  5780,    65,   518,     0,   142],
       [    2,     7,    15,     1,    12,     0,     0,     0,    23],
       [  189,   685,   807,   177,  1049,    33,    38,     1,  2082]])

In [14]:
print(classification_report(y_test, pred_rfc))

                                                        precision    recall  f1-score   support

                            Break and Enter Commercial       0.41      0.25      0.31      7676
                     Break and Enter Residential/Other       0.41      0.37      0.39     13365
                                              Mischief       0.34      0.24      0.28     16697
                                           Other Theft       0.84      0.85      0.85     12927
                                    Theft from Vehicle       0.53      0.79      0.63     41667
                                      Theft of Bicycle       0.40      0.13      0.19      6268
                                      Theft of Vehicle       0.23      0.06      0.10      8256
Vehicle Collision or Pedestrian Struck (with Fatality)       0.00      0.00      0.00        60
  Vehicle Collision or Pedestrian Struck (with Injury)       0.56      0.41      0.47      5061

                                      

## Model 2: Crime with school and census
Features:'YEAR', 'MONTH', 'DAY', 'HOUR', 'MINUTE', 'GEO_NAME' (one hot encoded), 'LAT', 'LON', 'D_SCHOOL', 'SCHOOL_SCORE', 'POPULATION_DENSITY', 'POPULATION_PERCENTAGE_CHANGE', 'POP_MALE', 'CHILDREN_TOTAL', 'YOUTH_MALE', 'ADULT_TOTAL', 'ADULT_MALE', 'ADULT_FEMALE', 'HOUSEHOLD_INCOME', 'EMPLOYMENT_RATE'

Label: TYPE

In [49]:
df2 = df_crime[['TYPE', 'YEAR', 'MONTH', 'DAY', 'HOUR', 'MINUTE', 'GEO_NAME', 'LAT',
       'LON', 'D_SCHOOL', 'SCHOOL_SCORE', 'POPULATION_DENSITY', 'POPULATION_PERCENTAGE_CHANGE', 
        'POP_MALE', 'CHILDREN_TOTAL', 'YOUTH_MALE', 
        'ADULT_TOTAL', 'ADULT_MALE', 'ADULT_FEMALE',
        'HOUSEHOLD_INCOME', 'EMPLOYMENT_RATE'
        ]]
df2

Unnamed: 0,TYPE,YEAR,MONTH,DAY,HOUR,MINUTE,GEO_NAME,LAT,LON,D_SCHOOL,...,POPULATION_DENSITY,POPULATION_PERCENTAGE_CHANGE,POP_MALE,CHILDREN_TOTAL,YOUTH_MALE,ADULT_TOTAL,ADULT_MALE,ADULT_FEMALE,HOUSEHOLD_INCOME,EMPLOYMENT_RATE
0,Break and Enter Commercial,2019,3,7,2,6,Vancouver Granville,49.266678,-123.129029,779.657216,...,4620.300000,3.6,48295.0,11500.0,5630.0,63710.0,29620.0,34090.0,48325.0,63.7
1,Break and Enter Commercial,2013,4,19,7,43,Vancouver Centre,49.282901,-123.126558,1221.974992,...,9206.648697,13.6,53000.0,5795.0,5145.0,81300.0,42965.0,38330.0,62040.0,69.0
2,Break and Enter Commercial,2014,12,31,20,0,Vancouver Centre,49.282786,-123.126215,1203.742734,...,9206.648697,13.6,53000.0,5795.0,5145.0,81300.0,42965.0,38330.0,62040.0,69.0
3,Break and Enter Commercial,2017,8,24,3,59,Vancouver Centre,49.282661,-123.126206,1190.086741,...,10466.500000,13.6,59980.0,6970.0,4410.0,84545.0,44410.0,40135.0,70080.0,70.4
4,Break and Enter Commercial,2015,2,13,2,32,Vancouver Centre,49.282762,-123.126178,1200.494511,...,9206.648697,13.6,53000.0,5795.0,5145.0,81300.0,42965.0,38330.0,62040.0,69.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
559878,Vehicle Collision or Pedestrian Struck (with I...,2003,11,19,17,35,Vancouver South,49.233763,-123.123888,87.002489,...,4718.500000,2.8,59350.0,17795.0,8975.0,79230.0,37360.0,41855.0,42430.0,56.1
559879,Vehicle Collision or Pedestrian Struck (with I...,2010,8,4,17,15,Vancouver Centre,49.233862,-123.123875,91.134276,...,9104.500000,11.0,68550.0,8250.0,6370.0,109020.0,55535.0,53465.0,82775.0,70.5
559880,Vehicle Collision or Pedestrian Struck (with I...,2004,4,19,13,5,Vancouver South,49.232530,-123.116083,497.656899,...,4718.500000,2.8,59350.0,17795.0,8975.0,79230.0,37360.0,41855.0,42430.0,56.1
559881,Vehicle Collision or Pedestrian Struck (with I...,2004,6,6,14,56,Vancouver South,49.232530,-123.116083,497.656899,...,4718.500000,2.8,59350.0,17795.0,8975.0,79230.0,37360.0,41855.0,42430.0,56.1


In [51]:
# One-hot encoding districts
district_onehot = pd.get_dummies(df2[['GEO_NAME']], prefix="", prefix_sep="")
df2 = pd.concat([df2, district_onehot], axis=1)
df2.drop(['GEO_NAME'], axis=1, inplace=True)
df2

Unnamed: 0,TYPE,YEAR,MONTH,DAY,HOUR,MINUTE,LAT,LON,D_SCHOOL,SCHOOL_SCORE,...,ADULT_MALE,ADULT_FEMALE,HOUSEHOLD_INCOME,EMPLOYMENT_RATE,Vancouver Centre,Vancouver East,Vancouver Granville,Vancouver Kingsway,Vancouver Quadra,Vancouver South
0,Break and Enter Commercial,2019,3,7,2,6,49.266678,-123.129029,779.657216,6.4,...,29620.0,34090.0,48325.0,63.7,0,0,1,0,0,0
1,Break and Enter Commercial,2013,4,19,7,43,49.282901,-123.126558,1221.974992,6.4,...,42965.0,38330.0,62040.0,69.0,1,0,0,0,0,0
2,Break and Enter Commercial,2014,12,31,20,0,49.282786,-123.126215,1203.742734,6.4,...,42965.0,38330.0,62040.0,69.0,1,0,0,0,0,0
3,Break and Enter Commercial,2017,8,24,3,59,49.282661,-123.126206,1190.086741,6.4,...,44410.0,40135.0,70080.0,70.4,1,0,0,0,0,0
4,Break and Enter Commercial,2015,2,13,2,32,49.282762,-123.126178,1200.494511,6.4,...,42965.0,38330.0,62040.0,69.0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
559878,Vehicle Collision or Pedestrian Struck (with I...,2003,11,19,17,35,49.233763,-123.123888,87.002489,8.7,...,37360.0,41855.0,42430.0,56.1,0,0,0,0,0,1
559879,Vehicle Collision or Pedestrian Struck (with I...,2010,8,4,17,15,49.233862,-123.123875,91.134276,8.7,...,55535.0,53465.0,82775.0,70.5,1,0,0,0,0,0
559880,Vehicle Collision or Pedestrian Struck (with I...,2004,4,19,13,5,49.232530,-123.116083,497.656899,8.7,...,37360.0,41855.0,42430.0,56.1,0,0,0,0,0,1
559881,Vehicle Collision or Pedestrian Struck (with I...,2004,6,6,14,56,49.232530,-123.116083,497.656899,8.7,...,37360.0,41855.0,42430.0,56.1,0,0,0,0,0,1


In [52]:
# Scale data and split data into train and test
scaler = StandardScaler()
y = df2['TYPE'].values
X = df2.iloc[:,1:].values
X = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
print("Training set: ", X_train.shape, y_train.shape)
print("Testing set: ", X_test.shape, y_test.shape)

Training set:  (447906, 25) (447906,)
Testing set:  (111977, 25) (111977,)


In [53]:
rfc = RandomForestClassifier()
rfc.fit(X_train,y_train)
pred_rfc = rfc.predict(X_test)
pred_rfc[0:5]

array(['Theft from Vehicle', 'Break and Enter Residential/Other',
       'Theft from Vehicle', 'Other Theft', 'Other Theft'], dtype=object)

In [54]:
print("Random Forest Accuracy: ", accuracy_score(y_test, pred_rfc))

Random Forest Accuracy:  0.5115872009430508


In [55]:
print(confusion_matrix(y_test, pred_rfc))

[[ 1910   365  1325   267  3584   120   106     0   126]
 [  285  4751  1438    86  6108   145   278     0   274]
 [  883  1781  3925   736  8523   174   274     0   355]
 [   94   116   456 10876  1203    58    30     0    92]
 [  877  2618  2582   700 33014   479   906     0   314]
 [  167   444   499   280  4105   714    60     0    74]
 [  139   935   542   115  5906    67   513     0   101]
 [    1     6    13     2    20     0     0     1    15]
 [  168   740   865   291  1285    37    54     2  1582]]


In [56]:
print(classification_report(y_test, pred_rfc))

                                                        precision    recall  f1-score   support

                            Break and Enter Commercial       0.42      0.24      0.31      7803
                     Break and Enter Residential/Other       0.40      0.36      0.38     13365
                                              Mischief       0.34      0.24      0.28     16651
                                           Other Theft       0.81      0.84      0.83     12925
                                    Theft from Vehicle       0.52      0.80      0.63     41490
                                      Theft of Bicycle       0.40      0.11      0.18      6343
                                      Theft of Vehicle       0.23      0.06      0.10      8318
Vehicle Collision or Pedestrian Struck (with Fatality)       0.33      0.02      0.03        58
  Vehicle Collision or Pedestrian Struck (with Injury)       0.54      0.31      0.40      5024

                                      