In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
import xgboost
%matplotlib inline
plt.style.use('seaborn')

In [2]:
#List of metric for classiffication models
def metrics_classific(y, predicted, proba_predictions):
    from sklearn.metrics import confusion_matrix, classification_report, precision_recall_curve, f1_score, auc
    from matplotlib import pyplot
    confusion_matrix = confusion_matrix(y, predicted)
    print(confusion_matrix)
    print(classification_report(y, predicted))
    
    # keep probabilities for the positive outcome only
    lr_probs = proba_predictions[:, 1]
    
    # calculate precision and recall for each threshold
    lr_precision, lr_recall, _ = precision_recall_curve(y, lr_probs)

    # calculate scores
    lr_f1, lr_auc = f1_score(y, predicted), auc(lr_recall, lr_precision)

    # summarize scores
    print('Logistic: f1=%.3f auc=%.3f' % (lr_f1, lr_auc))

    #plot the precision-recall curves
    no_skill = len(y[y == 1]) / len(y)
    pyplot.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill')
    pyplot.plot(lr_recall, lr_precision, marker='.', label='Logistic')
    # axis labels
    pyplot.xlabel('Recall')
    pyplot.ylabel('Precision')
    # show the legend
    pyplot.legend()
    # show the plot
    pyplot.show()

In [3]:
df_00 = pd.read_csv('data/Chicago_Crimes_2001_to_2004.csv', error_bad_lines=False)
df_00.shape

b'Skipping line 1513591: expected 23 fields, saw 24\n'


(1923515, 23)

In [4]:
df_00.head()

Unnamed: 0.1,Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,...,Ward,Community Area,FBI Code,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location
0,879,4786321,HM399414,01/01/2004 12:01:00 AM,082XX S COLES AVE,840,THEFT,FINANCIAL ID THEFT: OVER $300,RESIDENCE,False,...,7.0,46.0,6,,,2004.0,08/17/2015 03:03:40 PM,,,
1,2544,4676906,HM278933,03/01/2003 12:00:00 AM,004XX W 42ND PL,2825,OTHER OFFENSE,HARASSMENT BY TELEPHONE,RESIDENCE,False,...,11.0,61.0,26,1173974.0,1876760.0,2003.0,04/15/2016 08:55:02 AM,41.8172,-87.637328,"(41.817229156, -87.637328162)"
2,2919,4789749,HM402220,06/20/2004 11:00:00 AM,025XX N KIMBALL AVE,1752,OFFENSE INVOLVING CHILDREN,AGG CRIM SEX ABUSE FAM MEMBER,RESIDENCE,False,...,35.0,22.0,20,,,2004.0,08/17/2015 03:03:40 PM,,,
3,2927,4789765,HM402058,12/30/2004 08:00:00 PM,045XX W MONTANA ST,840,THEFT,FINANCIAL ID THEFT: OVER $300,OTHER,False,...,31.0,20.0,6,,,2004.0,08/17/2015 03:03:40 PM,,,
4,3302,4677901,HM275615,05/01/2003 01:00:00 AM,111XX S NORMAL AVE,841,THEFT,FINANCIAL ID THEFT:$300 &UNDER,RESIDENCE,False,...,34.0,49.0,6,1174948.0,1831050.0,2003.0,04/15/2016 08:55:02 AM,41.6918,-87.635116,"(41.691784636, -87.635115968)"


In [5]:
columns_to_drop = ['Unnamed: 0', 'Case Number', 'IUCR','Updated On','Year', 'FBI Code', 'Beat','Ward','Community Area', 'Location','Block']

In [6]:
df_01 = df_00.drop(columns_to_drop, axis=1).dropna()
df_01.shape

(1892808, 12)

In [7]:
# convert dates to pandas datetime format
df_01.Date = pd.to_datetime(df_01.Date, format='%m/%d/%Y %I:%M:%S %p')

In [8]:
df_01['le_hour']= pd.to_datetime(df_01['Date']).dt.hour
df_01['Year']= pd.to_datetime(df_01['Date']).dt.year
def binary_hour(hour):
    if hour>=18 or hour<6:
        return 0
    else:
        return 1

df_01['le_binary_hour'] = df_01['le_hour'].apply(binary_hour)

In [9]:
df_01['Year'].value_counts()

2001    565181
2002    471550
2003    471183
2004    384894
Name: Year, dtype: int64

In [10]:
df_01 = df_01[df_01['Year'] == 2003]

In [11]:
df_01['Year'].value_counts()

2003    471183
Name: Year, dtype: int64

In [12]:
df_01[['Y Coordinate','Latitude']] = df_01[['Y Coordinate','Latitude']].apply(pd.to_numeric)
df_01.loc[df_01['Primary Type'] == 'CRIM SEXUAL ASSAULT' , df_01.columns=='Primary Type'] = 'OTHER OFFENSE'
df_01.loc[df_01['Primary Type'] == 'SEX OFFENSE' , df_01.columns=='Primary Type'] = 'OTHER OFFENSE'

In [13]:
df_01['Primary Type'].value_counts()[:8]

THEFT                  97537
BATTERY                87754
CRIMINAL DAMAGE        54657
NARCOTICS              53687
OTHER OFFENSE          34322
ASSAULT                29245
BURGLARY               24993
MOTOR VEHICLE THEFT    22652
Name: Primary Type, dtype: int64

In [14]:
df_01['Location Description'].value_counts()[:5]

STREET       144580
RESIDENCE     80273
APARTMENT     39202
SIDEWALK      37379
OTHER         17745
Name: Location Description, dtype: int64

In [15]:
loc_to_change  = list(df_01['Location Description'].value_counts()[5:].index)

type_to_change = list(df_01['Primary Type'].value_counts()[8:].index)

df_01.loc[df_01['Location Description'].isin(loc_to_change) , df_01.columns=='Location Description'] = 'OTHER'

df_01.loc[df_01['Primary Type'].isin(type_to_change) , df_01.columns=='Primary Type'] = 'OTHER'

In [16]:
df_01['Primary Type'].value_counts()

THEFT                  97537
BATTERY                87754
OTHER                  66336
CRIMINAL DAMAGE        54657
NARCOTICS              53687
OTHER OFFENSE          34322
ASSAULT                29245
BURGLARY               24993
MOTOR VEHICLE THEFT    22652
Name: Primary Type, dtype: int64

In [17]:
df_01['Location Description'].value_counts()

OTHER        169749
STREET       144580
RESIDENCE     80273
APARTMENT     39202
SIDEWALK      37379
Name: Location Description, dtype: int64

In [18]:
df_01.le_binary_hour.value_counts()

0    235944
1    235239
Name: le_binary_hour, dtype: int64

In [19]:
df_01.Arrest.value_counts()

False    330882
True     140301
Name: Arrest, dtype: int64

In [20]:
categoric_column_list = ['Primary Type','Location Description']

In [21]:
from sklearn.preprocessing import LabelEncoder
df_label =df_01.copy()
df_label[categoric_column_list] = df_label[categoric_column_list].apply(LabelEncoder().fit_transform)

In [22]:
df_label.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 471183 entries, 1 to 1923511
Data columns (total 15 columns):
 #   Column                Non-Null Count   Dtype         
---  ------                --------------   -----         
 0   ID                    471183 non-null  int64         
 1   Date                  471183 non-null  datetime64[ns]
 2   Primary Type          471183 non-null  int32         
 3   Description           471183 non-null  object        
 4   Location Description  471183 non-null  int32         
 5   Arrest                471183 non-null  bool          
 6   Domestic              471183 non-null  bool          
 7   District              471183 non-null  float64       
 8   X Coordinate          471183 non-null  float64       
 9   Y Coordinate          471183 non-null  float64       
 10  Latitude              471183 non-null  float64       
 11  Longitude             471183 non-null  float64       
 12  le_hour               471183 non-null  int64         
 13

In [23]:
categoric_column_list

['Primary Type', 'Location Description']

# Description

<b>Xgboost</b>

In [24]:
df_01.columns

Index(['ID', 'Date', 'Primary Type', 'Description', 'Location Description',
       'Arrest', 'Domestic', 'District', 'X Coordinate', 'Y Coordinate',
       'Latitude', 'Longitude', 'le_hour', 'Year', 'le_binary_hour'],
      dtype='object')

In [26]:
y = df_label['Location Description']
X = df_label[['Primary Type','Domestic', 'District',
       'X Coordinate', 'Y Coordinate', 'Latitude', 'Longitude','Arrest','le_hour']]

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

In [28]:
X_train.shape, X_test.shape,

((235591, 9), (235592, 9))

In [29]:
# scale_pos_weight
weights = y.value_counts()[0]/y.value_counts()[1]
weights 

0.2309409775609871

In [30]:
xgb = xgboost.XGBClassifier(scale_pos_weight=weights, n_estimators=3_000, tree_method='gpu_hist', predictor='gpu_predictor')

In [31]:
%%time
model = xgb
model.fit(X_train,y_train)
y_test.value_counts()

Parameters: { scale_pos_weight } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Wall time: 7min 18s


1    84813
4    72531
2    39955
0    19603
3    18690
Name: Location Description, dtype: int64

In [32]:
proba_predictions = model.predict_proba(X_test)
predictions = model.predict(X_test)

In [37]:
df_01['Location Description'].unique().tolist()

['RESIDENCE', 'OTHER', 'APARTMENT', 'STREET', 'SIDEWALK']

In [38]:
df_01['Location Description'].value_counts()

OTHER        169749
STREET       144580
RESIDENCE     80273
APARTMENT     39202
SIDEWALK      37379
Name: Location Description, dtype: int64

In [None]:
# y_test.value_counts()

In [33]:
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_curve, f1_score, auc
from matplotlib import pyplot
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))

[[ 5551  3535  6354   700  3463]
 [ 2164 56481  6079  2896 17193]
 [ 4388  7066 18622  1175  8704]
 [  714  4777  1547  4553  7099]
 [ 2059 17367  6769  4435 41901]]
              precision    recall  f1-score   support

           0       0.37      0.28      0.32     19603
           1       0.63      0.67      0.65     84813
           2       0.47      0.47      0.47     39955
           3       0.33      0.24      0.28     18690
           4       0.53      0.58      0.56     72531

    accuracy                           0.54    235592
   macro avg       0.47      0.45      0.46    235592
weighted avg       0.53      0.54      0.53    235592



<b>linear models: KNN, Logist, SVM</b> 

In [25]:
df_on_hot = pd.get_dummies(df_label, columns = ['Primary Type'],sparse=True, drop_first=True)
df_on_hot.shape

(471183, 22)

In [26]:
df_01.columns

Index(['ID', 'Date', 'Primary Type', 'Description', 'Location Description',
       'Arrest', 'Domestic', 'District', 'X Coordinate', 'Y Coordinate',
       'Latitude', 'Longitude', 'le_hour', 'Year', 'le_binary_hour'],
      dtype='object')

In [27]:
y = df_on_hot['Location Description']
X = df_on_hot.drop(['ID', 'Date','Location Description','Description','Year','le_binary_hour'],axis=1)
X.shape, y.shape

((471183, 16), (471183,))

In [28]:
categoric_column_list

['Primary Type', 'Location Description']

In [29]:
columns_to_scale =['X Coordinate',
 'Y Coordinate',
 'Latitude',
 'Longitude']

In [30]:
X_scaled = X.copy()
X_scaled[columns_to_scale] = StandardScaler().fit_transform(X[columns_to_scale].values)

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.7, random_state=42)
X_train.shape, X_test.shape

((141354, 16), (329829, 16))

In [32]:
%%time
logistic_model = LogisticRegression(class_weight='balanced').fit(X_train,y_train)
proba_predictions = logistic_model.predict_proba(X_test)
predictions = logistic_model.predict(X_test)

Wall time: 15.3 s


In [33]:
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_curve, f1_score, auc
from matplotlib import pyplot
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))

[[12442  3724  5126  3767  2401]
 [13164 48407 12401 29480 15325]
 [14743  8727 18959  8298  5399]
 [ 1822  4478  1606 17397   871]
 [ 6182 29281 10133 30730 24966]]
              precision    recall  f1-score   support

           0       0.26      0.45      0.33     27460
           1       0.51      0.41      0.45    118777
           2       0.39      0.34      0.36     56126
           3       0.19      0.66      0.30     26174
           4       0.51      0.25      0.33    101292

    accuracy                           0.37    329829
   macro avg       0.37      0.42      0.36    329829
weighted avg       0.44      0.37      0.38    329829



In [34]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.95, random_state=42)
X_train.shape, X_test.shape

((23559, 16), (447624, 16))

In [35]:
%%time
SVC_model = SVC(kernel="rbf",class_weight='balanced',C = 3).fit(X_train,y_train)

# proba_predictions = SVC_model.predict_proba(X_test)
predictions = SVC_model.predict(X_test)

from sklearn.metrics import confusion_matrix, classification_report, precision_recall_curve, f1_score, auc
from matplotlib import pyplot
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))

[[16567  5367  5644  5896  3736]
 [14816 70346  9538 42861 23706]
 [19319 14659 21206 11932  9139]
 [ 2130  6276  1077 25154   870]
 [ 6253 35094  7378 46117 42543]]
              precision    recall  f1-score   support

           0       0.28      0.45      0.34     37210
           1       0.53      0.44      0.48    161267
           2       0.47      0.28      0.35     76255
           3       0.19      0.71      0.30     35507
           4       0.53      0.31      0.39    137385

    accuracy                           0.39    447624
   macro avg       0.40      0.44      0.37    447624
weighted avg       0.47      0.39      0.41    447624

Wall time: 7min 58s


In [36]:
%%time
SVC_model = SVC(kernel="linear").fit(X_train,y_train)

# proba_predictions = SVC_model.predict_proba(X_test)
predictions = SVC_model.predict(X_test)

from sklearn.metrics import confusion_matrix, classification_report, precision_recall_curve, f1_score, auc
from matplotlib import pyplot
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))

[[     0  16848  16420      0   3942]
 [     2 115227  12221      0  33817]
 [     1  33566  31915      0  10773]
 [     5  18959   2984      0  13559]
 [     4  65818  12277      0  59286]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00     37210
           1       0.46      0.71      0.56    161267
           2       0.42      0.42      0.42     76255
           3       0.00      0.00      0.00     35507
           4       0.49      0.43      0.46    137385

    accuracy                           0.46    447624
   macro avg       0.27      0.31      0.29    447624
weighted avg       0.39      0.46      0.41    447624

Wall time: 1h 38min 50s


In [37]:
%%time
knn_model = KNeighborsClassifier(n_neighbors=4).fit(X_train,y_train)
proba_predictions = knn_model.predict_proba(X_test)
predictions = knn_model.predict(X_test)

from sklearn.metrics import confusion_matrix, classification_report, precision_recall_curve, f1_score, auc
from matplotlib import pyplot
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))

[[10225 11581  8167  1234  6003]
 [11419 94468 13752  6813 34815]
 [12108 26487 21759  2206 13695]
 [ 3184 11838  2493  7584 10408]
 [ 8871 50743 11751 10392 55628]]
              precision    recall  f1-score   support

           0       0.22      0.27      0.25     37210
           1       0.48      0.59      0.53    161267
           2       0.38      0.29      0.32     76255
           3       0.27      0.21      0.24     35507
           4       0.46      0.40      0.43    137385

    accuracy                           0.42    447624
   macro avg       0.36      0.35      0.35    447624
weighted avg       0.42      0.42      0.42    447624

Wall time: 45.5 s
