https://www.kaggle.com/c/sf-crime/discussion

# 1. Read the cleaned and merged data set

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import numpy as np
import datetime
sns.set()
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

In [26]:
df = pd.read_csv('/Users/wanranli/Downloads/!Capstone Project 1/SF_crime/datasets/p0.csv', parse_dates=['Date'], index_col='Date')
df.head(3)

Unnamed: 0_level_0,Unnamed: 0,Address,Analysis Neighborhoods,Category,Current Police Districts,Current Supervisor Districts,DayOfWeek,Descript,IncidntNum,Location,PdDistrict,Resolution,SF Find Neighborhoods,Time,X,Y
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2017-12-31,0,800 Block of BRYANT ST,34.0,ASSAULT,1.0,10.0,Sunday,BATTERY,180000417,"(37.775420706711, -122.40340479147905)",SOUTHERN,NONE,32.0,23:58,-122.403405,37.775421
2017-12-31,1,500 Block of JONES ST,36.0,ASSAULT,5.0,10.0,Sunday,AGGRAVATED ASSAULT WITH A KNIFE,180000069,"(37.78627745916602, -122.41299907500884)",TENDERLOIN,"ARREST, BOOKED",20.0,23:56,-122.412999,37.786277
2017-12-31,4,500 Block of VALENCIA ST,20.0,LARCENY/THEFT,3.0,5.0,Sunday,PETTY THEFT OF PROPERTY,186005077,"(37.76408889445322, -122.42187648849193)",MISSION,NONE,37.0,23:55,-122.421876,37.764089


In [27]:
df.shape

(1822068, 16)

In [28]:
# to count the number of null values.
df.isnull().sum()

Unnamed: 0                          0
Address                          6415
Analysis Neighborhoods           7276
Category                            8
Current Police Districts         7489
Current Supervisor Districts     6940
DayOfWeek                           0
Descript                            0
IncidntNum                          0
Location                         6415
PdDistrict                          1
Resolution                          0
SF Find Neighborhoods           13495
Time                                0
X                                6415
Y                                6415
dtype: int64

In [29]:
# to capitalize all letters in PdDistrict column so that the values are consistant
df['PdDistrict'] = df['PdDistrict'].str.upper()
df.PdDistrict.value_counts()

SOUTHERN      331121
MISSION       239336
NORTHERN      230836
CENTRAL       202557
BAYVIEW       176448
INGLESIDE     157834
TARAVAL       139662
TENDERLOIN    138339
PARK          103713
RICHMOND       99828
OUT OF SF       2393
Name: PdDistrict, dtype: int64

In [30]:
# As the most common PdDistrict is Southern, we fill the null value with Southern district
df['PdDistrict'].fillna('SOUTHERN',inplace=True)
df['Category'].fillna('theft',inplace=True)

In [31]:
# to see the unique values of category
df.Category.unique()

array(['ASSAULT', 'LARCENY/THEFT', 'DRUNKENNESS', 'NON-CRIMINAL',
       'SEX OFFENSES, FORCIBLE', 'VANDALISM', 'SECONDARY CODES',
       'VEHICLE THEFT', 'SUSPICIOUS OCC', 'WEAPON LAWS', 'OTHER OFFENSES',
       'TRESPASS', 'MISSING PERSON', 'BURGLARY', 'STOLEN PROPERTY',
       'WARRANTS', 'FRAUD', 'DRUG/NARCOTIC', 'ROBBERY', 'PROSTITUTION',
       'SUICIDE', 'RECOVERED VEHICLE', 'FORGERY/COUNTERFEITING', 'ARSON',
       'BAD CHECKS', 'EMBEZZLEMENT', 'RUNAWAY', 'DISORDERLY CONDUCT',
       'DRIVING UNDER THE INFLUENCE', 'SEX OFFENSES, NON FORCIBLE',
       'KIDNAPPING', 'FAMILY OFFENSES', 'LIQUOR LAWS', 'BRIBERY',
       'EXTORTION', 'LOITERING', 'GAMBLING', 'PORNOGRAPHY/OBSCENE MAT',
       'TREA', 'Other Miscellaneous', 'Lost Property', 'Larceny Theft',
       'Fraud', 'Suspicious Occ', 'Recovered Vehicle', 'Burglary',
       'Non-Criminal', 'Missing Person', 'Other Offenses',
       'Malicious Mischief', 'Assault', 'Robbery',
       'Forgery And Counterfeiting', 'Miscellaneous Inv

In [32]:
# As the categories are broad and not consistent, we updated the category names
category_dict = {'ASSAULT': 'assault', 'LARCENY/THEFT':'theft','DRUNKENNESS':'alcohol_related', 
                 'NON-CRIMINAL':'aided_case','SEX OFFENSES, FORCIBLE':'sex_related', 
                 'VANDALISM': 'vandalism', 'SECONDARY CODES': 'others','VEHICLE THEFT': 'theft', 
                 'SUSPICIOUS OCC':'others', 'WEAPON LAWS':'others', 'OTHER OFFENSES':'traffic_related',
                 'TRESPASS': 'others', 'MISSING PERSON':'aided_case', 'BURGLARY':'robbery', 
                 'STOLEN PROPERTY':'theft','WARRANTS':'others', 'FRAUD':'fraud', 
                 'DRUG/NARCOTIC':'drug_related', 'ROBBERY':'robbery', 'PROSTITUTION':'sex_related',
                 'SUICIDE': 'suicide', 'RECOVERED VEHICLE':'theft', 'FORGERY/COUNTERFEITING':'fraud', 
                 'ARSON':'arson','BAD CHECKS':'fraud', 'EMBEZZLEMENT':'fraud', 'RUNAWAY':'others', 
                 'DISORDERLY CONDUCT':'others','DRIVING UNDER THE INFLUENCE':'alcohol_related', 
                 'SEX OFFENSES, NON FORCIBLE':'sex_related','KIDNAPPING':'kidnapping', 
                 'FAMILY OFFENSES':'others', 'LIQUOR LAWS':'alcohol_related', 'BRIBERY':'others',
                 'EXTORTION':'extortion', 'LOITERING':'others', 'GAMBLING':'gambling', 
                 'PORNOGRAPHY/OBSCENE MAT':'sex_related','TREA':'others', 'Other Miscellaneous':'others', 
                 'Lost Property':'aided_case', 'Larceny Theft': 'theft','Fraud':'fraud', 
                 'Suspicious Occ':'others', 'Recovered Vehicle':'aided_case', 'Burglary':'robbery',
                 'Non-Criminal':'aided_case', 'Missing Person':'aided_case', 'Other Offenses':'traffic_related',
                 'Malicious Mischief':'others', 'Assault':'assault', 'Robbery':'robbery',
                 'Forgery And Counterfeiting':'fraud', 'Miscellaneous Investigation':'others',
                 'Sex Offense':'sex_related', 'Motor Vehicle Theft':'theft', 'Courtesy Report':'others',
                 'Case Closure':'others', 'Other':'others', 'Drug Offense':'drug_related', 
                 'Warrant':'others','Disorderly Conduct':'others', 'Embezzlement':'fraud',
                 'Offences Against The Family And Children':'others', 'Weapons Carrying Etc':'others',
                 'Traffic Violation Arrest':'traffic_related', 'Civil Sidewalks':'others',
                 'Human Trafficking (A), Commercial Sex Acts':'sex_related', 'Juvenile Offenses':'others',
                 'Arson':'arson', 'Stolen Property':'theft', 'Vehicle Impounded':'vandalism', 
                 'Drug Violation':'drug_related','Traffic Collision':'traffic_related', 
                 'Prostitution':'sex_related', 'Weapons Offense':'others',
                 'Family Offense':'others', 'Vandalism':'vandalism', 'Fire Report':'others', 
                 'Rape':'sex_related', 'Suicide':'suicide','Vehicle Misplaced':'aided_case', 
                 'Suspicious':'others', 'Liquor Laws':'alcohol_related',
                 'Human Trafficking, Commercial Sex Acts':'sex_related', 'Homicide':'murder',
                 'Motor Vehicle Theft?':'theft', 'Gambling':'gambling', 'Weapons Offence':'others'}
df1 = df.replace({'Category': category_dict})
df1.Category.unique()

array(['assault', 'theft', 'alcohol_related', 'aided_case', 'sex_related',
       'vandalism', 'others', 'traffic_related', 'robbery', 'fraud',
       'drug_related', 'suicide', 'arson', 'kidnapping', 'extortion',
       'gambling', 'murder'], dtype=object)

In [33]:
# to see the unique values of Resolution column
df1.Resolution.unique()

array(['NONE', 'ARREST, BOOKED', 'EXCEPTIONAL CLEARANCE', 'UNFOUNDED',
       'JUVENILE BOOKED', 'CLEARED-CONTACT JUVENILE FOR MORE INFO',
       'JUVENILE DIVERTED', 'ARREST, CITED', 'PSYCHOPATHIC CASE',
       'NOT PROSECUTED', 'LOCATED', 'PROSECUTED FOR LESSER OFFENSE',
       'JUVENILE CITED', 'COMPLAINANT REFUSES TO PROSECUTE',
       'DISTRICT ATTORNEY REFUSES TO PROSECUTE',
       'PROSECUTED BY OUTSIDE AGENCY', 'JUVENILE ADMONISHED',
       'Cite or Arrest Adult', 'Open or Active', 'Unfounded',
       'Exceptional Adult', 'Cite or Arrest Juvenile',
       'Exceptional Juvenile'], dtype=object)

In [34]:
# to make names of the Resolution column consistent
res_dict = {'NONE':'none', 'ARREST, BOOKED':'arrest', 'EXCEPTIONAL CLEARANCE':'others', 'UNFOUNDED':'others',
       'JUVENILE BOOKED':'juvenile_related', 'CLEARED-CONTACT JUVENILE FOR MORE INFO':'juvenile_related',
       'JUVENILE DIVERTED':'juvenile_related', 'ARREST, CITED':'arrest', 'PSYCHOPATHIC CASE':'others',
       'NOT PROSECUTED':'others', 'LOCATED':'others', 'PROSECUTED FOR LESSER OFFENSE':'others',
       'JUVENILE CITED':'juvenile_related', 'COMPLAINANT REFUSES TO PROSECUTE':'others',
       'DISTRICT ATTORNEY REFUSES TO PROSECUTE':'others',
       'PROSECUTED BY OUTSIDE AGENCY':'others', 'JUVENILE ADMONISHED':'juvenile_related',
       'Cite or Arrest Adult':'arrest', 'Open or Active':'others', 'Unfounded':'others',
       'Exceptional Adult':'others', 'Cite or Arrest Juvenile':'juvenile_related',
       'Exceptional Juvenile':'juvenile_related'}
df1 = df1.replace({'Resolution': res_dict})
df1.Resolution.unique()

array(['none', 'arrest', 'others', 'juvenile_related'], dtype=object)

In [35]:
df1.head()

Unnamed: 0_level_0,Unnamed: 0,Address,Analysis Neighborhoods,Category,Current Police Districts,Current Supervisor Districts,DayOfWeek,Descript,IncidntNum,Location,PdDistrict,Resolution,SF Find Neighborhoods,Time,X,Y
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2017-12-31,0,800 Block of BRYANT ST,34.0,assault,1.0,10.0,Sunday,BATTERY,180000417,"(37.775420706711, -122.40340479147905)",SOUTHERN,none,32.0,23:58,-122.403405,37.775421
2017-12-31,1,500 Block of JONES ST,36.0,assault,5.0,10.0,Sunday,AGGRAVATED ASSAULT WITH A KNIFE,180000069,"(37.78627745916602, -122.41299907500884)",TENDERLOIN,arrest,20.0,23:56,-122.412999,37.786277
2017-12-31,4,500 Block of VALENCIA ST,20.0,theft,3.0,5.0,Sunday,PETTY THEFT OF PROPERTY,186005077,"(37.76408889445322, -122.42187648849193)",MISSION,none,37.0,23:55,-122.421876,37.764089
2017-12-31,5,700 Block of HARRISON ST,34.0,theft,1.0,10.0,Sunday,PETTY THEFT OF PROPERTY,176001461,"(37.782137249161906, -122.3978145063337)",SOUTHERN,none,32.0,23:50,-122.397815,37.782137
2017-12-31,6,HEMLOCK ST / POLK ST,21.0,alcohol_related,4.0,3.0,Sunday,UNDER INFLUENCE OF ALCOHOL IN A PUBLIC PLACE,180000025,"(37.787280707610776, -122.42002147104692)",NORTHERN,arrest,50.0,23:46,-122.420021,37.787281


In [36]:
df1.drop(['Unnamed: 0','Address','Current Supervisor Districts','IncidntNum','Descript','Location','Current Police Districts'],
         axis=1,inplace=True)

In [37]:
df1.head()

Unnamed: 0_level_0,Analysis Neighborhoods,Category,DayOfWeek,PdDistrict,Resolution,SF Find Neighborhoods,Time,X,Y
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2017-12-31,34.0,assault,Sunday,SOUTHERN,none,32.0,23:58,-122.403405,37.775421
2017-12-31,36.0,assault,Sunday,TENDERLOIN,arrest,20.0,23:56,-122.412999,37.786277
2017-12-31,20.0,theft,Sunday,MISSION,none,37.0,23:55,-122.421876,37.764089
2017-12-31,34.0,theft,Sunday,SOUTHERN,none,32.0,23:50,-122.397815,37.782137
2017-12-31,21.0,alcohol_related,Sunday,NORTHERN,arrest,50.0,23:46,-122.420021,37.787281


In [38]:
df1.to_csv('/Users/wanranli/Downloads/!Capstone Project 1/SF_crime/datasets/police.csv')

# 2. Read the new simplified clean dataset

In [39]:
df_clean = pd.read_csv('/Users/wanranli/Downloads/!Capstone Project 1/SF_crime/datasets/police.csv')
df_clean.head()

Unnamed: 0,Date,Analysis Neighborhoods,Category,DayOfWeek,PdDistrict,Resolution,SF Find Neighborhoods,Time,X,Y
0,2017-12-31,34.0,assault,Sunday,SOUTHERN,none,32.0,23:58,-122.403405,37.775421
1,2017-12-31,36.0,assault,Sunday,TENDERLOIN,arrest,20.0,23:56,-122.412999,37.786277
2,2017-12-31,20.0,theft,Sunday,MISSION,none,37.0,23:55,-122.421876,37.764089
3,2017-12-31,34.0,theft,Sunday,SOUTHERN,none,32.0,23:50,-122.397815,37.782137
4,2017-12-31,21.0,alcohol_related,Sunday,NORTHERN,arrest,50.0,23:46,-122.420021,37.787281


In [40]:
df_clean.isnull().sum()

Date                          0
Analysis Neighborhoods     7276
Category                      0
DayOfWeek                     0
PdDistrict                    0
Resolution                    0
SF Find Neighborhoods     13495
Time                          0
X                          6415
Y                          6415
dtype: int64

In [41]:
df_clean['Analysis Neighborhoods'].fillna(0, inplace=True)
df_clean['SF Find Neighborhoods'].fillna(0, inplace=True)
df_clean['X'].fillna(0, inplace=True)
df_clean['Y'].fillna(0, inplace=True)
df_clean.isnull().sum()

Date                      0
Analysis Neighborhoods    0
Category                  0
DayOfWeek                 0
PdDistrict                0
Resolution                0
SF Find Neighborhoods     0
Time                      0
X                         0
Y                         0
dtype: int64

In [42]:
df_clean['Category'].unique()

array(['assault', 'theft', 'alcohol_related', 'aided_case', 'sex_related',
       'vandalism', 'others', 'traffic_related', 'robbery', 'fraud',
       'drug_related', 'suicide', 'arson', 'kidnapping', 'extortion',
       'gambling', 'murder'], dtype=object)

In [43]:
# converting the target labels into numeric form 
from sklearn import preprocessing

label_encoder = preprocessing.LabelEncoder() 
df1['Category']= label_encoder.fit_transform(df1['Category']) 
df1['Category'].unique() 

array([ 3, 14,  1,  0, 12, 16, 10, 15, 11,  6,  4, 13,  2,  8,  5,  7,  9])

In [44]:
pd_dict = {'SOUTHERN': 1, 'MISSION': 2, 'NORTHERN':3, 'CENTRAL':4, 'BAYVIEW':5, 'INGLESIDE':6, 'TARAVAL':7,
           'TENDERLOIN':8, 'PARK':9, 'RICHMOND':10, 'OUT OF SF':11}

In [45]:
df_clean.replace({'PdDistrict': pd_dict}, inplace=True)
df_clean.head()

Unnamed: 0,Date,Analysis Neighborhoods,Category,DayOfWeek,PdDistrict,Resolution,SF Find Neighborhoods,Time,X,Y
0,2017-12-31,34.0,assault,Sunday,1,none,32.0,23:58,-122.403405,37.775421
1,2017-12-31,36.0,assault,Sunday,8,arrest,20.0,23:56,-122.412999,37.786277
2,2017-12-31,20.0,theft,Sunday,2,none,37.0,23:55,-122.421876,37.764089
3,2017-12-31,34.0,theft,Sunday,1,none,32.0,23:50,-122.397815,37.782137
4,2017-12-31,21.0,alcohol_related,Sunday,3,arrest,50.0,23:46,-122.420021,37.787281


In [46]:
day_dict = {'Monday':1, 'Tuesday':2, 'Wednesday':3,'Thursday':4,'Friday':5 ,  'Saturday':6, 'Sunday':7}
df_clean.replace({'DayOfWeek':day_dict}, inplace=True)
df_clean.head()

Unnamed: 0,Date,Analysis Neighborhoods,Category,DayOfWeek,PdDistrict,Resolution,SF Find Neighborhoods,Time,X,Y
0,2017-12-31,34.0,assault,7,1,none,32.0,23:58,-122.403405,37.775421
1,2017-12-31,36.0,assault,7,8,arrest,20.0,23:56,-122.412999,37.786277
2,2017-12-31,20.0,theft,7,2,none,37.0,23:55,-122.421876,37.764089
3,2017-12-31,34.0,theft,7,1,none,32.0,23:50,-122.397815,37.782137
4,2017-12-31,21.0,alcohol_related,7,3,arrest,50.0,23:46,-122.420021,37.787281


In [47]:
resolution_dict = {'none':0, 'arrest':1, 'juvenile_related':2,'others':3}
df_clean.replace({'Resolution':resolution_dict}, inplace=True)
df_clean.head()

Unnamed: 0,Date,Analysis Neighborhoods,Category,DayOfWeek,PdDistrict,Resolution,SF Find Neighborhoods,Time,X,Y
0,2017-12-31,34.0,assault,7,1,0,32.0,23:58,-122.403405,37.775421
1,2017-12-31,36.0,assault,7,8,1,20.0,23:56,-122.412999,37.786277
2,2017-12-31,20.0,theft,7,2,0,37.0,23:55,-122.421876,37.764089
3,2017-12-31,34.0,theft,7,1,0,32.0,23:50,-122.397815,37.782137
4,2017-12-31,21.0,alcohol_related,7,3,1,50.0,23:46,-122.420021,37.787281


In [48]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1822068 entries, 0 to 1822067
Data columns (total 10 columns):
Date                      object
Analysis Neighborhoods    float64
Category                  object
DayOfWeek                 int64
PdDistrict                int64
Resolution                int64
SF Find Neighborhoods     float64
Time                      object
X                         float64
Y                         float64
dtypes: float64(4), int64(3), object(3)
memory usage: 139.0+ MB


In [49]:
df_clean['Date'] = pd.to_datetime(df_clean['Date'])
df_clean['Day'] = df_clean['Date'].dt.day
df_clean['Month'] = df_clean['Date'].dt.month
df_clean['Year'] = df_clean['Date'].dt.year

df_clean['Time'] = pd.to_datetime(df_clean['Time'])
df_clean['Hour'] = df_clean['Time'].dt.hour
df_clean['Minute'] = df_clean['Time'].dt.minute
df_clean.head()

Unnamed: 0,Date,Analysis Neighborhoods,Category,DayOfWeek,PdDistrict,Resolution,SF Find Neighborhoods,Time,X,Y,Day,Month,Year,Hour,Minute
0,2017-12-31,34.0,assault,7,1,0,32.0,2020-02-08 23:58:00,-122.403405,37.775421,31,12,2017,23,58
1,2017-12-31,36.0,assault,7,8,1,20.0,2020-02-08 23:56:00,-122.412999,37.786277,31,12,2017,23,56
2,2017-12-31,20.0,theft,7,2,0,37.0,2020-02-08 23:55:00,-122.421876,37.764089,31,12,2017,23,55
3,2017-12-31,34.0,theft,7,1,0,32.0,2020-02-08 23:50:00,-122.397815,37.782137,31,12,2017,23,50
4,2017-12-31,21.0,alcohol_related,7,3,1,50.0,2020-02-08 23:46:00,-122.420021,37.787281,31,12,2017,23,46


In [50]:
df_clean.drop(['Date','Time'], axis=1, inplace=True)
df_clean.head()

Unnamed: 0,Analysis Neighborhoods,Category,DayOfWeek,PdDistrict,Resolution,SF Find Neighborhoods,X,Y,Day,Month,Year,Hour,Minute
0,34.0,assault,7,1,0,32.0,-122.403405,37.775421,31,12,2017,23,58
1,36.0,assault,7,8,1,20.0,-122.412999,37.786277,31,12,2017,23,56
2,20.0,theft,7,2,0,37.0,-122.421876,37.764089,31,12,2017,23,55
3,34.0,theft,7,1,0,32.0,-122.397815,37.782137,31,12,2017,23,50
4,21.0,alcohol_related,7,3,1,50.0,-122.420021,37.787281,31,12,2017,23,46


In [51]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1822068 entries, 0 to 1822067
Data columns (total 13 columns):
Analysis Neighborhoods    float64
Category                  object
DayOfWeek                 int64
PdDistrict                int64
Resolution                int64
SF Find Neighborhoods     float64
X                         float64
Y                         float64
Day                       int64
Month                     int64
Year                      int64
Hour                      int64
Minute                    int64
dtypes: float64(4), int64(8), object(1)
memory usage: 180.7+ MB


In [30]:
# Save the data for ML
df_clean.to_csv('/Users/wanranli/Downloads/!Capstone Project 1/SF_crime/datasets/police_ML.csv')

# 3. Import Dataset Prepped for ML

In [2]:
df_clean = pd.read_csv('/Users/wanranli/Downloads/!Capstone Project 1/SF_crime/datasets/police_ML.csv')
df_clean.head()

Unnamed: 0.1,Unnamed: 0,Analysis Neighborhoods,Category,DayOfWeek,PdDistrict,Resolution,SF Find Neighborhoods,X,Y,Day,Month,Year,Hour,Minute
0,0,34.0,3,7,1,0,32.0,-122.403405,37.775421,31,12,2017,23,58
1,1,36.0,3,7,8,1,20.0,-122.412999,37.786277,31,12,2017,23,56
2,2,20.0,14,7,2,0,37.0,-122.421876,37.764089,31,12,2017,23,55
3,3,34.0,14,7,1,0,32.0,-122.397815,37.782137,31,12,2017,23,50
4,4,21.0,1,7,3,1,50.0,-122.420021,37.787281,31,12,2017,23,46


In [3]:
# df_clean = df_clean[df_clean['Year'].isin([2018,2017])]
df_clean = df_clean[df_clean['Year']==2018]
df_clean

Unnamed: 0.1,Unnamed: 0,Analysis Neighborhoods,Category,DayOfWeek,PdDistrict,Resolution,SF Find Neighborhoods,X,Y,Day,Month,Year,Hour,Minute
1711355,1711355,8.0,10,7,1,1,32.0,-122.404795,37.784908,2,12,2018,0,45
1711356,1711356,36.0,0,6,4,3,19.0,-122.408036,37.786410,1,12,2018,20,30
1711357,1711357,20.0,14,5,2,3,53.0,-122.416549,37.766871,16,11,2018,1,34
1711358,1711358,34.0,14,7,1,3,32.0,-122.407015,37.777400,19,8,2018,23,0
1711359,1711359,0.0,14,1,5,3,0.0,0.000000,0.000000,31,12,2018,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1822063,1822063,36.0,10,7,1,3,21.0,-122.419397,37.775422,3,6,2018,7,0
1822064,1822064,6.0,11,3,4,3,106.0,-122.401907,37.796876,17,1,2018,23,0
1822065,1822065,3.0,14,7,3,3,26.0,-122.432280,37.772829,30,12,2018,20,20
1822066,1822066,18.0,0,3,9,3,24.0,-122.446284,37.774871,24,1,2018,5,0


In [4]:
df_clean.shape

(110713, 14)

In [5]:
df_clean.drop('Unnamed: 0',axis=1,inplace=True)

In [6]:
# importing alll the necessary packages to use the various classification algorithms
from sklearn.linear_model import LogisticRegression  # for Logistic Regression algorithm
from sklearn.model_selection import train_test_split #to split the dataset for training and testing
from sklearn.neighbors import KNeighborsClassifier  # for K nearest neighbours
from sklearn import svm  #for Support Vector Machine (SVM) Algorithm
from sklearn import metrics #for checking the model accuracy
from sklearn.tree import DecisionTreeClassifier #for using Decision Tree Algoithm

from time import sleep

In [7]:
df_clean.columns

Index(['Analysis Neighborhoods', 'Category', 'DayOfWeek', 'PdDistrict',
       'Resolution', 'SF Find Neighborhoods', 'X', 'Y', 'Day', 'Month', 'Year',
       'Hour', 'Minute'],
      dtype='object')

In [10]:
features = ['Analysis Neighborhoods', 'DayOfWeek', 'PdDistrict',
       'Resolution', 'SF Find Neighborhoods', 'X', 'Y', 'Day', 'Month', 'Year',
       'Hour', 'Minute']
X = df_clean[features]
y = df_clean.Category

# split training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [11]:
parameters = {'gamma':[1,2,3], 'min_child_weight':[0,1,2], 'learning_rate': [0.25, 0.2, 0.1], 
              'subsample':[1,0.8], 'scale_pos_weight':[0.95, 0.9], 'max_delta_step':[0,0.05]}

In [10]:
!pip install xgboost



In [12]:
from xgboost import XGBClassifier, cv, plot_importance
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

# instantiate a classifier
xg_clf = XGBClassifier(objective = 'multi:softprob', random_state = 42, silent = 1, n_estimators=10, max_depth= 8,
                      num_class=17)

# RandomSearch
grid_search = GridSearchCV(param_grid= parameters, estimator = xg_clf, cv=3, scoring='accuracy')
grid_search.fit(X_train,y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=8, min_child_weight=1,
                                     missing=None, n_estimators=10, n_jobs=1,
                                     nthread=None, num_class=17,
                                     objective='multi:softprob',
                                     random_state=42, reg_a...eg_lambda=1,
                                     scale_pos_weight=1, seed=None, silent=1,
                                     subsample=1, verbosity=1),
             iid='warn', n_jobs=None,
             param_grid={'gamma': [1, 2, 3], 'learning_rate': [0.25, 0.2, 0.1],
                         'max_delta_step': [0,

In [13]:
from sklearn.metrics import accuracy_score, mean_squared_error

In [14]:
# Print best parameters and results
print(grid_search.best_params_)
print(grid_search.best_score_)

{'gamma': 1, 'learning_rate': 0.25, 'max_delta_step': 0, 'min_child_weight': 1, 'scale_pos_weight': 0.95, 'subsample': 0.8}
0.5114646640601815


In [15]:
# optimized parameters
optimized_parameters = {'objective':'multi:softprob',
                        'n_estimators': 10,
                        'random_state': 42,
                        'silent': 0,
                        'subsample': 1, 
                        'scale_pos_weight': 1, 
                        'min_child_weight': 2, 
                        'max_depth': 8, 
                        'max_delta_step': 0, 
                        'learning_rate': 0.2, 
                        'gamma': 3,
                        'num_class':17
                       }

In [16]:
# instantiate a classifier
xg_clf_opt = XGBClassifier(**optimized_parameters)
xg_clf_opt.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=3,
              learning_rate=0.2, max_delta_step=0, max_depth=8,
              min_child_weight=2, missing=None, n_estimators=10, n_jobs=1,
              nthread=None, num_class=17, objective='multi:softprob',
              random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              seed=None, silent=0, subsample=1, verbosity=1)

In [17]:
# predict the labels
y_opt_pred = xg_clf_opt.predict(X_train)

# score model
score = accuracy_score(y_train, y_opt_pred)
print("The accuracy of the optimized classifier is {}".format(round(score, 5)))

The accuracy of the optimized classifier is 0.52841


In [63]:
# use classification_report to evaluate the performance
from sklearn.metrics import classification_report
report = classification_report(y_train, y_opt_pred, output_dict=True)
report

  'precision', 'predicted', average, warn_for)


{'0': {'precision': 0.5114060555785981,
  'recall': 0.1976753507014028,
  'f1-score': 0.2851361507775915,
  'support': 12475},
 '1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 15},
 '2': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 174},
 '3': {'precision': 0.7777777777777778,
  'recall': 0.005011933174224344,
  'f1-score': 0.009959686981266303,
  'support': 4190},
 '4': {'precision': 0.5555555555555556,
  'recall': 0.19148936170212766,
  'f1-score': 0.2848101265822785,
  'support': 1175},
 '6': {'precision': 0.5495495495495496,
  'recall': 0.048355132778438364,
  'f1-score': 0.08888888888888888,
  'support': 2523},
 '7': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 3},
 '9': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 6},
 '10': {'precision': 0.5014968192590745,
  'recall': 0.4448932167754786,
  'f1-score': 0.47150228685352413,
  'support': 18074},
 '11': {'precision': 0.4282700421940928,
  'recall': 0.039494163424

In [86]:
df_report = pd.DataFrame(report).transpose()
df_report

Unnamed: 0,precision,recall,f1-score,support
0,0.511406,0.197675,0.285136,12475.0
1,0.0,0.0,0.0,15.0
2,0.0,0.0,0.0,174.0
3,0.777778,0.005012,0.00996,4190.0
4,0.555556,0.191489,0.28481,1175.0
6,0.54955,0.048355,0.088889,2523.0
7,0.0,0.0,0.0,3.0
9,0.0,0.0,0.0,6.0
10,0.501497,0.444893,0.471502,18074.0
11,0.42827,0.039494,0.072319,5140.0


In [98]:
name = {'3':'assault', '14': 'theft', '1': 'alcohol_related', '0': 'aided_case', '12': 'sex_related',
       '16':'vandalism', '10':'others', '15':'traffic_related', '11': 'robbery', '6': 'fraud',
       '4': 'drug_related', '13': 'suicide', '2': 'arson', '8': 'kidnapping', '5': 'extortion',
       '7':'gambling', '9':'murder', 'accuracy':'accuracy', 'macro avg':'macro avg', 'weighted avg':'weighted avg'}

In [99]:
df_report.index

Index(['0', '1', '2', '3', '4', '6', '7', '9', '10', '11', '12', '13', '14',
       '15', '16', 'accuracy', 'macro avg', 'weighted avg'],
      dtype='object')

In [100]:
df_report['category'] = df_report.index
df_report

Unnamed: 0,precision,recall,f1-score,support,category
0,0.511406,0.197675,0.285136,12475.0,0
1,0.0,0.0,0.0,15.0,1
2,0.0,0.0,0.0,174.0,2
3,0.777778,0.005012,0.00996,4190.0,3
4,0.555556,0.191489,0.28481,1175.0,4
6,0.54955,0.048355,0.088889,2523.0,6
7,0.0,0.0,0.0,3.0,7
9,0.0,0.0,0.0,6.0,9
10,0.501497,0.444893,0.471502,18074.0,10
11,0.42827,0.039494,0.072319,5140.0,11


In [102]:
df_report['category'] = df_report['category'].map(name)
df_report

Unnamed: 0,precision,recall,f1-score,support,category
0,0.511406,0.197675,0.285136,12475.0,aided_case
1,0.0,0.0,0.0,15.0,alcohol_related
2,0.0,0.0,0.0,174.0,arson
3,0.777778,0.005012,0.00996,4190.0,assault
4,0.555556,0.191489,0.28481,1175.0,drug_related
6,0.54955,0.048355,0.088889,2523.0,fraud
7,0.0,0.0,0.0,3.0,gambling
9,0.0,0.0,0.0,6.0,murder
10,0.501497,0.444893,0.471502,18074.0,others
11,0.42827,0.039494,0.072319,5140.0,robbery


In [103]:
df_report.set_index('category')

Unnamed: 0_level_0,precision,recall,f1-score,support
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
aided_case,0.511406,0.197675,0.285136,12475.0
alcohol_related,0.0,0.0,0.0,15.0
arson,0.0,0.0,0.0,174.0
assault,0.777778,0.005012,0.00996,4190.0
drug_related,0.555556,0.191489,0.28481,1175.0
fraud,0.54955,0.048355,0.088889,2523.0
gambling,0.0,0.0,0.0,3.0
murder,0.0,0.0,0.0,6.0
others,0.501497,0.444893,0.471502,18074.0
robbery,0.42827,0.039494,0.072319,5140.0
