<a href="https://colab.research.google.com/github/mounirouadi/Gridsearch-and-Multinomial-Models-with-SF-Crime-Data/blob/main/Proj2_gridsearch_multinomial_sf_crime.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Necessary lab imports

In [1]:
import numpy as np
import pandas as pd
import patsy

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_val_predict
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV


import seaborn as sns

import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
!gdown --id 1I_7TRwMgQoxpISpVH0G5sPMUsTdpOhx0

Downloading...
From: https://drive.google.com/uc?id=1I_7TRwMgQoxpISpVH0G5sPMUsTdpOhx0
To: /content/sf_crime_train.csv
0.00B [00:00, ?B/s]2.29MB [00:00, 73.4MB/s]


#1. Read in the data

In [3]:
crime_csv = 'sf_crime_train.csv'

In [4]:
#read in the data using pandas
sf_crime = pd.read_csv(crime_csv)
sf_crime.drop('DayOfWeek',axis=1,inplace=True)
sf_crime.head()

Unnamed: 0,Dates,Category,Descript,PdDistrict,Resolution,Address,X,Y
0,5/13/15 23:53,WARRANTS,WARRANT ARREST,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,5/13/15 23:53,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,5/13/15 23:33,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,5/13/15 23:30,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,5/13/15 23:30,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


In [5]:
sf_crime.shape

(18000, 8)

In [6]:
#checking if there's any missing value in our dataframe
sf_crime.isnull().values.any()

False

the returned value is false, which means there's no missing data (NaN) in our dataframe and we don't need to fix anything.

In [7]:
#checking the datatype of our dataframe
sf_crime.dtypes

Dates          object
Category       object
Descript       object
PdDistrict     object
Resolution     object
Address        object
X             float64
Y             float64
dtype: object

#2. Create column for year, month, day, hour, time, and date from 'Dates' column.

Everything looks good, both X and Y are in float64 type, there's nothing to fix anything.

In [8]:
sf_crime['Dates'] = pd.to_datetime(sf_crime['Dates'])

In [9]:
# create a new column for 'Year','Month',and 'Day_of_Week'
sf_crime['Year'] = sf_crime['Dates'].dt.year
sf_crime['Month'] = sf_crime['Dates'].dt.month
sf_crime['Day_of_Week'] = sf_crime['Dates'].dt.dayofweek
#check the first couple rows to make sure it's what you want
sf_crime.head(2)

Unnamed: 0,Dates,Category,Descript,PdDistrict,Resolution,Address,X,Y,Year,Month,Day_of_Week
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,2015,5,2
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,2015,5,2


In [10]:
# create a column for the 'Hour','Time', and 'Date'
sf_crime['Hour'] = sf_crime['Dates'].dt.hour
sf_crime['Time'] = sf_crime['Dates'].dt.time
sf_crime['Date'] = sf_crime['Dates'].dt.date

In [15]:
# Drop the 'Dates' column
sf_crime=sf_crime.drop(['Dates'], axis = 1)
#check the first couple rows to make sure it's what you want
sf_crime.head(2)

Unnamed: 0,Category,Descript,PdDistrict,Resolution,Address,X,Y,Year,Month,Day_of_Week,Hour,Time,Date
0,WARRANTS,WARRANT ARREST,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,2015,5,2,23,23:53:00,2015-05-13
1,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,2015,5,2,23,23:53:00,2015-05-13


#3. Validate and clean the data.

In [17]:
sf_crime.value_counts('Category')

Category
LARCENY/THEFT                  4885
OTHER OFFENSES                 2291
NON-CRIMINAL                   2255
ASSAULT                        1536
VEHICLE THEFT                   967
VANDALISM                       877
BURGLARY                        732
WARRANTS                        728
SUSPICIOUS OCC                  592
MISSING PERSON                  535
DRUG/NARCOTIC                   496
ROBBERY                         465
FRAUD                           363
SECONDARY CODES                 261
WEAPON LAWS                     212
TRESPASS                        130
STOLEN PROPERTY                 111
SEX OFFENSES FORCIBLE           103
FORGERY/COUNTERFEITING           85
DRUNKENNESS                      74
KIDNAPPING                       50
PROSTITUTION                     44
DRIVING UNDER THE INFLUENCE      42
DISORDERLY CONDUCT               37
ARSON                            35
LIQUOR LAWS                      25
RUNAWAY                          16
EMBEZZLEMENT       

In [49]:
sf_crime.loc[sf_crime['Category'] == 'ASSUALT','Category'] = 'ASSAULT'

In [50]:
sf_crime.loc[sf_crime['Category'] == 'TRESPASSING','Category'] = 'TRESPASS'

In [51]:
# have a look to see whether you have all the days of the week in your data
sf_crime['Day_of_Week'].value_counts()

2    2930
4    2733
5    2556
3    2479
6    2456
0    2447
1    2399
Name: Day_of_Week, dtype: int64

In [52]:
# have a look at the value counts for 'Descript', 'PdDistrict', and 'Resolution' to make sure it all checks out
sf_crime['Descript'].value_counts()

GRAND THEFT FROM LOCKED AUTO                              2127
STOLEN AUTOMOBILE                                          625
AIDED CASE, MENTAL DISTURBED                               591
DRIVERS LICENSE, SUSPENDED OR REVOKED                      589
BATTERY                                                    520
                                                          ... 
STOLEN TRAILER                                               1
ROBBERY OF A BANK WITH A KNIFE                               1
ATTEMPTED KIDNAPPING, ADULT VICTIM                           1
INTERFERRING WITH A POLICE OFFICER                           1
DRIVING WHILE UNDER THE INFLUENCE OF ALCOHOL, W/INJURY       1
Name: Descript, Length: 510, dtype: int64

In [53]:
sf_crime['PdDistrict'].value_counts()

SOUTHERN      3287
NORTHERN      2250
CENTRAL       2206
MISSION       2118
BAYVIEW       1678
INGLESIDE     1628
TARAVAL       1426
TENDERLOIN    1327
RICHMOND      1101
PARK           979
Name: PdDistrict, dtype: int64

In [54]:
sf_crime['Resolution'].value_counts()

NONE                                      12862
ARREST, BOOKED                             4455
UNFOUNDED                                   367
ARREST, CITED                               100
JUVENILE BOOKED                              94
EXCEPTIONAL CLEARANCE                        58
PSYCHOPATHIC CASE                            28
LOCATED                                      25
CLEARED-CONTACT JUVENILE FOR MORE INFO       10
NOT PROSECUTED                                1
Name: Resolution, dtype: int64

In [55]:
# use .describe() to see whether the location coordinates seem appropriate
sf_crime.describe()

Unnamed: 0,X,Y,Year,Month,Day_of_Week,Hour
count,18000.0,18000.0,18000.0,18000.0,18000.0,18000.0
mean,-122.423639,37.768466,2015.0,3.489944,3.008,13.646833
std,0.026532,0.024391,0.0,0.868554,1.966564,6.53904
min,-122.513642,37.708154,2015.0,2.0,0.0,0.0
25%,-122.434199,37.753838,2015.0,3.0,1.0,10.0
50%,-122.416949,37.775608,2015.0,3.0,3.0,15.0
75%,-122.406539,37.78539,2015.0,4.0,5.0,19.0
max,-122.365565,37.819923,2015.0,5.0,6.0,23.0


#4. Set up a target and predictor matrix for predicting violent crime vs. non-violent crime vs. non-crimes.

In [57]:
NVC = ['BAD CHECKS','BRIBERY','DRUG/NARCOTIC','DRUNKENNESS',
     'EMBEZZLEMENT','FORGERY/COUNTERFEITING','FRAUD',
     'GAMBLING','LIQUOR','LOITERING','TRESPASS','OTHER OFFENSES']

NOT_C = ['NON-CRIMINAL','RUNAWAY','SECONDARY CODES','SUSPICIOUS OCC','WARRANTS']

#use a list comprehension to get all the categories in sf_crime['Category'].unique() that are NOT in the lists above
VC = [crime for crime in sf_crime['Category'].unique() if crime not in NVC+NOT_C]

In [58]:
#add a column called 'Type' into your dataframe that stores whether the observation was:
#Non-Violent, Violent, or Non-Crime
#use .map()!
def typecrime(x):
    if x in NOT_C: return 'NOT_CRIMINAL'
    if x in NVC: return 'NON-VIOLENT'
    if x in VC: return 'VIOLENT_CRIME'

sf_crime['Type']= sf_crime['Category'].map(typecrime)

In [59]:
sf_crime.head()

Unnamed: 0,Category,Descript,PdDistrict,Resolution,Address,X,Y,Year,Month,Day_of_Week,Hour,Time,Date,Type
0,WARRANTS,WARRANT ARREST,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,2015,5,2,23,23:53:00,2015-05-13,NOT_CRIMINAL
1,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,2015,5,2,23,23:53:00,2015-05-13,NON-VIOLENT
2,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414,2015,5,2,23,23:33:00,2015-05-13,NON-VIOLENT
3,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873,2015,5,2,23,23:30:00,2015-05-13,VIOLENT_CRIME
4,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541,2015,5,2,23,23:30:00,2015-05-13,VIOLENT_CRIME


In [60]:
#find the baseline accuracy:
sf_crime['Type'].value_counts().max() / len(sf_crime)

0.5931111111111111

In [61]:
#create a target array with 'Type'
#create a predictor matrix with 'Day_of_Week','Month','Year','PdDistrict','Hour', and 'Resolution'
y = sf_crime['Type']
X = sf_crime[['Day_of_Week','Month','Year','PdDistrict','Hour','Resolution']]

In [62]:
#use pd.get_dummies() to dummify your categorical variables
#remember to drop a column!
X = pd.get_dummies(X,drop_first=True)
X.head()

Unnamed: 0,Day_of_Week,Month,Year,Hour,PdDistrict_CENTRAL,PdDistrict_INGLESIDE,PdDistrict_MISSION,PdDistrict_NORTHERN,PdDistrict_PARK,PdDistrict_RICHMOND,PdDistrict_SOUTHERN,PdDistrict_TARAVAL,PdDistrict_TENDERLOIN,"Resolution_ARREST, CITED",Resolution_CLEARED-CONTACT JUVENILE FOR MORE INFO,Resolution_EXCEPTIONAL CLEARANCE,Resolution_JUVENILE BOOKED,Resolution_LOCATED,Resolution_NONE,Resolution_NOT PROSECUTED,Resolution_PSYCHOPATHIC CASE,Resolution_UNFOUNDED
0,2,5,2015,23,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,5,2015,23,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,2,5,2015,23,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,2,5,2015,23,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0
4,2,5,2015,23,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0


#5. Create a train / test / split and standardize the predictor matrices

In [63]:
#create a 50/50 train test split; 
#stratify based on your target variable
#use a random state of 2018
X_test, X_train, y_test, y_train = train_test_split(X, y, test_size=0.5, stratify=y, random_state=2018)

In [64]:
#standardise your predictor matrices
from sklearn.preprocessing import StandardScaler
ss=StandardScaler()
ss.fit(X_train)
X_train_ss = ss.transform(X_train)
X_test_ss = ss.transform(X_test)

#6. Create a basic Logistic Regression model and use cross_val_score to assess its performance on your training data

In [65]:
#create a default Logistic Regression model and find its mean cross-validated accuracy with your training data
#use 5 cross-validation folds
lr = LogisticRegression()
cross_val_score(lr, X_train_ss, y_train, cv=5).mean()

0.6366666666666667

In [66]:
#create a confusion matrix with cross_val_predict
predictions = cross_val_predict(lr, X_train_ss, y_train, cv=5)
confusion = confusion_matrix(y_test,predictions)
pd.DataFrame(confusion,
             columns=sorted(y_train.unique()),
             index=sorted(y_train.unique()))

Unnamed: 0,NON-VIOLENT,NOT_CRIMINAL,VIOLENT_CRIME
NON-VIOLENT,379,45,1312
NOT_CRIMINAL,435,49,1442
VIOLENT_CRIME,1097,127,4114


#7. Find the optimal hyperparameters (optimal regularization) to predict your crime categories using GridSearchCV.

In [68]:
#create a hyperparameter dictionary for a logistic regression
hparams={'penalty':['l1','l2'] , 'solver':['liblinear'] , 'C':np.logspace(-3,0,50)}

In [69]:
#create a gridsearch object using LogisticRegression() and the dictionary you created above
crime_gs=GridSearchCV(LogisticRegression() , hparams , n_jobs=-1 , cv=5)

In [70]:
#fit the gridsearch object on your training data
crime_gs.fit(X_train_ss,y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': array([0.001     , 0.0011514 , 0.00...
       0.03393222, 0.0390694 , 0.04498433, 0.05179475, 0.05963623,
       0.06866488, 0.07906043, 0.09102982, 0.10481131, 0.12067926,
       0.13894955, 0.15998587, 0.184207  , 0.21209509, 0.24420531,
       0.28117687, 0.32374575, 0.37275937, 0.42919343, 0.49417134,
       0.5689866 

In [71]:
#print out the best parameters
crime_gs.best_params_

{'C': 0.09102981779915217, 'penalty': 'l1', 'solver': 'liblinear'}

In [72]:
#print out the best mean cross-validated score
crime_gs.best_score_

0.6373333333333333

In [73]:
#assign your best estimator to the variable 'best_logreg'
best_lr=crime_gs.best_estimator_

In [74]:
#score your model on your testing data
best_lr.score(X_test_ss,y_test)

0.629

#8. Print out a classification report for your best_logreg model

In [75]:
#use your test data to create your classification report
predictions = best_lr.predict(X_test_ss)
print(classification_report(y_test, predictions))

               precision    recall  f1-score   support

  NON-VIOLENT       0.45      0.51      0.48      1736
 NOT_CRIMINAL       0.62      0.07      0.12      1926
VIOLENT_CRIME       0.68      0.87      0.76      5338

     accuracy                           0.63      9000
    macro avg       0.59      0.48      0.46      9000
 weighted avg       0.62      0.63      0.57      9000

