In [1]:
import numpy as np
import pandas as pd
import patsy
import sklearn.linear_model as linear_model

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import GridSearchCV

#### Read in data and conduct an exploratory data analysis.
#### Resolve any data issues you identify and articulate why you did what you did.

In [49]:
sf_crime = pd.read_csv('datasets/sf_crime_train.csv')

In [50]:
sf_crime.head()

## since we will be predicting violent crime versus non-violent crime versus non-crimes
## which are based off of category, we can drop 'descript'
## can also drop address because there could be plenty of formatting variations
## and we have x,y as lat, long

sf_crime.drop(['Descript','Address'], inplace=True, axis=1)

In [54]:
## convert dayofweek to numbers so that if we need to look at data by day,
## we can have the days in the correct order

sf_crime.day = [1 if day == 'Sunday' else 2 if day == 'Monday' else 3 if day == 'Tuesday' else 4 if day == 'Wednesday' else 5 if day == 'Thursday' else 6 if day == 'Friday' else 7 for day in sf_crime.DayOfWeek]
sf_crime.drop('DayOfWeek', inplace=True, axis=1)


#### Create column for hour, month, and year from 'Dates' column.
##### Hint: pd.to_datetime may be helpful.

In [55]:
# print sf_crime.head()
sf_crime['Dates'] = pd.to_datetime(sf_crime.Dates)

sf_crime['hour'] = sf_crime.Dates.dt.hour
sf_crime['month'] = sf_crime.Dates.dt.month
sf_crime['year'] = sf_crime.Dates.dt.year

### Build a logit model predicting violent crime versus non-violent crime versus non-crimes.

##### We've given you the various crimes that fall into specific categories so you can use these to create these subcategories 

#### Non-Violent Crimes: bad checks, bribery, drug/narcotic, drunkenness, embezzlement, forgery/counterfeiting, fraud, gambling, liquor, loitering, trespass.

#### Non-Crimes: non-criminal, runaway, secondary codes, suspicious occ, warrants.

#### Violent Crimes: everything else.

###### Hint: What type of model do you need here? What should your "baseline" category be?

Need a logistic regression model
baseline = violent crime

#### Fit a model with five folds and lasso regularization
#### Use Cs=15 to test a grid of 15 distinct parameters
#### Remember: Cs describes the inverse of regularization strength

In [58]:
import patsy
nvc = ['BAD CHECKS', 'BRIBERY', 'DRUG/NARCOTIC', 'DRUNKENNESS', 'EMBEZZLEMENT', 'FORGERY/COUNTERFEITING', 'FRAUD', 'GAMBLING', 'LIQUOR', 'LOITERING', 'TRESSPASS']
nc = ['NON-CRIMINAL', 'RUNAWAY', 'SECONDARY CODES', 'SUSPICIOUS OCC', 'WARRANTS']

sf_crime['Subcategory'] = ['Non-Violent Crime' if x in (nvc) else 'Non-Crime' if x in (nc) else 'Violent Crime' for x in sf_crime['Category']]


In [85]:
X = patsy.dmatrix('~ C(PdDistrict) + C(Resolution)', sf_crime)
X = pd.DataFrame(X, columns=X.design_info.column_names)

y = sf_crime.Subcategory

In [87]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=77)


logreg_cv = LogisticRegressionCV(solver='liblinear', Cs=15, cv=5, penalty='l1')
logreg_cv.fit(X_test, y_test)
logreg_cv.score(X_test, y_test)
y_pred = logreg_cv.predict(X_test)
print ('Max auc_roc:', logreg_cv.scores_)

# 0.79562658

#Classification report:
from sklearn.metrics import classification_report
print classification_report(y_test, y_pred)


#Confusion Matrix:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)
cm
# array([[ 421, 1182],
#        [  39, 4298]])


('Max auc_roc:', {'Violent Crime': array([[ 0.26997477,  0.26997477,  0.73002523,  0.73002523,  0.73002523,
         0.73759462,  0.73759462,  0.73591253,  0.73591253,  0.73507149,
         0.73507149,  0.73507149,  0.73507149,  0.73507149,  0.73507149],
       [ 0.26997477,  0.26997477,  0.73002523,  0.73002523,  0.73002523,
         0.74011775,  0.74011775,  0.74095879,  0.74095879,  0.74095879,
         0.74095879,  0.74095879,  0.74095879,  0.74095879,  0.74095879],
       [ 0.27020202,  0.27020202,  0.72979798,  0.72979798,  0.72979798,
         0.74242424,  0.74242424,  0.74326599,  0.74326599,  0.74326599,
         0.74326599,  0.74326599,  0.74326599,  0.74326599,  0.74326599],
       [ 0.26958719,  0.26958719,  0.73041281,  0.73041281,  0.73041281,
         0.73294019,  0.73294019,  0.73378265,  0.73294019,  0.73294019,
         0.73294019,  0.73294019,  0.73294019,  0.73294019,  0.73294019],
       [ 0.26958719,  0.26958719,  0.73041281,  0.73041281,  0.73041281,
         0.7

  'precision', 'predicted', average, warn_for)


array([[  86,    0, 1182],
       [   7,    0,  328],
       [  39,    0, 4298]])

In [79]:
# find best C per class
print('best C for class:')
best_C = {logreg_cv.classes_[i]:x for i, (x, c) in enumerate(zip(logreg_cv.C_, logreg_cv.classes_))}
print(best_C)

best C for class:
{0.0: 1.0}


In [89]:
cm = confusion_matrix(y_test, y_pred)
cm = pd.DataFrame(cm, columns=(logreg_cv.classes_ + ' PREDICTED'), index=(logreg_cv.classes_ + ' TRUE'))
cm

Unnamed: 0,Non-Crime PREDICTED,Non-Violent Crime PREDICTED,Violent Crime PREDICTED
Non-Crime TRUE,86,0,1182
Non-Violent Crime TRUE,7,0,328
Violent Crime TRUE,39,0,4298


#### Fit regular logit model to 'DRUG/NARCOTIC' and 'BURGLARY' classes; be sure to specify the Y = 1 and Y = 0 classes. Use lasso penalty

In [None]:
import statsmodels.formula.api as sm
import pandas as pd

model = sm.logit(
    "Subcategory ~ ",
    data = sf_crime
).fit()

model.summary()

#### Build confusion matrices for the models above


#### Run gridsearch using GridSearchCV and 5 folds
#### Score on accuracy; what does this metric tell us?

In [None]:
logreg = LogisticRegression()
C_vals = [0.0001, 0.001, 0.01, 0.1, 0.5, 0.75, 1.0, 2.5, 5.0, 10.0, 100.0, 1000.0]
penalties = ['l1','l2']

#### Find the best parameter

#### Use this parameter to .fit, .predict, and print a classification_report (we've already imported this for you) for our X and Y

#### When looking at the classification report, remember:
##### Precision = True Positives / (True Positives + False Positives)

A precision score of 1 indicates that the classifier never mistakenly added observations from another class. A precision score of 0 would mean that the classifier misclassified every instance of the current class.
 
##### Recall = True Positives / (True Positives + False Negatives)
A recall score of 1 indicates that the classifier correctly predicted (found) all observations of the current class (by implication, no false negatives, or misclassifications of the current class). A recall score of 0 alternatively means that the classifier missed all observations of the current class.

##### F1-Score = 2 * (Precision * Recall) / (Precision + Recall)
The f1-score's best value is 1 and worst value is 0, like the precision and recall scores. It is a useful metric for taking into account both measures at once. 

##### Support is simply the number of observations of the labelled class.

http://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html