In [1]:
import numpy as np
import pandas as pd
import patsy

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.grid_search import GridSearchCV

In [16]:
# prep data, covert date to datetime, split for train and test, and build model
sf_crime = pd.read_csv('../../assets/datasets/sf_crime_train.csv')

sf_crime = sf_crime.dropna()

sf_crime['Dates'] = pd.to_datetime(sf_crime.Dates)
sf_crime_dates = pd.DatetimeIndex(sf_crime.Dates.values, dtype='datetime64[ns]', freq=None)

sf_crime['hour'] = sf_crime_dates.hour
sf_crime['month'] = sf_crime_dates.month
sf_crime['year'] = sf_crime_dates.year

In [14]:
subset = ['VEHICLE THEFT','BURGLARY','DRUG/NARCOTIC']
sf_crime_sub = sf_crime[sf_crime['Category'].str.contains('|'.join(subset))]

#sf_sample = sf_crime_sub.sample(frac=0.50)

X = patsy.dmatrix('~ C(hour) + C(DayOfWeek) + C(PdDistrict)', sf_crime_sub)
Y = sf_crime_sub.Category.values

# split for train and test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, stratify=Y, random_state=77)

['VEHICLE THEFT' 'VEHICLE THEFT' 'VEHICLE THEFT' ..., 'VEHICLE THEFT'
 'BURGLARY' 'VEHICLE THEFT']


In [8]:
# fit model with five folds and lasso regularization
# use Cs=15 to test a grid of 15 distinct parameters
# remeber: Cs describes the inverse of regularization strength
logreg_cv = LogisticRegressionCV(Cs = 15, cv = 5, penalty = 'l1', solver='liblinear') # update inputs here
logreg_cv.fit(X_train, Y_train)

LogisticRegressionCV(Cs=15, class_weight=None, cv=5, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l1', random_state=None,
           refit=True, scoring=None, solver='liblinear', tol=0.0001,
           verbose=0)

In [9]:
# find best C per class
print('best C for class:')
best_C = {logreg_cv.classes_[i]:x for i, (x, c) in enumerate(zip(logreg_cv.C_, logreg_cv.classes_))}
print(best_C)

best C for class:
{'DRUG/NARCOTIC': 1.0, 'VEHICLE THEFT': 3.7275937203149381, 'BURGLARY': 1.0}


In [27]:
# fit regular logit model to 'DRUG/NARCOTIC' and 'BURGLARY' classes
# use lasso penalty

subset = ['BURGLARY','DRUG/NARCOTIC']
sf_crime_sub = sf_crime[sf_crime['Category'].str.contains('|'.join(subset))]

logreg_2 = LogisticRegressionCV(penalty = 'l1', Cs = 15, cv = 5, solver='liblinear')
model = logreg_2.fit(X_train, Y_train)

In [28]:
# build confusion matrices for the models above
Y_1_pred = logreg_cv.predict(X_train)
Y_2_pred = logreg_2.predict(X_train)

conmat_1 = confusion_matrix(Y_train, Y_1_pred, labels=logreg_cv.classes_)
conmat_1 = pd.DataFrame(conmat_1, columns=logreg_cv.classes_, index=logreg_cv.classes_)

conmat_2 = confusion_matrix(Y_train, Y_2_pred, labels=logreg_2.classes_)
conmat_2 = pd.DataFrame(conmat_2, columns=logreg_2.classes_, index=logreg_2.classes_)


In [31]:
# print classification reports
print(classification_report(Y_train, Y_1_pred))
print(classification_report(Y_train, Y_2_pred))

             precision    recall  f1-score   support

   BURGLARY       0.70      0.45      0.55       490
DRUG/NARCOTIC       0.82      0.30      0.43       332
VEHICLE THEFT       0.00      0.00      0.00         0

avg / total       0.75      0.39      0.50       822

             precision    recall  f1-score   support

   BURGLARY       0.69      0.90      0.78       490
DRUG/NARCOTIC       0.73      0.40      0.52       332

avg / total       0.71      0.70      0.67       822



  'recall', 'true', average, warn_for)


In [36]:
# run gridsearch using GridSearchCV and 5 folds
# score on f1_macro; what does this metric tell us?
logreg = LogisticRegression()
C_vals = [0.0001, 0.001, 0.01, 0.1, 0.5, 0.75, 1.0, 2.5, 5.0, 10.0, 100.0, 1000.0]
penalties = ['l1','l2']

gridsearch = GridSearchCV(estimator = logreg, param_grid={'penalty': penalties, 'C': C_vals}, cv = 5, scoring = 'f1_macro', verbose = True)
gridsearch.fit(X, Y)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'penalty': ['l1', 'l2'], 'C': [0.0001, 0.001, 0.01, 0.1, 0.5, 0.75, 1.0, 2.5, 5.0, 10.0, 100.0, 1000.0]},
       pre_dispatch='2*n_jobs', refit=True, scoring='f1_macro',
       verbose=True)

In [37]:
# find the best parameter
print(gridsearch.best_score_)

0.610059200587


In [None]:
# use this parameter to .fit, .predict, and print a classification_report for our X and Y
