## Modelling

### Imports

In [88]:
import pandas as pd
import numpy as np
import pickle

from sklearn import preprocessing
from sklearn.preprocessing import label_binarize
from sklearn.metrics import precision_score, recall_score, roc_auc_score, roc_curve, auc, confusion_matrix, classification_report, accuracy_score
from sklearn.model_selection import cross_val_score, train_test_split, RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

import matplotlib.pyplot as plt
import matplotlib.patches as patches
%matplotlib inline

### Load Data

In [None]:
df = pickle.load(open('complete_data_set.pkl', 'rb'))

In [2]:
# df[df['broad_category'] == 'PROPERTY CRIME']['description'].value_counts()

In [3]:
# theft_from_auto = ['GRAND THEFT FROM LOCKED AUTO', 'PETTY THEFT FROM LOCKED AUTO',
#                    'GRAND THEFT FROM UNLOCKED AUTO', 'PETTY THEFT FROM UNLOCKED AUTO',
#                    'ATTEMPTED THEFT FROM LOCKED VEHICLE']

# theft_of_property = ['PETTY THEFT OF PROPERTY', 'GRAND THEFT OF PROPERTY']

# shoplifting_pickpocket = ['PETTY THEFT SHOPLIFTING', 'GRAND THEFT SHOPLIFTING', 'GRAND THEFT PICKPOCKET']

# def split_theft_category(input_val):
#     if input_val in theft_from_auto:
#         return 'THEFT FROM AUTO'
#     elif input_val in theft_of_property:
#         return 'THEFT OF PROPERTY'
#     elif input_val in shoplifting_pickpocket:
#         return 'SHOPLIFTING/PICKPOCKET'
#     else:
#         return input_val

In [89]:
subset_categories = ['LARCENY/THEFT','ASSAULT','VANDALISM','VEHICLE THEFT','BURGLARY','DRUG/NARCOTIC','ROBBERY']
df = df[df['category'].isin(subset_categories)]

In [90]:
df['category'].value_counts()

LARCENY/THEFT    53809
ASSAULT          17563
VANDALISM        11533
VEHICLE THEFT     8346
BURGLARY          7641
DRUG/NARCOTIC     5402
ROBBERY           4289
Name: category, dtype: int64

In [91]:
df.loc[df['description'] == 'PETTY THEFT OF PROPERTY', 'category'] = 'THEFT OF PROPERTY'
df.loc[df['description'] == 'GRAND THEFT OF PROPERTY', 'category'] = 'THEFT OF PROPERTY'
df.loc[df['description'] == 'GRAND THEFT FROM LOCKED AUTO', 'category'] = 'THEFT FROM AUTO'
df.loc[df['description'] == 'PETTY THEFT FROM LOCKED AUTO', 'category'] = 'THEFT FROM AUTO'
df.loc[df['description'] == 'GRAND THEFT FROM UNLOCKED AUTO', 'category'] = 'THEFT FROM AUTO'
df.loc[df['description'] == 'PETTY THEFT FROM UNLOCKED AUTO', 'category'] = 'THEFT FROM AUTO'
df.loc[df['description'] == 'ATTEMPTED THEFT FROM LOCKED VEHICLE', 'category'] = 'THEFT FROM AUTO'
df.loc[df['description'] == 'PETTY THEFT SHOPLIFTING', 'category'] = 'SHOPLIFTING/PICKPOCKET'
df.loc[df['description'] == 'GRAND THEFT SHOPLIFTING', 'category'] = 'SHOPLIFTING/PICKPOCKET'
df.loc[df['description'] == 'GRAND THEFT PICKPOCKET', 'category'] = 'SHOPLIFTING/PICKPOCKET'
df.loc[df['category'] == 'LARCENY/THEFT', 'category'] = 'OTHER THEFT'

In [92]:
df['category'].value_counts()

THEFT FROM AUTO           33501
ASSAULT                   17563
VANDALISM                 11533
THEFT OF PROPERTY          8414
VEHICLE THEFT              8346
OTHER THEFT                8114
BURGLARY                   7641
DRUG/NARCOTIC              5402
ROBBERY                    4289
SHOPLIFTING/PICKPOCKET     3780
Name: category, dtype: int64

In [93]:
df = df[df['category'] != 'BURGLARY']
df = df[df['category'] != 'OTHER THEFT']

In [94]:
df.columns

Index(['category', 'description', 'day_of_week', 'pd_district', 'resolution',
       'address', 'longitude', 'latitude', 'date_time', 'hour_of_day', 'month',
       'broad_category', 'geometry', 'id', 'pop10_sqmi', 'pop2010', 'sqmi',
       'zip_code', 'median_income', 'z_index', 'dist_to_train_station',
       'num_close_train_stations', 'dist_to_police_station',
       'num_close_police_stations', 'dist_to_dispensary',
       'num_close_dispensaries', 'dist_to_health_facility',
       'num_close_health_facilities', 'dist_to_shelter', 'num_close_shelters',
       'dist_to_union_sq'],
      dtype='object')

In [95]:
df['category'].value_counts()

THEFT FROM AUTO           33501
ASSAULT                   17563
VANDALISM                 11533
THEFT OF PROPERTY          8414
VEHICLE THEFT              8346
DRUG/NARCOTIC              5402
ROBBERY                    4289
SHOPLIFTING/PICKPOCKET     3780
Name: category, dtype: int64

In [96]:
data_set = df.drop(['longitude','latitude','geometry','id','pop2010','sqmi',
                    'address','resolution','description','broad_category'], axis=1)

In [97]:
data_set.columns

Index(['category', 'day_of_week', 'pd_district', 'date_time', 'hour_of_day',
       'month', 'pop10_sqmi', 'zip_code', 'median_income', 'z_index',
       'dist_to_train_station', 'num_close_train_stations',
       'dist_to_police_station', 'num_close_police_stations',
       'dist_to_dispensary', 'num_close_dispensaries',
       'dist_to_health_facility', 'num_close_health_facilities',
       'dist_to_shelter', 'num_close_shelters', 'dist_to_union_sq'],
      dtype='object')

In [98]:
data_set['z_index'].fillna(data_set['z_index'].mean(), inplace=True)
data_set['pop10_sqmi'] = data_set['pop10_sqmi'].apply(lambda x: float(x))
data_set['pop10_sqmi'].fillna(data_set['pop10_sqmi'].mean(), inplace=True)
data_set['median_income'].fillna(data_set['median_income'].median(), inplace=True)

data_set['z_index'] = preprocessing.scale(data_set['z_index'], with_mean=False)
data_set['pop10_sqmi'] = preprocessing.scale(data_set['pop10_sqmi'], with_mean=False)
data_set['median_income'] = preprocessing.scale(data_set['median_income'], with_mean=False)

In [99]:
def category_to_numeric(input_val):
    if input_val == 'ASSAULT':
        return 1
    elif input_val == 'SHOPLIFTING/PICKPOCKET':
        return 2
    elif input_val == 'THEFT FROM AUTO':
        return 3
    elif input_val == 'DRUG/NARCOTIC':
        return 4
    elif input_val == 'VANDALISM':
        return 5
    elif input_val == 'ROBBERY':
        return 6
    elif input_val == 'VEHICLE THEFT':
        return 7
    else:
        return 8

In [100]:
data_set['category'] = data_set['category'].apply(category_to_numeric)

In [101]:
data_set.sample(5)

Unnamed: 0,category,day_of_week,pd_district,date_time,hour_of_day,month,pop10_sqmi,zip_code,median_income,z_index,...,num_close_train_stations,dist_to_police_station,num_close_police_stations,dist_to_dispensary,num_close_dispensaries,dist_to_health_facility,num_close_health_facilities,dist_to_shelter,num_close_shelters,dist_to_union_sq
92523,3,Tuesday,INGLESIDE,2016-02-09 18:15:00,18,Feb,1.597911,94112,3.266508,2.156278,...,0,0.013125,0,0.002726,2,0.019917,0,0.01507,0,0.080042
46889,5,Wednesday,NORTHERN,2016-05-25 17:20:00,17,May,3.807212,94102,1.266892,1.905441,...,1,0.008186,1,0.006608,5,0.002297,17,0.00155,27,0.015043
51719,5,Sunday,NORTHERN,2016-06-19 23:00:00,23,Jun,3.807212,94102,1.266892,1.905441,...,1,0.007932,1,0.004828,3,0.001209,17,0.000388,26,0.014487
115349,3,Saturday,TARAVAL,2016-01-23 12:00:00,12,Jan,1.597911,94112,3.266508,2.156278,...,1,0.008557,1,0.007577,1,0.013442,0,0.007783,1,0.082303
98804,5,Friday,TENDERLOIN,2016-06-03 07:30:00,7,Jun,3.807212,94102,1.266892,1.905441,...,2,0.003056,1,0.002131,4,0.001848,13,0.001218,40,0.005229


In [102]:
quick_model_data = data_set.drop(['date_time'], axis=1)

In [103]:
quick_model_data = pd.get_dummies(quick_model_data, columns=['day_of_week','pd_district',
                                                             'month','zip_code','hour_of_day'])

In [104]:
quick_model_data.columns

Index(['category', 'pop10_sqmi', 'median_income', 'z_index',
       'dist_to_train_station', 'num_close_train_stations',
       'dist_to_police_station', 'num_close_police_stations',
       'dist_to_dispensary', 'num_close_dispensaries',
       'dist_to_health_facility', 'num_close_health_facilities',
       'dist_to_shelter', 'num_close_shelters', 'dist_to_union_sq',
       'day_of_week_Friday', 'day_of_week_Monday', 'day_of_week_Saturday',
       'day_of_week_Sunday', 'day_of_week_Thursday', 'day_of_week_Tuesday',
       'day_of_week_Wednesday', 'pd_district_BAYVIEW', 'pd_district_CENTRAL',
       'pd_district_INGLESIDE', 'pd_district_MISSION', 'pd_district_NORTHERN',
       'pd_district_PARK', 'pd_district_RICHMOND', 'pd_district_SOUTHERN',
       'pd_district_TARAVAL', 'pd_district_TENDERLOIN', 'month_Apr',
       'month_Aug', 'month_Dec', 'month_Feb', 'month_Jan', 'month_Jul',
       'month_Jun', 'month_Mar', 'month_May', 'month_Nov', 'month_Oct',
       'month_Sep', 'zip_code_941

In [20]:
# model = GradientBoostingClassifier()

In [21]:
# y = quick_model_data['category']
# X = quick_model_data.drop('category', axis=1)

In [22]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=4444)

In [23]:
# model.fit(X_train, y_train)

In [24]:
# y_pred = model.predict(X_test)

In [25]:
# score = accuracy_score(y_test, y_pred)

In [26]:
# y_categories = list(y.unique())
# target_names = [y for y in y_categories]
# print(classification_report(y_test, y_pred, target_names=target_names))

In [105]:
def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest',cmap=cmap)
    plt.title(title)
    plt.colorbar()
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

def train_score(classifier,x,y):
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=4444)
    y_train=np.ravel(y_train)
    clf = classifier.fit(X_train, y_train)
    
    # accuracy for test & train:
    train_acc = clf.score(X_train, y_train)
    test_acc = clf.score(X_test, y_test)
    
    print("Training Data Accuracy: %0.2f" %(train_acc))
    print("Test Data Accuracy:     %0.2f" %(test_acc))
    print ('\n')
    
    y_true = y_test
    y_pred = clf.predict(X_test)

    conf = confusion_matrix(y_true, y_pred)
    print(conf)

    print ('\n')
    print ("Precision:              %0.2f" %(conf[0, 0] / (conf[0, 0] + conf[1, 0])))
    print ("Recall:                 %0.2f"% (conf[0, 0] / (conf[0, 0] + conf[0, 1])))
    
    cm=confusion_matrix(y_true, y_pred, labels=None)
    
    plt.figure()
    plot_confusion_matrix(cm)

In [28]:
# train_score(model, X, y)

In [106]:
def plot_roc_curve(input_y_true, input_y_score, title):
    fpr, tpr,_ = roc_curve(input_y_true, input_y_score)
    roc_auc = auc(fpr, tpr)

    # build a rectangle in axes coords
    left, width = 0.05, 0.90
    bottom, height = 0.05, 0.90
    right = left + width
    top = bottom + height

    fig = plt.figure()
    ax = fig.add_axes([0,0,1,1])
    p = patches.Rectangle((left, bottom), width, height, fill=False, transform=ax.transAxes, clip_on=False)
    auc_string = 'AUC: {:.4f}'.format(roc_auc)
    ax.text(right, bottom, auc_string, horizontalalignment='right', verticalalignment='bottom', transform=ax.transAxes, size=14)
    ax.add_patch(p)

    # Plotting our Baseline..
    plt.title(title)
    plt.plot([0,1],[0,1], alpha=0.5, linewidth=3.0)
    plt.plot(fpr, tpr, alpha=0.5, linewidth=3.0)
    plt.xlabel('FPR')
    plt.ylabel('TPR')
    
    return plt

In [30]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=4444)
# model_2 = GradientBoostingClassifier()
# model_2.fit(X_train, y_train)
# model_y_score = model_2.predict_proba(X_test)[:,1]
# plot_roc_curve(y_test, model_y_score, 'ROC curve')

In [31]:
# y_test = label_binarize(np.ravel(y_test), classes)
# y_test

In [32]:
# 'ASSAULT':1
# 'SHOPLIFTING/PICKPOCKET':2
# 'THEFT FROM AUTO':3
# 'DRUG/NARCOTIC':4
# 'VANDALISM':5
# 'ROBBERY':6
# 'VEHICLE THEFT':7
# 'THEFT OF PROPERTY':8

In [33]:
# len(y)

In [34]:
# y_categories = [1, 2, 3, 4, 5, 6, 7, 8]
# y_pred = model.predict(X_test)
# y_test = np.ravel(y_test)
# # target_names = [y for y in y_categories]
# print(classification_report(y_test, y_pred, target_names=y_categories))


In [36]:
# knn = KNeighborsClassifier(n_neighbors=300)
# scores = cross_val_score(knn, X, y, cv=3, scoring='accuracy')
# print(scores)

In [38]:
# # k_range = list(range(1, 31))
# k_range = [50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750]
# weight_options = ['uniform', 'distance']
# # param_grid = dict(n_neighbors=k_range, weights=weight_options)
# param_dist = dict(n_neighbors=k_range, weights=weight_options)

In [39]:
# param_dist

{'n_neighbors': [50,
  100,
  150,
  200,
  250,
  300,
  350,
  400,
  450,
  500,
  550,
  600,
  650,
  700,
  750],
 'weights': ['uniform', 'distance']}

In [40]:
# knn = KNeighborsClassifier()

In [41]:
# # model_sample = quick_model_data.sample(30000)
# y = model_sample['category']
# X = model_sample.drop('category', axis=1)

In [42]:
# # n_iter controls the number of searches
# rand = RandomizedSearchCV(knn, param_dist, cv=5, scoring='accuracy', n_iter=10, random_state=42)
# rand.fit(X, y)
# rand.grid_scores_

In [44]:
# # view the complete results
# grid.grid_scores_

In [45]:
# # examine the best model
# print(rand.best_score_)
# print(rand.best_params_)

In [46]:
# quick_model_data['category'].value_counts()

In [109]:
cat_1 = quick_model_data[quick_model_data['category'] == 1].sample(2000)
cat_2 = quick_model_data[quick_model_data['category'] == 2].sample(2000)
cat_3 = quick_model_data[quick_model_data['category'] == 3].sample(2000)
cat_4 = quick_model_data[quick_model_data['category'] == 4].sample(2000)
cat_5 = quick_model_data[quick_model_data['category'] == 5].sample(2000)
cat_6 = quick_model_data[quick_model_data['category'] == 6].sample(2000)
cat_7 = quick_model_data[quick_model_data['category'] == 7].sample(2000)
cat_8 = quick_model_data[quick_model_data['category'] == 8].sample(2000)

In [110]:
balanced_sample = pd.concat([cat_1, cat_2, cat_3, cat_4, cat_5, cat_6, cat_7, cat_8])

In [111]:
y = balanced_sample['category']
X = balanced_sample.drop('category', axis=1)

In [50]:
# # n_iter controls the number of searches
# knn = KNeighborsClassifier()
# rand = RandomizedSearchCV(knn, param_dist, cv=5, scoring='accuracy', n_iter=10, random_state=42)
# rand.fit(X, y)
# rand.grid_scores_



[mean: 0.28369, std: 0.00726, params: {'weights': 'distance', 'n_neighbors': 700},
 mean: 0.29175, std: 0.00906, params: {'weights': 'distance', 'n_neighbors': 400},
 mean: 0.28425, std: 0.00824, params: {'weights': 'distance', 'n_neighbors': 600},
 mean: 0.28825, std: 0.00805, params: {'weights': 'distance', 'n_neighbors': 450},
 mean: 0.27481, std: 0.00878, params: {'weights': 'uniform', 'n_neighbors': 250},
 mean: 0.29456, std: 0.00706, params: {'weights': 'distance', 'n_neighbors': 250},
 mean: 0.24419, std: 0.00793, params: {'weights': 'uniform', 'n_neighbors': 750},
 mean: 0.25062, std: 0.00764, params: {'weights': 'uniform', 'n_neighbors': 650},
 mean: 0.27219, std: 0.01084, params: {'weights': 'uniform', 'n_neighbors': 350},
 mean: 0.29369, std: 0.00511, params: {'weights': 'uniform', 'n_neighbors': 50}]

In [51]:
# rand.cv_results_

{'mean_fit_time': array([ 0.0672524 ,  0.06082511,  0.06594925,  0.05990233,  0.06035933,
         0.06008029,  0.06054153,  0.06154995,  0.06054912,  0.06132159]),
 'mean_score_time': array([ 2.94600549,  2.26143312,  2.78876138,  2.35199952,  1.75545793,
         1.79904385,  2.85063229,  2.6545836 ,  2.02700744,  1.05474501]),
 'mean_test_score': array([ 0.2836875,  0.29175  ,  0.28425  ,  0.28825  ,  0.2748125,
         0.2945625,  0.2441875,  0.250625 ,  0.2721875,  0.2936875]),
 'mean_train_score': array([ 0.98860938,  0.98860938,  0.98860938,  0.98860938,  0.28523438,
         0.98860938,  0.24892188,  0.2540625 ,  0.27820312,  0.33551563]),
 'param_n_neighbors': masked_array(data = [700 400 600 450 250 250 750 650 350 50],
              mask = [False False False False False False False False False False],
        fill_value = ?),
 'param_weights': masked_array(data = ['distance' 'distance' 'distance' 'distance' 'uniform' 'distance' 'uniform'
  'uniform' 'uniform' 'uniform'],
  

In [None]:
# # view the complete results
# grid.grid_scores_

In [52]:
# # examine the best model
# print(rand.best_score_)
# print(rand.best_params_)

0.2945625
{'weights': 'distance', 'n_neighbors': 250}


In [57]:
# best_knn = KNeighborsClassifier(n_neighbors=250, weights='distance')

In [58]:
# y = quick_model_data['category']
# X = quick_model_data.drop('category', axis=1)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=4444)

In [59]:
# best_knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=250, p=2,
           weights='distance')

In [62]:
# best_knn_y_pred = best_knn.predict(X_test)

In [66]:
# y_categories = list(y.unique())
# target_names = [y for y in y_categories]
print(classification_report(y_test, best_knn_y_pred))

             precision    recall  f1-score   support

          1       0.37      0.51      0.43      5198
          2       0.41      0.37      0.39      1129
          3       0.50      0.81      0.62     10110
          4       0.55      0.50      0.53      1571
          5       0.21      0.04      0.06      3481
          6       0.13      0.03      0.05      1275
          7       0.57      0.11      0.18      2519
          8       0.31      0.07      0.11      2566

avg / total       0.41      0.45      0.39     27849



In [67]:
# 'ASSAULT':1
# 'SHOPLIFTING/PICKPOCKET':2
# 'THEFT FROM AUTO':3
# 'DRUG/NARCOTIC':4
# 'VANDALISM':5
# 'ROBBERY':6
# 'VEHICLE THEFT':7
# 'THEFT OF PROPERTY':8

In [80]:
# grad = GradientBoostingClassifier(learning_rate=0.2, n_estimators=200, max_depth=4)

# # k_range = list(range(1, 31))
# # learning_rates = [0.01, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3]
# # n_estimators = [50, 75, 100, 125, 150, 175, 200, 225, 250, 300, 325, 350]
# # max_depths = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

# # param_grid = dict(n_neighbors=k_range, weights=weight_options)
# # param_dist = dict(learning_rate=0.15, n_estimators=150, max_depth=3)

# y = balanced_sample['category']
# X = balanced_sample.drop('category', axis=1)

# # n_iter controls the number of searches
# # rand = RandomizedSearchCV(grad, param_dist, cv=3, scoring='accuracy', n_iter=3, random_state=42)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=4444)
# # rand.grid_scores_

In [81]:
# # examine the best model
# print(rand.best_score_)
# print(rand.best_params_)

In [82]:
# grad.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.2, loss='deviance', max_depth=4,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=200, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False)

In [83]:
# grad_y_pred = grad.predict(X_test)

In [84]:
# score = accuracy_score(y_test, grad_y_pred)

In [85]:
# score

0.33916666666666667

In [86]:
# print(classification_report(y_test, grad_y_pred))

             precision    recall  f1-score   support

          1       0.24      0.24      0.24       594
          2       0.57      0.64      0.61       577
          3       0.32      0.34      0.33       609
          4       0.48      0.53      0.51       581
          5       0.21      0.20      0.20       582
          6       0.23      0.18      0.20       641
          7       0.31      0.34      0.32       622
          8       0.29      0.27      0.28       594

avg / total       0.33      0.34      0.33      4800



In [112]:
random_forest = RandomForestClassifier()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=4444)
random_forest.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [118]:
# grad = GradientBoostingClassifier(learning_rate=0.2, n_estimators=200, max_depth=4)

# k_range = list(range(1, 31))
# learning_rates = [0.01, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3]
n_estimators = [2, 5, 10, 20, 40, 50, 75, 100, 125, 150, 175, 200, 225, 250, 300, 325, 350]
max_depths = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

param_grid = dict(n_estimators=n_estimators, max_depth=max_depths)
# param_dist = dict(learning_rate=0.15, n_estimators=150, max_depth=3)

y = balanced_sample['category']
X = balanced_sample.drop('category', axis=1)

# n_iter controls the number of searches
rand = RandomizedSearchCV(random_forest, param_grid, cv=3)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=4444)
# rand.grid_scores_

In [119]:
rand.fit(X, y)
# rand.grid_scores_

RandomizedSearchCV(cv=3, error_score='raise',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
          fit_params={}, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'n_estimators': [2, 5, 10, 20, 40, 50, 75, 100, 125, 150, 175, 200, 225, 250, 300, 325, 350], 'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score=True, scoring=None, verbose=0)

In [120]:
rand.grid_scores_



[mean: 0.30688, std: 0.00338, params: {'n_estimators': 150, 'max_depth': 8},
 mean: 0.28650, std: 0.00494, params: {'n_estimators': 5, 'max_depth': 7},
 mean: 0.28181, std: 0.00034, params: {'n_estimators': 250, 'max_depth': 5},
 mean: 0.30294, std: 0.00857, params: {'n_estimators': 10, 'max_depth': 9},
 mean: 0.32056, std: 0.00550, params: {'n_estimators': 75, 'max_depth': 10},
 mean: 0.24538, std: 0.00509, params: {'n_estimators': 225, 'max_depth': 1},
 mean: 0.26894, std: 0.00483, params: {'n_estimators': 250, 'max_depth': 3},
 mean: 0.30144, std: 0.00395, params: {'n_estimators': 250, 'max_depth': 7},
 mean: 0.29037, std: 0.00184, params: {'n_estimators': 200, 'max_depth': 6},
 mean: 0.30012, std: 0.00239, params: {'n_estimators': 200, 'max_depth': 7}]

In [122]:
print(rand.best_score_)
print(rand.best_params_)

0.3205625
{'n_estimators': 75, 'max_depth': 10}


In [123]:
new_rand = RandomForestClassifier()

In [124]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=4444)

In [126]:
new_rand.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=75, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [127]:
new_rand.feature_importances_

array([ 0.01866316,  0.02517072,  0.01950668,  0.07859484,  0.03590709,
        0.06252765,  0.00927506,  0.05750702,  0.02129084,  0.07512296,
        0.04746401,  0.05786216,  0.0646771 ,  0.12358718,  0.00734726,
        0.00701218,  0.00717028,  0.00844251,  0.00700827,  0.00664661,
        0.00690042,  0.00361129,  0.00665685,  0.00459715,  0.00445058,
        0.00438447,  0.00091681,  0.00283098,  0.00682869,  0.00133169,
        0.01197392,  0.00527692,  0.00468774,  0.00483366,  0.00627543,
        0.00619414,  0.00565532,  0.00574805,  0.00634846,  0.00431424,
        0.00495701,  0.0055619 ,  0.00426451,  0.00367058,  0.00575331,
        0.00053948,  0.00115172,  0.00119234,  0.00829345,  0.0018084 ,
        0.00350141,  0.00069436,  0.00295966,  0.00066936,  0.00069639,
        0.00018056,  0.00122855,  0.00063233,  0.0001387 ,  0.0007495 ,
        0.00083542,  0.00315616,  0.00042054,  0.00012438,  0.0005815 ,
        0.00358696,  0.00196056,  0.00115016,  0.00013218,  0.  

In [128]:
importances = new_rand.feature_importances_

In [129]:
std = np.std([tree.feature_importances_ for tree in new_rand.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(X.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

Feature ranking:
1. feature 13 (0.123587)
2. feature 3 (0.078595)
3. feature 9 (0.075123)
4. feature 12 (0.064677)
5. feature 5 (0.062528)
6. feature 11 (0.057862)
7. feature 7 (0.057507)
8. feature 10 (0.047464)
9. feature 4 (0.035907)
10. feature 1 (0.025171)
11. feature 8 (0.021291)
12. feature 2 (0.019507)
13. feature 0 (0.018663)
14. feature 30 (0.011974)
15. feature 6 (0.009275)
16. feature 17 (0.008443)
17. feature 48 (0.008293)
18. feature 14 (0.007347)
19. feature 16 (0.007170)
20. feature 15 (0.007012)
21. feature 18 (0.007008)
22. feature 20 (0.006900)
23. feature 28 (0.006829)
24. feature 22 (0.006657)
25. feature 19 (0.006647)
26. feature 38 (0.006348)
27. feature 34 (0.006275)
28. feature 35 (0.006194)
29. feature 90 (0.006113)
30. feature 92 (0.005902)
31. feature 44 (0.005753)
32. feature 37 (0.005748)
33. feature 91 (0.005703)
34. feature 36 (0.005655)
35. feature 41 (0.005562)
36. feature 88 (0.005535)
37. feature 87 (0.005380)
38. feature 93 (0.005307)
39. feature 31

In [130]:
X_train.columns

Index(['pop10_sqmi', 'median_income', 'z_index', 'dist_to_train_station',
       'num_close_train_stations', 'dist_to_police_station',
       'num_close_police_stations', 'dist_to_dispensary',
       'num_close_dispensaries', 'dist_to_health_facility',
       'num_close_health_facilities', 'dist_to_shelter', 'num_close_shelters',
       'dist_to_union_sq', 'day_of_week_Friday', 'day_of_week_Monday',
       'day_of_week_Saturday', 'day_of_week_Sunday', 'day_of_week_Thursday',
       'day_of_week_Tuesday', 'day_of_week_Wednesday', 'pd_district_BAYVIEW',
       'pd_district_CENTRAL', 'pd_district_INGLESIDE', 'pd_district_MISSION',
       'pd_district_NORTHERN', 'pd_district_PARK', 'pd_district_RICHMOND',
       'pd_district_SOUTHERN', 'pd_district_TARAVAL', 'pd_district_TENDERLOIN',
       'month_Apr', 'month_Aug', 'month_Dec', 'month_Feb', 'month_Jan',
       'month_Jul', 'month_Jun', 'month_Mar', 'month_May', 'month_Nov',
       'month_Oct', 'month_Sep', 'zip_code_94102', 'zip_code_9410

In [131]:
new_log = LogisticRegression()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=4444)
new_log.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [132]:
log_y_pred = new_log.predict(X_test)

In [133]:
score = accuracy_score(y_test, log_y_pred)
score

0.28749999999999998