# Modelling the data

## Imports

In [173]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.model_selection import validation_curve, cross_val_score, train_test_split, KFold, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score, classification_report, auc, confusion_matrix, f1_score, roc_curve, recall_score, precision_score
import os

## Load Data

In [67]:
df = pd.read_csv('complete_data_set_inc_elevation.csv')

## Filter data

In [68]:
subset_categories = ['LARCENY/THEFT','ASSAULT','VANDALISM','VEHICLE THEFT','BURGLARY','DRUG/NARCOTIC','ROBBERY']
df = df[df['category'].isin(subset_categories)]

In [69]:
len(df)

108583

In [70]:
df['category'].value_counts()

LARCENY/THEFT    53809
ASSAULT          17563
VANDALISM        11533
VEHICLE THEFT     8346
BURGLARY          7641
DRUG/NARCOTIC     5402
ROBBERY           4289
Name: category, dtype: int64

In [71]:
df.loc[df['description'] == 'PETTY THEFT OF PROPERTY', 'category'] = 'THEFT OF PROPERTY'
df.loc[df['description'] == 'GRAND THEFT OF PROPERTY', 'category'] = 'THEFT OF PROPERTY'
df.loc[df['description'] == 'GRAND THEFT FROM LOCKED AUTO', 'category'] = 'THEFT FROM AUTO'
df.loc[df['description'] == 'PETTY THEFT FROM LOCKED AUTO', 'category'] = 'THEFT FROM AUTO'
df.loc[df['description'] == 'GRAND THEFT FROM UNLOCKED AUTO', 'category'] = 'THEFT FROM AUTO'
df.loc[df['description'] == 'PETTY THEFT FROM UNLOCKED AUTO', 'category'] = 'THEFT FROM AUTO'
df.loc[df['description'] == 'ATTEMPTED THEFT FROM LOCKED VEHICLE', 'category'] = 'THEFT FROM AUTO'
df.loc[df['description'] == 'PETTY THEFT SHOPLIFTING', 'category'] = 'SHOPLIFTING/PICKPOCKET'
df.loc[df['description'] == 'GRAND THEFT SHOPLIFTING', 'category'] = 'SHOPLIFTING/PICKPOCKET'
df.loc[df['description'] == 'GRAND THEFT PICKPOCKET', 'category'] = 'SHOPLIFTING/PICKPOCKET'
df.loc[df['category'] == 'LARCENY/THEFT', 'category'] = 'OTHER THEFT'

In [72]:
df['category'].value_counts()

THEFT FROM AUTO           33501
ASSAULT                   17563
VANDALISM                 11533
THEFT OF PROPERTY          8414
VEHICLE THEFT              8346
OTHER THEFT                8114
BURGLARY                   7641
DRUG/NARCOTIC              5402
ROBBERY                    4289
SHOPLIFTING/PICKPOCKET     3780
Name: category, dtype: int64

In [73]:
df = df[df['category'] != 'BURGLARY']
df = df[df['category'] != 'OTHER THEFT']

In [74]:
df['category'].value_counts()

THEFT FROM AUTO           33501
ASSAULT                   17563
VANDALISM                 11533
THEFT OF PROPERTY          8414
VEHICLE THEFT              8346
DRUG/NARCOTIC              5402
ROBBERY                    4289
SHOPLIFTING/PICKPOCKET     3780
Name: category, dtype: int64

## Fill N/A values and Scale

In [75]:
df['z_index'].fillna(df['z_index'].mean(), inplace=True)
df['pop10_sqmi'] = df['pop10_sqmi'].apply(lambda x: float(x))
df['pop10_sqmi'].fillna(df['pop10_sqmi'].mean(), inplace=True)
df['median_income'].fillna(df['median_income'].median(), inplace=True)

df['z_index'] = preprocessing.scale(df['z_index'])
df['pop10_sqmi'] = preprocessing.scale(df['pop10_sqmi'])
df['median_income'] = preprocessing.scale(df['median_income'])
df['altitude'] = preprocessing.scale(df['altitude'])

In [76]:
len(df)

92828

In [77]:
df.sample(5)

Unnamed: 0,category,description,day_of_week,pd_district,resolution,address,longitude,latitude,date_time,hour_of_day,...,dist_to_police_station,num_close_police_stations,dist_to_dispensary,num_close_dispensaries,dist_to_health_facility,num_close_health_facilities,dist_to_shelter,num_close_shelters,dist_to_union_sq,altitude
103418,THEFT OF PROPERTY,GRAND THEFT OF PROPERTY,Thursday,TENDERLOIN,NONE,POWELL ST / OFARRELL ST,-122.40804,37.786409,2016-08-18 18:00:00,18,...,0.005576,1,0.004357,4,0.00295,11,0.001201,37,0.001733,-0.542875
123637,ASSAULT,BATTERY,Tuesday,INGLESIDE,NONE,2700 Block of DIAMOND ST,-122.43385,37.734786,2016-11-15 15:06:00,15,...,0.015972,0,0.005949,1,0.008065,2,0.010814,0,0.059431,0.635757
74557,ASSAULT,ATTEMPTED SIMPLE ASSAULT,Wednesday,MISSION,NONE,1800 Block of MISSION ST,-122.419971,37.767454,2016-10-12 15:00:00,15,...,0.005034,1,0.001397,8,0.00203,8,0.000712,21,0.024082,-0.792611
63548,ROBBERY,"SHOPLIFTING, FORCE AGAINST AGENT",Sunday,NORTHERN,NONE,1200 Block of SUTTER ST,-122.420942,37.787596,2016-08-21 09:00:00,9,...,0.008953,1,0.001809,1,0.001853,12,0.00171,14,0.01345,0.43475
18705,ASSAULT,BATTERY,Friday,INGLESIDE,NONE,3300 Block of MISSION ST,-122.421645,37.742997,2017-04-14 13:25:00,13,...,0.019855,0,0.000827,2,0.003473,8,0.00237,3,0.047227,-0.009902


## Change categories to numeric values

In [78]:
def category_to_numeric(input_val):
    if input_val == 'ASSAULT':
        return 1
    elif input_val == 'SHOPLIFTING/PICKPOCKET':
        return 2
    elif input_val == 'THEFT FROM AUTO':
        return 3
    elif input_val == 'DRUG/NARCOTIC':
        return 4
    elif input_val == 'VANDALISM':
        return 5
    elif input_val == 'ROBBERY':
        return 6
    elif input_val == 'VEHICLE THEFT':
        return 7
    else:
        return 8

In [79]:
df['category'] = df['category'].apply(category_to_numeric)

In [80]:
df.sample(6)

Unnamed: 0,category,description,day_of_week,pd_district,resolution,address,longitude,latitude,date_time,hour_of_day,...,dist_to_police_station,num_close_police_stations,dist_to_dispensary,num_close_dispensaries,dist_to_health_facility,num_close_health_facilities,dist_to_shelter,num_close_shelters,dist_to_union_sq,altitude
27308,3,GRAND THEFT FROM LOCKED AUTO,Tuesday,MISSION,NONE,HARRISON ST / 24TH ST,-122.411902,37.752628,2017-04-11 17:12:00,17,...,0.014371,0,0.008379,3,0.005412,10,0.005157,1,0.0357,-0.558103
25334,3,GRAND THEFT FROM LOCKED AUTO,Thursday,PARK,NONE,600 Block of PANORAMA DR,-122.451898,37.753848,2017-03-09 19:00:00,19,...,0.014354,0,0.025934,0,0.00814,2,0.013514,0,0.056048,5.009184
107318,3,GRAND THEFT FROM LOCKED AUTO,Sunday,RICHMOND,NONE,4500 Block of CABRILLO ST,-122.507369,37.773437,2016-10-16 22:00:00,22,...,0.039389,0,0.0369,0,0.010027,0,0.022474,0,0.100933,-0.570285
30892,2,PETTY THEFT SHOPLIFTING,Friday,BAYVIEW,"ARREST, BOOKED",400 Block of RHODEISLAND ST,-122.402571,37.764126,2016-02-12 12:16:00,12,...,0.015534,0,0.015927,0,0.008286,5,0.005373,3,0.024432,-0.667743
103089,8,PETTY THEFT OF PROPERTY,Friday,SOUTHERN,NONE,MARKET ST / SPEAR ST,-122.39563,37.79379,2016-07-29 17:00:00,17,...,0.015119,0,0.006099,2,0.009463,1,0.003558,6,0.013182,-0.954026
44351,2,PETTY THEFT SHOPLIFTING,Saturday,SOUTHERN,"ARREST, BOOKED",800 Block of MARKET ST,-122.406521,37.785063,2016-05-14 15:11:00,15,...,0.006526,1,0.003291,4,0.004467,9,0.000408,39,0.003149,-0.698199


## Group hour of day into 3 categories

In [81]:
def group_times(val):
    d = {
        0:'night',
        1:'night',
        2:'night',
        3:'night',
        4:'night',
        5:'night',
        6:'night',
        7:'night',
        8:'day',
        9:'day',
        10:'day',
        11:'day',
        12:'day',
        13:'day',
        14:'day',
        15:'day',
        16:'evening',
        17:'evening',
        18:'evening',
        19:'evening',
        20:'evening',
        21:'evening',
        22:'evening',
        23:'night'
    }
    
    return d[val]

In [82]:
df['time_of_day'] = df['hour_of_day'].apply(group_times)
df.drop('hour_of_day', axis=1, inplace=True)

In [83]:
df.sample(6)

Unnamed: 0,category,description,day_of_week,pd_district,resolution,address,longitude,latitude,date_time,month,...,num_close_police_stations,dist_to_dispensary,num_close_dispensaries,dist_to_health_facility,num_close_health_facilities,dist_to_shelter,num_close_shelters,dist_to_union_sq,altitude,time_of_day
102780,3,GRAND THEFT FROM UNLOCKED AUTO,Sunday,TENDERLOIN,NONE,200 Block of GOLDEN GATE AV,-122.414263,37.781757,2016-08-07 12:30:00,Aug,...,1,0.004796,5,0.000967,20,0.000781,44,0.009242,-0.405825,day
1795,2,PETTY THEFT SHOPLIFTING,Saturday,SOUTHERN,"ARREST, BOOKED",800 Block of MARKET ST,-122.406521,37.785063,2017-01-14 15:56:00,Jan,...,1,0.003291,4,0.004467,9,0.000408,39,0.003149,-0.698199,day
100203,3,GRAND THEFT FROM LOCKED AUTO,Thursday,RICHMOND,NONE,600 Block of POINTLOBOS AV,-122.510525,37.77984,2016-06-30 18:30:00,Jun,...,0,0.039397,0,0.013541,0,0.024416,0,0.103352,0.638802,evening
95097,3,GRAND THEFT FROM LOCKED AUTO,Monday,SOUTHERN,NONE,800 Block of BRYANT ST,-122.403405,37.775421,2016-03-28 10:30:00,Mar,...,0,0.006862,3,0.003678,3,0.00095,28,0.013282,-0.920525,day
123906,3,GRAND THEFT FROM LOCKED AUTO,Monday,SOUTHERN,NONE,6TH ST / MISSION ST,-122.408711,37.780971,2016-03-07 12:10:00,Mar,...,1,0.001059,5,0.002007,11,0.000607,41,0.007188,-0.667743,day
9091,3,PETTY THEFT FROM LOCKED AUTO,Wednesday,SOUTHERN,NONE,500 Block of 9TH ST,-122.407702,37.770697,2017-02-22 12:30:00,Feb,...,0,0.008911,3,0.00539,4,0.002082,23,0.01736,-0.893115,day


In [84]:
df.columns

Index(['category', 'description', 'day_of_week', 'pd_district', 'resolution',
       'address', 'longitude', 'latitude', 'date_time', 'month',
       'broad_category', 'geometry', 'id', 'pop10_sqmi', 'pop2010', 'sqmi',
       'zip_code', 'median_income', 'z_index', 'dist_to_train_station',
       'num_close_train_stations', 'dist_to_police_station',
       'num_close_police_stations', 'dist_to_dispensary',
       'num_close_dispensaries', 'dist_to_health_facility',
       'num_close_health_facilities', 'dist_to_shelter', 'num_close_shelters',
       'dist_to_union_sq', 'altitude', 'time_of_day'],
      dtype='object')

## Convert categorical data into dummies

In [85]:
model_df = df.drop(['description','resolution','address','longitude','latitude','broad_category',
                    'geometry','id','date_time','pop2010','sqmi'], axis=1)

In [86]:
model_df.shape

(92828, 21)

In [87]:
model_df.columns

Index(['category', 'day_of_week', 'pd_district', 'month', 'pop10_sqmi',
       'zip_code', 'median_income', 'z_index', 'dist_to_train_station',
       'num_close_train_stations', 'dist_to_police_station',
       'num_close_police_stations', 'dist_to_dispensary',
       'num_close_dispensaries', 'dist_to_health_facility',
       'num_close_health_facilities', 'dist_to_shelter', 'num_close_shelters',
       'dist_to_union_sq', 'altitude', 'time_of_day'],
      dtype='object')

In [88]:
prepared_df = pd.get_dummies(model_df, columns=['day_of_week','pd_district','month','zip_code','time_of_day'])

In [89]:
prepared_df.shape

(92828, 74)

In [90]:
prepared_df.columns

Index(['category', 'pop10_sqmi', 'median_income', 'z_index',
       'dist_to_train_station', 'num_close_train_stations',
       'dist_to_police_station', 'num_close_police_stations',
       'dist_to_dispensary', 'num_close_dispensaries',
       'dist_to_health_facility', 'num_close_health_facilities',
       'dist_to_shelter', 'num_close_shelters', 'dist_to_union_sq', 'altitude',
       'day_of_week_Friday', 'day_of_week_Monday', 'day_of_week_Saturday',
       'day_of_week_Sunday', 'day_of_week_Thursday', 'day_of_week_Tuesday',
       'day_of_week_Wednesday', 'pd_district_BAYVIEW', 'pd_district_CENTRAL',
       'pd_district_INGLESIDE', 'pd_district_MISSION', 'pd_district_NORTHERN',
       'pd_district_PARK', 'pd_district_RICHMOND', 'pd_district_SOUTHERN',
       'pd_district_TARAVAL', 'pd_district_TENDERLOIN', 'month_Apr',
       'month_Aug', 'month_Dec', 'month_Feb', 'month_Jan', 'month_Jul',
       'month_Jun', 'month_Mar', 'month_May', 'month_Nov', 'month_Oct',
       'month_Sep', '

In [92]:
prepared_df.sample(5)

Unnamed: 0,category,pop10_sqmi,median_income,z_index,dist_to_train_station,num_close_train_stations,dist_to_police_station,num_close_police_stations,dist_to_dispensary,num_close_dispensaries,...,zip_code_94127.0,zip_code_94130.0,zip_code_94131.0,zip_code_94132.0,zip_code_94133.0,zip_code_94134.0,zip_code_94158.0,time_of_day_day,time_of_day_evening,time_of_day_night
2481,3,2.03517,-1.4966,-1.059951,0.008401,1,0.009574,1,0.005655,4,...,0,0,0,0,0,0,0,0,1,0
124217,3,-1.361795,0.354,-0.464602,0.027117,0,0.028051,0,0.01417,0,...,0,0,0,1,0,0,0,0,1,0
58435,1,2.03517,-1.4966,-1.059951,0.002599,2,0.001755,1,0.005372,5,...,0,0,0,0,0,0,0,1,0,0
32999,1,-0.389682,1.528841,0.942278,0.019625,0,0.01695,0,0.012212,0,...,0,0,0,0,0,0,0,1,0,0
3702,1,-0.317831,-0.998935,-0.492623,0.00247,2,0.006218,1,0.001174,5,...,0,0,0,0,0,0,0,1,0,0


## Create X and y

In [93]:
X = prepared_df.drop('category', axis=1)

In [94]:
X.head()

Unnamed: 0,pop10_sqmi,median_income,z_index,dist_to_train_station,num_close_train_stations,dist_to_police_station,num_close_police_stations,dist_to_dispensary,num_close_dispensaries,dist_to_health_facility,...,zip_code_94127.0,zip_code_94130.0,zip_code_94131.0,zip_code_94132.0,zip_code_94133.0,zip_code_94134.0,zip_code_94158.0,time_of_day_day,time_of_day_evening,time_of_day_night
0,-1.361795,0.354,-0.464602,0.028985,0,0.016293,0,0.01236,0,0.020727,...,0,0,0,1,0,0,0,0,1,0
1,-1.361795,0.354,-0.464602,0.028985,0,0.016293,0,0.01236,0,0.020727,...,0,0,0,1,0,0,0,0,1,0
2,2.03517,-1.4966,-1.059951,0.003517,1,0.006849,1,0.003571,6,0.001241,...,0,0,0,0,0,0,0,0,1,0
3,0.010387,0.285698,-0.000561,0.004819,1,0.010118,0,0.00532,4,0.00503,...,0,0,0,0,0,0,0,1,0,0
4,-1.361795,0.354,-0.464602,0.028985,0,0.016293,0,0.01236,0,0.020727,...,0,0,0,1,0,0,0,1,0,0


In [95]:
y = prepared_df['category']

In [96]:
y.unique()

array([1, 2, 3, 4, 5, 6, 7, 8])

## Feature Selection

In [98]:
X.columns

Index(['pop10_sqmi', 'median_income', 'z_index', 'dist_to_train_station',
       'num_close_train_stations', 'dist_to_police_station',
       'num_close_police_stations', 'dist_to_dispensary',
       'num_close_dispensaries', 'dist_to_health_facility',
       'num_close_health_facilities', 'dist_to_shelter', 'num_close_shelters',
       'dist_to_union_sq', 'altitude', 'day_of_week_Friday',
       'day_of_week_Monday', 'day_of_week_Saturday', 'day_of_week_Sunday',
       'day_of_week_Thursday', 'day_of_week_Tuesday', 'day_of_week_Wednesday',
       'pd_district_BAYVIEW', 'pd_district_CENTRAL', 'pd_district_INGLESIDE',
       'pd_district_MISSION', 'pd_district_NORTHERN', 'pd_district_PARK',
       'pd_district_RICHMOND', 'pd_district_SOUTHERN', 'pd_district_TARAVAL',
       'pd_district_TENDERLOIN', 'month_Apr', 'month_Aug', 'month_Dec',
       'month_Feb', 'month_Jan', 'month_Jul', 'month_Jun', 'month_Mar',
       'month_May', 'month_Nov', 'month_Oct', 'month_Sep', 'zip_code_94102.0',


In [99]:
print(X.shape)
print(y.shape)

(92828, 73)
(92828,)


### Examine feature importances using a Random Forest Classifier

In [100]:
rf = RandomForestClassifier()
rf = rf.fit(X, y)

In [103]:
features = list(zip(X.columns, rf.feature_importances_))

In [107]:
features = sorted(features, key=lambda x: x[1], reverse=True)

In [110]:
model = SelectFromModel(rf, prefit=True)

In [111]:
X_new = model.transform(X)

In [113]:
X.shape

(92828, 73)

In [112]:
X_new.shape

(92828, 31)

In [119]:
features[0:30]

[('dist_to_union_sq', 0.068550792358501839),
 ('dist_to_train_station', 0.067317886398206211),
 ('dist_to_health_facility', 0.062054760604308944),
 ('dist_to_police_station', 0.061502079963091663),
 ('dist_to_dispensary', 0.06130159206604343),
 ('dist_to_shelter', 0.060512061761589589),
 ('altitude', 0.060400694610814799),
 ('num_close_shelters', 0.02619455150178094),
 ('day_of_week_Friday', 0.025161380105772947),
 ('day_of_week_Saturday', 0.024452254478526354),
 ('day_of_week_Wednesday', 0.024108333921286449),
 ('month_Jan', 0.024013717226003428),
 ('month_Mar', 0.023873331122724335),
 ('day_of_week_Sunday', 0.023716785219819465),
 ('month_Feb', 0.023481381022028715),
 ('day_of_week_Monday', 0.02313735948502749),
 ('day_of_week_Tuesday', 0.023023950178647425),
 ('day_of_week_Thursday', 0.022813445539399298),
 ('num_close_health_facilities', 0.021457658164269009),
 ('month_Apr', 0.020778375904478771),
 ('time_of_day_evening', 0.019432424030054445),
 ('time_of_day_day', 0.01933762460235

## Modelling

### Models, Parameters, Grid search

In [145]:
models_dict = {}

models_dict['gradient_boost'] = GradientBoostingClassifier()
models_dict['random_forest'] = RandomForestClassifier()
models_dict['logistic'] = LogisticRegression()
models_dict['knn'] = KNeighborsClassifier()
models_dict['bagging'] = BaggingClassifier()
models_dict['adaboost'] = AdaBoostClassifier()
models_dict['extra_trees'] = ExtraTreesClassifier()

param_dict = {}

param_dict['gradient_boost'] = {
    'model__n_estimators': [10, 100, 300],
    'model__max_depth': [2, 3, 4, 5, 6],
    'model__max_features': ['sqrt', 'auto'],
    'model__min_samples_split': [5, 10, 20, 40]
}

param_dict['random_forest'] = {
    'model__n_estimators': [10, 100, 300],
    'model__max_depth': [2, 3, 4, 5, 6],
    'model__max_features': ['sqrt', 'auto'],
    'model__min_samples_split': [5, 10, 20, 40]
}

param_dict['bagging'] = {
    'model__n_estimators': [10, 100, 300]
}

param_dict['adaboost'] = {
    'model__n_estimators': [10, 100, 300]
}

param_dict['extra_trees'] = {
    'model__n_estimators': [10, 100, 300],
    'model__max_depth': [2, 3, 4, 5, 6],
    'model__max_features': ['sqrt', 'auto'],
    'model__min_samples_split': [5, 10, 20, 40]
}

param_dict['logistic'] = {
    'model__penalty': ['l1', 'l2']
}

param_dict['knn'] = {
    'model__n_neighbors': [5, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500],
    'model__weights': ['uniform', 'distance']
}

In [146]:
def run_pipelines(model_dict, X, y, feature_list, param_dict):
    '''
    Runs through a pipeline for each type of model.
    feature_list = list of tuples.
    param_dict = a nested dictionary that contains the hyper parameters that need to be tuned.
    '''
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
        
    grid_dict = {}
    
    for name, model in sorted(model_dict.items()):
        # Feature selection is usually used as a pre-processing step before doing the actual learning. 
        
        if feature_list:
            combined_features = FeatureUnion(feature_list)

            steps = [
                ('features', combined_features),
                # Evaluate feature importances and select the most relevant features.
                # In this example, it is using Lasso to determine feature importances
                ('feature_selection', SelectFromModel(RandomForestClassifier())),
                # Train on the transformed output, i.e. using only relevant features
                ('model', model)
                    ]
        else:
            steps = [
                ('feature_selection', SelectFromModel(RandomForestClassifier())),
                ('model', model)
                ]
        
        regression_pipeline = Pipeline(steps)
        
        if name in param_dict:
            parameters = param_dict[name]
        else:
            return print('Incorrect parameters in the parameter dictionary.')
            
        grid_dict[name] = GridSearchCV(regression_pipeline, parameters, n_jobs=3, verbose=1)
        train_fit = grid_dict[name].fit(X_train, y_train)
        
        # Output results
        print('Model: ', name)
        print('Best Score: %0.3f' % train_fit.best_score_)
        print('Optimal Parameters: ', train_fit.best_params_)
         
    return grid_dict

In [147]:
grid = run_pipelines(models_dict, X, y, None, param_dict)

Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=3)]: Done   9 out of   9 | elapsed:   40.7s finished


Model:  adaboost
Best Score: 0.412
Optimal Parameters:  {'model__n_estimators': 300}
Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=3)]: Done   9 out of   9 | elapsed:  3.9min finished


Model:  bagging
Best Score: 0.438
Optimal Parameters:  {'model__n_estimators': 300}
Fitting 3 folds for each of 120 candidates, totalling 360 fits


[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:  1.1min
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:  4.7min
[Parallel(n_jobs=3)]: Done 360 out of 360 | elapsed:  9.3min finished


Model:  extra_trees
Best Score: 0.381
Optimal Parameters:  {'model__max_depth': 6, 'model__max_features': 'auto', 'model__min_samples_split': 20, 'model__n_estimators': 10}
Fitting 3 folds for each of 120 candidates, totalling 360 fits


[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:  8.2min
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed: 55.6min
[Parallel(n_jobs=3)]: Done 360 out of 360 | elapsed: 168.4min finished


Model:  gradient_boost
Best Score: 0.457
Optimal Parameters:  {'model__max_depth': 6, 'model__max_features': 'sqrt', 'model__min_samples_split': 20, 'model__n_estimators': 300}
Fitting 3 folds for each of 22 candidates, totalling 66 fits


[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed: 10.2min
[Parallel(n_jobs=3)]: Done  66 out of  66 | elapsed: 16.6min finished


Model:  knn
Best Score: 0.431
Optimal Parameters:  {'model__n_neighbors': 250, 'model__weights': 'distance'}
Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=3)]: Done   6 out of   6 | elapsed:   46.6s finished


Model:  logistic
Best Score: 0.384
Optimal Parameters:  {'model__penalty': 'l1'}
Fitting 3 folds for each of 120 candidates, totalling 360 fits


[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:  1.3min
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:  6.0min
[Parallel(n_jobs=3)]: Done 360 out of 360 | elapsed: 12.8min finished


Model:  random_forest
Best Score: 0.410
Optimal Parameters:  {'model__max_depth': 6, 'model__max_features': 'auto', 'model__min_samples_split': 10, 'model__n_estimators': 10}


In [149]:
grid.keys()

dict_keys(['adaboost', 'bagging', 'extra_trees', 'gradient_boost', 'knn', 'logistic', 'random_forest'])

In [150]:
grid['knn']

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(steps=[('feature_selection', SelectFromModel(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weig...owski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'))]),
       fit_params={}, iid=True, n_jobs=3,
       param_grid={'model__n_neighbors': [5, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500], 'model__weights': ['uniform', 'distance']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=1)

### Stack all the best models

In [170]:
def stack_models(base_model_list, final_model, X_train, X_test, y_train, y_test, path, beta=0.5):
    '''
    DESCRIPTION:
        - Using the stacking ensemble method, a stacked model is outputted.
    INPUT: 
        - base_model_list is the list of model objects to be used in building the stacked model.
        - final_model is the model object that will be used to train on the predicted values of the previous models.
        - X_train, X_test, y_train, and y_test are training and validation data.
        - path is the directory to save the final model
        - beta is an optional parameter for the fbeta scoring metric
    OUTPUT:
        - final_model is the trained stacked model.
        - score_dict is the various scoring metrics used to validate the model.
    '''
    df_ensemble_train = pd.DataFrame()
    df_ensemble_test = pd.DataFrame()
    
    score_dict = {}
    
    for i,model in enumerate(base_model_list):
        
        model.fit(X_train, y_train)
        print("FIT: ", model)
        
        train_pred = model.predict(X_train)
        print("PREDICT TRAIN: ", model)
        
        test_pred = model.predict(X_test)
        print("PREDICT TEST: ", model)
        
        col_name = 'M' + str(i+1)
        df_ensemble_train[col_name] = train_pred
        df_ensemble_test[col_name] = test_pred
    
    final_model.fit(df_ensemble_train, y_train)
    print("FIT FINAL MODEL: ", final_model)
    
    y_pred = final_model.predict(df_ensemble_test)
    print("PREDICT: ", final_model)
        
#     score_dict['acc'] = accuracy_score(y_test, y_pred)
#     score_dict['pre'] = precision_score(y_test, y_pred)
#     score_dict['rec'] = recall_score(y_test, y_pred)
#     score_dict['f1'] = f1_score(y_test, y_pred)
    score_dict['report'] = classification_report(y_test, y_pred)

#     print('Accuracy: ', score_dict['acc'])
#     print('Precision: ', score_dict['pre'])
#     print('Recall: ', score_dict['rec'])
#     print('F1: ', score_dict['f1'])
    print('Classification Report:\n', score_dict['report'])

    if not os.path.isdir(path):
        os.mkdir(path)
        
    loc = os.path.join(path, str(final_model) + '.pkl')
    joblib.dump(final_model, loc)
    
    return final_model, score_dict

In [171]:
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.3)

In [177]:
grad_boost = GradientBoostingClassifier(max_depth=6, max_features='sqrt', min_samples_split=20, n_estimators=300)
bagging = BaggingClassifier(n_estimators=300)
knn = KNeighborsClassifier(n_neighbors=250, weights='distance')
random_forest = RandomForestClassifier(max_depth=6, max_features='auto', min_samples_split=10, n_estimators=10)

model_list = [grad_boost, bagging, knn, random_forest]

for model in model_list:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    c_report = classification_report(y_test, y_pred)
    print('Classification Report:\n', c_report)

Classification Report:
              precision    recall  f1-score   support

          1       0.40      0.46      0.43      5265
          2       0.54      0.52      0.53      1082
          3       0.51      0.83      0.64     10068
          4       0.54      0.38      0.44      1673
          5       0.27      0.05      0.09      3437
          6       0.16      0.03      0.06      1323
          7       0.31      0.12      0.17      2555
          8       0.38      0.17      0.24      2446

avg / total       0.42      0.47      0.41     27849

Classification Report:
              precision    recall  f1-score   support

          1       0.42      0.48      0.45      5265
          2       0.50      0.47      0.48      1082
          3       0.57      0.71      0.63     10068
          4       0.55      0.50      0.52      1673
          5       0.21      0.14      0.17      3437
          6       0.15      0.09      0.11      1323
          7       0.33      0.25      0.28     

  'precision', 'predicted', average, warn_for)


In [178]:
fin_model, fin_score_dict = stack_models(model_list, 
                                                  LogisticRegression(), 
                                                  X_train, 
                                                  X_test, 
                                                  y_train, 
                                                  y_test,
                                                  path='')

FIT:  GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=6,
              max_features='sqrt', max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=20, min_weight_fraction_leaf=0.0,
              n_estimators=300, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False)
PREDICT TRAIN:  GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=6,
              max_features='sqrt', max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=20, min_weight_fraction_leaf=0.0,
              n_estimators=300, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False)
PREDICT TEST:  GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1

FileNotFoundError: [Errno 2] No such file or directory: ''

Stacking doesn't yield any benefit in this case