In [30]:
#import statements
from sklearn import ensemble
from sklearn import grid_search

import pandas as pd
import numpy as np


#read files
train = pd.read_csv('trainingData.csv')
test = pd.read_csv('testingData.csv')

#Log Loss quantifies the accuracy of a classifier by penalising false classifications.
def log_loss_binary(actual, predicted, eps = 1e-15):
    predicted = np.minimum(np.maximum(predicted, eps), 1 - eps)
    
    return -(sum(actual * np.log(predicted) + (1. - actual) * np.log(1. - predicted))) / len(actual)

#check data types
print('Food type has a data type of:  ' + str(train['foodType'].dtype))
print( 'Burglary has a data type of:   ' + str(train['burglary'].dtype))
print ('Ward has a data type of:       ' + str(train['ward'].dtype))
print ('Sanitation has a data type of: ' + str(train['sanitation'].dtype))

print ("")

#return dummy head data
dummy_data = pd.get_dummies(train['foodType'].head())

#concatenate dummy foodtype data plotted against foodtype head data
print (pd.concat([train['foodType'].head(), dummy_data], axis = 1))

#get dummy data
dummy_train = pd.get_dummies(train['foodType'])

print ("")

#check clean data
print ('Food type has:  ' + str(sum(train['foodType'].isnull()))   + '  missing values')
print ('Burglary has:   ' + str(sum(train['burglary'].isnull()))   + '  missing values')
print ('Ward has:       ' + str(sum(train['ward'].isnull()))       + ' missing values')
print ('Sanitation has: ' + str(sum(train['sanitation'].isnull())) + '  missing values')

#create a missing indicator and then set missings to an imputed or arbitrary value.
train['ward_missing'] = 1. * (train['ward'].isnull())

train.loc[train['ward_missing'] == 1, 'ward'] = 0.

#Fit GBM
baseline_model = ensemble.GradientBoostingRegressor(
                                                    loss = 'ls', 
                                                    learning_rate = 0.1, 
                                                    subsample = 0.7,  
                                                    min_samples_leaf = 5, 
                                                    max_depth = 5
                                                   )

numeric_predictors = ['burglary', 'ward', 'sanitation', 'ward_missing']

X = pd.concat([dummy_train, train[numeric_predictors]], axis = 1)
y = train['response']

trees_to_search_over = {'n_estimators': range(100, 401, 100)}
grid_model = grid_search.GridSearchCV(estimator = baseline_model, 
                                      param_grid = trees_to_search_over, 
                                      n_jobs = 4,
                                      iid = False, 
                                      verbose = 2,
                                      cv = 5)

grid_model.fit(X, y)

# print out the best cv fold combination:"
print ('Best parameter combinations:' + str(grid_model.best_params_))

# fit the estimator with the best cv fold combination:
grid_model.estimator.fit(X, y)

Food type has a data type of:  object
Burglary has a data type of:   float64
Ward has a data type of:       float64
Sanitation has a data type of: float64

  foodType  dinner  mart  unknown
0     mart       0     1        0
1  unknown       0     0        1
2   dinner       1     0        0
3   dinner       1     0        0
4  unknown       0     0        1

Food type has:  0  missing values
Burglary has:   0  missing values
Ward has:       32 missing values
Sanitation has: 0  missing values


ValueError: Parameter values for parameter (n_estimators) need to be a sequence.