In [13]:
import pandas as pd, sys, time
from sklearn import cross_validation
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import log_loss
from sklearn.grid_search import GridSearchCV
from sklearn.externals import joblib

In [7]:
# add main directory to path in order to import other parts of the shelter animal outcomes project
sys.path.extend(['/Users/pc3sq/Google Drive/Data Science Library/!notebooks/shelter animal outcomes kaggle/'])

In [82]:
from src.pipeline import cleaning_pipeline, transform_pipeline
from src.prepare_submission import process_probas

## Prepping Test Set For Predictions

In [31]:
submission = pd.read_csv('../data/test.csv')

In [32]:
clean_submission = cleaning_pipeline(submission)

Index([u'ID', u'Name', u'DateTime', u'AnimalType', u'SexuponOutcome',
       u'AgeuponOutcome', u'Breed', u'Color'],
      dtype='object')


In [33]:
trans_submission = transform_pipeline(clean_submission)

In [34]:
X_submission = trans_submission.drop('ID', axis=1)

## Prepping & Splitting Training Set For Grid Search

In [27]:
transformed = pd.read_csv('../data/shelter-transform-2016-07-28.csv')

In [28]:
transformed.head()

Unnamed: 0,OutcomeType,AgeuponOutcome,ageInDaysAtOutcome,hasAge,Dog,pureBreed,hasName,Multi-Colored,Patterned,SexuponOutcome_Intact Female,SexuponOutcome_Intact Male,SexuponOutcome_Neutered Male,SexuponOutcome_Spayed Female,SexuponOutcome_Unknown
0,Return_to_owner,365,365.0,1,1,0,1,1,0,0.0,0.0,1.0,0.0,0.0
1,Euthanasia,365,365.0,1,0,0,1,1,1,0.0,0.0,0.0,1.0,0.0
2,Adoption,730,730.0,1,1,0,1,1,0,0.0,0.0,1.0,0.0,0.0
3,Transfer,21,21.0,1,0,0,0,1,0,0.0,1.0,0.0,0.0,0.0
4,Transfer,730,730.0,1,1,0,0,0,0,0.0,0.0,1.0,0.0,0.0


In [36]:
trans_X = transformed.drop('OutcomeType', axis=1)

In [37]:
trans_y = transformed['OutcomeType']

In [50]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(trans_X, trans_y, test_size=0.25, random_state=10)

# <center>Gridsearch w/ GBM</center>

In [65]:
def gridsearch_gbm(params_grid, X_train, y_train):
    model_start = time.time()  
    gbc = GradientBoostingClassifier()
    best_model = GridSearchCV(gbc, param_grid=params_grid, n_jobs=-1)
    model_end = time.time()

    print "Grid search took: ", model_end - model_start
    
    fit_start = time.time()
    best_model.fit(X_train, y_train)
    fit_end = time.time()
    print "Fitting took: ", fit_end - fit_start
    
    return best_model

def gridsearch_stats(model, X_test, y_test):
    print model.best_params_
    
    probas = model.predict_proba(X_test)
    print log_loss(y_test, probas)
    
    return probas


## Attempt 1

In [60]:
params01 = {
            'max_depth': [8],
            'min_samples_leaf': [1000], 
            "learning_rate": [0.1, 0.05, 0.01], 
            "max_features": ["sqrt", 3, 10]
            }

gbm_gc01 = gridsearch_gbm(params01, X_train, y_train)

Grid search took:  0.000370979309082
Fitting took:  75.5929970741


In [64]:
gridsearch_stats(gbm_gc01, X_test, y_test)

{'max_features': 10, 'learning_rate': 0.1, 'max_depth': 8, 'min_samples_leaf': 1000}
0.848833997465


array([[  2.34069737e-01,   5.57051044e-03,   7.66808899e-02,
          5.68846713e-01,   1.14832150e-01],
       [  9.21415745e-01,   8.14169046e-04,   7.23022169e-04,
          5.99293231e-03,   7.10541314e-02],
       [  7.39322586e-01,   1.14172231e-03,   3.86128161e-03,
          5.39079874e-02,   2.01766423e-01],
       ..., 
       [  5.00729275e-01,   9.97972528e-04,   1.42364381e-02,
          1.12597893e-01,   3.71438421e-01],
       [  3.34274822e-01,   1.20124139e-03,   6.83995202e-02,
          4.30505641e-01,   1.65618775e-01],
       [  1.80082164e-04,   1.78827989e-02,   2.54307253e-02,
          2.49143738e-03,   9.54014956e-01]])

+ Keeping learning_rate = 0.1

## Attempt 2

In [66]:
params02 = {
            'max_depth': [10, 15],
            'min_samples_leaf': [1000], 
            "learning_rate": [0.1], 
            "max_features": [10]
            }

gbm_gc02 = gridsearch_gbm(params02, X_train, y_train)

Grid search took:  0.00048303604126
Fitting took:  33.6684710979


In [67]:
gridsearch_stats(gbm_gc02, X_test, y_test)

{'max_features': 10, 'learning_rate': 0.1, 'max_depth': 15, 'min_samples_leaf': 1000}
0.849199137568


array([[  2.38276658e-01,   5.79562711e-03,   8.07676570e-02,
          5.65709831e-01,   1.09450226e-01],
       [  9.18039140e-01,   1.09364601e-03,   7.00444578e-04,
          6.39060751e-03,   7.37761616e-02],
       [  7.55185230e-01,   1.37706375e-03,   3.75698202e-03,
          4.98700457e-02,   1.89810679e-01],
       ..., 
       [  5.00435546e-01,   1.01932743e-03,   1.50573274e-02,
          1.12454332e-01,   3.71033467e-01],
       [  3.38900862e-01,   1.11785650e-03,   6.76450330e-02,
          4.29359816e-01,   1.62976432e-01],
       [  1.56891032e-04,   1.72850605e-02,   2.45506858e-02,
          2.90760863e-03,   9.55099754e-01]])

+ Deeper max-depth

## Attempt 3

In [70]:
params03 = {
            'max_depth': [18, 25],
            'min_samples_leaf': [500, 1000], 
            "learning_rate": [0.1], 
            "max_features": [10, 13] # 13 is the max features
            }

gbm_gc03 = gridsearch_gbm(params03, X_train, y_train)

Grid search took:  0.000117063522339
Fitting took:  114.977802038


In [71]:
gridsearch_stats(gbm_gc03, X_test, y_test)

{'max_features': 10, 'learning_rate': 0.1, 'max_depth': 25, 'min_samples_leaf': 500}
0.84788841092


array([[  1.62988376e-01,   5.79724501e-03,   7.77530812e-02,
          6.51505134e-01,   1.01956164e-01],
       [  8.84663347e-01,   3.57955672e-04,   1.42333796e-03,
          4.25804709e-03,   1.09297312e-01],
       [  7.62834126e-01,   9.30908845e-04,   4.38087356e-03,
          4.57230177e-02,   1.86131074e-01],
       ..., 
       [  5.22247953e-01,   5.05979246e-04,   1.51926760e-02,
          1.09737595e-01,   3.52315796e-01],
       [  3.33336509e-01,   2.56319152e-03,   6.27378043e-02,
          4.29302670e-01,   1.72059825e-01],
       [  2.72742989e-04,   3.31566143e-02,   1.91492305e-02,
          1.54199786e-03,   9.45879414e-01]])

+ Lower min_samples_leaf
+ Deeper max_depth
+ Keep max_features = 10

## Attempt 4

In [74]:
params04 = {
            'max_depth': [30, 40],
            'min_samples_leaf': [100, 300], 
            "learning_rate": [0.1], 
            "max_features": [10]
            }

gbm_gc04 = gridsearch_gbm(params04, X_train, y_train)

Grid search took:  6.103515625e-05
Fitting took:  117.023581982


In [75]:
gridsearch_stats(gbm_gc04, X_test, y_test)

{'max_features': 10, 'learning_rate': 0.1, 'max_depth': 30, 'min_samples_leaf': 300}
0.851779505476


array([[  2.07445394e-01,   4.04962052e-03,   4.25240132e-02,
          6.21313495e-01,   1.24667478e-01],
       [  8.86804980e-01,   2.69704549e-04,   1.34174140e-03,
          3.06258736e-03,   1.08520987e-01],
       [  7.56471181e-01,   1.31484771e-03,   6.10069446e-03,
          5.85130913e-02,   1.77600185e-01],
       ..., 
       [  5.42128719e-01,   1.11965406e-04,   1.57739439e-02,
          1.02174874e-01,   3.39810497e-01],
       [  3.23794944e-01,   3.60831554e-03,   5.80861068e-02,
          4.34860477e-01,   1.79650156e-01],
       [  8.08341876e-05,   1.74326290e-02,   1.84683276e-02,
          9.34133032e-04,   9.63084076e-01]])

## Using Best Params From Attempt 3

In [76]:
best_params = {'max_features': 10, 'learning_rate': 0.1, 'max_depth': 25, 'min_samples_leaf': 500}

In [77]:
# Using random state 7 like previous train test splits
X_train1, X_test1, y_train1, y_test1 = cross_validation.train_test_split(trans_X, trans_y, test_size=0.25, random_state=7)

In [78]:
probas03 = gbm_gc03.predict_proba(X_test1)

In [79]:
log_loss(y_test1, probas03)

0.83028614635739051

In [81]:
submission_probas = gbm_gc03.predict_proba(X_submission)

In [85]:
def process_probas(model, pred_probas, raw_test_set):
    submission = pd.DataFrame(pred_probas, columns=model.classes_)
    submission['ID'] = raw_test_set['ID']
    return submission[['ID', 'Adoption', 'Died', 'Euthanasia', 'Return_to_owner', 'Transfer']]

In [89]:
final_submission = process_probas(gbm_gc03.best_estimator_, submission_probas, test)

In [92]:
# final_submission.to_csv('../data/submission-2016-07-28-gridsearch-gbm.csv', index=False)

In [93]:
test_final_submission = pd.read_csv('../data/submission-2016-07-28-gridsearch-gbm.csv')

In [95]:
test_final_submission.shape

(11456, 6)

In [96]:
test_final_submission.columns

Index([u'ID', u'Adoption', u'Died', u'Euthanasia', u'Return_to_owner',
       u'Transfer'],
      dtype='object')

In [98]:
## saving best model
# joblib.dump(gbm_gc03.best_estimator_, '../models/2016-07-28-gbm-gridsearch/gbm_gc03.pkl')