In [36]:
import pandas as pd, sys, time
from sklearn import cross_validation
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import log_loss
from sklearn.grid_search import GridSearchCV
from sklearn.externals import joblib

In [37]:
# add main directory to path in order to import other parts of the shelter animal outcomes project
sys.path.extend(['/Users/pc3sq/Google Drive/Data Science Library/!notebooks/animal-shelter-outcomes/'])

In [38]:
from src.pipeline import cleaning_pipeline, transform_pipeline
from src.prepare_submission import process_probas

## Prepping Test Set For Predictions

In [39]:
submission = pd.read_csv('../data/test.csv')

In [40]:
clean_submission = cleaning_pipeline(submission)

Index([u'ID', u'Name', u'DateTime', u'AnimalType', u'SexuponOutcome',
       u'AgeuponOutcome', u'Breed', u'Color'],
      dtype='object')


In [41]:
trans_submission = transform_pipeline(clean_submission)

In [42]:
X_submission = trans_submission.drop('ID', axis=1)

## Prepping & Splitting Training Set For Grid Search

In [43]:
transformed = pd.read_csv('../data/shelter-transform-2016-07-28.csv')

In [44]:
transformed.head()

Unnamed: 0,OutcomeType,AgeuponOutcome,ageInDaysAtOutcome,hasAge,Dog,pureBreed,hasName,Multi-Colored,Patterned,SexuponOutcome_Intact Female,SexuponOutcome_Intact Male,SexuponOutcome_Neutered Male,SexuponOutcome_Spayed Female,SexuponOutcome_Unknown
0,Return_to_owner,365,365.0,1,1,0,1,1,0,0.0,0.0,1.0,0.0,0.0
1,Euthanasia,365,365.0,1,0,0,1,1,1,0.0,0.0,0.0,1.0,0.0
2,Adoption,730,730.0,1,1,0,1,1,0,0.0,0.0,1.0,0.0,0.0
3,Transfer,21,21.0,1,0,0,0,1,0,0.0,1.0,0.0,0.0,0.0
4,Transfer,730,730.0,1,1,0,0,0,0,0.0,0.0,1.0,0.0,0.0


In [45]:
trans_X = transformed.drop('OutcomeType', axis=1)

In [46]:
trans_y = transformed['OutcomeType']

In [47]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(trans_X, trans_y, test_size=0.25, random_state=10)

# <center>Gridsearch w/ GBM</center>

In [48]:
def gridsearch_gbm(params_grid, X_train, y_train):
    model_start = time.time()  
    gbc = GradientBoostingClassifier()
    best_model = GridSearchCV(gbc, param_grid=params_grid, n_jobs=-1)
    model_end = time.time()

    print "Grid search took: ", model_end - model_start
    
    fit_start = time.time()
    best_model.fit(X_train, y_train)
    fit_end = time.time()
    print "Fitting took: ", fit_end - fit_start
    
    return best_model

def gridsearch_stats(model, X_test, y_test):
    print model.best_params_
    
    probas = model.predict_proba(X_test)
    print log_loss(y_test, probas)
    
    return probas


## Attempt 1

In [49]:
params01 = {
            'max_depth': [8],
            'min_samples_leaf': [1000], 
            "learning_rate": [0.1, 0.05, 0.01], 
            "max_features": ["sqrt", 3, 10]
            }

gbm_gc01 = gridsearch_gbm(params01, X_train, y_train)

Grid search took:  8.29696655273e-05
Fitting took:  50.4349400997


In [50]:
gridsearch_stats(gbm_gc01, X_test, y_test)

{'max_features': 10, 'learning_rate': 0.1, 'max_depth': 8, 'min_samples_leaf': 1000}
0.84865256231


array([[  2.38335346e-01,   5.66571852e-03,   7.58748536e-02,
          5.68882554e-01,   1.11241528e-01],
       [  9.14839354e-01,   1.01507027e-03,   7.73187196e-04,
          6.32941115e-03,   7.70429769e-02],
       [  7.57412765e-01,   1.74718971e-03,   4.13647548e-03,
          4.32789118e-02,   1.93424658e-01],
       ..., 
       [  4.98958607e-01,   9.42222823e-04,   1.48138296e-02,
          1.12325923e-01,   3.72959417e-01],
       [  3.39265806e-01,   1.23875733e-03,   6.64493162e-02,
          4.28196640e-01,   1.64849481e-01],
       [  2.35002894e-04,   1.89652328e-02,   2.55407916e-02,
          3.06307458e-03,   9.52195898e-01]])

+ Keeping learning_rate = 0.1

## Attempt 2

In [51]:
params02 = {
            'max_depth': [10, 15],
            'min_samples_leaf': [1000], 
            "learning_rate": [0.1], 
            "max_features": [10]
            }

gbm_gc02 = gridsearch_gbm(params02, X_train, y_train)

Grid search took:  7.08103179932e-05
Fitting took:  20.3236610889


In [52]:
gridsearch_stats(gbm_gc02, X_test, y_test)

{'max_features': 10, 'learning_rate': 0.1, 'max_depth': 15, 'min_samples_leaf': 1000}
0.848959544101


array([[  2.43399172e-01,   5.50761068e-03,   7.72582478e-02,
          5.64956553e-01,   1.08878416e-01],
       [  9.17902282e-01,   1.04981273e-03,   7.60572327e-04,
          5.70096293e-03,   7.45863700e-02],
       [  7.54929213e-01,   1.33020514e-03,   4.89032157e-03,
          4.91884618e-02,   1.89661799e-01],
       ..., 
       [  5.02029118e-01,   1.18769613e-03,   1.30210869e-02,
          1.10421266e-01,   3.73340833e-01],
       [  3.36289662e-01,   1.31365852e-03,   6.52508233e-02,
          4.33311551e-01,   1.63834305e-01],
       [  1.58531567e-04,   1.73497052e-02,   2.60276394e-02,
          2.82874131e-03,   9.53635383e-01]])

+ Deeper max-depth

## Attempt 3

In [53]:
params03 = {
            'max_depth': [18, 25],
            'min_samples_leaf': [500, 1000], 
            "learning_rate": [0.1], 
            "max_features": [10, 13] # 13 is the max features
            }

gbm_gc03 = gridsearch_gbm(params03, X_train, y_train)

Grid search took:  4.79221343994e-05
Fitting took:  72.2075340748


In [54]:
gridsearch_stats(gbm_gc03, X_test, y_test)

{'max_features': 10, 'learning_rate': 0.1, 'max_depth': 25, 'min_samples_leaf': 500}
0.847954668457


array([[  1.57783596e-01,   5.26578826e-03,   8.21334305e-02,
          6.53762348e-01,   1.01054837e-01],
       [  8.86214084e-01,   3.48524489e-04,   1.41733110e-03,
          3.43033457e-03,   1.08589726e-01],
       [  7.72342185e-01,   1.25967557e-03,   5.03009956e-03,
          4.41633367e-02,   1.77204703e-01],
       ..., 
       [  5.26679086e-01,   2.46552408e-04,   1.45323231e-02,
          1.12375715e-01,   3.46166323e-01],
       [  3.32078063e-01,   2.21975829e-03,   6.07997206e-02,
          4.34103490e-01,   1.70798968e-01],
       [  3.05854392e-04,   3.66201094e-02,   1.83905811e-02,
          1.42170881e-03,   9.43261746e-01]])

+ Lower min_samples_leaf
+ Deeper max_depth
+ Keep max_features = 10

## Attempt 4

In [55]:
params04 = {
            'max_depth': [30, 40],
            'min_samples_leaf': [100, 300], 
            "learning_rate": [0.1], 
            "max_features": [10]
            }

gbm_gc04 = gridsearch_gbm(params04, X_train, y_train)

Grid search took:  4.91142272949e-05
Fitting took:  72.5444891453


In [56]:
gridsearch_stats(gbm_gc04, X_test, y_test)

{'max_features': 10, 'learning_rate': 0.1, 'max_depth': 40, 'min_samples_leaf': 300}
0.851588279696


array([[  2.17431103e-01,   4.17372051e-03,   4.71661940e-02,
          6.12477490e-01,   1.18751493e-01],
       [  8.88464215e-01,   3.14783543e-04,   1.01351920e-03,
          2.78044396e-03,   1.07427038e-01],
       [  7.55159243e-01,   9.89230893e-04,   6.56518958e-03,
          5.89834428e-02,   1.78302894e-01],
       ..., 
       [  5.37696609e-01,   8.38246530e-05,   1.56643615e-02,
          9.94437843e-02,   3.47111420e-01],
       [  3.23635268e-01,   3.21016769e-03,   5.71683009e-02,
          4.36180910e-01,   1.79805353e-01],
       [  1.02544305e-04,   1.69688537e-02,   2.01714741e-02,
          4.24224704e-04,   9.62332903e-01]])

## Using Best Params From Attempt 3

In [57]:
best_params = {'max_features': 10, 'learning_rate': 0.1, 'max_depth': 25, 'min_samples_leaf': 500}

In [58]:
# Using random state 7 like previous train test splits
X_train1, X_test1, y_train1, y_test1 = cross_validation.train_test_split(trans_X, trans_y, test_size=0.25, random_state=7)

In [59]:
probas03 = gbm_gc03.predict_proba(X_test1)

In [60]:
log_loss(y_test1, probas03)

0.83080436521487722

In [61]:
submission_probas = gbm_gc03.predict_proba(X_submission)

In [62]:
def process_probas(model, pred_probas, raw_test_set):
    submission = pd.DataFrame(pred_probas, columns=model.classes_)
    submission['ID'] = raw_test_set['ID']
    return submission[['ID', 'Adoption', 'Died', 'Euthanasia', 'Return_to_owner', 'Transfer']]

In [63]:
final_submission = process_probas(gbm_gc03.best_estimator_, submission_probas, test)

NameError: name 'test' is not defined

In [None]:
# final_submission.to_csv('../data/submission-2016-07-28-gridsearch-gbm.csv', index=False)

In [None]:
test_final_submission = pd.read_csv('../data/submission-2016-07-28-gridsearch-gbm.csv')

In [None]:
test_final_submission.shape

In [None]:
test_final_submission.columns

In [None]:
## saving best model
# joblib.dump(gbm_gc03.best_estimator_, '../models/2016-07-28-gbm-gridsearch/gbm_gc03.pkl')