In [1]:
import pandas as pd, numpy as np, sys
from sklearn import cross_validation
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import log_loss
from sklearn.externals import joblib

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
# add main directory to path in order to import other parts of the shelter animal outcomes project
sys.path.extend(['/Users/pc3sq/Google Drive/Data Science Library/!notebooks/animal-shelter-outcomes/'])

In [4]:
from src.pipeline import cleaning_pipeline, transform_pipeline

# Create New Gradient Boosting Model w/ New Features

In [5]:
dframe = pd.read_csv('../data/shelter-transform-2016-07-28.csv')

In [6]:
dframe.head()

Unnamed: 0,OutcomeType,AgeuponOutcome,ageInDaysAtOutcome,hasAge,Dog,pureBreed,hasName,Multi-Colored,Patterned,SexuponOutcome_Intact Female,SexuponOutcome_Intact Male,SexuponOutcome_Neutered Male,SexuponOutcome_Spayed Female,SexuponOutcome_Unknown
0,Return_to_owner,365,365.0,1,1,0,1,1,0,0.0,0.0,1.0,0.0,0.0
1,Euthanasia,365,365.0,1,0,0,1,1,1,0.0,0.0,0.0,1.0,0.0
2,Adoption,730,730.0,1,1,0,1,1,0,0.0,0.0,1.0,0.0,0.0
3,Transfer,21,21.0,1,0,0,0,1,0,0.0,1.0,0.0,0.0,0.0
4,Transfer,730,730.0,1,1,0,0,0,0,0.0,0.0,1.0,0.0,0.0


In [7]:
X = dframe.drop(['OutcomeType'], axis=1)
X.columns

Index([u'AgeuponOutcome', u'ageInDaysAtOutcome', u'hasAge', u'Dog',
       u'pureBreed', u'hasName', u'Multi-Colored', u'Patterned',
       u'SexuponOutcome_Intact Female', u'SexuponOutcome_Intact Male',
       u'SexuponOutcome_Neutered Male', u'SexuponOutcome_Spayed Female',
       u'SexuponOutcome_Unknown'],
      dtype='object')

In [8]:
y = dframe['OutcomeType']

In [9]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.25, random_state=7)

Previously modeling attempt already shows a gradient boost model is a superior choice. Let's see how a GBM model performs with the new set of features.

In [10]:
gbm = GradientBoostingClassifier(n_estimators=100,random_state=7).fit(X_train,y_train)

In [11]:
gbm_probas = gbm.predict_proba(X_test)

In [12]:
# 0.85 log loss is approximately what we had with the last set of transformed features
log_loss(y_test, gbm_probas)

0.85329749803152133

# Prepare Submission

In [13]:
sample = pd.read_csv('../data/sample_submission.csv')

In [14]:
sample.head()

Unnamed: 0,ID,Adoption,Died,Euthanasia,Return_to_owner,Transfer
0,1,1,0,0,0,0
1,2,1,0,0,0,0
2,3,1,0,0,0,0
3,4,1,0,0,0,0
4,5,1,0,0,0,0


In [15]:
test = pd.read_csv('../data/test.csv')

In [16]:
test.head()

Unnamed: 0,ID,Name,DateTime,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,1,Summer,2015-10-12 12:15:00,Dog,Intact Female,10 months,Labrador Retriever Mix,Red/White
1,2,Cheyenne,2014-07-26 17:59:00,Dog,Spayed Female,2 years,German Shepherd/Siberian Husky,Black/Tan
2,3,Gus,2016-01-13 12:20:00,Cat,Neutered Male,1 year,Domestic Shorthair Mix,Brown Tabby
3,4,Pongo,2013-12-28 18:12:00,Dog,Intact Male,4 months,Collie Smooth Mix,Tricolor
4,5,Skooter,2015-09-24 17:59:00,Dog,Neutered Male,2 years,Miniature Poodle Mix,White


In [17]:
# create submission dframe
submission_example = pd.DataFrame(gbm_probas, columns=gbm.classes_)

In [18]:
submission_example['ID'] = test['ID']

In [19]:
prepared_submission = submission_example[['ID', 'Adoption', 'Died', 'Euthanasia', 'Return_to_owner', 'Transfer']]

In [20]:
# check manually prepared submission columns match the sample columns
prepared_submission.columns == sample.columns

array([ True,  True,  True,  True,  True,  True], dtype=bool)

# Create Submission

In [None]:
clean_submission = cleaning_pipeline(test)

In [26]:
# submission requires we predict on all 11456 observations in the test set
# need to figure out which records are being dropped during cleaning because they contain nulls
# instead, we should fill in the nulls and preserve these records

In [None]:
transformed_submission = transform_pipeline(clean_submission, test_set=True)

In [None]:
# check for nulls
np.sum(transformed_submission.isnull())

In [None]:
submission_X = transformed_submission.drop('ID', axis=1)

In [None]:
submission_probas = gbm.predict_proba(submission_X)

In [None]:
# submission has correct shape w/o ID
print submission_probas.shape[0] == test.shape[0]
submission_probas.shape

In [22]:
# save to src under 'prepare_submission' module
def process_probas(model, pred_probas, raw_test_set):
    submission = pd.DataFrame(pred_probas, columns=model.classes_)
    submission['ID'] = raw_test_set['ID']    
    return submission[['ID', 'Adoption', 'Died', 'Euthanasia', 'Return_to_owner', 'Transfer']]

In [23]:
# create a test submission using 
test_process_probas = process_probas(gbm, gbm_probas, test)

In [27]:
# testing if process_probas is returning the same columns as the sample submission
test_process_probas.columns == sample.columns

array([ True,  True,  True,  True,  True,  True], dtype=bool)

In [28]:
# quick look at the test submission created by process_probas()
test_process_probas.head()

Unnamed: 0,ID,Adoption,Died,Euthanasia,Return_to_owner,Transfer
0,1,0.411901,0.009103,0.148407,0.221508,0.209081
1,2,0.024276,0.030181,0.120592,0.013148,0.811803
2,3,0.010274,0.007551,0.030129,0.033124,0.918923
3,4,0.037235,0.004793,0.053249,0.177797,0.726927
4,5,0.562923,0.00164,0.016214,0.243667,0.175557


In [None]:
# create submission with the probabilities predicted by gradient boosting model on the test set
prepared_submission = process_probas(gbm, submission_probas, test)

In [None]:
# checking prepared submission is correct shape
print prepared_submission.shape
prepared_submission.head()

In [None]:
# # saving final submission
# prepared_submission.to_csv('../data/submission-2016-07-28-final.csv', index=False)

In [None]:
checking_submission = pd.read_csv('../data/submission-2016-07-28-final.csv')

In [None]:
print checking_submission.shape
checking_submission.head()

In [None]:
##save gbm model
# joblib.dump(gbm, '../models/2016-07-28-gbm/2016-07-28-gbm.pkl') 