In [121]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder

from skopt import BayesSearchCV

from xgboost import XGBClassifier

%matplotlib inline

In [123]:
class BayesSearchCV(BayesSearchCV):
    def _run_search(self, x): raise BaseException('Use newer skopt')

In [134]:
df = pd.read_csv('./data/all_categoricals.csv', index_col = 0)

In [135]:
df.columns.values

array(['animal_id', 'age', 'dow', 'gender', 'mix', 'name',
       'animal_type_Cat', 'animal_type_Dog', 'fixed_status_fixed',
       'fixed_status_intact', 'group_Herding', 'group_Hound',
       'group_Non-Sporting', 'group_Sporting', 'group_Terrier',
       'group_Toy', 'group_Working', 'group_long hair',
       'group_short hair', 'intake_condition_Aged',
       'intake_condition_Feral', 'intake_condition_Injured',
       'intake_condition_Normal', 'intake_condition_Nursing',
       'intake_condition_Other', 'intake_condition_Pregnant',
       'intake_condition_Sick', 'intake_season_fall',
       'intake_season_spring', 'intake_season_summer',
       'intake_season_winter', 'intake_type_Euthanasia Request',
       'intake_type_Owner Surrender', 'intake_type_Public Assist',
       'intake_type_Stray', 'simple_color_Black', 'simple_color_Blue',
       'simple_color_Brown', 'simple_color_Gray', 'simple_color_Red',
       'simple_color_Sable', 'simple_color_Tricolor',
       'simple_colo

In [136]:
# We don't need animal id for modeling
df_model = df.iloc[:, 1:]

In [139]:
df_model = df_model[~df_model.duplicated()]

In [143]:
df = df[df['outcome_type'] != 'Rto-Adopt']
df = df[df['outcome_type'] != 'Missing']
df = df[df['outcome_type'] != 'Disposal']

In [144]:
df['outcome_type'].value_counts()

Adoption           29439
Transfer           22113
Return to Owner    12108
Euthanasia          2805
Died                 535
Name: outcome_type, dtype: int64

In [147]:
df_model.columns.values

array(['age', 'dow', 'gender', 'mix', 'name', 'animal_type_Cat',
       'animal_type_Dog', 'fixed_status_fixed', 'fixed_status_intact',
       'group_Herding', 'group_Hound', 'group_Non-Sporting',
       'group_Sporting', 'group_Terrier', 'group_Toy', 'group_Working',
       'group_long hair', 'group_short hair', 'intake_condition_Aged',
       'intake_condition_Feral', 'intake_condition_Injured',
       'intake_condition_Normal', 'intake_condition_Nursing',
       'intake_condition_Other', 'intake_condition_Pregnant',
       'intake_condition_Sick', 'intake_season_fall',
       'intake_season_spring', 'intake_season_summer',
       'intake_season_winter', 'intake_type_Euthanasia Request',
       'intake_type_Owner Surrender', 'intake_type_Public Assist',
       'intake_type_Stray', 'simple_color_Black', 'simple_color_Blue',
       'simple_color_Brown', 'simple_color_Gray', 'simple_color_Red',
       'simple_color_Sable', 'simple_color_Tricolor',
       'simple_color_White', 'simple_co

In [145]:
X.shape

(67000, 55)

In [94]:
X = df_test.iloc[:, :-1]

In [95]:
y = df_test.iloc[:, -1]

In [111]:
le = LabelEncoder()
le.fit(y)

LabelEncoder()

In [112]:
le.classes_

array(['Adoption', 'Died', 'Euthanasia', 'Return to Owner', 'Transfer'],
      dtype=object)

In [113]:
y = le.transform(y)

In [118]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state = 2325)

In [119]:
rfc = RandomForestClassifier(n_estimators = 250, n_jobs = -1)

scores = cross_val_score(rfc, X_train, y_train, scoring='f1_micro', cv = 3)

In [120]:
scores

array([0.61318052, 0.61432836, 0.62144734])

## XGBoost hyperparameter tuning

In [124]:
xg = XGBClassifier(n_jobs = -1)

tuning_params = {'max_depth': [5, 10], 'learning_rate': [1e-3, 1e-2, 1e-1], 
                 'min_child_weight': [1, 3, 5]}

bs = BayesSearchCV(xg, tuning_params, cv=3, verbose=True, scoring = 'f1_micro', n_jobs=-1, n_iter = 10)

bs.fit(X_train, y_train)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  1.0min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  1.1min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   45.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   51.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   57.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   50.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  1.2min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   51.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  1.5min finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   51.5s finished


BayesSearchCV(cv=3, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=-1, nthread=None, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1),
       fit_params=None, iid=True, n_iter=10, n_jobs=-1, n_points=1,
       optimizer_kwargs=None, pre_dispatch='2*n_jobs', random_state=None,
       refit=True, return_train_score=False, scoring='f1_micro',
       search_spaces={'max_depth': [5, 10], 'learning_rate': [0.001, 0.01, 0.1], 'min_child_weight': [1, 3, 5]},
       verbose=True)

In [125]:
bs.best_params_

{'learning_rate': 0.1, 'max_depth': 7, 'min_child_weight': 3}

In [126]:
bs.best_score_

0.6513432835820896

In [129]:
scores = cross_val_score(XGBClassifier(learning_rate = 0.1, max_depth = 7, min_child_weight = 3), cv=3,
                         n_jobs = -1, scoring='neg_log_loss', X=X_train, y=y_train)

In [130]:
scores

array([-0.85496438, -0.85031197, -0.83949584])