In [227]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

from skopt import BayesSearchCV

from xgboost import XGBClassifier

%matplotlib inline

In [232]:
class BayesSearchCV(BayesSearchCV):
    def _run_search(self, x): raise BaseException('Use newer skopt')

In [160]:
df = pd.read_csv('./data/all_categoricals.csv', index_col = 0)

In [161]:
df.columns.values

array(['animal_id', 'age', 'dow', 'gender', 'mix', 'name',
       'animal_type_Cat', 'animal_type_Dog', 'fixed_status_fixed',
       'fixed_status_intact', 'group_Herding', 'group_Hound',
       'group_Non-Sporting', 'group_Sporting', 'group_Terrier',
       'group_Toy', 'group_Working', 'group_long hair',
       'group_short hair', 'intake_condition_Aged',
       'intake_condition_Feral', 'intake_condition_Injured',
       'intake_condition_Normal', 'intake_condition_Nursing',
       'intake_condition_Other', 'intake_condition_Pregnant',
       'intake_condition_Sick', 'intake_season_fall',
       'intake_season_spring', 'intake_season_summer',
       'intake_season_winter', 'intake_type_Euthanasia Request',
       'intake_type_Owner Surrender', 'intake_type_Public Assist',
       'intake_type_Stray', 'simple_color_Black', 'simple_color_Blue',
       'simple_color_Brown', 'simple_color_Gray', 'simple_color_Red',
       'simple_color_Sable', 'simple_color_Tricolor',
       'simple_colo

In [162]:
df = df[df['outcome_type'] != 'Rto-Adopt']
df = df[df['outcome_type'] != 'Missing']
df = df[df['outcome_type'] != 'Disposal']

In [208]:
# We don't need animal id for modeling
df_model = df.iloc[:, 1:]

In [209]:
df_model = df_model[~df_model.duplicated()]

In [210]:
df['outcome_type'].value_counts()

Adoption           29439
Transfer           22113
Return to Owner    12108
Euthanasia          2805
Died                 535
Name: outcome_type, dtype: int64

In [211]:
df_model['outcome_type'].value_counts()

Adoption           28548
Transfer           19298
Return to Owner    11990
Euthanasia          2780
Died                 527
Name: outcome_type, dtype: int64

Drop the days in shelter feature for now. This will be used later in regression analysis if needed.

In [212]:
df_model = df_model.drop(labels=['days_in_shelter'], axis=1)

In [213]:
df_model.head()

Unnamed: 0,age,dow,gender,mix,name,animal_type_Cat,animal_type_Dog,fixed_status_fixed,fixed_status_intact,group_Herding,...,hour_in_22,hour_in_23,hour_in_3,hour_in_4,hour_in_5,hour_in_6,hour_in_7,hour_in_8,hour_in_9,outcome_type
0,2920,6,1,0,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,Return to Owner
1,330,3,0,1,1,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,Return to Owner
2,1460,6,0,1,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,Return to Owner
3,730,5,0,1,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,Return to Owner
4,730,3,0,1,1,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,Return to Owner


In [214]:
X = df_model.iloc[:, :-1]

In [215]:
X = pd.concat([X, pd.get_dummies(df_model['dow'].astype(str))], axis=1)

In [216]:
y = df_model.iloc[:, -1]

In [217]:
le = LabelEncoder()
le.fit(y)

LabelEncoder()

In [218]:
le.classes_

array(['Adoption', 'Died', 'Euthanasia', 'Return to Owner', 'Transfer'],
      dtype=object)

In [219]:
y = le.transform(y)

In [220]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state = 2325)

In [237]:
rfc = RandomForestClassifier()

tuning_params_rf = {'n_estimators': [300, 800], 'max_features': ['sqrt'], 'max_leaf_nodes': [50, 100],
                    'max_depth': [50, 200]}

bs = BayesSearchCV(rfc, tuning_params_rf, n_jobs=-1, n_iter = 20, scoring='f1_micro', cv=3)

bs.fit(X_train, y_train)

BayesSearchCV(cv=3, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_iter=20, n_jobs=-1, n_points=1,
       optimizer_kwargs=None, pre_dispatch='2*n_jobs', random_state=None,
       refit=True, return_train_score=False, scoring='f1_micro',
       search_spaces={'n_estimators': [300, 800], 'max_features': ['sqrt'], 'max_leaf_nodes': [50, 100], 'max_depth': [50, 200]},
       verbose=0)

In [238]:
bs.best_score_

0.623688155922039

## XGBoost hyperparameter tuning

In [None]:
xg = XGBClassifier(n_jobs = -1)

tuning_params = {'max_depth': [5, 90], 'learning_rate': [0, 1], 'objective': ['multi:softprob'],
                 'min_child_weight': [1, 3, 5]}

bs = BayesSearchCV(xg, tuning_params, cv=3, verbose=True, scoring = 'neg_log_loss', n_jobs=-1, n_iter = 10)

bs.fit(X_train, y_train)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  4.0min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  1.5min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  3.4min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  3.0min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  3.1min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  3.6min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


In [None]:
bs.best_params_

In [None]:
bs.best_score_

In [228]:
pipe = make_pipeline(StandardScaler(), XGBClassifier(learning_rate=0.1, max_depth=7, min_child_weight=3))

In [229]:
scores = cross_val_score(pipe, cv=3, n_jobs = -1, scoring='f1_micro', X=X_train, y=y_train)

In [230]:
scores

array([0.64014696, 0.6450019 , 0.64375317])