In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

from catboost import CatBoostClassifier

from skopt import BayesSearchCV

from xgboost import XGBClassifier

%matplotlib inline

In [2]:
df = pd.read_csv('./data/model12218.csv', index_col = 0)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 64339 entries, 0 to 75010
Data columns (total 18 columns):
age                 64339 non-null int64
animal_type         64339 non-null object
dow                 64339 non-null int64
fixed_status        64339 non-null object
gender              64339 non-null object
group               64339 non-null object
intake_condition    64339 non-null object
intake_season       64339 non-null object
intake_type         64339 non-null object
mix                 64339 non-null int64
simple_color        64339 non-null object
hour_in             64339 non-null int64
simple_group        64339 non-null object
shade               64339 non-null object
given_name          47018 non-null object
name_freq           64339 non-null int64
days_in_shelter     64339 non-null int64
outcome_type        64339 non-null object
dtypes: int64(6), object(12)
memory usage: 9.3+ MB


In [5]:
df['gender'] = df['gender'].astype(str)

In [8]:
df['dow'] = df['dow'].astype(str)
df['mix'] = df['mix'].astype(str)
df['hour_in'] = df['hour_in'].astype(str)

In [11]:
df = df.drop(labels=['days_in_shelter'], axis=1)

In [14]:
df = df[~df.duplicated()]

In [15]:
df.reset_index(inplace=True, drop=True)

In [20]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [29]:
X = X.drop(labels=['given_name'], axis=1)

In [None]:
le = LabelEncoder()
le.fit(y)
y = le.transform(y)

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [40]:
X.head()

Unnamed: 0,age,animal_type,dow,fixed_status,gender,group,intake_condition,intake_season,intake_type,mix,simple_color,hour_in,simple_group,shade,name_freq
0,2920,Dog,6,fixed,1,Sporting,Normal,summer,Stray,0,White,12,Sporting,Light,52
1,330,Dog,3,intact,0,Hound,Normal,spring,Stray,1,Sable,18,Non-Sporting,Light,1
2,1460,Dog,6,fixed,0,Working,Normal,spring,Stray,1,Yellow,10,Non-Sporting,Light,22
3,730,Dog,5,fixed,0,Sporting,Normal,winter,Owner Surrender,1,Brown,12,Sporting,Medium,32
4,730,Dog,3,intact,0,Sporting,Normal,summer,Public Assist,1,Black,14,Sporting,Dark,1


In [39]:
cb = CatBoostClassifier(one_hot_max_size=24, learning_rate = 0.1, 
                        cat_features = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], 
                        loss_function = 'MultiClass')

tuning_params = {'one_hot_max_size': [24], 'learning_rate': [1e-2, 1], 
                 'cat_features': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], 'loss_function': ['MultiClass'],
                 ''}

scores = cross_val_score(cb, cv=3, n_jobs=-1, scoring='neg_log_loss', X=X_train, y=y_train)

scores

array([-0.87083723, -0.8677593 , -0.87302842])