# Base CatBoost Regressor

In [1]:
import pathlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import tensorflow as tf

from collections import Counter
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer
from sklearn import ensemble, linear_model
from xgboost.sklearn import XGBClassifier
from bayes_opt import BayesianOptimization
from tensorflow import keras
from tensorflow.keras import layers
from catboost import Pool, CatBoostClassifier
from utils.metrics import Metric
from tqdm import tqdm
weights = pd.read_csv('data/005_weights.csv')['weight'].values

print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

X_submit = pd.read_csv('./data/420_X_submit.csv', index_col='id')
X_train = pd.read_csv('./data/420_X_train.csv', index_col='id')
X_test = pd.read_csv('./data/420_X_test.csv', index_col='id')

y_submit = pd.read_csv('./data/004_test.csv', index_col='id')
y_train = pd.read_csv('./data/410_y_train.csv', index_col='id')
y_test = pd.read_csv('./data/410_y_test.csv', index_col='id')

X_submit = X_submit.values
X_train = X_train.values
X_test = X_test.values
y_train = np.squeeze(y_train.values)
y_test = np.squeeze(y_test.values)

X_train.shape

Num GPUs Available:  1


(80000, 310)

In [6]:
train_data = catboost_pool = Pool(X_train, y_train)

model = CatBoostClassifier(iterations=1000,
                           depth=3,
                           learning_rate=0.3,
                           loss_function='MultiClass',
                           random_seed=100,
                           verbose=True)

In [7]:
# train the model
model.fit(train_data, 
          eval_set=(X_test, y_test), 
          early_stopping_rounds=50)

0:	learn: 2.0488372	test: 2.0452692	best: 2.0452692 (0)	total: 181ms	remaining: 3m
1:	learn: 1.8751076	test: 1.8709106	best: 1.8709106 (1)	total: 380ms	remaining: 3m 9s
2:	learn: 1.7801817	test: 1.7748372	best: 1.7748372 (2)	total: 580ms	remaining: 3m 12s
3:	learn: 1.7110984	test: 1.7046853	best: 1.7046853 (3)	total: 772ms	remaining: 3m 12s
4:	learn: 1.6717900	test: 1.6657314	best: 1.6657314 (4)	total: 967ms	remaining: 3m 12s
5:	learn: 1.6442868	test: 1.6383728	best: 1.6383728 (5)	total: 1.16s	remaining: 3m 12s
6:	learn: 1.6215890	test: 1.6156791	best: 1.6156791 (6)	total: 1.35s	remaining: 3m 11s
7:	learn: 1.6075642	test: 1.6016738	best: 1.6016738 (7)	total: 1.54s	remaining: 3m 10s
8:	learn: 1.5905064	test: 1.5854625	best: 1.5854625 (8)	total: 1.72s	remaining: 3m 9s
9:	learn: 1.5786018	test: 1.5732075	best: 1.5732075 (9)	total: 1.9s	remaining: 3m 7s
10:	learn: 1.5706288	test: 1.5650933	best: 1.5650933 (10)	total: 2.09s	remaining: 3m 8s
11:	learn: 1.5647167	test: 1.5596763	best: 1.55967

<catboost.core.CatBoostClassifier at 0x7f1ab04fb860>

In [12]:
%%time
model_1 = CatBoostClassifier(iterations=1000,
                           depth=3,
                           learning_rate=0.3,
                           loss_function='MultiClass',
                           random_seed=100,
                           verbose=False)

skf = StratifiedKFold(n_splits=3)
skf.get_n_splits(X_train, y_train)

y_train_pred_1 = []
for train_index, test_index in tqdm(skf.split(X_train, y_train)):
    xt, xv = X_train[train_index], X_train[test_index]
    yt, yv = y_train[train_index], y_train[test_index]
    # train the model
    model_1.fit(xt, yt, eval_set=(xv, yv), early_stopping_rounds=50)
    y_train_pred_1.append(model_1.predict_proba(xv))
    
y_train_pred_1 = np.concatenate(y_train_pred_1)
print(metrics.log_loss(y_train, y_train_pred_1))
model_1.fit(X_train, y_train)
y_test_pred_1   = model_1.predict_proba(X_test)
y_submit_pred_1 = model_1.predict_proba(X_submit)

3it [07:14, 144.70s/it]


1.7650022660661466
CPU times: user 7h 28min 59s, sys: 3min 57s, total: 7h 32min 57s
Wall time: 9min 42s


In [18]:
model.predict_proba(X_test)[0]

array([3.72795167e-02, 1.05688112e-01, 1.39449749e-01, 4.65960562e-01,
       6.18493688e-02, 4.42553326e-07, 4.23636925e-03, 3.85579013e-04,
       1.12447234e-02, 7.56575017e-04, 9.93453177e-02, 4.09139673e-03,
       6.97122874e-02])

In [17]:
metrics.log_loss(y_test, model.predict_proba(X_test))

1.2535156185199543

In [19]:
df_submit = pd.DataFrame.from_records(model.predict_proba(X_submit))

cols = ['class'+str(i) for i in range(13)]

df_submit.columns = cols
df_submit.head()

Unnamed: 0,class0,class1,class2,class3,class4,class5,class6,class7,class8,class9,class10,class11,class12
0,0.005582,0.004618,0.131281,0.034383,0.060293,3e-06,0.001705,0.009857,0.005033,0.001585,0.687529,0.000682,0.057449
1,0.000103,0.008279,0.574617,0.189202,0.135532,2e-06,0.002116,0.045804,0.002447,0.005064,0.033413,0.000267,0.003156
2,0.000423,0.012707,0.206986,0.669474,0.029933,2.4e-05,0.004084,0.007063,0.012507,0.011111,0.035352,3.6e-05,0.0103
3,0.003144,0.087299,0.263002,0.349634,0.179223,1e-06,0.003705,0.022504,0.010675,0.004856,0.054652,0.001681,0.019624
4,0.024929,0.014629,0.099065,0.04425,0.078238,2e-06,0.005994,0.000822,0.014037,0.003582,0.678733,0.000373,0.035346


In [20]:
df_submit = pd.concat([
    y_submit.reset_index(),
    df_submit
], axis=1)

df_submit = df_submit.set_index('id')
df_submit

Unnamed: 0_level_0,class0,class1,class2,class3,class4,class5,class6,class7,class8,class9,class10,class11,class12
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
151807,0.005582,0.004618,0.131281,0.034383,0.060293,2.843937e-06,0.001705,0.009857,0.005033,0.001585,0.687529,0.000682,0.057449
118131,0.000103,0.008279,0.574617,0.189202,0.135532,1.969099e-06,0.002116,0.045804,0.002447,0.005064,0.033413,0.000267,0.003156
110921,0.000423,0.012707,0.206986,0.669474,0.029933,2.413884e-05,0.004084,0.007063,0.012507,0.011111,0.035352,0.000036,0.010300
105149,0.003144,0.087299,0.263002,0.349634,0.179223,1.050636e-06,0.003705,0.022504,0.010675,0.004856,0.054652,0.001681,0.019624
143868,0.024929,0.014629,0.099065,0.044250,0.078238,2.157348e-06,0.005994,0.000822,0.014037,0.003582,0.678733,0.000373,0.035346
...,...,...,...,...,...,...,...,...,...,...,...,...,...
146316,0.004114,0.025404,0.163209,0.463184,0.195058,4.942182e-06,0.004477,0.008506,0.017196,0.014622,0.049548,0.000997,0.053679
121816,0.004409,0.003018,0.277026,0.046098,0.302755,2.728172e-06,0.000485,0.000101,0.000354,0.000481,0.362857,0.000810,0.001603
106217,0.000156,0.030419,0.029325,0.087973,0.790838,2.327046e-06,0.006013,0.001413,0.007338,0.001689,0.042735,0.000401,0.001698
103515,0.014806,0.012331,0.263947,0.092129,0.132663,1.561182e-06,0.002608,0.000537,0.007262,0.003687,0.212446,0.001194,0.256387


In [21]:
df_submit.to_csv('008_submit.csv')

In [23]:
from joblib import dump, load

In [24]:
dump(model_1, 'M_336/521_base_ml_model_1.joblib')

['M_336/521_base_ml_model_1.joblib']