# Base CatBoost Regressor

In [12]:
import pathlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import tensorflow as tf

from collections import Counter
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer
from sklearn import ensemble, linear_model
from xgboost.sklearn import XGBClassifier
from bayes_opt import BayesianOptimization
from tensorflow import keras
from tensorflow.keras import layers
from catboost import Pool, CatBoostClassifier
from utils.metrics import Metric
from tqdm import tqdm
weights = pd.read_csv('data/005_weights.csv')['weight'].values

print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

X_submit = pd.concat([ 
    pd.read_csv('./data/420_X_submit.csv'),
    pd.read_csv('./data/441_X_submit.csv').drop('id', axis=1),
    pd.read_csv('./data/520_X_submit_preds.csv', header=None)
], axis=1)

X_train = pd.concat([
    pd.read_csv('./data/420_X_train.csv'), 
    pd.read_csv('./data/441_X_train.csv').drop('id', axis=1),
    pd.read_csv('./data/520_X_train_preds.csv', header=None)
], axis=1)

X_test = pd.concat([
    pd.read_csv('./data/420_X_test.csv'), 
    pd.read_csv('./data/441_X_test.csv').drop('id', axis=1),
    pd.read_csv('./data/520_X_test_preds.csv', header=None)
], axis=1)

Num GPUs Available:  1


In [13]:
X_submit = X_submit.set_index('id')
X_train  = X_train.set_index('id')
X_test   = X_test.set_index('id')

y_submit = pd.read_csv('./data/004_test.csv', index_col='id')
y_train = pd.read_csv('./data/410_y_train.csv', index_col='id')
y_test = pd.read_csv('./data/410_y_test.csv', index_col='id')


X_submit = X_submit.values
X_train = X_train.values
X_test = X_test.values
y_train = np.squeeze(y_train.values)
y_test = np.squeeze(y_test.values)

X_train.shape

(80000, 499)

In [14]:
y_train.shape

(80000,)

In [15]:
# train_data = Pool(X_train, y_train)
# test_data = Pool(X_test, y_test)

model = CatBoostClassifier(iterations=5000,
                           depth=4,
                           learning_rate=0.1,
                           loss_function='MultiClass',
                           random_seed=100,
                           verbose=True)

In [16]:
X_test.shape

(20000, 499)

In [None]:
# train the model
model.fit(X_train, y_train, 
          eval_set=(X_test, y_test), 
          early_stopping_rounds=35)

0:	learn: 2.2220086	test: 2.2132791	best: 2.2132791 (0)	total: 210ms	remaining: 17m 27s
1:	learn: 2.0505333	test: 2.0372428	best: 2.0372428 (1)	total: 367ms	remaining: 15m 16s
2:	learn: 1.9287027	test: 1.9123301	best: 1.9123301 (2)	total: 529ms	remaining: 14m 40s
3:	learn: 1.8405485	test: 1.8223995	best: 1.8223995 (3)	total: 683ms	remaining: 14m 13s
4:	learn: 1.7637233	test: 1.7442571	best: 1.7442571 (4)	total: 840ms	remaining: 13m 59s
5:	learn: 1.7048845	test: 1.6843396	best: 1.6843396 (5)	total: 1000ms	remaining: 13m 52s
6:	learn: 1.6550538	test: 1.6338131	best: 1.6338131 (6)	total: 1.2s	remaining: 14m 12s
7:	learn: 1.6128498	test: 1.5906512	best: 1.5906512 (7)	total: 1.39s	remaining: 14m 29s
8:	learn: 1.5785703	test: 1.5556815	best: 1.5556815 (8)	total: 1.59s	remaining: 14m 40s
9:	learn: 1.5499039	test: 1.5266054	best: 1.5266054 (9)	total: 1.77s	remaining: 14m 44s
10:	learn: 1.5260387	test: 1.5022620	best: 1.5022620 (10)	total: 1.95s	remaining: 14m 46s
11:	learn: 1.5038557	test: 1.4

In [14]:
df_submit = pd.DataFrame.from_records(model.predict_proba(X_submit))

cols = ['class'+str(i) for i in range(13)]

df_submit.columns = cols
df_submit.head()

Unnamed: 0,class0,class1,class2,class3,class4,class5,class6,class7,class8,class9,class10,class11,class12
0,0.004388,0.005987,0.268693,0.027795,0.086402,4.5e-05,0.003958,0.00788,0.004884,0.002027,0.53117,0.001448,0.055321
1,0.000857,0.007049,0.746808,0.115228,0.069057,1.3e-05,0.002329,0.009772,0.003443,0.009114,0.034279,0.000483,0.001569
2,0.001516,0.0308,0.112514,0.763034,0.007659,1.2e-05,0.008596,0.007813,0.015281,0.009396,0.031261,0.000154,0.011964
3,0.000771,0.100787,0.215823,0.322645,0.218926,1e-05,0.007986,0.039855,0.009397,0.012762,0.056495,0.002869,0.011673
4,0.006651,0.001857,0.026181,0.00466,0.020327,3e-06,0.000953,0.000298,0.000742,0.000491,0.932489,0.000329,0.005019


In [15]:
df_submit = pd.concat([
    y_submit.reset_index(),
    df_submit
], axis=1)

df_submit = df_submit.set_index('id')
df_submit

Unnamed: 0_level_0,class0,class1,class2,class3,class4,class5,class6,class7,class8,class9,class10,class11,class12
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
151807,0.004388,0.005987,0.268693,0.027795,0.086402,4.536730e-05,0.003958,0.007880,0.004884,0.002027,0.531170,0.001448,0.055321
118131,0.000857,0.007049,0.746808,0.115228,0.069057,1.323127e-05,0.002329,0.009772,0.003443,0.009114,0.034279,0.000483,0.001569
110921,0.001516,0.030800,0.112514,0.763034,0.007659,1.196816e-05,0.008596,0.007813,0.015281,0.009396,0.031261,0.000154,0.011964
105149,0.000771,0.100787,0.215823,0.322645,0.218926,9.892000e-06,0.007986,0.039855,0.009397,0.012762,0.056495,0.002869,0.011673
143868,0.006651,0.001857,0.026181,0.004660,0.020327,3.077644e-06,0.000953,0.000298,0.000742,0.000491,0.932489,0.000329,0.005019
...,...,...,...,...,...,...,...,...,...,...,...,...,...
146316,0.001383,0.016119,0.069230,0.666364,0.172940,1.445683e-06,0.002787,0.009917,0.006679,0.011875,0.018662,0.000428,0.023614
121816,0.002497,0.001699,0.141787,0.020752,0.288340,1.423993e-05,0.000396,0.000142,0.000559,0.000315,0.539467,0.000555,0.003477
106217,0.000187,0.020435,0.017967,0.073731,0.848943,1.886945e-06,0.004102,0.001753,0.009350,0.001956,0.018282,0.001689,0.001603
103515,0.017822,0.011354,0.100587,0.071624,0.192909,2.218350e-06,0.003110,0.000609,0.011776,0.005890,0.077504,0.001405,0.505407


In [16]:
df_submit.to_csv('013_submit.csv')

In [18]:
from joblib import dump, load

In [19]:
dump(model, 'M_336/623_stack_cat.joblib')

['M_336/623_stack_cat.joblib']