# Base CatBoost Regressor

In [2]:
import pathlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import tensorflow as tf

from collections import Counter
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer
from sklearn import ensemble, linear_model
from xgboost.sklearn import XGBClassifier
from bayes_opt import BayesianOptimization
from tensorflow import keras
from tensorflow.keras import layers
from catboost import Pool, CatBoostClassifier
from utils.metrics import Metric
from tqdm import tqdm
weights = pd.read_csv('data/005_weights.csv')['weight'].values

print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

X_submit = pd.concat([ 
    pd.read_csv('./data/420_X_submit.csv'), 
    pd.read_csv('./data/510_X_submit_preds.csv', header=None),
    pd.read_csv('./data/520_X_submit_preds.csv', header=None),
], axis=1)

X_train = pd.concat([
    pd.read_csv('./data/420_X_train.csv'), 
    pd.read_csv('./data/510_X_train_preds.csv', header=None),
    pd.read_csv('./data/520_X_train_preds.csv', header=None)
], axis=1)

X_test = pd.concat([
    pd.read_csv('./data/420_X_test.csv'),
    pd.read_csv('./data/510_X_test_preds.csv', header=None), 
    pd.read_csv('./data/520_X_test_preds.csv', header=None), 
], axis=1)

X_submit = X_submit.set_index('id')
X_train  = X_train.set_index('id')
X_test   = X_test.set_index('id')


y_submit = pd.read_csv('./data/004_test.csv', index_col='id')
y_train = pd.read_csv('./data/410_y_train.csv', index_col='id')
y_test = pd.read_csv('./data/410_y_test.csv', index_col='id')


X_submit = X_submit.values
X_train = X_train.values
X_test = X_test.values
y_train = np.squeeze(y_train.values)
y_test = np.squeeze(y_test.values)

X_train.shape

Num GPUs Available:  1


(80000, 427)

In [5]:
train_data = catboost_pool = Pool(X_train, y_train)

model = CatBoostClassifier(iterations=1000,
                           depth=3,
                           learning_rate=0.1,
                           loss_function='MultiClass',
                           random_seed=100,
                           verbose=True)

In [6]:
# train the model
model.fit(train_data, 
          eval_set=(X_test, y_test), 
          early_stopping_rounds=30)

0:	learn: 2.2516606	test: 2.2450068	best: 2.2450068 (0)	total: 145ms	remaining: 2m 24s
1:	learn: 2.0723252	test: 2.0613543	best: 2.0613543 (1)	total: 290ms	remaining: 2m 24s
2:	learn: 1.9536033	test: 1.9411242	best: 1.9411242 (2)	total: 441ms	remaining: 2m 26s
3:	learn: 1.8630801	test: 1.8489878	best: 1.8489878 (3)	total: 590ms	remaining: 2m 26s
4:	learn: 1.7927271	test: 1.7769215	best: 1.7769215 (4)	total: 743ms	remaining: 2m 27s
5:	learn: 1.7346103	test: 1.7178937	best: 1.7178937 (5)	total: 888ms	remaining: 2m 27s
6:	learn: 1.6870313	test: 1.6689655	best: 1.6689655 (6)	total: 1.03s	remaining: 2m 25s
7:	learn: 1.6416122	test: 1.6223330	best: 1.6223330 (7)	total: 1.17s	remaining: 2m 25s
8:	learn: 1.6040495	test: 1.5845277	best: 1.5845277 (8)	total: 1.33s	remaining: 2m 26s
9:	learn: 1.5744894	test: 1.5545371	best: 1.5545371 (9)	total: 1.47s	remaining: 2m 25s
10:	learn: 1.5483789	test: 1.5280582	best: 1.5280582 (10)	total: 1.62s	remaining: 2m 26s
11:	learn: 1.5238228	test: 1.5030256	best

<catboost.core.CatBoostClassifier at 0x7f0f9c137518>

In [7]:
df_submit = pd.DataFrame.from_records(model.predict_proba(X_submit))

cols = ['class'+str(i) for i in range(13)]

df_submit.columns = cols
df_submit.head()

Unnamed: 0,class0,class1,class2,class3,class4,class5,class6,class7,class8,class9,class10,class11,class12
0,0.003978,0.01037,0.243877,0.030266,0.072331,0.000146,0.004269,0.006686,0.00807,0.002642,0.570881,0.001477,0.045008
1,0.001145,0.007652,0.775455,0.092409,0.064654,3.9e-05,0.001901,0.007002,0.004016,0.00796,0.034787,0.000662,0.002319
2,0.002015,0.027583,0.136887,0.710563,0.01279,6.7e-05,0.010542,0.008062,0.017073,0.012697,0.047322,0.000172,0.014228
3,0.001101,0.108876,0.229989,0.297944,0.195681,1.3e-05,0.008653,0.037459,0.012246,0.011997,0.078381,0.003117,0.014542
4,0.003715,0.001921,0.028438,0.00801,0.019942,1.5e-05,0.001184,0.000424,0.001339,0.000539,0.926107,0.000373,0.007992


In [8]:
df_submit = pd.concat([
    y_submit.reset_index(),
    df_submit
], axis=1)

df_submit = df_submit.set_index('id')
df_submit

Unnamed: 0_level_0,class0,class1,class2,class3,class4,class5,class6,class7,class8,class9,class10,class11,class12
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
151807,0.003978,0.010370,0.243877,0.030266,0.072331,0.000146,0.004269,0.006686,0.008070,0.002642,0.570881,0.001477,0.045008
118131,0.001145,0.007652,0.775455,0.092409,0.064654,0.000039,0.001901,0.007002,0.004016,0.007960,0.034787,0.000662,0.002319
110921,0.002015,0.027583,0.136887,0.710563,0.012790,0.000067,0.010542,0.008062,0.017073,0.012697,0.047322,0.000172,0.014228
105149,0.001101,0.108876,0.229989,0.297944,0.195681,0.000013,0.008653,0.037459,0.012246,0.011997,0.078381,0.003117,0.014542
143868,0.003715,0.001921,0.028438,0.008010,0.019942,0.000015,0.001184,0.000424,0.001339,0.000539,0.926107,0.000373,0.007992
...,...,...,...,...,...,...,...,...,...,...,...,...,...
146316,0.001270,0.015076,0.083511,0.661049,0.161917,0.000003,0.002402,0.006610,0.008117,0.014804,0.023718,0.000759,0.020764
121816,0.003232,0.002082,0.157796,0.024265,0.221112,0.000037,0.000822,0.000318,0.001090,0.000459,0.582551,0.000671,0.005566
106217,0.000390,0.021515,0.015176,0.086981,0.831125,0.000007,0.003079,0.002584,0.009338,0.001742,0.023960,0.001342,0.002761
103515,0.015986,0.015474,0.122236,0.068242,0.210135,0.000008,0.005060,0.001182,0.014761,0.009718,0.087760,0.001395,0.448044


In [9]:
df_submit.to_csv('009_submit.csv')

In [10]:
from joblib import dump, load

In [12]:
dump(model, 'M_336/622_stack_cat.joblib')

['M_336/622_stack_cat.joblib']