# Base CatBoost Regressor

https://towardsdatascience.com/catboost-vs-light-gbm-vs-xgboost-5f93620723db
https://catboost.ai/docs/

![](https://miro.medium.com/max/1467/1*09uNKZvIG2rhSpjTTnrDvw.png)
![](https://miro.medium.com/max/1870/1*A0b_ahXOrrijazzJengwYw.png)

In [1]:
import pathlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import tensorflow as tf

from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import make_scorer
from sklearn import ensemble, linear_model
from xgboost.sklearn import XGBClassifier

from catboost import Pool, CatBoostClassifier


X_submit = pd.read_csv('./data/401_X_submit.csv', index_col='id')
X_train = pd.read_csv('./data/401_X_train.csv', index_col='id')
X_test = pd.read_csv('./data/401_X_test.csv', index_col='id')

y_submit = pd.read_csv('./data/004_test.csv', index_col='id')
y_train = pd.read_csv('./data/401_y_train.csv', index_col='id')
y_test = pd.read_csv('./data/401_y_test.csv', index_col='id')

X_submit = X_submit.values[:,:-3]
X_train = X_train.values[:,:-3]
X_test = X_test.values[:,:-3]
y_train = np.squeeze(y_train.values)
y_test = np.squeeze(y_test.values)

X_train.shape

(80000, 36)

In [2]:
from utils.metrics import Metric
weights = pd.read_csv('data/005_weights.csv')['weight'].values
weights

array([6.9100e-03, 2.1780e-02, 2.2122e-01, 2.7304e-01, 1.7781e-01,
       4.0000e-05, 9.5300e-03, 7.5400e-03, 1.6660e-02, 6.9600e-03,
       1.9220e-01, 2.0900e-03, 6.4220e-02])

In [3]:
train_data = catboost_pool = Pool(X_train, y_train, cat_features=[33,34,35])

model = CatBoostClassifier(iterations=1000,
                           depth=2,
                           learning_rate=1,
                           loss_function='MultiClass',
                           random_seed=100,
                           verbose=True)

In [None]:
cross_val_predict

In [4]:
# train the model
model.fit(train_data, 
          eval_set=(X_test, y_test), 
          early_stopping_rounds=50)

0:	learn: 1.9359819	test: 1.9257218	best: 1.9257218 (0)	total: 265ms	remaining: 4m 24s
1:	learn: 1.8578718	test: 1.8637277	best: 1.8637277 (1)	total: 427ms	remaining: 3m 33s
2:	learn: 1.8101315	test: 1.8091093	best: 1.8091093 (2)	total: 585ms	remaining: 3m 14s
3:	learn: 1.7548782	test: 1.7610432	best: 1.7610432 (3)	total: 793ms	remaining: 3m 17s
4:	learn: 1.7138847	test: 1.7134612	best: 1.7134612 (4)	total: 979ms	remaining: 3m 14s
5:	learn: 1.6968923	test: 1.6992124	best: 1.6992124 (5)	total: 1.15s	remaining: 3m 11s
6:	learn: 1.6788963	test: 1.6820746	best: 1.6820746 (6)	total: 1.34s	remaining: 3m 9s
7:	learn: 1.6638618	test: 1.6645270	best: 1.6645270 (7)	total: 1.52s	remaining: 3m 9s
8:	learn: 1.6580917	test: 1.6593131	best: 1.6593131 (8)	total: 1.71s	remaining: 3m 7s
9:	learn: 1.6517721	test: 1.6529183	best: 1.6529183 (9)	total: 1.88s	remaining: 3m 6s
10:	learn: 1.6461907	test: 1.6466000	best: 1.6466000 (10)	total: 2.08s	remaining: 3m 6s
11:	learn: 1.6415175	test: 1.6419663	best: 1.6

<catboost.core.CatBoostClassifier at 0x7f1cb8014898>

In [5]:
model.predict_proba(X_test)

array([[3.22317469e-03, 8.09439795e-02, 1.61082551e-01, ...,
        7.93311487e-02, 1.28297229e-03, 7.94998363e-02],
       [1.68974308e-02, 2.17262064e-02, 2.98438899e-01, ...,
        7.60252003e-02, 4.79278476e-04, 5.40506806e-02],
       [2.88481302e-04, 1.04154737e-02, 2.54619106e-03, ...,
        7.69007182e-03, 5.69268600e-06, 9.70232497e-03],
       ...,
       [2.40185317e-02, 1.67910537e-02, 1.33688879e-01, ...,
        5.10210803e-02, 2.30006887e-03, 3.13535919e-01],
       [4.23406802e-03, 4.44298603e-03, 1.24623796e-01, ...,
        5.71054004e-01, 1.37250382e-03, 7.87904941e-03],
       [1.64613759e-03, 6.86457042e-03, 3.90129208e-02, ...,
        1.14457308e-02, 2.18940089e-06, 4.00738193e-01]])

In [6]:
pd.get_dummies(y_test, prefix='cls').values

array([[0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)

In [7]:
metric = Metric(weights)

In [8]:
pd.get_dummies(y_test, prefix='cls').values

array([[0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)

In [9]:
model.predict_proba(X_test)[0]

array([3.22317469e-03, 8.09439795e-02, 1.61082551e-01, 5.05708758e-01,
       4.78567146e-02, 8.27753446e-07, 8.35205270e-03, 1.52818190e-02,
       1.47971268e-02, 2.63903862e-03, 7.93311487e-02, 1.28297229e-03,
       7.94998363e-02])

In [11]:
metric.eval_metric(pd.get_dummies(y_test, prefix='cls').values, model.predict_proba(X_test))

0.3439473969652732

In [12]:
df_submit = pd.DataFrame.from_records(model.predict_proba(X_submit))

cols = ['class'+str(i) for i in range(13)]

df_submit.columns = cols
df_submit.head()

Unnamed: 0,class0,class1,class2,class3,class4,class5,class6,class7,class8,class9,class10,class11,class12
0,0.006156,0.019999,0.230786,0.030114,0.061058,1.075911e-06,0.003884,0.033828,0.0127,0.003175,0.543941,0.006731,0.047628
1,0.000288,0.017637,0.713987,0.066899,0.085888,1.762302e-06,0.001667,0.001647,0.002478,0.0017,0.102514,0.000837,0.004457
2,0.000229,0.057991,0.175213,0.681788,0.001079,9.300412e-07,0.003794,0.027765,0.0084,0.00995,0.014656,8.6e-05,0.019048
3,0.001136,0.054771,0.28714,0.319186,0.18795,7.959553e-07,0.011447,0.029432,0.009269,0.009482,0.046661,0.001301,0.042225
4,0.002719,0.006624,0.077707,0.014104,0.178175,8.867174e-07,0.000804,0.002433,0.001538,0.000247,0.695583,0.000757,0.019309


In [128]:
df_submit = pd.concat([
    y_submit.reset_index(),
    df_submit
], axis=1)

df_submit = df_submit.set_index('id')
df_submit

Unnamed: 0_level_0,class0,class1,class2,class3,class4,class5,class6,class7,class8,class9,class10,class11,class12
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
151807,0.000146,0.003092,0.303957,0.029689,0.274254,0.000003,0.000294,0.001030,0.018898,0.010838,0.345569,0.000218,0.012012
118131,0.000001,0.005125,0.858612,0.018627,0.077862,0.000019,0.000777,0.005471,0.000336,0.000923,0.031510,0.000008,0.000729
110921,0.000561,0.026784,0.201211,0.371557,0.082146,0.000018,0.001487,0.000876,0.269563,0.018790,0.018680,0.000749,0.007578
105149,0.002588,0.233498,0.112588,0.035842,0.119365,0.000028,0.002670,0.322062,0.039811,0.002813,0.097249,0.000254,0.031233
143868,0.000129,0.000236,0.032598,0.022839,0.023431,0.000023,0.000131,0.002986,0.000925,0.004754,0.907525,0.000383,0.004040
...,...,...,...,...,...,...,...,...,...,...,...,...,...
146316,0.000060,0.006250,0.185938,0.732060,0.031699,0.000004,0.000118,0.000191,0.013680,0.000291,0.015041,0.000007,0.014660
121816,0.006305,0.013297,0.107097,0.117748,0.435716,0.000081,0.013726,0.002540,0.002869,0.089389,0.192658,0.000130,0.018444
106217,0.000139,0.111097,0.072940,0.567772,0.155628,0.000003,0.004499,0.000165,0.002349,0.002182,0.080525,0.000017,0.002684
103515,0.006400,0.005073,0.730552,0.083353,0.077665,0.000010,0.000890,0.003703,0.005735,0.038603,0.023453,0.000196,0.024368


In [130]:
df_submit.to_csv('001_submit.csv')