# Base CatBoost Regressor

In [1]:
import pathlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import tensorflow as tf

from collections import Counter
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer
from sklearn import ensemble, linear_model
from xgboost.sklearn import XGBClassifier
from bayes_opt import BayesianOptimization
from tensorflow import keras
from tensorflow.keras import layers
from catboost import Pool, CatBoostClassifier
from utils.metrics import Metric
from tqdm import tqdm
weights = pd.read_csv('data/005_weights.csv')['weight'].values

print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

X_submit = pd.concat([ 
    pd.read_csv('./data/420_X_submit.csv'), 
    pd.read_csv('./data/520_X_submit_preds.csv', header=None),
], axis=1)

X_train = pd.concat([
    pd.read_csv('./data/420_X_train.csv'), 
    pd.read_csv('./data/520_X_train_preds.csv', header=None)
], axis=1)

X_test = pd.concat([
    pd.read_csv('./data/420_X_test.csv'),
    pd.read_csv('./data/520_X_test_preds.csv', header=None), 
], axis=1)

X_submit = X_submit.set_index('id')
X_train  = X_train.set_index('id')
X_test   = X_test.set_index('id')

y_submit = pd.read_csv('./data/004_test.csv', index_col='id')
y_train = pd.read_csv('./data/410_y_train.csv', index_col='id')
y_test = pd.read_csv('./data/410_y_test.csv', index_col='id')

X_submit = X_submit.values
X_train = X_train.values
X_test = X_test.values
y_train = np.squeeze(y_train.values)
y_test = np.squeeze(y_test.values)

X_train.shape

Num GPUs Available:  1


(80000, 375)

In [2]:
%%time
model_1 = CatBoostClassifier(iterations=1500,
                           depth=4,
                           learning_rate=0.3,
                           loss_function='MultiClass',
                           random_seed=100,
                           verbose=False)

skf = StratifiedKFold(n_splits=3)
skf.get_n_splits(X_train, y_train)

y_train_pred_1 = []
for train_index, test_index in tqdm(skf.split(X_train, y_train)):
    xt, xv = X_train[train_index], X_train[test_index]
    yt, yv = y_train[train_index], y_train[test_index]
    # train the model
    model_1.fit(xt, yt, eval_set=(xv, yv), early_stopping_rounds=35)
    y_train_pred_1.append(model_1.predict_proba(xv))
    
y_train_pred_1 = np.concatenate(y_train_pred_1)
print(metrics.log_loss(y_train, y_train_pred_1))
model_1.fit(X_train, y_train)
y_test_pred_1   = model_1.predict_proba(X_test)
y_submit_pred_1 = model_1.predict_proba(X_submit)

3it [02:21, 47.03s/it]


1.7886362774002267
CPU times: user 4h 56min 4s, sys: 1min 49s, total: 4h 57min 53s
Wall time: 6min 23s


In [4]:
model_1.predict_proba(X_test)[0]

array([1.65072966e-02, 6.29526617e-02, 7.38880672e-02, 3.84214333e-01,
       1.28845757e-01, 3.28368630e-08, 5.17906181e-03, 1.77161749e-03,
       1.10185682e-01, 8.97361059e-04, 1.71374653e-01, 9.15474469e-03,
       3.50287307e-02])

In [5]:
metrics.log_loss(y_test, model_1.predict_proba(X_test))

1.212342016756957

In [8]:
X_train_preds = np.concatenate([
    y_train_pred_1
], axis=1)


X_test_preds = np.concatenate([
    y_test_pred_1
], axis=1)

X_submit_preds = np.concatenate([
    y_submit_pred_1
], axis=1)

X_train_preds.shape, X_test_preds.shape, X_submit_preds.shape

((80000, 13), (20000, 13), (53240, 13))

In [9]:
X_tmp = np.concatenate((
    X_train_preds,
    X_test_preds,
    X_submit_preds
), axis=0)

X_tmp.shape

(153240, 13)

In [10]:
from sklearn.preprocessing import QuantileTransformer
qt = QuantileTransformer(n_quantiles=10, random_state=100, output_distribution='normal')

In [11]:
qt.fit(X_tmp)

X_train_preds  = qt.transform(X_train_preds)
X_test_preds   = qt.transform(X_test_preds)
X_submit_preds = qt.transform(X_submit_preds)

In [13]:
from joblib import dump, load

In [14]:
dump(model_1, 'M_336/522_base_ml_model_1.joblib')
dump(qt, 'M_336/522_base_ml_qt_normal.joblib')

['M_336/522_base_ml_qt_normal.joblib']

In [15]:
np.savetxt('data/522_X_train_preds.csv' , X_train_preds , delimiter=",")
np.savetxt('data/522_X_test_preds.csv'  , X_test_preds  , delimiter=",")
np.savetxt('data/522_X_submit_preds.csv', X_submit_preds, delimiter=",")