# Base MLs

In [1]:
import pathlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import tensorflow as tf

from collections import Counter
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer
from sklearn import ensemble, linear_model
from xgboost.sklearn import XGBClassifier
from bayes_opt import BayesianOptimization
from tensorflow import keras
from tensorflow.keras import layers
from catboost import Pool, CatBoostClassifier
from utils.metrics import Metric
from tqdm import tqdm
weights = pd.read_csv('data/005_weights.csv')['weight'].values

print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

X_submit = pd.read_csv('./data/420_X_submit.csv', index_col='id')
X_train = pd.read_csv('./data/420_X_train.csv', index_col='id')
X_test = pd.read_csv('./data/420_X_test.csv', index_col='id')

y_submit = pd.read_csv('./data/004_test.csv', index_col='id')
y_train = pd.read_csv('./data/410_y_train.csv', index_col='id')
y_test = pd.read_csv('./data/410_y_test.csv', index_col='id')

X_submit = X_submit.values
X_train = X_train.values
X_test = X_test.values
y_train = np.squeeze(y_train.values)
y_test = np.squeeze(y_test.values)

X_train.shape

Num GPUs Available:  1


(80000, 310)

In [52]:
%%time
model_1 = CatBoostClassifier(iterations=1000,
                           depth=2,
                           learning_rate=1,
                           loss_function='MultiClass',
                           random_seed=100,
                           verbose=False)

skf = StratifiedKFold(n_splits=3)
skf.get_n_splits(X_train, y_train)

y_train_pred_1 = []
for train_index, test_index in tqdm(skf.split(X_train, y_train)):
    xt, xv = X_train[train_index], X_train[test_index]
    yt, yv = y_train[train_index], y_train[test_index]
    # train the model
    model_1.fit(xt, yt, eval_set=(xv, yv), early_stopping_rounds=30)
    y_train_pred_1.append(model_1.predict_proba(xv))
    
y_train_pred_1 = np.concatenate(y_train_pred_1)
print(metrics.log_loss(y_train, y_train_pred_1))
model_1.fit(X_train, y_train)
y_test_pred_1   = model_1.predict_proba(X_test)
y_submit_pred_1 = model_1.predict_proba(X_submit)

3it [02:16, 45.65s/it]


1.8406305364035296
CPU times: user 3h 38min 19s, sys: 1min 46s, total: 3h 40min 6s
Wall time: 4min 49s


In [6]:
%%time
model_2 = XGBClassifier(
    max_depth=5,
    learning_rate=0.1,
    n_estimators=100,
    verbosity=1,
    silent=None,
    objective='multi:softmax',
    booster='gbtree',
    eval_metric='mlogloss',
    n_jobs=36,
    nthread=36,
    gamma=5,
    min_child_weight=1,
    max_delta_step=0,
    subsample=0.7,
    colsample_bytree=1,
    colsample_bylevel=1,
    colsample_bynode=0.5,
    reg_alpha=0.5,
    reg_lambda=1,
    scale_pos_weight=1,
    base_score=0.5,
    random_state=100,
    seed=None,
    missing=None,
    importance_type='gain'
)

y_train_pred_2  = cross_val_predict(model_2, X_train, y_train, cv=3, method='predict_proba')
print(metrics.log_loss(y_train, y_train_pred_2))
model_2.fit(X_train, y_train)
y_test_pred_2   = model_2.predict_proba(X_test)
y_submit_pred_2 = model_2.predict_proba(X_submit)

1.3429361670810729
CPU times: user 1h 27min 28s, sys: 5.53 s, total: 1h 27min 33s
Wall time: 2min 35s


In [5]:
%%time
model_3 = ensemble.RandomForestClassifier(n_estimators=200, max_depth=4, max_features=0.4, n_jobs=36, random_state=100)

y_train_pred_3  = cross_val_predict(model_3, X_train, y_train, cv=3, method='predict_proba')
print(metrics.log_loss(y_train, y_train_pred_3))
model_3.fit(X_train, y_train)
y_test_pred_3   = model_3.predict_proba(X_test)
y_submit_pred_3 = model_3.predict_proba(X_submit)

1.606489195361814
CPU times: user 3min 54s, sys: 4.69 s, total: 3min 59s
Wall time: 43.7 s


In [3]:
%%time
model_4 = ensemble.RandomForestClassifier(n_estimators=300, max_depth=6, max_features=0.5, n_jobs=36, random_state=100)

y_train_pred_4  = cross_val_predict(model_4, X_train, y_train, cv=3, method='predict_proba')
print(metrics.log_loss(y_train, y_train_pred_4))
model_4.fit(X_train, y_train)
y_test_pred_4   = model_4.predict_proba(X_test)
y_submit_pred_4 = model_4.predict_proba(X_submit)

1.5479847591245361
CPU times: user 10min 39s, sys: 4.29 s, total: 10min 43s
Wall time: 1min 34s


In [29]:
%%time
model_5 = XGBClassifier(
    max_depth=3,
    learning_rate=0.3,
    n_estimators=100,
    verbosity=1,
    silent=None,
    objective='multi:softmax',
    booster='gbtree',
    eval_metric='mlogloss',
    n_jobs=40,
    nthread=40,
    gamma=1,
    min_child_weight=1,
    max_delta_step=0,
    subsample=0.5,
    colsample_bytree=1,
    colsample_bylevel=1,
    colsample_bynode=0.7,
    reg_alpha=0.5,
    reg_lambda=1,
    scale_pos_weight=1,
    base_score=0.5,
    random_state=100,
    seed=None,
    missing=None,
    importance_type='gain'
)

y_train_pred_5  = cross_val_predict(model_5, X_train, y_train, cv=3, method='predict_proba')
print(metrics.log_loss(y_train, y_train_pred_5))
model_5.fit(X_train, y_train)
y_test_pred_5   = model_5.predict_proba(X_test)
y_submit_pred_5 = model_5.predict_proba(X_submit)

1.3268524651420768
CPU times: user 1h 21min 20s, sys: 4.57 s, total: 1h 21min 24s
Wall time: 2min 10s


In [31]:
%%time
model_6 = XGBClassifier(
    max_depth=4,
    learning_rate=0.3,
    n_estimators=100,
    verbosity=1,
    silent=None,
    objective='reg:logistic',
    eval_metric='merror',
    booster='gbtree',
    n_jobs=40,
    nthread=40,
    gamma=1,
    min_child_weight=1,
    max_delta_step=0,
    subsample=0.6,
    colsample_bytree=1,
    colsample_bylevel=1,
    colsample_bynode=0.5,
    reg_alpha=0.5,
    reg_lambda=1,
    scale_pos_weight=1,
    base_score=0.5,
    random_state=100,
    seed=None,
    missing=None,
    importance_type='gain'
)

y_train_pred_6  = cross_val_predict(model_6, X_train, y_train, cv=3, method='predict_proba')
print(metrics.log_loss(y_train, y_train_pred_6))
model_6.fit(X_train, y_train)
y_test_pred_6   = model_6.predict_proba(X_test)
y_submit_pred_6 = model_6.predict_proba(X_submit)

1.2983929338992457
CPU times: user 1h 20min 18s, sys: 4.33 s, total: 1h 20min 23s
Wall time: 2min 8s


## Merge and Transform

In [61]:
X_train_preds = np.concatenate([
#     y_train_pred_1,
    y_train_pred_2,
    y_train_pred_3,
    y_train_pred_4,
    y_train_pred_5,
    y_train_pred_6
], axis=1)


X_test_preds = np.concatenate([
#     y_test_pred_1,
    y_test_pred_2,
    y_test_pred_3,
    y_test_pred_4,
    y_test_pred_5,
    y_test_pred_6
], axis=1)

X_submit_preds = np.concatenate([
#     y_submit_pred_1,
    y_submit_pred_2,
    y_submit_pred_3,
    y_submit_pred_4,
    y_submit_pred_5,
    y_submit_pred_6
], axis=1)

X_train_preds.shape, X_test_preds.shape, X_submit_preds.shape

((80000, 65), (20000, 65), (53240, 65))

In [63]:
X_tmp = np.concatenate((
    X_train_preds,
    X_test_preds,
    X_submit_preds
), axis=0)

X_tmp.shape

(153240, 65)

In [64]:
from sklearn.preprocessing import QuantileTransformer
qt = QuantileTransformer(n_quantiles=10, random_state=100, output_distribution='normal')

In [65]:
qt.fit(X_tmp)

X_train_preds  = qt.transform(X_train_preds)
X_test_preds   = qt.transform(X_test_preds)
X_submit_preds = qt.transform(X_submit_preds)

## Save model

In [66]:
from joblib import dump, load

In [67]:
dump(model_1, 'M_336/520_base_ml_model_1.joblib')
dump(model_2, 'M_336/520_base_ml_model_2.joblib')
dump(model_3, 'M_336/520_base_ml_model_3.joblib')
dump(model_4, 'M_336/520_base_ml_model_4.joblib')
dump(model_5, 'M_336/520_base_ml_model_5.joblib')
dump(model_6, 'M_336/520_base_ml_model_6.joblib')

dump(qt, 'M_336/520_base_ml_qt_normal.joblib')

['M_336/520_base_ml_qt_normal.joblib']

In [68]:
np.savetxt('data/520_X_train_preds.csv' , X_train_preds , delimiter=",")
np.savetxt('data/520_X_test_preds.csv'  , X_test_preds  , delimiter=",")
np.savetxt('data/520_X_submit_preds.csv', X_submit_preds, delimiter=",")