# Base MLs

In [1]:
import pathlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import tensorflow as tf

from collections import Counter
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer
from sklearn import ensemble, linear_model
from xgboost.sklearn import XGBClassifier
from bayes_opt import BayesianOptimization
from catboost import Pool, CatBoostClassifier
from utils.metrics import Metric
from tqdm import tqdm
weights = pd.read_csv('data/005_weights.csv')['weight'].values


X_submit = pd.read_csv('./data/410_X_submit.csv', index_col='id')
X_train = pd.read_csv('./data/410_X_train.csv', index_col='id')
X_test = pd.read_csv('./data/410_X_test.csv', index_col='id')

y_submit = pd.read_csv('./data/004_test.csv', index_col='id')
y_train = pd.read_csv('./data/410_y_train.csv', index_col='id')
y_test = pd.read_csv('./data/410_y_test.csv', index_col='id')

X_submit = X_submit.values
X_train = X_train.values
X_test = X_test.values
y_train = np.squeeze(y_train.values)
y_test = np.squeeze(y_test.values)

X_train.shape

(80000, 280)

In [5]:
%%time
model_1 = XGBClassifier(
    max_depth=4,
    learning_rate=0.3,
    n_estimators=100,
    verbosity=1,
    silent=None,
    objective='reg:logistic',
    eval_metric='mlogloss',
    booster='gbtree',
    n_jobs=40,
    nthread=40,
    gamma=1,
    min_child_weight=1,
    max_delta_step=0,
    subsample=0.6,
    colsample_bytree=1,
    colsample_bylevel=1,
    colsample_bynode=0.5,
    reg_alpha=0.5,
    reg_lambda=1,
    scale_pos_weight=1,
    base_score=0.5,
    random_state=100,
    seed=None,
    missing=None,
    importance_type='gain'
)

y_train_pred_1  = cross_val_predict(model_1, X_train, y_train, cv=3, method='predict_proba')
print(metrics.log_loss(y_train, y_train_pred_1))
model_1.fit(X_train, y_train)
y_test_pred_1   = model_1.predict_proba(X_test)
y_submit_pred_1 = model_1.predict_proba(X_submit)

1.3804222034256206
CPU times: user 1h 16min 12s, sys: 4.32 s, total: 1h 16min 16s
Wall time: 2min 1s


In [4]:
%%time
model_2 = XGBClassifier(
    max_depth=5,
    learning_rate=0.1,
    n_estimators=100,
    verbosity=1,
    silent=None,
    objective='multi:softmax',
    booster='gbtree',
    eval_metric='mlogloss',
    n_jobs=-1,
    nthread=-1,
    gamma=5,
    min_child_weight=1,
    max_delta_step=0,
    subsample=0.7,
    colsample_bytree=1,
    colsample_bylevel=1,
    colsample_bynode=0.5,
    reg_alpha=0.5,
    reg_lambda=1,
    scale_pos_weight=1,
    base_score=0.5,
    random_state=100,
    seed=None,
    missing=None,
    importance_type='gain'
)

y_train_pred_2  = cross_val_predict(model_2, X_train, y_train, cv=3, method='predict_proba')
print(metrics.log_loss(y_train, y_train_pred_2))
model_2.fit(X_train, y_train)
y_test_pred_2   = model_2.predict_proba(X_test)
y_submit_pred_2 = model_2.predict_proba(X_submit)

1.4161252554443664
CPU times: user 3h 9min 56s, sys: 17.6 s, total: 3h 10min 14s
Wall time: 4min 2s


In [7]:
%%time
model_3 = ensemble.RandomForestClassifier(n_estimators=200, max_depth=4, max_features=0.4, n_jobs=-1, random_state=100)

y_train_pred_3  = cross_val_predict(model_3, X_train, y_train, cv=3, method='predict_proba')
print(metrics.log_loss(y_train, y_train_pred_3))
model_3.fit(X_train, y_train)
y_test_pred_3   = model_3.predict_proba(X_test)
y_submit_pred_3 = model_3.predict_proba(X_submit)

1.644515654040536
CPU times: user 6min 11s, sys: 8.38 s, total: 6min 20s
Wall time: 52.3 s


In [8]:
%%time
model_4 = ensemble.RandomForestClassifier(n_estimators=300, max_depth=6, max_features=0.5, n_jobs=-1, random_state=100)

y_train_pred_4  = cross_val_predict(model_4, X_train, y_train, cv=3, method='predict_proba',)
print(metrics.log_loss(y_train, y_train_pred_4))
model_4.fit(X_train, y_train)
y_test_pred_4   = model_4.predict_proba(X_test)
y_submit_pred_4 = model_4.predict_proba(X_submit)

1.6001131929267354
CPU times: user 18min 43s, sys: 9.23 s, total: 18min 52s
Wall time: 1min 47s


## Merge and Transform

In [9]:
X_train_preds = np.concatenate([
    y_train_pred_1,
    y_train_pred_2,
    y_train_pred_3,
    y_train_pred_4
], axis=1)


X_test_preds = np.concatenate([
    y_test_pred_1,
    y_test_pred_2,
    y_test_pred_3,
    y_test_pred_4
], axis=1)

X_submit_preds = np.concatenate([
    y_submit_pred_1,
    y_submit_pred_2,
    y_submit_pred_3,
    y_submit_pred_4
], axis=1)

X_train_preds.shape, X_test_preds.shape, X_submit_preds.shape

((80000, 52), (20000, 52), (53240, 52))

In [10]:
X_tmp = np.concatenate((
    X_train_preds,
    X_test_preds,
    X_submit_preds
), axis=0)

X_tmp.shape

(153240, 52)

In [11]:
from sklearn.preprocessing import QuantileTransformer
qt = QuantileTransformer(n_quantiles=10, random_state=100, output_distribution='normal')

In [12]:
qt.fit(X_tmp)

X_train_preds  = qt.transform(X_train_preds)
X_test_preds   = qt.transform(X_test_preds)
X_submit_preds = qt.transform(X_submit_preds)

## Save model

In [13]:
from joblib import dump, load

In [14]:
dump(model_1, 'M_336/510_base_ml_model_1.joblib')
dump(model_2, 'M_336/510_base_ml_model_2.joblib')
dump(model_3, 'M_336/510_base_ml_model_3.joblib')
dump(model_4, 'M_336/510_base_ml_model_4.joblib')

dump(qt, 'M_336/510_base_ml_qt_normal.joblib')

['M_336/510_base_ml_qt_normal.joblib']

In [15]:
np.savetxt('data/510_X_train_preds.csv' , X_train_preds , delimiter=",")
np.savetxt('data/510_X_test_preds.csv'  , X_test_preds  , delimiter=",")
np.savetxt('data/510_X_submit_preds.csv', X_submit_preds, delimiter=",")