In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
import optuna

from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold

import warnings
warnings.filterwarnings('ignore')

In [2]:
df_train = pd.read_csv(r'data/train.csv')
df_test = pd.read_csv(r'data/test.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'data/train.csv'

In [None]:
print(f'train na: {df_train.isna().sum().max()}')
print(f'test na: {df_test.isna().sum().max()}')

In [None]:
df_train.target.value_counts()

In [None]:
x = df_train.drop(['id', 'target'], 1)
y = df_train.target

ID = df_test.id
df_test.drop('id', 1, inplace=True)

In [None]:
num_cols = x.select_dtypes(include='number').columns.tolist()
cat_cols = x.select_dtypes(exclude='number').columns.tolist()

In [None]:
cols = cat_cols + num_cols
df = pd.concat([x[cols], df_test[cols]], 0)
df = pd.get_dummies(df, columns=cat_cols)

In [8]:
x = df[:len(x)]
test = df[len(x):]

NameError: name 'df' is not defined

In [10]:
x.shape, test.shape

((300000, 642), (200000, 642))

# params

In [11]:
early_stopping_rounds = 50
n_trials = 30
test_size = 0.1
n_splits = 5 # kfold

In [12]:
folds = KFold(n_splits=n_splits, shuffle=True, random_state=1)
x_valid_temp = len(next(iter(folds.split(x)))[0])

# catboost and optuna

In [13]:
def objective(trial, data=x, target=y):
    x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=test_size, random_state=1)

    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.005, 0.02, 0.05, 0.08, 0.1]),
        'n_estimators': trial.suggest_int('n_estimators', 50, 3000),
        'max_bin': trial.suggest_int('max_bin', 200, 400),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 300),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 0.0001, 1.0, log = True),
        'subsample': trial.suggest_float('subsample', 0.1, 0.8),
        'random_seed': 1,
        'task_type': 'GPU',
        'loss_function': 'Logloss',
        'eval_metric': 'AUC',
        'bootstrap_type': 'Poisson'
    }
    
    model = CatBoostClassifier(**params)  
    model.fit(x_train, y_train, eval_set = [(x_valid,y_valid)], early_stopping_rounds=early_stopping_rounds, verbose=False) # was 222
    y_pred = model.predict_proba(x_valid)[:, 1]
    roc_auc = roc_auc_score(y_valid, y_pred)

    return roc_auc

In [14]:
%%time
study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials=n_trials) # was 30
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)
print('Best value:', study.best_value)

[32m[I 2021-03-25 18:51:50,274][0m A new study created in memory with name: no-name-bc47e188-54fc-4c1d-bfe6-eee3e01e70cd[0m
[32m[I 2021-03-25 18:53:08,873][0m Trial 0 finished with value: 0.8964096325035159 and parameters: {'max_depth': 8, 'learning_rate': 0.1, 'n_estimators': 2475, 'max_bin': 254, 'min_data_in_leaf': 8, 'l2_leaf_reg': 0.03074485903865801, 'subsample': 0.6516535736775653}. Best is trial 0 with value: 0.8964096325035159.[0m
[32m[I 2021-03-25 18:53:54,472][0m Trial 1 finished with value: 0.8968586726555974 and parameters: {'max_depth': 9, 'learning_rate': 0.02, 'n_estimators': 1874, 'max_bin': 383, 'min_data_in_leaf': 294, 'l2_leaf_reg': 0.00032811340148320513, 'subsample': 0.47180449406861513}. Best is trial 1 with value: 0.8968586726555974.[0m
[32m[I 2021-03-25 18:54:00,916][0m Trial 2 finished with value: 0.8648952973485229 and parameters: {'max_depth': 3, 'learning_rate': 0.005, 'n_estimators': 284, 'max_bin': 261, 'min_data_in_leaf': 148, 'l2_leaf_reg': 0

[32m[I 2021-03-25 19:03:52,771][0m Trial 25 finished with value: 0.8974482131651644 and parameters: {'max_depth': 8, 'learning_rate': 0.05, 'n_estimators': 1095, 'max_bin': 351, 'min_data_in_leaf': 40, 'l2_leaf_reg': 0.3511667681668909, 'subsample': 0.42161380824905037}. Best is trial 18 with value: 0.8986170437499553.[0m
[32m[I 2021-03-25 19:04:34,318][0m Trial 26 finished with value: 0.8979916467965792 and parameters: {'max_depth': 10, 'learning_rate': 0.05, 'n_estimators': 2055, 'max_bin': 285, 'min_data_in_leaf': 108, 'l2_leaf_reg': 0.9606140667465336, 'subsample': 0.34838163215975054}. Best is trial 18 with value: 0.8986170437499553.[0m
[32m[I 2021-03-25 19:04:59,882][0m Trial 27 finished with value: 0.8970324883587303 and parameters: {'max_depth': 10, 'learning_rate': 0.05, 'n_estimators': 1304, 'max_bin': 281, 'min_data_in_leaf': 112, 'l2_leaf_reg': 0.7931743899645288, 'subsample': 0.26217158094282716}. Best is trial 18 with value: 0.8986170437499553.[0m
[32m[I 2021-03

Number of finished trials: 30
Best trial: {'max_depth': 9, 'learning_rate': 0.05, 'n_estimators': 2152, 'max_bin': 306, 'min_data_in_leaf': 119, 'l2_leaf_reg': 0.5491778959136009, 'subsample': 0.3813342356089708}
Best value: 0.8986170437499553
CPU times: user 17min 18s, sys: 6min 25s, total: 23min 43s
Wall time: 13min 59s


# catboost visuals

In [15]:
# optuna.visualization.plot_optimization_history(study)

In [16]:
# optuna.visualization.plot_param_importances(study)

# catboost and best params

In [17]:
paramsCB = study.best_trial.params
paramsCB['task_type'] = 'GPU'
paramsCB['loss_function'] = 'Logloss'
paramsCB['eval_metric'] = 'AUC'
paramsCB['random_seed'] = 1
paramsCB['bootstrap_type'] = 'Poisson'

In [18]:
%%time

folds = KFold(n_splits=n_splits, shuffle=True, random_state=1)

predictions_catb = np.zeros(len(test))
roc_auc_list_catb = []

for fold, (train_idx, valid_idx) in enumerate(folds.split(x, y)):
    
    x_train, y_train = x.iloc[train_idx], y.iloc[train_idx]
    x_valid, y_valid = x.iloc[valid_idx], y.iloc[valid_idx]

    model = CatBoostClassifier(**paramsCB)
   
    model.fit(x_train, y_train, eval_set=[(x_valid, y_valid)], verbose=False, early_stopping_rounds=early_stopping_rounds) # was 222
    
    predictions_catb += model.predict_proba(test)[:, 1] / folds.n_splits
    valid_preds_catb = model.predict_proba(x_valid)[:, 1]
    
    roc_auc_list_catb.append(roc_auc_score(y_valid, valid_preds_catb))
    
roc_auc_catb = max(roc_auc_list_catb)
roc_auc_catb

CPU times: user 3min 33s, sys: 1min 17s, total: 4min 50s
Wall time: 2min 44s


In [19]:
submission_catb = pd.DataFrame({'id': ID, 'target': predictions_catb})
submission_catb.to_csv('submissionCB.csv', index = False)

# xgb and optuna

In [20]:
def objective(trial, data=x, target=y):
    x_train, x_valid, y_train, y_valid=train_test_split(x, y, test_size=test_size, random_state=1)

    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.005, 0.02, 0.05, 0.08, 0.1]),
        'n_estimators': trial.suggest_int('n_estimators', 50, 3000),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
        'gamma': trial.suggest_float('gamma', 0.0001, 1.0, log=True),
        'alpha': trial.suggest_float('alpha', 0.0001, 10.0, log=True),
        'lambda': trial.suggest_float('lambda', 0.0001, 10.0, log=True),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 0.8),
        'subsample': trial.suggest_float('subsample', 0.1, 0.8),
        'tree_method': 'gpu_hist',
        'booster': 'gbtree',
        'random_state': 1,
        'use_label_encoder': False,
        'eval_metric': 'auc'

    }
    
    model = XGBClassifier(**params)  
    model.fit(x_train, y_train, eval_set=[(x_valid, y_valid)], early_stopping_rounds=early_stopping_rounds, verbose=False) # was 222
    y_pred = model.predict_proba(x_valid)[:, 1]
    roc_auc = roc_auc_score(y_valid, y_pred)

    return roc_auc

In [21]:
%%time
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=n_trials) # was 30
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)
print('Best value:', study.best_value)

[32m[I 2021-03-25 19:08:35,496][0m A new study created in memory with name: no-name-4db4928b-a2bd-4e84-869e-6776a4f9e233[0m
[32m[I 2021-03-25 19:08:55,256][0m Trial 0 finished with value: 0.8907465342575941 and parameters: {'max_depth': 4, 'learning_rate': 0.02, 'n_estimators': 766, 'min_child_weight': 148, 'gamma': 0.0006816392257218367, 'alpha': 0.0007736267559768395, 'lambda': 0.03690197077299625, 'colsample_bytree': 0.47784915007063355, 'subsample': 0.7759281600488638}. Best is trial 0 with value: 0.8907465342575941.[0m
[32m[I 2021-03-25 19:09:11,864][0m Trial 1 finished with value: 0.897170648382018 and parameters: {'max_depth': 5, 'learning_rate': 0.08, 'n_estimators': 550, 'min_child_weight': 83, 'gamma': 0.041326581682098386, 'alpha': 4.042777701583641, 'lambda': 0.24312217429420901, 'colsample_bytree': 0.5375253726149076, 'subsample': 0.5293556921333198}. Best is trial 1 with value: 0.897170648382018.[0m
[32m[I 2021-03-25 19:09:39,515][0m Trial 2 finished with value

[32m[I 2021-03-25 19:18:04,399][0m Trial 20 finished with value: 0.8999977089963753 and parameters: {'max_depth': 9, 'learning_rate': 0.02, 'n_estimators': 1378, 'min_child_weight': 33, 'gamma': 0.5164231023684138, 'alpha': 0.002286269033367302, 'lambda': 0.00010431617561165493, 'colsample_bytree': 0.4314008847003251, 'subsample': 0.7821582085955274}. Best is trial 20 with value: 0.8999977089963753.[0m
[32m[I 2021-03-25 19:19:01,961][0m Trial 21 finished with value: 0.900139339368168 and parameters: {'max_depth': 9, 'learning_rate': 0.02, 'n_estimators': 1461, 'min_child_weight': 32, 'gamma': 0.520530358214376, 'alpha': 0.0017997264178078855, 'lambda': 0.00013629961326197502, 'colsample_bytree': 0.4197378675830371, 'subsample': 0.7921655612018631}. Best is trial 21 with value: 0.900139339368168.[0m
[32m[I 2021-03-25 19:20:01,799][0m Trial 22 finished with value: 0.9002002712309669 and parameters: {'max_depth': 9, 'learning_rate': 0.02, 'n_estimators': 1461, 'min_child_weight': 

Number of finished trials: 30
Best trial: {'max_depth': 9, 'learning_rate': 0.02, 'n_estimators': 1924, 'min_child_weight': 21, 'gamma': 0.20774242085473657, 'alpha': 0.0003099308499578175, 'lambda': 0.0007266802253693671, 'colsample_bytree': 0.40545047613575835, 'subsample': 0.7026633887251847}
Best value: 0.9007919627829961
CPU times: user 19min 44s, sys: 39.4 s, total: 20min 24s
Wall time: 19min 17s


In [22]:
# optuna.visualization.plot_optimization_history(study)

In [23]:
# optuna.visualization.plot_param_importances(study)

# xgb and best params

In [24]:
paramsXGB = study.best_trial.params
paramsXGB['tree_method'] = 'gpu_hist'
paramsXGB['booster'] = 'gbtree'
paramsXGB['eval_metric'] = 'auc'
paramsXGB['random_state'] = 1
paramsXGB['use_label_encoder'] = False

In [25]:
%%time

folds = KFold(n_splits=n_splits, shuffle=True, random_state=1)

predictions_xgb = np.zeros(len(test))
roc_auc_list_xgb = []

for fold, (train_idx, valid_idx) in enumerate(folds.split(x, y)):
    
    x_train, y_train = x.iloc[train_idx], y.iloc[train_idx]
    x_valid, y_valid = x.iloc[valid_idx], y.iloc[valid_idx]

    model = XGBClassifier(**paramsXGB)
   
    model.fit(x_train, y_train, eval_set=[(x_valid, y_valid)], eval_metric='auc', verbose=False, early_stopping_rounds=early_stopping_rounds) # was 222
    
    predictions_xgb += model.predict_proba(test)[:, 1] / folds.n_splits
    valid_preds_xgb = model.predict_proba(x_valid)[:, 1]
    
    roc_auc_list_xgb.append(roc_auc_score(y_valid, valid_preds_catb))
    
roc_auc_xgb = max(roc_auc_list_xgb)
roc_auc_xgb

CPU times: user 8min 1s, sys: 10.6 s, total: 8min 12s
Wall time: 6min 47s


In [26]:
submission_xgb = pd.DataFrame({'id': ID, 'target': predictions_xgb})
submission_xgb.to_csv('submissionXGB.csv', index = False)

# lgbm and optuna

In [27]:
def objective(trial, data=x, target=y):
    x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=test_size, random_state=1)

    params = {
        'reg_alpha': trial.suggest_float('reg_alpha', 0.001, 10.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.001, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 11, 333),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'max_depth': trial.suggest_int('max_depth', 5, 20),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.01, 0.02, 0.05, 0.005, 0.1]),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 0.5),
        'n_estimators': trial.suggest_int('n_estimators', 50, 3000),
        'random_state': 1,
        'boosting_type': 'gbdt',
        'metric': 'AUC',
        'device': 'gpu'
    }
    
    model = LGBMClassifier(**params)  
    model.fit(x_train, y_train, eval_set=[(x_valid, y_valid)], early_stopping_rounds=early_stopping_rounds, verbose=False) # was 222
    y_pred= model.predict_proba(x_valid)[:, 1]
    roc_auc = roc_auc_score(y_valid, y_pred)

    return roc_auc

In [28]:
%%time
study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials=n_trials) # was 30
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)
print('Best value:', study.best_value)

[32m[I 2021-03-25 19:34:41,894][0m A new study created in memory with name: no-name-f07d5e68-97cd-40fc-8c9a-39b1622fb8f1[0m
[32m[I 2021-03-25 19:35:36,316][0m Trial 0 finished with value: 0.9003077688609771 and parameters: {'reg_alpha': 3.1742110108843513, 'reg_lambda': 4.956898097285141, 'num_leaves': 215, 'min_child_samples': 69, 'max_depth': 11, 'learning_rate': 0.05, 'colsample_bytree': 0.2795642296872885, 'n_estimators': 2090}. Best is trial 0 with value: 0.9003077688609771.[0m
[32m[I 2021-03-25 19:37:39,091][0m Trial 1 finished with value: 0.8973236878830275 and parameters: {'reg_alpha': 9.537492668580397, 'reg_lambda': 7.856848730391431, 'num_leaves': 206, 'min_child_samples': 13, 'max_depth': 14, 'learning_rate': 0.005, 'colsample_bytree': 0.1780019474335216, 'n_estimators': 1954}. Best is trial 0 with value: 0.9003077688609771.[0m
[32m[I 2021-03-25 19:38:20,974][0m Trial 2 finished with value: 0.9000148542388238 and parameters: {'reg_alpha': 6.737406022203017, 'reg_

[32m[I 2021-03-25 19:54:34,094][0m Trial 23 finished with value: 0.8993191554816856 and parameters: {'reg_alpha': 1.384349588466033, 'reg_lambda': 8.874144979560706, 'num_leaves': 84, 'min_child_samples': 19, 'max_depth': 13, 'learning_rate': 0.1, 'colsample_bytree': 0.27577597607920334, 'n_estimators': 2134}. Best is trial 0 with value: 0.9003077688609771.[0m
[32m[I 2021-03-25 19:55:37,234][0m Trial 24 finished with value: 0.9000593033802526 and parameters: {'reg_alpha': 0.1463464134431458, 'reg_lambda': 5.2870719415006695, 'num_leaves': 20, 'min_child_samples': 57, 'max_depth': 11, 'learning_rate': 0.05, 'colsample_bytree': 0.2721419018400416, 'n_estimators': 2981}. Best is trial 0 with value: 0.9003077688609771.[0m
[32m[I 2021-03-25 19:57:07,117][0m Trial 25 finished with value: 0.8967603728831535 and parameters: {'reg_alpha': 4.030078889161007, 'reg_lambda': 8.140729365917027, 'num_leaves': 78, 'min_child_samples': 45, 'max_depth': 14, 'learning_rate': 0.005, 'colsample_byt

Number of finished trials: 30
Best trial: {'reg_alpha': 3.513934835282675, 'reg_lambda': 3.7744569735669833, 'num_leaves': 155, 'min_child_samples': 48, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.2706264252200076, 'n_estimators': 2216}
Best value: 0.900353560252902
CPU times: user 47min 23s, sys: 1min 1s, total: 48min 25s
Wall time: 25min 59s


# lgbm visuals

In [29]:
# optuna.visualization.plot_optimization_history(study)

In [30]:
# optuna.visualization.plot_param_importances(study)

# lgbm and best params

In [31]:
paramsLGBM = study.best_trial.params
paramsLGBM['boosting_type'] = 'gbdt'
paramsLGBM['metric'] = 'AUC'
paramsLGBM['random_state'] = 1

In [32]:
%%time

folds = KFold(n_splits=n_splits, shuffle=True, random_state=1)

predictions_lgbm = np.zeros(len(test))
roc_auc_list_lgbm = []

for fold, (train_idx, valid_idx) in enumerate(folds.split(x, y)):
    
    x_train, y_train = x.iloc[train_idx], y.iloc[train_idx]
    x_valid, y_valid = x.iloc[valid_idx], y.iloc[valid_idx]

    model = LGBMClassifier(**paramsLGBM)
   
    model.fit(x_train, y_train, eval_set = [(x_valid, y_valid)], eval_metric='auc', verbose=False, early_stopping_rounds=early_stopping_rounds) # was 222
    
    predictions_lgbm += model.predict_proba(test)[:, 1] / folds.n_splits
    valid_preds_lgbm = model.predict_proba(x_valid)[:, 1]
    
    roc_auc_list_lgbm.append(roc_auc_score(y_valid, valid_preds_catb))
    
roc_auc_lgbm = max(roc_auc_list_lgbm)
roc_auc_lgbm

CPU times: user 11min 51s, sys: 8.38 s, total: 12min
Wall time: 6min 25s


In [33]:
submission_lgbm = pd.DataFrame({'id': ID, 'target': predictions_lgbm})
submission_lgbm.to_csv('submissionLGBM.csv', index = False)

# results

In [34]:
print(f'roc auc catboost: {roc_auc_catb}')
print(f'roc auc xgb: {roc_auc_xgb}')
print(f'roc auc lgbm: {roc_auc_lgbm}')

roc auc catboost: 0.8942327916650583
roc auc xgb: 0.8915750866233396
roc auc lgbm: 0.8915750866233396


# final submission

In [37]:
final = pd.DataFrame({'id':ID, 'catboost': predictions_catb, 'xgb': predictions_xgb, 'lgbm': predictions_lgbm})
final.head()

In [42]:
final['target'] = final.loc[:, ['catboost', 'xgb', 'lgbm']].mean(axis=1)
final.drop(['catboost', 'xgb', 'lgbm'], 1, inplace=True)

In [43]:
final.head()

Unnamed: 0,id,target
0,5,0.131032
1,6,0.406565
2,8,0.015783
3,9,0.213555
4,11,0.115598


In [44]:
final.to_csv('submission.csv', index=False)