In [7]:
import time
import os
import pickle
import copy
import glob
import sys

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import roc_auc_score

from xgboost import XGBClassifier

from lightgbm import LGBMClassifier

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

# Helper functions

In [8]:
def change_dtype_ser(ser):
    if ser.dtype == int and (set(ser.unique()) == set([0, 1])):
        return ser.astype(np.bool)
    
    if ser.dtype == int:
        return ser.astype(np.int32)
    
    if ser.dtype == float and (set(ser.unique()) == set([0., 1.])):
        return ser.astype(np.bool)
    
    if ser.dtype == float:
        return ser.astype(np.float32)
    
    if (ser.dtype == "object") and (set(ser.unique()) == set(["Y", "N"])):
        ser = ser.map({"Y": 1, "N": 0})
        return ser.astype(np.bool)
    
    if (ser.dtype == "object") and (ser.nunique() < ser.shape[0]):
        return ser.astype("category")
    
    return ser
    


def change_dtype_df(df):
    """
    change types of columns to reduce memory size
    :param df: dataframe
    :return df: dataframe
    """
    memory = df.memory_usage().sum() / 10**6
    print("Memory usage before changing types %0.2f MB" % memory)

    for col in df.columns:
        df[col] = change_dtype_ser(df[col])

    memory = df.memory_usage().sum() / 10 ** 6
    print("Memory usage after changing types %0.2f MB" % memory)
    return df


def load_csv(filename):
    df = pd.read_csv(filename)
    df = change_dtype_df(df)
    return df

In [9]:
class Standardizer(BaseEstimator, TransformerMixin):
    def __init__(self, to_array=False):
        self._to_array = to_array
        
    def fit(self, df_train):
        num_cols = df_train.select_dtypes(["number"]).columns.to_list()
        self._mean = {col: df_train[col].mean() for col in num_cols}
        self._std = {col: df_train[col].std() for col in num_cols}
        return self
    
    def transform(self, df):
        for col in self._mean:
            if self._std[col] > 0:
                df[col] = (df[col] - self._mean[col]) / self._std[col]
                df[col] = df[col].astype("float32")
            else:
                print("WARNING: " + col + " has zero std.")
                df[col] = df[col] - self._mean[col]
                df[col] = df[col].astype("float32")
                
        if self._to_array:
            return df.values.astype(np.float32)
        else:
            return df

In [10]:
class LabelEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, to_array=False):
        self._to_array = to_array
    
    def fit(self, df_train):
        all_cols = df_train.columns.to_list()
        cat_cols = df_train.select_dtypes(["category", "object"]).columns.to_list()
        
        self._cat_col_idx = [i for i, col in enumerate(all_cols) if col in cat_cols]
        
        self._label_maps = {}
        self._missing_imputers = {}
        for col in cat_cols:
            label = df_train[col].unique()
            self._label_maps[col] = {c: n for n, c in enumerate(label)}
            
            mode_label = df_train[col].mode().iloc[0]
            self._missing_imputers[col] = self._label_maps[col][mode_label]
        return self
    
    def transform(self, df):
        for col, label_map in self._label_maps.items():
            df[col] = df[col].map(label_map)
            if df[col].isnull().any():
                df[col] = df[col].astype(np.float32).fillna(self._missing_imputers[col])
                
        self._fatures = df.columns.to_list()
        if self._to_array:
            return df.values.astype(np.float32)
        else:
            return df
        
    def get_cat_cols(self):
        return self._cat_col_idx
    

class OneHotEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, to_array=False):
        self._to_array = to_array
        
        
    def fit(self, train_df):
        df_cat = train_df.select_dtypes(["object", "category"])
        self._cat_cols = df_cat.columns.to_list()
        
        if len(self._cat_cols) > 0:
            self._cat_cols_ohe = pd.get_dummies(df_cat, drop_first=True).columns.to_list()
        else:
            self._cat_cols_ohe = []
        return self
    
    def transform(self, df):
        if len(self._cat_cols) == 0:
            print("No cat cols in df_train, so do nothing.")
            return df
        
        df_cat = df.select_dtypes(["object", "category"])
        cat_cols = df_cat.columns.to_list()
        assert set(cat_cols) == set(self._cat_cols), "df does not have the same categorical cols as train_df"
        
        # one-hot encode
        df_cat = pd.get_dummies(df_cat)
        # drop cols that are present in test_df but absent in train_df
        cols_to_drop = [col for col in df_cat.columns if col not in self._cat_cols_ohe]
        df_cat = df_cat.drop(cols_to_drop, axis="columns")
        
        # change to float32
        for col in df_cat.columns:
            df_cat[col] = df_cat[col].astype("float32")
        
        # if some some colums are absent in test but present in train, make them all zero 
        cat_cols_ohe = df_cat.columns.to_list()
        for col in self._cat_cols_ohe:
            if col not in cat_cols_ohe:
                df_cat[col] = 0
                df_cat[col] = df_cat[col].astype(np.uint8)
        
        num_cols = [col for col in df.columns if col not in cat_cols]
        df_num = df[num_cols]
        
        df = pd.concat([df_num, df_cat], axis="columns")
        if self._to_array:
            return df.values.astype(np.float32)
        else:
            return df

In [11]:
def roc_auc(estimator, X_eval, y_eval):
    """
    :param estimator: sklearn estimator that have predict_proba() method
    :param X_eval: test features
    :param y_eval: test target
    :return: float
    """
    proba = estimator.predict_proba(X_eval)
    return roc_auc_score(y_eval, proba[:, 1])


def write_submit_csv(estimator, X_test, id_test, out):
    """
    :param estimator: a sklearn estimator that has predict_proba() method
    :param X_test: df or array
    :param id_test: dataframe containing column "SK_ID_CURR"
    :param out: str, csv output file name
    :return: None
    """
    prob_test = estimator.predict_proba(X_test)[:, 1]
    submit = id_test.copy()
    submit["TARGET"] = prob_test
    submit.to_csv(out, index=False)
    return None


def feature_importance_df(estimator, features):
    """
    :param estimator: an estimator object that has feature_importances_ attribute
    :param features: list of str, list of feature names
    :return: feature_imp, dataframe
    """
    feature_imp = pd.DataFrame({"feature": features, "importance": estimator.feature_importances_})
    feature_imp = feature_imp.sort_values(by=["importance"], ascending=False)
    
    feature_imp["rank"] = np.arange(feature_imp.shape[0]) + 1
    return feature_imp

In [13]:
def run_hyperopt(classifier,
                 params_tuned, 
                 X_train, y_train,
                 num_eval,
                 params_fixed=None,
                 rstate=None):
    
    time_start = time.time()
    if params_fixed is None:
        params_fixed = {"n_jobs": 20, "n_estimators": 100}
    
    def objective(params):
        classifier.set_params(**params_fixed, **params)
        classifier.fit(X_train, y_train)
        
        auc = cross_val_score(classifier, X_train, y_train, cv=5, scoring="roc_auc").mean()
        return {"loss": -auc, "status": STATUS_OK}
    
    if rstate is not None:
        rstate = np.random.RandomState(rstate)
        
    trials = Trials()
    best_params = fmin(objective, 
                      params_tuned, 
                      algo=tpe.suggest, 
                      max_evals=num_eval, 
                      trials=trials,
                      rstate=rstate)
    
    best_params = whole_to_int(best_params)
    best_model = classifier.set_params(**params_fixed, **best_params)
    best_model.fit(X_train, y_train)
    
    time_end = time.time()
    time_elapse = time_end - time_start
    print("Time elapsed: %0.5f s" % time_elapse)
    return trials, best_params, best_model



"""
def hyperopt_lr(params_tuned, 
                X_train, y_train, 
                X_val, y_val, 
                num_eval,
                params_fixed=None,
                rstate=None):
    
    time_start = time.time()
    if params_fixed is None:
        params_fixed = {"n_jobs": 20}
    
    def objective(params):
        estimator = LogisticRegression(**params_fixed, **params)
        estimator.fit(X_train, y_train)
        
        auc = roc_auc(estimator, X_val, y_val)
        return {"loss": -auc, "status": STATUS_OK}
    
    if rstate is not None:
        rstate = np.random.RandomState(rstate)
        
    trials = Trials()
    best_param = fmin(objective, 
                      params_tuned, 
                      algo=tpe.suggest, 
                      max_evals=num_eval, 
                      trials=trials,
                      rstate=rstate)
    
    time_end = time.time()
    time_elapse = time_end - time_start
    print("Time elapsed: %0.5f s" % time_elapse)
    return trials, best_param



def hyperopt_rf(params_tuned, 
                X_train, y_train, 
                X_val, y_val, 
                num_eval, 
                params_fixed=None,
                rstate=None):
    
    time_start = time.time()
    if params_fixed is None:
        params_fixed = {"n_jobs": 20, "n_estimators": 100}
    
    def objective(params):
        estimator = RandomForestClassifier(**params_fixed, **params)
        (estimator.get_params())
        estimator.fit(X_train, y_train)
        
        auc = roc_auc(estimator, X_val, y_val)
        return {"loss": -auc, "status": STATUS_OK}
    
    if rstate is not None:
        rstate = np.random.RandomState(rstate)
    
    trials = Trials()
    best_param = fmin(objective, 
                      params_tuned, 
                      algo=tpe.suggest, 
                      max_evals=num_eval, 
                      trials=trials,
                      rstate=rstate)
    
    time_end = time.time()
    time_elapse = time_end - time_start
    print("Time elapsed: %0.5f s" % time_elapse)
    return trials, best_param


def hyperopt_xgb(params_tuned, 
                 X_train, y_train, 
                 X_val, y_val, 
                 num_eval, 
                 params_fixed=None,
                 rstate=None):
    
    time_start = time.time()
    if params_fixed is None:
        params_fixed = {"n_jobs": 20, "n_estimators": 100}
    
    def objective(params):
        estimator = XGBClassifier(**params_fixed, **params)
        estimator.fit(X_train, y_train)
        
        auc = roc_auc(estimator, X_val, y_val)
        return {"loss": -auc, "status": STATUS_OK}

    if rstate is not None:
        rstate = np.random.RandomState(rstate)
    trials = Trials()
    best_param = fmin(objective, 
                      params_tuned, 
                      algo=tpe.suggest, 
                      max_evals=num_eval, 
                      trials=trials,
                      rstate=rstate)
    
    time_end = time.time()
    time_elapse = time_end - time_start
    print("Time elapsed: %0.5f s" % time_elapse)
    return trials, best_param


def hyperopt_lgbm(params_tuned, 
                  X_train, y_train, 
                  X_val, y_val, 
                  num_eval, 
                  params_fixed=None,
                  rstate=None):
    
    time_start = time.time()
    if params_fixed is None:
        params_fixed = {"n_jobs": 20, "n_estimators": 100}
    
    def objective(params):
        estimator = LGBMClassifier(**params_fixed, **params)
        estimator.fit(X_train, y_train)
        
        auc = roc_auc(estimator, X_val, y_val)
        return {"loss": -auc, "status": STATUS_OK}

    if rstate is not None:
        rstate = np.random.RandomState(rstate)
    trials = Trials()
    best_param = fmin(objective, 
                      params_tuned, 
                      algo=tpe.suggest, 
                      max_evals=num_eval, 
                      trials=trials,
                      rstate=rstate)
    
    time_end = time.time()
    time_elapse = time_end - time_start
    print("Time elapsed: %0.5f s" % time_elapse)
    return trials, best_param
"""
pass

In [14]:
def whole_to_int(a_dict):
    new_dict = copy.deepcopy(a_dict)
    for k, v in new_dict.items():
        if np.isclose(np.round(v), v):
            new_dict[k] = int(new_dict[k])
    return new_dict


def averaging_y_hat(submit_csv_files):
    y_hats = [pd.read_csv(f) for f in submit_csv_files]
    result = y_hats[0][["SK_ID_CURR"]]
    result["TARGET"] = 0.
    for y in y_hats:
        result["TARGET"] = result["TARGET"] + y["TARGET"]
    
    result["TARGET"] = result["TARGET"] / len(y_hats)
    return result

In [13]:
IN_DIR = "data/data3_"
SUB_DIR = "data/submit_"
MODELS_DIR = "data/models_"

# Logistic regression

In [None]:
time_start = time.time()

X_full_train = load_csv(os.path.join(IN_DIR, "X_y_train.csv"))
X_test = load_csv(os.path.join(IN_DIR, "X_test.csv"))

print("X_full_train.shape", X_full_train.shape)
print("X_test.shape", X_test.shape)
print("X_full_train.isnull().sum().sum:", X_full_train.isnull().sum().sum())
print("X_test.isnull().sum().sum:", X_test.isnull().sum().sum())

y_full_train = X_full_train["APPL_TARGET"].values
X_full_train = X_full_train.drop(["APPL_TARGET", "SK_ID_CURR"], axis="columns")
print("X_full_train.shape", X_full_train.shape)
print("X_test.shape", X_test.shape)

features = X_full_train.columns.to_list()
sk_id_test = X_test[["SK_ID_CURR"]]
X_test = X_test.drop(["SK_ID_CURR"], axis="columns")

print("X_full_train.shape", X_full_train.shape)
print("X_test.shape", X_test.shape)

time_end = time.time()
time_elapse = time_end - time_start
print("Elapsed Time", time_elapse)

In [9]:
time_start = time.time()

X_full_train = load_csv(os.path.join(IN_DIR, "X_y_train.csv"))
X_test = load_csv(os.path.join(IN_DIR, "X_test.csv"))

print("X_full_train.shape", X_full_train.shape)
print("X_test.shape", X_test.shape)
print("X_full_train.isnull().sum().sum:", X_full_train.isnull().sum().sum())
print("X_test.isnull().sum().sum:", X_test.isnull().sum().sum())

y_full_train = X_full_train["APPL_TARGET"].values
X_full_train = X_full_train.drop(["APPL_TARGET", "SK_ID_CURR"], axis="columns")
print("X_full_train.shape", X_full_train.shape)
print("X_test.shape", X_test.shape)

features = X_full_train.columns.to_list()
sk_id_test = X_test[["SK_ID_CURR"]]
X_test = X_test.drop(["SK_ID_CURR"], axis="columns")

print("X_full_train.shape", X_full_train.shape)
print("X_test.shape", X_test.shape)

time_end = time.time()
time_elapse = time_end - time_start
print("Elapsed Time", time_elapse)

Memory usage before changing types 5223.69 MB
Memory usage after changing types 2568.97 MB
Memory usage before changing types 827.62 MB
Memory usage after changing types 407.03 MB
X_train.shape (307511, 2147)
X_test.shape (48744, 2146)
X_train.isnull().sum().sum: 0
X_test.isnull().sum().sum: 0
X_train.shape (307511, 2145)
X_test.shape (48744, 2146)
X_train.shape (307511, 2145)
X_test.shape (48744, 2145)
Elapsed Time 1352.9275920391083


In [10]:
ohe = OneHotEncoder()
ohe.fit(X_full_train)
X_full_train = ohe.transform(X_full_train)
X_test = ohe.transform(X_test)

print("X_full_train.shape", X_full_train.shape)
print("X_test.shape", X_test.shape)

features = X_full_train.columns.to_list()

X_train.shape (307511, 2477)
X_test.shape (48744, 2477)


In [11]:
scaler = Standardizer(to_array=True)
scaler.fit(X_full_train)
X_full_train = scaler.transform(X_full_train)
X_test = scaler.transform(X_test)

print("X_full_train.shape", X_full_train.shape)
print("X_test.shape", X_test.shape)

X_train.shape (307511, 2477)
X_test.shape (48744, 2477)


In [12]:
X_train, X_val, y_train, y_val = train_test_split(X_full_train, y_full_train, test_size=0.2, 
                                                  stratify=y_full_train, random_state=21083)

X_train.shape, y_train.shape, X_val.shape, y_val.shape

((246008, 2477), (246008,), (61503, 2477), (61503,))

## Baseline (not tuned) model

In [13]:
lr = LogisticRegression(max_iter=100, n_jobs=20)
lr.fit(X_train, y_train)

LogisticRegression(n_jobs=20)

In [14]:
auc_lr_train = roc_auc(lr, X_train, y_train)
print("AUC of Logistic regression model on the train set: %0.5f" % auc_lr_train)

auc_lr_val = roc_auc(lr, X_val, y_val)
print("AUC of Logistic regression model on the evaluation set: %0.5f" % auc_lr_val)

AUC of Logistic regression model on the train set: 0.79900
AUC of Logistic regression model on the evaluation set: 0.77225


In [None]:
lr.fit(np.concatenate([X_train, X_val]), np.concatenate([y_train, y_val]))
write_submit_csv(lr, X_test, sk_id_test, os.path.join(SUB_DIR, "lr_data3_baseline.csv"))

## Tuning using `hyperopt`

In [None]:
params = {"C": hp.loguniform('C', np.log(0.00001), np.log(100))}
num_eval = 10

lr = LogisticRegression()
trials, best_params, best_model = run_hyperopt(lr, params, X_train, y_train, X_val, y_val, num_eval)
best_params

In [None]:
best_model.fit(np.concatenate([X_train, X_val]), np.concatenate([y_train, y_val]))

out_sub = os.path.join(SUB_DIR, "lr_data3_tuned.csv")
write_submit_csv(best_model, X_test, sk_id_test, out_sub)

out_model = os.path.join(MODELS_DIR, "lr_data3_tuned.pickle")
pickle.dump(best_model, open(out_model, "wb"))

# Random forest

## Baseline model

In [16]:
rf = RandomForestClassifier(n_jobs=20)
rf.fit(X_train, y_train)

RandomForestClassifier(n_jobs=20)

In [17]:
auc_rf_train = roc_auc(rf, X_train, y_train)
print("AUC of Random Forest model on the train set: %0.5f" % auc_rf_train)

auc_rf_val = roc_auc(rf, X_val, y_val)
print("AUC of Random Forest model on the evaluation set: %0.5f" % auc_rf_val)

AUC of Random Forest model on the train set: 1.00000
AUC of Random Forest model on the evaluation set: 0.72982


## Tuning using `hyperopt`

In [None]:
params_rf = {
    "max_depth": scope.int(hp.quniform("max_depth", 2, 10, 1)),
    #"min_samples_split": scope.int(hp.quniform("min_samples_split", 20, 400, 10)),
    "min_samples_leaf": scope.int(hp.quniform("min_samples_leaf", 20, 200, 10)), 
    "max_features": scope.int(hp.quniform("max_features", 10, 200, 1)),
}

params_fixed_rf = {
    "n_jobs": 20,
    "n_estimators": 100
}


num_eval = 60

trials_rf, best_params_rf = hyperopt_rf(params_rf, 
                                        X_train, y_train, X_val, y_val, 
                                        num_eval,
                                        params_fixed=params_fixed_rf,
                                        rstate=32003)
best_params_rf

In [None]:
best_params_rf = {s: int(best_params_rf[s]) for s in best_params_rf}
best_params_rf

In [None]:
rf_best = RandomForestClassifier(n_estimators=500, n_jobs=20, **best_params_rf)
rf_best.fit(X_train, y_train)

auc_rf_train = roc_auc(rf_best, X_train, y_train)
print("AUC of Random Forest model on the train set: %0.5f" % auc_rf_train)

auc_rf_val = roc_auc(rf_best, X_val, y_val)
print("AUC of Random Forest model on the evaluation set: %0.5f" % auc_rf_val)

In [None]:
rf_best.fit(np.concatenate([X_train, X_val]), np.concatenate([y_train, y_val]))

out_sub = os.path.join(SUB_DIR, "rf_data3_tuned.csv")
write_submit_csv(rf_best, X_test, sk_id_test, out_sub)

out_model = os.path.join(MODELS_DIR, "rf_data3_tuned.pickle")
pickle.dump(rf_best, open(out_model, "wb"))

# XGBOOST

## One-hot encoding

In [18]:
time_start = time.time()

X_full_train = load_csv(os.path.join(IN_DIR, "X_y_train.csv"))
X_test = load_csv(os.path.join(IN_DIR, "X_test.csv"))

print("X_full_train.shape", X_full_train.shape)
print("X_test.shape", X_test.shape)
print("X_full_train.isnull().sum().sum:", X_full_train.isnull().sum().sum())
print("X_test.isnull().sum().sum:", X_test.isnull().sum().sum())

y_full_train = X_full_train["APPL_TARGET"].values
X_full_train = X_full_train.drop(["APPL_TARGET", "SK_ID_CURR"], axis="columns")
print("X_full_train.shape", X_full_train.shape)
print("X_test.shape", X_test.shape)

features = X_full_train.columns.to_list()
sk_id_test = X_test[["SK_ID_CURR"]]
X_test = X_test.drop(["SK_ID_CURR"], axis="columns")

print("X_full_train.shape", X_full_train.shape)
print("X_test.shape", X_test.shape)

time_end = time.time()
time_elapse = time_end - time_start
print("Elapsed Time", time_elapse)

Memory usage before changing types 5223.69 MB
Memory usage after changing types 2568.97 MB
Memory usage before changing types 827.62 MB
Memory usage after changing types 407.03 MB
X_train.shape (307511, 2147)
X_test.shape (48744, 2146)
X_train.isnull().sum().sum: 0
X_test.isnull().sum().sum: 0
X_train.shape (307511, 2145)
X_test.shape (48744, 2146)
X_train.shape (307511, 2145)
X_test.shape (48744, 2145)
Elapsed Time 1537.468981742859


In [19]:
ohe = OneHotEncoder()
ohe.fit(X_full_train)
X_full_train = ohe.transform(X_full_train)
X_test = ohe.transform(X_test)

print("X_full_train.shape", X_full_train.shape)
print("X_test.shape", X_test.shape)

X_train.shape (307511, 2477)
X_test.shape (48744, 2477)


In [20]:
X_train, X_val, y_train, y_val = train_test_split(X_full_train, y_full_train, test_size=0.2, 
                                                  stratify=y_full_train, random_state=21083)

X_train.shape, y_train.shape, X_val.shape, y_val.shape

((246008, 2477), (246008,), (61503, 2477), (61503,))

### Baseline model

In [21]:
time_start = time.time()

xgb = XGBClassifier(tree_method="gpu_hist")
xgb.fit(X_train, y_train)

auc_xgb_train = roc_auc(xgb, X_train, y_train)
print("AUC of XGBOOST model on the train set: %0.5f" % auc_xgb_train)

auc_xgb_val = roc_auc(xgb, X_val, y_val)
print("AUC of XGBOOST model on the validation set: %0.5f" % auc_xgb_val)

time_end = time.time()
time_elapse = time_end - time_start
print("Time elapsed: %0.5f s" % time_elapse)

AUC of XGBOOST model on the train set: 0.91127
AUC of XGBOOST model on the validation set: 0.77461
Time elapsed: 234.91595 s


### Tuning using `hyperopt`

In [23]:
params_xgb = {
    "max_depth": scope.int(hp.quniform("max_depth", 2, 10, 1)),
    "min_child_weight": scope.int(hp.quniform("min_child_weight", 1, 16, 1)), 
    "subsample": hp.uniform("subsample", 0.4, 1.0),
    "colsample_bytree": hp.uniform("colsample_bytree", 0.4, 1.0),
    "reg_lambda": hp.loguniform("reg_lambda", np.log(0.01), np.log(10000)),
    #"reg_alpha": hp.loguniform("reg_alpha", np.log(0.0001), np.log(100)),
    "learning_rate": hp.loguniform("learning_rate", np.log(0.001), np.log(0.5)),
    "gamma": hp.uniform("gamma", 0., 2.),
}

params_fixed_xgb = {
    "booster": "gbtree",
    "tree_method": "gpu_hist" ,
    "n_estimators": 500
}

num_eval = 200

trials_xgb, best_params_xgb = hyperopt_xgb(params_xgb, 
                                           X_train, y_train, X_val, y_val, 
                                           num_eval,
                                           params_fixed=params_fixed_xgb,
                                           rstate=40292)
best_params_xgb

100%|██████████| 200/200 [27:37:37<00:00, 497.29s/trial, best loss: -0.7928299059136121]   
Time elapsed: 99458.38221 s


{'colsample_bytree': 0.43432432470594506,
 'gamma': 0.3354020590533102,
 'learning_rate': 0.05779942715385299,
 'max_depth': 8.0,
 'min_child_weight': 3.0,
 'reg_lambda': 380.36755231614643,
 'subsample': 0.88624379991172}

In [24]:
{'colsample_bytree': 0.43432432470594506,
 'gamma': 0.3354020590533102,
 'learning_rate': 0.05779942715385299,
 'max_depth': 8.0,
 'min_child_weight': 3.0,
 'reg_lambda': 380.36755231614643,
 'subsample': 0.88624379991172}

{'colsample_bytree': 0.43432432470594506,
 'gamma': 0.3354020590533102,
 'learning_rate': 0.05779942715385299,
 'max_depth': 8.0,
 'min_child_weight': 3.0,
 'reg_lambda': 380.36755231614643,
 'subsample': 0.88624379991172}

In [25]:
best_params_xgb = whole_to_int(best_params_xgb)

xgb_best = XGBClassifier(**params_fixed_xgb, **best_params_xgb)
xgb_best.fit(X_train, y_train)

auc_xgb_train = roc_auc(xgb_best, X_train, y_train)
print("AUC of XGBoost model on the train set: %0.5f" % auc_xgb_train)

auc_xgb_val = roc_auc(xgb_best, X_val, y_val)
print("AUC of XGBoost model on the evaluation set: %0.5f" % auc_xgb_val)


xgb_best.fit(np.concatenate([X_train, X_val]), np.concatenate([y_train, y_val]))

out_sub = os.path.join(SUB_DIR, "xgb_data5_ohe_tuned_01.csv")
write_submit_csv(xgb_best, X_test, sk_id_test, out_sub)

out_model = os.path.join(MODELS_DIR, "xgb_data5_ohe_tuned_01.pickle")
pickle.dump(xgb_best, open(out_model, "wb"))

AUC of XGBoost model on the train set: 0.88741
AUC of XGBoost model on the evaluation set: 0.79283


In [None]:
feat_impt = feature_importance_df(xgb_best, features)
feat_impt.head(30)

In [None]:
feat_impt[feat_impt["feature"].str.startswith("APPL_")].head(30)

## Label encoding

In [14]:
time_start = time.time()

X_full_train = load_csv(os.path.join(IN_DIR, "X_y_train.csv"))
X_test = load_csv(os.path.join(IN_DIR, "X_test.csv"))

print("X_full_train.shape", X_full_train.shape)
print("X_test.shape", X_test.shape)
print("X_full_train.isnull().sum().sum:", X_full_train.isnull().sum().sum())
print("X_test.isnull().sum().sum:", X_test.isnull().sum().sum())

y_full_train = X_full_train["APPL_TARGET"].values
X_full_train = X_full_train.drop(["APPL_TARGET", "SK_ID_CURR"], axis="columns")
print("X_full_train.shape", X_full_train.shape)
print("X_test.shape", X_test.shape)

features = X_full_train.columns.to_list()
sk_id_test = X_test[["SK_ID_CURR"]]
X_test = X_test.drop(["SK_ID_CURR"], axis="columns")

print("X_full_train.shape", X_full_train.shape)
print("X_test.shape", X_test.shape)

time_end = time.time()
time_elapse = time_end - time_start
print("Elapsed Time", time_elapse)

Memory usage before changing types 5223.69 MB
Memory usage after changing types 2568.97 MB
Memory usage before changing types 827.62 MB
Memory usage after changing types 407.03 MB
X_train.shape (307511, 2147)
X_test.shape (48744, 2146)
X_train.isnull().sum().sum: 0
X_test.isnull().sum().sum: 0
X_train.shape (307511, 2145)
X_test.shape (48744, 2146)
X_train.shape (307511, 2145)
X_test.shape (48744, 2145)
Elapsed Time 1402.5415875911713


In [15]:
le = LabelEncoder(to_array=True)
X_full_train = le.fit_transform(X_full_train)
X_test = le.transform(X_test)
print("X_full_train.shape", X_full_train.shape)
print("X_test.shape", X_test.shape)

cat_col_idx = le.get_cat_cols()

X_train.shape (307511, 2145)
X_test.shape (48744, 2145)


In [17]:
X_train, X_val, y_train, y_val = train_test_split(X_full_train, y_full_train, test_size=0.2, 
                                                  stratify=y_full_train, random_state=4112015)

X_train.shape, y_train.shape, X_val.shape, y_val.shape

((246008, 2145), (246008,), (61503, 2145), (61503,))

### Baseline model

In [18]:
time_start = time.time()

xgb = XGBClassifier(tree_method="gpu_hist")
xgb.fit(X_train, y_train)

auc_xgb_train = roc_auc(xgb, X_train, y_train)
print("AUC of XGBOOST model on the train set: %0.5f" % auc_xgb_train)

auc_xgb_val = roc_auc(xgb, X_val, y_val)
print("AUC of XGBOOST model on the validation set: %0.5f" % auc_xgb_val)

time_end = time.time()
time_elapse = time_end - time_start
print("Time elapsed: %0.5f s" % time_elapse)

AUC of XGBOOST model on the train set: 0.91190
AUC of XGBOOST model on the validation set: 0.77176
Time elapsed: 69.47420 s


### Tuning using `hyperopt`

In [19]:
params_xgb = {
    "max_depth": scope.int(hp.quniform("max_depth", 2, 10, 1)),
    "min_child_weight": scope.int(hp.quniform("min_child_weight", 1, 16, 1)), 
    "subsample": hp.uniform("subsample", 0.4, 1.0),
    "colsample_bytree": hp.uniform("colsample_bytree", 0.4, 1.0),
    "reg_lambda": hp.loguniform("reg_lambda", np.log(0.01), np.log(10000)),
    #"reg_alpha": hp.loguniform("reg_alpha", np.log(0.0001), np.log(100)),
    "learning_rate": hp.loguniform("learning_rate", np.log(0.005), np.log(1)),
    #"gamma": hp.uniform("gamma", 0., 2.),
}

params_fixed_xgb = {
    "booster": "gbtree",
    "tree_method": "gpu_hist" ,
    "n_estimators": 500
}

num_eval = 100

trials_xgb, best_params_xgb = hyperopt_xgb(params_xgb, 
                                           X_train, y_train, X_val, y_val, 
                                           num_eval,
                                           params_fixed=params_fixed_xgb,
                                           rstate=63259)
best_params_xgb

100%|██████████| 100/100 [7:47:08<00:00, 280.28s/trial, best loss: -0.7895740700307722]  
Time elapsed: 28028.48750 s


{'colsample_bytree': 0.88063766163331,
 'learning_rate': 0.039936412196415395,
 'max_depth': 6.0,
 'min_child_weight': 2.0,
 'reg_lambda': 5.753974775049421,
 'subsample': 0.6284760348606167}

In [20]:
{'colsample_bytree': 0.88063766163331,
 'learning_rate': 0.039936412196415395,
 'max_depth': 6.0,
 'min_child_weight': 2.0,
 'reg_lambda': 5.753974775049421,
 'subsample': 0.6284760348606167}

{'colsample_bytree': 0.88063766163331,
 'learning_rate': 0.039936412196415395,
 'max_depth': 6.0,
 'min_child_weight': 2.0,
 'reg_lambda': 5.753974775049421,
 'subsample': 0.6284760348606167}

In [21]:
best_params_xgb = whole_to_int(best_params_xgb)

xgb_best = XGBClassifier(**params_fixed_xgb, **best_params_xgb)
xgb_best.fit(X_train, y_train)

auc_xgb_train = roc_auc(xgb_best, X_train, y_train)
print("AUC of XGBoost model on the train set: %0.5f" % auc_xgb_train)

auc_xgb_val = roc_auc(xgb_best, X_val, y_val)
print("AUC of XGBoost model on the evaluation set: %0.5f" % auc_xgb_val)


xgb_best.fit(np.concatenate([X_train, X_val]), np.concatenate([y_train, y_val]))

out_sub = os.path.join(SUB_DIR, "xgb_data5_le_tuned_01.csv")
write_submit_csv(xgb_best, X_test, sk_id_test, out_sub)

out_model = os.path.join(MODELS_DIR, "xgb_data5_le_tuned_01.pickle")
pickle.dump(xgb_best, open(out_model, "wb"))

AUC of XGBoost model on the train set: 0.88458
AUC of XGBoost model on the evaluation set: 0.78957


# LightGBM

In [None]:
time_start = time.time()

X_full_train = load_csv(os.path.join(IN_DIR, "X_y_train.csv"))
X_test = load_csv(os.path.join(IN_DIR, "X_test.csv"))

print("X_full_train.shape", X_full_train.shape)
print("X_test.shape", X_test.shape)
print("X_full_train.isnull().sum().sum:", X_full_train.isnull().sum().sum())
print("X_test.isnull().sum().sum:", X_test.isnull().sum().sum())

y_full_train = X_full_train["APPL_TARGET"].values
X_full_train = X_full_train.drop(["APPL_TARGET", "SK_ID_CURR"], axis="columns")
print("X_full_train.shape", X_full_train.shape)
print("X_test.shape", X_test.shape)

features = X_full_train.columns.to_list()
sk_id_test = X_test[["SK_ID_CURR"]]
X_test = X_test.drop(["SK_ID_CURR"], axis="columns")

print("X_full_train.shape", X_full_train.shape)
print("X_test.shape", X_test.shape)

time_end = time.time()
time_elapse = time_end - time_start
print("Elapsed Time", time_elapse)

In [None]:
le = LabelEncoder(to_array=True)
X_full_train = le.fit_transform(X_full_train)
X_test = le.transform(X_test)
print("X_full_train.shape", X_full_train.shape)
print("X_test.shape", X_test.shape)

cat_col_idx = le.get_cat_cols()

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_full_train, y_full_train, test_size=0.2, 
                                                  stratify=y_full_train, random_state=4112015)

X_train.shape, y_train.shape, X_val.shape, y_val.shape

## Baseline model

In [22]:
time_start = time.time()

lgbm = LGBMClassifier(device="gpu", categorical_feature=cat_col_idx)
lgbm.fit(X_train, y_train)

auc_lgbm_train = roc_auc(lgbm, X_train, y_train)
print("AUC of LightGBM model on the train set: %0.5f" % auc_lgbm_train)

auc_lgbm_val = roc_auc(lgbm, X_val, y_val)
print("AUC of LightGBM model on the validation set: %0.5f" % auc_lgbm_val)

time_end = time.time()
time_elapse = time_end - time_start
print("Time elapsed: %0.5f s" % time_elapse)

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))


AUC of LightGBM model on the train set: 0.84583
AUC of LightGBM model on the validation set: 0.78232
Time elapsed: 77.44414 s


## Tuning using `hyperopt`

### `gbtree`

In [23]:
params_lgbm = {
    "max_depth": scope.int(hp.quniform("max_depth", 2, 10, 1)),
    "num_leaves": scope.int(hp.quniform("num_leaves", 10, 200, 5)),
    "min_child_samples": scope.int(hp.quniform("min_child_samples", 10, 500, 10)), 
    "subsample": hp.uniform("subsample", 0.4, 1.0),
    "colsample_bytree": hp.uniform("colsample_bytree", 0.4, 1.0),
    "reg_lambda": hp.loguniform("reg_lambda", np.log(0.01), np.log(10000)),
    "learning_rate": hp.loguniform("learning_rate", np.log(0.005), np.log(1)),
}

# categorical_feature
params_fixed_lgbm = {
    "boosting_type": "gbdt",
    "device": "gpu" ,
    "n_estimators": 500,
    "categorical_feature": cat_col_idx
}

num_eval = 100

trials_lgbm, best_params_lgbm = hyperopt_lgbm(params_lgbm, 
                                              X_train, y_train, X_val, y_val, 
                                              num_eval,
                                              params_fixed=params_fixed_lgbm,
                                              rstate=31029)
best_params_lgbm

  0%|          | 0/100 [00:00<?, ?trial/s, best loss=?]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



  1%|          | 1/100 [01:36<2:39:27, 96.64s/trial, best loss: -0.7223299129849375]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



  2%|▏         | 2/100 [02:40<2:21:36, 86.70s/trial, best loss: -0.7861111262512284]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



  3%|▎         | 3/100 [03:40<2:07:24, 78.81s/trial, best loss: -0.7861111262512284]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



  4%|▍         | 4/100 [06:44<2:56:42, 110.44s/trial, best loss: -0.7861111262512284]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



  5%|▌         | 5/100 [09:06<3:09:36, 119.75s/trial, best loss: -0.7879672476161173]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



  6%|▌         | 6/100 [10:21<2:46:45, 106.45s/trial, best loss: -0.7879672476161173]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



  7%|▋         | 7/100 [12:22<2:51:41, 110.77s/trial, best loss: -0.7879672476161173]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



  8%|▊         | 8/100 [14:19<2:52:53, 112.76s/trial, best loss: -0.7879672476161173]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



  9%|▉         | 9/100 [15:19<2:27:00, 96.93s/trial, best loss: -0.7879672476161173] 

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 10%|█         | 10/100 [16:02<2:01:09, 80.77s/trial, best loss: -0.7879672476161173]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 11%|█         | 11/100 [17:34<2:04:22, 83.85s/trial, best loss: -0.7879672476161173]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 12%|█▏        | 12/100 [19:06<2:06:41, 86.38s/trial, best loss: -0.7879672476161173]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 13%|█▎        | 13/100 [20:58<2:16:21, 94.05s/trial, best loss: -0.7879672476161173]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 14%|█▍        | 14/100 [22:46<2:20:52, 98.29s/trial, best loss: -0.7879672476161173]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 15%|█▌        | 15/100 [24:47<2:28:44, 104.99s/trial, best loss: -0.7879672476161173]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 16%|█▌        | 16/100 [25:51<2:09:56, 92.81s/trial, best loss: -0.7879672476161173] 

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 17%|█▋        | 17/100 [27:17<2:05:25, 90.67s/trial, best loss: -0.7879672476161173]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 18%|█▊        | 18/100 [29:33<2:22:30, 104.28s/trial, best loss: -0.7879672476161173]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 19%|█▉        | 19/100 [30:19<1:57:23, 86.95s/trial, best loss: -0.7879672476161173] 

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 20%|██        | 20/100 [33:16<2:31:58, 113.98s/trial, best loss: -0.7879672476161173]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 21%|██        | 21/100 [34:48<2:21:27, 107.44s/trial, best loss: -0.7879672476161173]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 22%|██▏       | 22/100 [37:07<2:31:40, 116.67s/trial, best loss: -0.7879672476161173]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 23%|██▎       | 23/100 [40:10<2:55:21, 136.64s/trial, best loss: -0.7879672476161173]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 24%|██▍       | 24/100 [41:50<2:39:14, 125.71s/trial, best loss: -0.7879672476161173]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 25%|██▌       | 25/100 [43:28<2:26:42, 117.37s/trial, best loss: -0.7879672476161173]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 26%|██▌       | 26/100 [46:16<2:43:27, 132.53s/trial, best loss: -0.7879672476161173]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 27%|██▋       | 27/100 [47:46<2:25:51, 119.89s/trial, best loss: -0.7879672476161173]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 28%|██▊       | 28/100 [49:23<2:15:23, 112.83s/trial, best loss: -0.7879672476161173]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 29%|██▉       | 29/100 [50:50<2:04:38, 105.33s/trial, best loss: -0.7879672476161173]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 30%|███       | 30/100 [52:16<1:56:07, 99.53s/trial, best loss: -0.7879672476161173] 

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 31%|███       | 31/100 [53:57<1:54:41, 99.74s/trial, best loss: -0.7879672476161173]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 32%|███▏      | 32/100 [55:28<1:50:17, 97.32s/trial, best loss: -0.7879672476161173]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 33%|███▎      | 33/100 [57:06<1:48:42, 97.35s/trial, best loss: -0.7879672476161173]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 34%|███▍      | 34/100 [58:53<1:50:21, 100.32s/trial, best loss: -0.7879672476161173]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 35%|███▌      | 35/100 [59:56<1:36:37, 89.19s/trial, best loss: -0.7879672476161173] 

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 36%|███▌      | 36/100 [1:01:05<1:28:43, 83.19s/trial, best loss: -0.7879672476161173]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 37%|███▋      | 37/100 [1:02:52<1:34:40, 90.17s/trial, best loss: -0.7879672476161173]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 38%|███▊      | 38/100 [1:03:50<1:23:22, 80.69s/trial, best loss: -0.7880748600064615]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 39%|███▉      | 39/100 [1:04:41<1:12:54, 71.71s/trial, best loss: -0.7880748600064615]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 40%|████      | 40/100 [1:05:31<1:05:06, 65.11s/trial, best loss: -0.7880748600064615]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 41%|████      | 41/100 [1:06:21<59:39, 60.67s/trial, best loss: -0.7880748600064615]  

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 42%|████▏     | 42/100 [1:07:24<59:21, 61.41s/trial, best loss: -0.7880748600064615]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 43%|████▎     | 43/100 [1:08:33<1:00:23, 63.56s/trial, best loss: -0.7880748600064615]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 44%|████▍     | 44/100 [1:09:49<1:02:52, 67.36s/trial, best loss: -0.7880748600064615]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 45%|████▌     | 45/100 [1:10:45<58:26, 63.76s/trial, best loss: -0.7880748600064615]  

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 46%|████▌     | 46/100 [1:11:28<52:00, 57.78s/trial, best loss: -0.7880748600064615]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 47%|████▋     | 47/100 [1:13:01<1:00:19, 68.29s/trial, best loss: -0.7880748600064615]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 48%|████▊     | 48/100 [1:13:54<55:09, 63.64s/trial, best loss: -0.7880748600064615]  

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 49%|████▉     | 49/100 [1:14:46<51:14, 60.28s/trial, best loss: -0.7880748600064615]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 50%|█████     | 50/100 [1:15:51<51:15, 61.52s/trial, best loss: -0.7880748600064615]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 51%|█████     | 51/100 [1:17:11<54:47, 67.09s/trial, best loss: -0.7880748600064615]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 52%|█████▏    | 52/100 [1:18:20<54:14, 67.79s/trial, best loss: -0.7880748600064615]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 53%|█████▎    | 53/100 [1:19:03<47:12, 60.26s/trial, best loss: -0.7880748600064615]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 54%|█████▍    | 54/100 [1:19:45<42:00, 54.80s/trial, best loss: -0.7880748600064615]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 55%|█████▌    | 55/100 [1:20:44<41:58, 55.97s/trial, best loss: -0.7880748600064615]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 56%|█████▌    | 56/100 [1:22:11<48:01, 65.48s/trial, best loss: -0.7880748600064615]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 57%|█████▋    | 57/100 [1:23:28<49:12, 68.66s/trial, best loss: -0.7880748600064615]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 58%|█████▊    | 58/100 [1:24:45<49:49, 71.17s/trial, best loss: -0.7880748600064615]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 59%|█████▉    | 59/100 [1:25:42<45:46, 66.99s/trial, best loss: -0.7880748600064615]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 60%|██████    | 60/100 [1:26:41<43:07, 64.69s/trial, best loss: -0.7880748600064615]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 61%|██████    | 61/100 [1:28:30<50:39, 77.93s/trial, best loss: -0.7880748600064615]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 62%|██████▏   | 62/100 [1:29:21<44:14, 69.85s/trial, best loss: -0.7880748600064615]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 63%|██████▎   | 63/100 [1:31:02<48:45, 79.07s/trial, best loss: -0.7880748600064615]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 64%|██████▍   | 64/100 [1:33:20<58:10, 96.96s/trial, best loss: -0.7880748600064615]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 65%|██████▌   | 65/100 [1:34:40<53:38, 91.94s/trial, best loss: -0.7880748600064615]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 66%|██████▌   | 66/100 [1:36:25<54:14, 95.73s/trial, best loss: -0.7880748600064615]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 67%|██████▋   | 67/100 [1:38:06<53:28, 97.24s/trial, best loss: -0.7880748600064615]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 68%|██████▊   | 68/100 [1:40:00<54:29, 102.18s/trial, best loss: -0.7880748600064615]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 69%|██████▉   | 69/100 [1:41:51<54:11, 104.88s/trial, best loss: -0.7880748600064615]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 70%|███████   | 70/100 [1:43:54<55:12, 110.41s/trial, best loss: -0.7880748600064615]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 71%|███████   | 71/100 [1:45:25<50:36, 104.71s/trial, best loss: -0.7880748600064615]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 72%|███████▏  | 72/100 [1:46:52<46:18, 99.24s/trial, best loss: -0.7880748600064615] 

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 73%|███████▎  | 73/100 [1:48:22<43:29, 96.64s/trial, best loss: -0.7880748600064615]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 74%|███████▍  | 74/100 [1:50:46<47:56, 110.63s/trial, best loss: -0.7880748600064615]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 75%|███████▌  | 75/100 [1:52:40<46:30, 111.62s/trial, best loss: -0.7880748600064615]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 76%|███████▌  | 76/100 [1:54:26<44:02, 110.09s/trial, best loss: -0.7880748600064615]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 77%|███████▋  | 77/100 [1:55:58<40:04, 104.54s/trial, best loss: -0.7880748600064615]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 78%|███████▊  | 78/100 [1:57:18<35:40, 97.28s/trial, best loss: -0.7880748600064615] 

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 79%|███████▉  | 79/100 [1:58:12<29:28, 84.22s/trial, best loss: -0.7880748600064615]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 80%|████████  | 80/100 [1:58:58<24:18, 72.94s/trial, best loss: -0.7880748600064615]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 81%|████████  | 81/100 [2:01:09<28:33, 90.18s/trial, best loss: -0.7880748600064615]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 82%|████████▏ | 82/100 [2:03:32<31:48, 106.02s/trial, best loss: -0.7880748600064615]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 83%|████████▎ | 83/100 [2:05:39<31:50, 112.36s/trial, best loss: -0.7880748600064615]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 84%|████████▍ | 84/100 [2:07:58<32:07, 120.48s/trial, best loss: -0.7880748600064615]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 85%|████████▌ | 85/100 [2:08:47<24:43, 98.92s/trial, best loss: -0.7880748600064615] 

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 86%|████████▌ | 86/100 [2:09:30<19:10, 82.21s/trial, best loss: -0.7880748600064615]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 87%|████████▋ | 87/100 [2:11:24<19:50, 91.60s/trial, best loss: -0.7880748600064615]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 88%|████████▊ | 88/100 [2:12:34<17:03, 85.31s/trial, best loss: -0.7880748600064615]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 89%|████████▉ | 89/100 [2:13:31<14:04, 76.74s/trial, best loss: -0.7880748600064615]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 90%|█████████ | 90/100 [2:14:44<12:34, 75.43s/trial, best loss: -0.7880748600064615]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 91%|█████████ | 91/100 [2:15:46<10:43, 71.48s/trial, best loss: -0.7880748600064615]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 92%|█████████▏| 92/100 [2:17:32<10:55, 81.91s/trial, best loss: -0.7880748600064615]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 93%|█████████▎| 93/100 [2:18:17<08:14, 70.70s/trial, best loss: -0.7880748600064615]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 94%|█████████▍| 94/100 [2:20:33<09:02, 90.47s/trial, best loss: -0.7880748600064615]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 95%|█████████▌| 95/100 [2:21:48<07:08, 85.67s/trial, best loss: -0.7880748600064615]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 96%|█████████▌| 96/100 [2:23:20<05:51, 87.78s/trial, best loss: -0.7880748600064615]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 97%|█████████▋| 97/100 [2:25:15<04:47, 95.69s/trial, best loss: -0.7880748600064615]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 98%|█████████▊| 98/100 [2:26:17<02:51, 85.73s/trial, best loss: -0.7880748600064615]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



 99%|█████████▉| 99/100 [2:27:39<01:24, 84.69s/trial, best loss: -0.7880748600064615]

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))



100%|██████████| 100/100 [2:28:33<00:00, 89.13s/trial, best loss: -0.7880748600064615]
Time elapsed: 8913.33204 s


{'colsample_bytree': 0.667434542409391,
 'learning_rate': 0.09146817439402177,
 'max_depth': 4.0,
 'min_child_samples': 380.0,
 'num_leaves': 160.0,
 'reg_lambda': 197.79685074014847,
 'subsample': 0.504377097169451}

In [24]:
{'colsample_bytree': 0.667434542409391,
 'learning_rate': 0.09146817439402177,
 'max_depth': 4.0,
 'min_child_samples': 380.0,
 'num_leaves': 160.0,
 'reg_lambda': 197.79685074014847,
 'subsample': 0.504377097169451}

{'colsample_bytree': 0.667434542409391,
 'learning_rate': 0.09146817439402177,
 'max_depth': 4.0,
 'min_child_samples': 380.0,
 'num_leaves': 160.0,
 'reg_lambda': 197.79685074014847,
 'subsample': 0.504377097169451}

In [25]:
best_params_lgbm = whole_to_int(best_params_lgbm)

lgbm_best = LGBMClassifier(**params_fixed_lgbm, **best_params_lgbm)
lgbm_best.fit(X_train, y_train)

auc_lgbm_train = roc_auc(lgbm_best, X_train, y_train)
print("AUC of LightGBM model on the train set: %0.5f" % auc_lgbm_train)

auc_lgbm_val = roc_auc(lgbm_best, X_val, y_val)
print("AUC of LightGBM model on the evaluation set: %0.5f" % auc_lgbm_val)


lgbm_best.fit(np.concatenate([X_train, X_val]), np.concatenate([y_train, y_val]))

out_sub = os.path.join(SUB_DIR, "lgbm_data5_le_tuned_01.csv")
write_submit_csv(lgbm_best, X_test, sk_id_test, out_sub)

out_model = os.path.join(MODELS_DIR, "lgbm_data5_le_tuned_01.pickle")
pickle.dump(lgbm_best, open(out_model, "wb"))

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))


AUC of LightGBM model on the train set: 0.83661
AUC of LightGBM model on the evaluation set: 0.78808


Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))
