In [1]:
import warnings
warnings.filterwarnings('ignore')

import time
import os
import pickle
import copy
import glob
import sys

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import roc_auc_score

from xgboost import XGBClassifier

from lightgbm import LGBMClassifier

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

# Helper functions

In [2]:
def change_dtype_ser(ser):
    if ser.dtype == int:
        return ser.astype(np.int32)
    
    if ser.dtype == float:
        return ser.astype(np.float32)
    
    if ser.dtype == "object":
        if ser.nunique() < ser.shape[0]:
            return ser.astype("category")
        else:
            raise TypeError(ser.name + ": type is object but are all distinct")
    
    return ser
    


def change_dtype_df(df):
    """
    change types of columns to reduce memory size
    :param df: dataframe
    :return df: dataframe
    """
    memory = df.memory_usage().sum() / 10**6
    print("Memory usage before changing types %0.2f MB" % memory)

    for col in df.columns:
        df[col] = change_dtype_ser(df[col])

    memory = df.memory_usage().sum() / 10 ** 6
    print("Memory usage after changing types %0.2f MB" % memory)
    return df


def load_csv(filename):
    df = pd.read_csv(filename)
    df = change_dtype_df(df)
    return df

In [3]:
class Standardizer(BaseEstimator, TransformerMixin):
    def __init__(self, to_array=False):
        self._to_array = to_array
        
    def fit(self, df_train):
        num_cols = df_train.select_dtypes(["number"]).columns.to_list()
        self._mean = {col: df_train[col].mean() for col in num_cols}
        self._std = {col: df_train[col].std() for col in num_cols}
        return self
    
    def transform(self, df):
        for col in self._mean:
            if self._std[col] > 0:
                df[col] = (df[col] - self._mean[col]) / self._std[col]
                df[col] = df[col].astype("float32")
            else:
                print("WARNING: " + col + " has zero std.")
                df[col] = df[col] - self._mean[col]
                df[col] = df[col].astype("float32")
                
        if self._to_array:
            return df.values.astype(np.float32)
        else:
            return df

In [4]:
class LabelEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, to_array=False):
        self._to_array = to_array
    
    def fit(self, df_train):
        self._train_cols = df_train.columns.to_list()
        
        cat_cols = df_train.select_dtypes(["category", "object"]).columns.to_list()
        
        self._cat_col_idx = [i for i, col in enumerate(self._train_cols) if col in cat_cols]
        
        self._label_maps = {}
        self._missing_imputers = {}
        for col in cat_cols:
            label = df_train[col].unique()
            self._label_maps[col] = {c: n for n, c in enumerate(label)}
            
            mode_label = df_train[col].mode().iloc[0]
            self._missing_imputers[col] = self._label_maps[col][mode_label]
        return self
    
    def transform(self, df):
        cols = df.columns.to_list()
        assert set(cols) == set(self._train_cols), "do not have the same set of columns as train"
        
        for col, label_map in self._label_maps.items():
            df[col] = df[col].map(label_map)
            if df[col].isnull().any():
                df[col] = df[col].astype(np.float32).fillna(self._missing_imputers[col])
        
        # align columns
        df = df[self._train_cols]
        
        self._features = df.columns.to_list()
        if self._to_array:
            return df.values.astype(np.float32)
        else:
            return df
        
    def get_cat_cols(self):
        return self._cat_col_idx
    

class OneHotEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, to_array=False):
        self._to_array = to_array
        
        
    def fit(self, train_df):
        self._cols_before = train_df.columns.to_list()
        
        df_cat = train_df.select_dtypes(["object", "category"])
        self._cat_cols = df_cat.columns.to_list()
        
        if len(self._cat_cols) > 0:
            self._cat_cols_ohe = pd.get_dummies(df_cat, drop_first=True).columns.to_list()
        else:
            self._cat_cols_ohe = []
        
        num_cols = [col for col in train_df.columns if col not in self._cat_cols]
        self._cols_after = num_cols + self._cat_cols_ohe
        
        return self
    
    def transform(self, df):
        cols_before = df.columns.to_list()
        assert set(cols_before) == set(self._cols_before), "Do not have the same columns as train before transformed"
        
        if len(self._cat_cols) == 0:
            print("No cat cols in df_train, so do nothing.")
            return df
        
        df_cat = df.select_dtypes(["object", "category"])
        cat_cols = df_cat.columns.to_list()
        assert set(cat_cols) == set(self._cat_cols), "df does not have the same categorical cols as train_df"
        
        # one-hot encode
        df_cat = pd.get_dummies(df_cat)
        # drop cols that are present in test_df but absent in train_df
        cols_to_drop = [col for col in df_cat.columns if col not in self._cat_cols_ohe]
        df_cat = df_cat.drop(cols_to_drop, axis="columns")
        
        # change to float32
        for col in df_cat.columns:
            df_cat[col] = df_cat[col].astype("float32")
        
        # if some some colums are absent in test but present in train, make them all zero 
        cat_cols_ohe = df_cat.columns.to_list()
        for col in self._cat_cols_ohe:
            if col not in cat_cols_ohe:
                df_cat[col] = 0
                df_cat[col] = df_cat[col].astype(np.uint8)
        
        num_cols = [col for col in df.columns if col not in cat_cols]
        cols_after = num_cols + df_cat.columns.to_list()
        assert set(cols_after) == set(self._cols_after), "Do not have the same columns as train after transformed"
        
        df_num = df[num_cols]
        
        df = pd.concat([df_num, df_cat], axis="columns")
        # align columns
        df = df[self._cols_after]
        
        self._features = df.columns.to_list()
        if self._to_array:
            return df.values.astype(np.float32)
        else:
            return df

In [5]:
def roc_auc(estimator, X_eval, y_eval):
    """
    :param estimator: sklearn estimator that have predict_proba() method
    :param X_eval: test features
    :param y_eval: test target
    :return: float
    """
    proba = estimator.predict_proba(X_eval)
    return roc_auc_score(y_eval, proba[:, 1])


def write_submit_csv(estimator, X_test, id_test, out):
    """
    :param estimator: a sklearn estimator that has predict_proba() method
    :param X_test: df or array
    :param id_test: dataframe containing column "SK_ID_CURR"
    :param out: str, csv output file name
    :return: None
    """
    prob_test = estimator.predict_proba(X_test)[:, 1]
    submit = id_test.copy()
    submit["TARGET"] = prob_test
    submit.to_csv(out, index=False)
    return None


def feature_importance_df(estimator, features):
    """
    :param estimator: an estimator object that has feature_importances_ attribute
    :param features: list of str, list of feature names
    :return: feature_imp, dataframe
    """
    feature_imp = pd.DataFrame({"feature": features, "importance": estimator.feature_importances_})
    feature_imp = feature_imp.sort_values(by=["importance"], ascending=False)
    
    feature_imp["rank"] = np.arange(feature_imp.shape[0]) + 1
    return feature_imp

In [6]:
def run_hyperopt(classifier,
                 params_tuned, 
                 X_train, y_train,
                 X_val, y_val,
                 num_eval,
                 params_fixed=None,
                 rstate=None):
    
    time_start = time.time()
    if params_fixed is None:
        params_fixed = {"n_jobs": 20, "n_estimators": 100}
    
    def objective(params):
        classifier.set_params(**params_fixed, **params)
        classifier.fit(X_train, y_train)
        
        auc = roc_auc(classifier, X_val, y_val)
        return {"loss": -auc, "status": STATUS_OK}
    
    if rstate is not None:
        rstate = np.random.RandomState(rstate)
        
    trials = Trials()
    best_params = fmin(objective, 
                      params_tuned, 
                      algo=tpe.suggest, 
                      max_evals=num_eval, 
                      trials=trials,
                      rstate=rstate)
    
    best_params = whole_to_int(best_params)
    best_model = classifier.set_params(**params_fixed, **best_params)
    best_model.fit(X_train, y_train)
    
    time_end = time.time()
    time_elapse = time_end - time_start
    print("Time elapsed: %0.5f s" % time_elapse)
    
    return trials, best_params, best_model


In [7]:
def whole_to_int(a_dict):
    new_dict = copy.deepcopy(a_dict)
    for k, v in new_dict.items():
        if np.isclose(np.round(v), v):
            new_dict[k] = int(new_dict[k])
    return new_dict


def averaging_y_hat(submit_csv_files):
    y_hats = [pd.read_csv(f) for f in submit_csv_files]
    result = y_hats[0][["SK_ID_CURR"]]
    result["TARGET"] = 0.
    for y in y_hats:
        result["TARGET"] = result["TARGET"] + y["TARGET"]
    
    result["TARGET"] = result["TARGET"] / len(y_hats)
    return result

In [8]:
def check_same_cols(df1, df2):
    cols1 = df1.columns.to_list()
    cols2 = df2.columns.to_list()
    
    assert set(cols1) == set(cols2)
    for c1, c2 in zip(cols1, cols2):
        assert c1 == c2
        
    return None

In [9]:
IN_DIR = "data/data_"
SUB_DIR = "data/submit_"
MODELS_DIR = "data/models_"

# Logistic regression

In [10]:
time_start = time.time()

X_train = load_csv(os.path.join(IN_DIR, "X_y_train.csv"))
X_test = load_csv(os.path.join(IN_DIR, "X_test.csv"))

print("X_train.shape", X_train.shape)
print("X_test.shape", X_test.shape)
print("X_train.isnull().sum().sum:", X_train.isnull().sum().sum())
print("X_test.isnull().sum().sum:", X_test.isnull().sum().sum())

y_train = X_train["APPL_TARGET"].values
X_train = X_train.drop(["APPL_TARGET", "SK_ID_CURR"], axis="columns")
print("X_train.shape", X_train.shape)

sk_id_test = X_test[["SK_ID_CURR"]]
X_test = X_test.drop(["SK_ID_CURR"], axis="columns")
print("X_test.shape", X_test.shape)


ohe = OneHotEncoder()
ohe.fit(X_train)
X_train = ohe.transform(X_train)
X_test = ohe.transform(X_test)

print("X_train.shape", X_train.shape)
print("X_test.shape", X_test.shape)

features = ohe._features


scaler = Standardizer(to_array=True)
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

print("X_train.shape", X_train.shape)
print("X_test.shape", X_test.shape)


X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, 
                                                  stratify=y_train, random_state=40293)

print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)

time_end = time.time()
time_elapse = time_end - time_start
print("Time elapsed: %0.5f s" % time_elapse)

Memory usage before changing types 4977.37 MB
Memory usage after changing types 2449.34 MB
Memory usage before changing types 788.58 MB
Memory usage after changing types 388.07 MB
X_train.shape (307511, 2046)
X_test.shape (48744, 2045)
X_train.isnull().sum().sum: 0
X_test.isnull().sum().sum: 0
X_train.shape (307511, 2044)
X_test.shape (48744, 2044)
X_train.shape (307511, 2364)
X_test.shape (48744, 2364)
X_train.shape (307511, 2364)
X_test.shape (48744, 2364)
(246008, 2364) (246008,) (61503, 2364) (61503,)
Time elapsed: 1753.14431 s


In [11]:
lr = LogisticRegression(max_iter=100, n_jobs=20)
lr.fit(X_train, y_train)

LogisticRegression(n_jobs=20)

In [12]:
auc_train = roc_auc(lr, X_train, y_train)
print("AUC of the train set: %0.5f" % auc_train)

auc_val = roc_auc(lr, X_val, y_val)
print("AUC of the validation set: %0.5f" % auc_val)


lr.fit(np.concatenate([X_train, X_val]), np.concatenate([y_train, y_val]))

# Submitting this give public AUC = 0.77271 and private AUC = 0.77530
write_submit_csv(lr, X_test, sk_id_test, os.path.join(SUB_DIR, "lr.csv"))

AUC of the train set: 0.79685
AUC of the validation set: 0.78364


# Random forest

In [14]:
time_start = time.time()

X_train = load_csv(os.path.join(IN_DIR, "X_y_train.csv"))
X_test = load_csv(os.path.join(IN_DIR, "X_test.csv"))

print("X_train.shape", X_train.shape)
print("X_test.shape", X_test.shape)
print("X_train.isnull().sum().sum:", X_train.isnull().sum().sum())
print("X_test.isnull().sum().sum:", X_test.isnull().sum().sum())

y_train = X_train["APPL_TARGET"].values
X_train = X_train.drop(["APPL_TARGET", "SK_ID_CURR"], axis="columns")
print("X_train.shape", X_train.shape)

sk_id_test = X_test[["SK_ID_CURR"]]
X_test = X_test.drop(["SK_ID_CURR"], axis="columns")
print("X_test.shape", X_test.shape)


ohe = OneHotEncoder(to_array=True)
ohe.fit(X_train)
X_train = ohe.transform(X_train)
X_test = ohe.transform(X_test)

print("X_train.shape", X_train.shape)
print("X_test.shape", X_test.shape)

features = ohe._features


X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, 
                                                  stratify=y_train, random_state=40293)

print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)

time_end = time.time()
time_elapse = time_end - time_start
print("Time elapsed: %0.5f s" % time_elapse)

Memory usage before changing types 4977.37 MB
Memory usage after changing types 2449.34 MB
Memory usage before changing types 788.58 MB
Memory usage after changing types 388.07 MB
X_train.shape (307511, 2046)
X_test.shape (48744, 2045)
X_train.isnull().sum().sum: 0
X_test.isnull().sum().sum: 0
X_train.shape (307511, 2044)
X_test.shape (48744, 2044)
X_train.shape (307511, 2364)
X_test.shape (48744, 2364)
(246008, 2364) (246008,) (61503, 2364) (61503,)
Time elapsed: 1872.57600 s


In [16]:
params = {
    "max_depth": scope.int(hp.quniform("max_depth", 2, 20, 1)),
    "min_samples_split": scope.int(hp.quniform("min_samples_split", 20, 400, 10)),
    "min_samples_leaf": scope.int(hp.quniform("min_samples_leaf", 20, 200, 10)), 
    "max_features": scope.int(hp.quniform("max_features", 10, 200, 1)),
}

params_fixed = {
    "n_jobs": 20,
    "n_estimators": 100
}


num_eval = 100
rf = RandomForestClassifier()
trials, best_params, best_model = run_hyperopt(rf, params, 
                                               X_train, y_train, X_val, y_val, 
                                               num_eval,
                                               params_fixed=params_fixed)
best_params

100%|██████████| 100/100 [1:50:24<00:00, 66.24s/trial, best loss: -0.7625559894891252]
Time elapsed: 6698.88609 s


{'max_depth': 10,
 'max_features': 83,
 'min_samples_leaf': 110,
 'min_samples_split': 20}

In [None]:
{'max_depth': 10,
 'max_features': 83,
 'min_samples_leaf': 110,
 'min_samples_split': 20}

In [18]:
best_model.fit(X_train, y_train)

auc_train = roc_auc(best_model, X_train, y_train)
print("AUC of the train set: %0.5f" % auc_train)

auc_val = roc_auc(best_model, X_val, y_val)
print("AUC of the validation set: %0.5f" % auc_val)


best_model.fit(np.concatenate([X_train, X_val]), np.concatenate([y_train, y_val]))

out_model = os.path.join(MODELS_DIR, "rf.pickle")
pickle.dump(best_model, open(out_model, "wb"))


out_sub = os.path.join(SUB_DIR, "rf.csv")
# Submitting this give public AUC = 0.74792 and private AUC = 0.74145
write_submit_csv(best_model, X_test, sk_id_test, out_sub)

AUC of the train set: 0.79322
AUC of the validation set: 0.76269


# XGBOOST

## One-hot encoding

In [19]:
time_start = time.time()

X_train = load_csv(os.path.join(IN_DIR, "X_y_train.csv"))
X_test = load_csv(os.path.join(IN_DIR, "X_test.csv"))

print("X_train.shape", X_train.shape)
print("X_test.shape", X_test.shape)
print("X_train.isnull().sum().sum:", X_train.isnull().sum().sum())
print("X_test.isnull().sum().sum:", X_test.isnull().sum().sum())

y_train = X_train["APPL_TARGET"].values
X_train = X_train.drop(["APPL_TARGET", "SK_ID_CURR"], axis="columns")
print("X_train.shape", X_train.shape)

sk_id_test = X_test[["SK_ID_CURR"]]
X_test = X_test.drop(["SK_ID_CURR"], axis="columns")
print("X_test.shape", X_test.shape)


ohe = OneHotEncoder(to_array=True)
ohe.fit(X_train)
X_train = ohe.transform(X_train)
X_test = ohe.transform(X_test)

print("X_train.shape", X_train.shape)
print("X_test.shape", X_test.shape)

features = ohe._features


X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, 
                                                  stratify=y_train, random_state=40293)

print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)

time_end = time.time()
time_elapse = time_end - time_start
print("Time elapsed: %0.5f s" % time_elapse)

Memory usage before changing types 4977.37 MB
Memory usage after changing types 2449.34 MB
Memory usage before changing types 788.58 MB
Memory usage after changing types 388.07 MB
X_train.shape (307511, 2046)
X_test.shape (48744, 2045)
X_train.isnull().sum().sum: 0
X_test.isnull().sum().sum: 0
X_train.shape (307511, 2044)
X_test.shape (48744, 2044)
X_train.shape (307511, 2364)
X_test.shape (48744, 2364)
(246008, 2364) (246008,) (61503, 2364) (61503,)
Time elapsed: 1850.51061 s


In [20]:
params = {
    "max_depth": scope.int(hp.quniform("max_depth", 2, 12, 1)),
    "min_child_weight": scope.int(hp.quniform("min_child_weight", 1, 20, 1)), 
    "subsample": hp.uniform("subsample", 0.4, 1.0),
    "colsample_bytree": hp.uniform("colsample_bytree", 0.4, 1.0),
    "reg_lambda": hp.loguniform("reg_lambda", np.log(0.01), np.log(10000)),
    "reg_alpha": hp.loguniform("reg_alpha", np.log(0.001), np.log(1000)),
    "learning_rate": hp.loguniform("learning_rate", np.log(0.001), np.log(1.)),
    "gamma": hp.uniform("gamma", 0., 5.),
}

params_fixed = {
    "booster": "gbtree",
    "tree_method": "gpu_hist",
    "predictor": "gpu_predictor",
    "n_estimators": 500
}

num_eval = 200

xgb = XGBClassifier()
trials, best_params, best_model = run_hyperopt(xgb, params, 
                                               X_train, y_train, X_val, y_val, 
                                               num_eval,
                                               params_fixed=params_fixed)
best_params

100%|██████████| 200/200 [20:38:48<00:00, 371.64s/trial, best loss: -0.7986753448393237]   
Time elapsed: 74747.66016 s


{'colsample_bytree': 0.5921658797363964,
 'gamma': 2.81393985204685,
 'learning_rate': 0.037238669538423745,
 'max_depth': 7,
 'min_child_weight': 8,
 'reg_alpha': 13.49762560054903,
 'reg_lambda': 3.7989514085640046,
 'subsample': 0.828082657230721}

In [22]:
{'colsample_bytree': 0.5921658797363964,
 'gamma': 2.81393985204685,
 'learning_rate': 0.037238669538423745,
 'max_depth': 7,
 'min_child_weight': 8,
 'reg_alpha': 13.49762560054903,
 'reg_lambda': 3.7989514085640046,
 'subsample': 0.828082657230721}

{'colsample_bytree': 0.5921658797363964,
 'gamma': 2.81393985204685,
 'learning_rate': 0.037238669538423745,
 'max_depth': 7,
 'min_child_weight': 8,
 'reg_alpha': 13.49762560054903,
 'reg_lambda': 3.7989514085640046,
 'subsample': 0.828082657230721}

In [21]:
auc_train = roc_auc(best_model, X_train, y_train)
print("AUC of the train set: %0.5f" % auc_train)

auc_val = roc_auc(best_model, X_val, y_val)
print("AUC of the validation set: %0.5f" % auc_val)


best_model.fit(np.concatenate([X_train, X_val]), np.concatenate([y_train, y_val]))

out_model = os.path.join(MODELS_DIR, "xgb_ohe.pickle")
pickle.dump(best_model, open(out_model, "wb"))


# Submitting this give public AUC = 0.79186 and private AUC = 0.79108
out_sub = os.path.join(SUB_DIR, "xgb_ohe.csv")
write_submit_csv(best_model, X_test, sk_id_test, out_sub)

AUC of the train set: 0.87957
AUC of the validation set: 0.79802


## Label encoding

In [10]:
time_start = time.time()

X_train = load_csv(os.path.join(IN_DIR, "X_y_train.csv"))
X_test = load_csv(os.path.join(IN_DIR, "X_test.csv"))

print("X_train.shape", X_train.shape)
print("X_test.shape", X_test.shape)
print("X_train.isnull().sum().sum:", X_train.isnull().sum().sum())
print("X_test.isnull().sum().sum:", X_test.isnull().sum().sum())

y_train = X_train["APPL_TARGET"].values
X_train = X_train.drop(["APPL_TARGET", "SK_ID_CURR"], axis="columns")
print("X_train.shape", X_train.shape)

sk_id_test = X_test[["SK_ID_CURR"]]
X_test = X_test.drop(["SK_ID_CURR"], axis="columns")
print("X_test.shape", X_test.shape)


le = LabelEncoder(to_array=True)
X_train = le.fit_transform(X_train)
X_test = le.transform(X_test)
print("X_train.shape", X_train.shape)
print("X_test.shape", X_test.shape)

cat_col_idx = le.get_cat_cols()
features = le._features


X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, 
                                                  stratify=y_train, random_state=52094)

print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)

time_end = time.time()
time_elapse = time_end - time_start
print("Time elapsed: %0.5f s" % time_elapse)

Memory usage before changing types 4977.37 MB
Memory usage after changing types 2449.34 MB
Memory usage before changing types 788.58 MB
Memory usage after changing types 388.07 MB
X_train.shape (307511, 2046)
X_test.shape (48744, 2045)
X_train.isnull().sum().sum: 0
X_test.isnull().sum().sum: 0
X_train.shape (307511, 2044)
X_test.shape (48744, 2044)
X_train.shape (307511, 2044)
X_test.shape (48744, 2044)
(246008, 2044) (246008,) (61503, 2044) (61503,)
Time elapsed: 1341.80533 s


In [24]:
params = {
    "max_depth": scope.int(hp.quniform("max_depth", 2, 12, 1)),
    "min_child_weight": scope.int(hp.quniform("min_child_weight", 1, 14, 1)), 
    "subsample": hp.uniform("subsample", 0.4, 1.0),
    "colsample_bytree": hp.uniform("colsample_bytree", 0.4, 1.0),
    "reg_lambda": hp.loguniform("reg_lambda", np.log(0.001), np.log(10000)),
    "reg_alpha": hp.loguniform("reg_alpha", np.log(0.0001), np.log(1000)),
    "learning_rate": hp.loguniform("learning_rate", np.log(0.001), np.log(1.)),
    #"gamma": hp.uniform("gamma", 0., 5.),
}

params_fixed = {
    "booster": "gbtree",
    "tree_method": "gpu_hist",
    "predictor": "gpu_predictor",
    "n_estimators": 500
}

num_eval = 200

xgb = XGBClassifier()
trials, best_params, best_model = run_hyperopt(xgb, params, 
                                               X_train, y_train, X_val, y_val, 
                                               num_eval,
                                               params_fixed=params_fixed)
best_params

100%|██████████| 200/200 [18:41:32<00:00, 336.46s/trial, best loss: -0.7901599783150772]  
Time elapsed: 67704.15255 s


{'colsample_bytree': 0.6701944459124302,
 'learning_rate': 0.09002824038085307,
 'max_depth': 8,
 'min_child_weight': 8,
 'reg_alpha': 1.1615938969390502,
 'reg_lambda': 1488.7791790041074,
 'subsample': 0.9508741452720249}

In [21]:
{'colsample_bytree': 0.6701944459124302,
 'learning_rate': 0.09002824038085307,
 'max_depth': 8,
 'min_child_weight': 8,
 'reg_alpha': 1.1615938969390502,
 'reg_lambda': 1488.7791790041074,
 'subsample': 0.9508741452720249}

{'colsample_bytree': 0.6701944459124302,
 'learning_rate': 0.09002824038085307,
 'max_depth': 8,
 'min_child_weight': 8,
 'reg_alpha': 1.1615938969390502,
 'reg_lambda': 1488.7791790041074,
 'subsample': 0.9508741452720249}

In [17]:
auc_train = roc_auc(best_model, X_train, y_train)
print("AUC of the train set: %0.5f" % auc_train)

auc_val = roc_auc(best_model, X_val, y_val)
print("AUC of the validation set: %0.5f" % auc_val)


best_model.fit(np.concatenate([X_train, X_val]), np.concatenate([y_train, y_val]))

out_model = os.path.join(MODELS_DIR, "xgb_le.pickle")
pickle.dump(best_model, open(out_model, "wb"))

# Submitting this give public AUC = 0.79088 and private AUC = 0.79006
out_sub = os.path.join(SUB_DIR, "xgb_le.csv")
write_submit_csv(best_model, X_test, sk_id_test, out_sub)

AUC of the train set: 0.87326
AUC of the validation set: 0.78880


# LightGBM

## Label encoding

In [18]:
time_start = time.time()

X_train = load_csv(os.path.join(IN_DIR, "X_y_train.csv"))
X_test = load_csv(os.path.join(IN_DIR, "X_test.csv"))

print("X_train.shape", X_train.shape)
print("X_test.shape", X_test.shape)
print("X_train.isnull().sum().sum:", X_train.isnull().sum().sum())
print("X_test.isnull().sum().sum:", X_test.isnull().sum().sum())

y_train = X_train["APPL_TARGET"].values
X_train = X_train.drop(["APPL_TARGET", "SK_ID_CURR"], axis="columns")
print("X_train.shape", X_train.shape)

sk_id_test = X_test[["SK_ID_CURR"]]
X_test = X_test.drop(["SK_ID_CURR"], axis="columns")
print("X_test.shape", X_test.shape)


le = LabelEncoder(to_array=True)
X_train = le.fit_transform(X_train)
X_test = le.transform(X_test)
print("X_train.shape", X_train.shape)
print("X_test.shape", X_test.shape)

cat_col_idx = le.get_cat_cols()
features = le._features


X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, 
                                                  stratify=y_train, random_state=16039)

print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)

time_end = time.time()
time_elapse = time_end - time_start
print("Time elapsed: %0.5f s" % time_elapse)

Memory usage before changing types 4977.37 MB
Memory usage after changing types 2449.34 MB
Memory usage before changing types 788.58 MB
Memory usage after changing types 388.07 MB
X_train.shape (307511, 2046)
X_test.shape (48744, 2045)
X_train.isnull().sum().sum: 0
X_test.isnull().sum().sum: 0
X_train.shape (307511, 2044)
X_test.shape (48744, 2044)
X_train.shape (307511, 2044)
X_test.shape (48744, 2044)
(246008, 2044) (246008,) (61503, 2044) (61503,)
Time elapsed: 1457.93122 s


In [19]:
params = {
    "max_depth": scope.int(hp.quniform("max_depth", 2, 10, 1)),
    "num_leaves": scope.int(hp.quniform("num_leaves", 10, 200, 5)),
    "min_child_samples": scope.int(hp.quniform("min_child_samples", 10, 500, 10)), 
    "subsample": hp.uniform("subsample", 0.4, 1.0),
    "colsample_bytree": hp.uniform("colsample_bytree", 0.4, 1.0),
    "reg_lambda": hp.loguniform("reg_lambda", np.log(0.1), np.log(10000)),
    "reg_alpha": hp.loguniform("reg_alpha", np.log(0.001), np.log(1000)),
    "learning_rate": hp.loguniform("learning_rate", np.log(0.005), np.log(1)),
}

# categorical_feature
params_fixed = {
    "boosting_type": "gbdt",
    "device": "gpu" ,
    "n_estimators": 500,
    "categorical_feature": cat_col_idx
}

num_eval = 200

lgbm = LGBMClassifier()
trials, best_params, best_model = run_hyperopt(lgbm, params, 
                                               X_train, y_train, X_val, y_val, 
                                               num_eval,
                                               params_fixed=params_fixed)
best_params

100%|██████████| 200/200 [5:09:48<00:00, 92.94s/trial, best loss: -0.7950186734642588]   
Time elapsed: 18653.15159 s


{'colsample_bytree': 0.4661559641143767,
 'learning_rate': 0.05414702094866219,
 'max_depth': 7,
 'min_child_samples': 390,
 'num_leaves': 75,
 'reg_alpha': 0.2621812996397479,
 'reg_lambda': 191.60321812828326,
 'subsample': 0.6990613159642727}

In [20]:
auc_train = roc_auc(best_model, X_train, y_train)
print("AUC of the train set: %0.5f" % auc_train)

auc_val = roc_auc(best_model, X_val, y_val)
print("AUC of the validation set: %0.5f" % auc_val)


best_model.fit(np.concatenate([X_train, X_val]), np.concatenate([y_train, y_val]))

out_model = os.path.join(MODELS_DIR, "lgb_le.pickle")
pickle.dump(best_model, open(out_model, "wb"))

# Submitting this give public AUC = 0.79186 and private AUC = 0.79108
out_sub = os.path.join(SUB_DIR, "lgbm_le.csv")
write_submit_csv(best_model, X_test, sk_id_test, out_sub)

AUC of the train set: 0.87621
AUC of the validation set: 0.79502
