In [1]:
import time
import os
import pickle
import copy
import glob
import sys

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import roc_auc_score

from xgboost import XGBClassifier

from lightgbm import LGBMClassifier

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

# Helper functions

In [2]:
def change_dtype_ser(ser):
    if ser.dtype == int:
        return ser.astype(np.int32)
    
    if ser.dtype == float:
        return ser.astype(np.float32)
    
    if ser.dtype == "object":
        if ser.nunique() < ser.shape[0]:
            return ser.astype("category")
        else:
            raise TypeError(ser.name + ": type is object but are all distinct")
    
    return ser
    


def change_dtype_df(df):
    """
    change types of columns to reduce memory size
    :param df: dataframe
    :return df: dataframe
    """
    memory = df.memory_usage().sum() / 10**6
    print("Memory usage before changing types %0.2f MB" % memory)

    for col in df.columns:
        df[col] = change_dtype_ser(df[col])

    memory = df.memory_usage().sum() / 10 ** 6
    print("Memory usage after changing types %0.2f MB" % memory)
    return df


def load_csv(filename):
    df = pd.read_csv(filename)
    df = change_dtype_df(df)
    return df

In [3]:
class Standardizer(BaseEstimator, TransformerMixin):
    def __init__(self, to_array=False):
        self._to_array = to_array
        
    def fit(self, df_train):
        num_cols = df_train.select_dtypes(["number"]).columns.to_list()
        self._mean = {col: df_train[col].mean() for col in num_cols}
        self._std = {col: df_train[col].std() for col in num_cols}
        return self
    
    def transform(self, df):
        for col in self._mean:
            if self._std[col] > 0:
                df[col] = (df[col] - self._mean[col]) / self._std[col]
                df[col] = df[col].astype("float32")
            else:
                print("WARNING: " + col + " has zero std.")
                df[col] = df[col] - self._mean[col]
                df[col] = df[col].astype("float32")
                
        if self._to_array:
            return df.values.astype(np.float32)
        else:
            return df

In [4]:
class LabelEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, to_array=False):
        self._to_array = to_array
    
    def fit(self, df_train):
        all_cols = df_train.columns.to_list()
        cat_cols = df_train.select_dtypes(["category", "object"]).columns.to_list()
        
        self._cat_col_idx = [i for i, col in enumerate(all_cols) if col in cat_cols]
        
        self._label_maps = {}
        self._missing_imputers = {}
        for col in cat_cols:
            label = df_train[col].unique()
            self._label_maps[col] = {c: n for n, c in enumerate(label)}
            
            mode_label = df_train[col].mode().iloc[0]
            self._missing_imputers[col] = self._label_maps[col][mode_label]
        return self
    
    def transform(self, df):
        for col, label_map in self._label_maps.items():
            df[col] = df[col].map(label_map)
            if df[col].isnull().any():
                df[col] = df[col].astype(np.float32).fillna(self._missing_imputers[col])
                
        self._features = df.columns.to_list()
        if self._to_array:
            return df.values.astype(np.float32)
        else:
            return df
        
    def get_cat_cols(self):
        return self._cat_col_idx
    

class OneHotEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, to_array=False):
        self._to_array = to_array
        
        
    def fit(self, train_df):
        df_cat = train_df.select_dtypes(["object", "category"])
        self._cat_cols = df_cat.columns.to_list()
        
        if len(self._cat_cols) > 0:
            self._cat_cols_ohe = pd.get_dummies(df_cat, drop_first=True).columns.to_list()
        else:
            self._cat_cols_ohe = []
        return self
    
    def transform(self, df):
        if len(self._cat_cols) == 0:
            print("No cat cols in df_train, so do nothing.")
            return df
        
        df_cat = df.select_dtypes(["object", "category"])
        cat_cols = df_cat.columns.to_list()
        assert set(cat_cols) == set(self._cat_cols), "df does not have the same categorical cols as train_df"
        
        # one-hot encode
        df_cat = pd.get_dummies(df_cat)
        # drop cols that are present in test_df but absent in train_df
        cols_to_drop = [col for col in df_cat.columns if col not in self._cat_cols_ohe]
        df_cat = df_cat.drop(cols_to_drop, axis="columns")
        
        # change to float32
        for col in df_cat.columns:
            df_cat[col] = df_cat[col].astype("float32")
        
        # if some some colums are absent in test but present in train, make them all zero 
        cat_cols_ohe = df_cat.columns.to_list()
        for col in self._cat_cols_ohe:
            if col not in cat_cols_ohe:
                df_cat[col] = 0
                df_cat[col] = df_cat[col].astype(np.uint8)
        
        num_cols = [col for col in df.columns if col not in cat_cols]
        df_num = df[num_cols]
        
        df = pd.concat([df_num, df_cat], axis="columns")
        self._features = df.columns.to_list()
        if self._to_array:
            return df.values.astype(np.float32)
        else:
            return df

In [5]:
def roc_auc(estimator, X_eval, y_eval):
    """
    :param estimator: sklearn estimator that have predict_proba() method
    :param X_eval: test features
    :param y_eval: test target
    :return: float
    """
    proba = estimator.predict_proba(X_eval)
    return roc_auc_score(y_eval, proba[:, 1])


def write_submit_csv(estimator, X_test, id_test, out):
    """
    :param estimator: a sklearn estimator that has predict_proba() method
    :param X_test: df or array
    :param id_test: dataframe containing column "SK_ID_CURR"
    :param out: str, csv output file name
    :return: None
    """
    prob_test = estimator.predict_proba(X_test)[:, 1]
    submit = id_test.copy()
    submit["TARGET"] = prob_test
    submit.to_csv(out, index=False)
    return None


def feature_importance_df(estimator, features):
    """
    :param estimator: an estimator object that has feature_importances_ attribute
    :param features: list of str, list of feature names
    :return: feature_imp, dataframe
    """
    feature_imp = pd.DataFrame({"feature": features, "importance": estimator.feature_importances_})
    feature_imp = feature_imp.sort_values(by=["importance"], ascending=False)
    
    feature_imp["rank"] = np.arange(feature_imp.shape[0]) + 1
    return feature_imp

In [6]:
def run_hyperopt(classifier,
                 params_tuned, 
                 X_train, y_train,
                 X_val, y_val,
                 num_eval,
                 params_fixed=None,
                 rstate=None):
    
    time_start = time.time()
    if params_fixed is None:
        params_fixed = {"n_jobs": 20, "n_estimators": 100}
    
    def objective(params):
        classifier.set_params(**params_fixed, **params)
        classifier.fit(X_train, y_train)
        
        auc = roc_auc(classifier, X_val, y_val)
        return {"loss": -auc, "status": STATUS_OK}
    
    if rstate is not None:
        rstate = np.random.RandomState(rstate)
        
    trials = Trials()
    best_params = fmin(objective, 
                      params_tuned, 
                      algo=tpe.suggest, 
                      max_evals=num_eval, 
                      trials=trials,
                      rstate=rstate)
    
    best_params = whole_to_int(best_params)
    best_model = classifier.set_params(**params_fixed, **best_params)
    best_model.fit(X_train, y_train)
    
    time_end = time.time()
    time_elapse = time_end - time_start
    print("Time elapsed: %0.5f s" % time_elapse)
    
    return trials, best_params, best_model


In [7]:
def whole_to_int(a_dict):
    new_dict = copy.deepcopy(a_dict)
    for k, v in new_dict.items():
        if np.isclose(np.round(v), v):
            new_dict[k] = int(new_dict[k])
    return new_dict


def averaging_y_hat(submit_csv_files):
    y_hats = [pd.read_csv(f) for f in submit_csv_files]
    result = y_hats[0][["SK_ID_CURR"]]
    result["TARGET"] = 0.
    for y in y_hats:
        result["TARGET"] = result["TARGET"] + y["TARGET"]
    
    result["TARGET"] = result["TARGET"] / len(y_hats)
    return result

In [8]:
IN_DIR = "data/data5_"
SUB_DIR = "data/submit_"
MODELS_DIR = "data/models_"

# Load data

In [11]:
time_start = time.time()

X_org_train = load_csv(os.path.join(IN_DIR, "X_y_train.csv"))
X_org_test = load_csv(os.path.join(IN_DIR, "X_test.csv"))

print("X_org_train.shape", X_org_train.shape)
print("X_org_test.shape", X_org_test.shape)
print("X_org_train.isnull().sum().sum:", X_org_train.isnull().sum().sum())
print("X_org_test.isnull().sum().sum:", X_org_test.isnull().sum().sum())

y_org_train = X_org_train["APPL_TARGET"].values
X_org_train = X_org_train.drop(["APPL_TARGET", "SK_ID_CURR"], axis="columns")
print("X_org_train.shape", X_org_train.shape)

sk_id_test = X_org_test[["SK_ID_CURR"]]
X_org_test = X_org_test.drop(["SK_ID_CURR"], axis="columns")
print("X_org_test.shape", X_org_test.shape)

time_end = time.time()
time_elapse = time_end - time_start
print("Elapsed Time", time_elapse)

Memory usage before changing types 6478.33 MB
Memory usage after changing types 3193.52 MB
Memory usage before changing types 1026.50 MB
Memory usage after changing types 506.03 MB
X_org_train.shape (307511, 2657)
X_org_test.shape (48744, 2656)
X_org_train.isnull().sum().sum: 0
X_org_test.isnull().sum().sum: 0
X_org_train.shape (307511, 2655)
X_org_test.shape (48744, 2655)
Elapsed Time 2182.02224111557


# Logistic regression

In [None]:
ohe = OneHotEncoder()
ohe.fit(X_org_train)
X_train = ohe.transform(X_org_train)
X_test = ohe.transform(X_org_test)

print("X_train.shape", X_train.shape)
print("X_test.shape", X_test.shape)

features = ohe._features


scaler = Standardizer(to_array=True)
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

print("X_train.shape", X_train.shape)
print("X_test.shape", X_test.shape)


X_train, X_val, y_train, y_val = train_test_split(X_train, y_org_train, test_size=0.2, 
                                                  stratify=y_org_train, random_state=21083)

X_train.shape, y_train.shape, X_val.shape, y_val.shape

## Baseline (not tuned) model

In [None]:
lr = LogisticRegression(max_iter=100, n_jobs=20)
lr.fit(X_train, y_train)

In [None]:
auc_train = roc_auc(lr, X_train, y_train)
print("AUC of the train set: %0.5f" % auc_train)

auc_val = roc_auc(lr, X_val, y_val)
print("AUC of the validation set: %0.5f" % auc_val)

In [None]:
lr.fit(np.concatenate([X_train, X_val]), np.concatenate([y_train, y_val]))
write_submit_csv(lr, X_test, sk_id_test, os.path.join(SUB_DIR, "lr_data3_baseline.csv"))

## Tuning using `hyperopt`

In [None]:
params = {"C": hp.loguniform('C', np.log(0.00001), np.log(100))}
num_eval = 10

lr = LogisticRegression()
trials, best_params, best_model = run_hyperopt(lr, params, X_train, y_train, X_val, y_val, num_eval)
best_params

In [None]:
auc_train = roc_auc(best_model, X_train, y_train)
print("AUC of the train set: %0.5f" % auc_train)

auc_val = roc_auc(best_model, X_val, y_val)
print("AUC of the validation set: %0.5f" % auc_val)

In [None]:
out_sub = os.path.join(SUB_DIR, "lr_data3_tuned.csv")
write_submit_csv(best_model, X_test, sk_id_test, out_sub)

out_model = os.path.join(MODELS_DIR, "lr_data3_tuned.pickle")
pickle.dump(best_model, open(out_model, "wb"))

# Random forest

## Baseline model

In [None]:
rf = RandomForestClassifier(n_jobs=20)
rf.fit(X_train, y_train)

In [None]:
auc_train = roc_auc(rf, X_train, y_train)
print("AUC of the train set: %0.5f" % auc_rf_train)

auc_val = roc_auc(rf, X_val, y_val)
print("AUC of the validation set: %0.5f" % auc_val)

## Tuning using `hyperopt`

In [None]:
params = {
    "max_depth": scope.int(hp.quniform("max_depth", 2, 10, 1)),
    #"min_samples_split": scope.int(hp.quniform("min_samples_split", 20, 400, 10)),
    "min_samples_leaf": scope.int(hp.quniform("min_samples_leaf", 20, 200, 10)), 
    "max_features": scope.int(hp.quniform("max_features", 10, 200, 1)),
}

params_fixed_rf = {
    "n_jobs": 20,
    "n_estimators": 100
}


num_eval = 60
rf = RandomForestClassifier()
trials, best_params, best_model = run_hyperopt(rf, params, 
                                               X_train, y_train, X_val, y_val, 
                                               num_eval,
                                               params_fixed=params_fixed)
best_params

In [None]:
best_model.fit(X_train, y_train)

auc_train = roc_auc(best_model, X_train, y_train)
print("AUC of the train set: %0.5f" % auc_train)

auc_val = roc_auc(best_model, X_val, y_val)
print("AUC of the validation set: %0.5f" % auc_val)

In [None]:
rf_best.fit(np.concatenate([X_train, X_val]), np.concatenate([y_train, y_val]))

out_sub = os.path.join(SUB_DIR, "rf_data3_tuned.csv")
write_submit_csv(rf_best, X_test, sk_id_test, out_sub)

out_model = os.path.join(MODELS_DIR, "rf_data3_tuned.pickle")
pickle.dump(rf_best, open(out_model, "wb"))

# XGBOOST

## One-hot encoding

In [11]:
time_start = time.time()

X_train = load_csv(os.path.join(IN_DIR, "X_y_train.csv"))
X_test = load_csv(os.path.join(IN_DIR, "X_test.csv"))

print("X_train.shape", X_train.shape)
print("X_test.shape", X_test.shape)
print("X_train.isnull().sum().sum:", X_train.isnull().sum().sum())
print("X_test.isnull().sum().sum:", X_test.isnull().sum().sum())

y_train = X_train["APPL_TARGET"].values
X_train = X_train.drop(["APPL_TARGET", "SK_ID_CURR"], axis="columns")
print("X_train.shape", X_train.shape)

sk_id_test = X_test[["SK_ID_CURR"]]
X_test = X_test.drop(["SK_ID_CURR"], axis="columns")
print("X_test.shape", X_test.shape)


ohe = OneHotEncoder(to_array=True)
ohe.fit(X_train)
X_train = ohe.transform(X_train)
X_test = ohe.transform(X_test)

print("X_train.shape", X_train.shape)
print("X_test.shape", X_test.shape)

features = ohe._features


X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, 
                                                  stratify=y_train, random_state=30192)

print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)

time_end = time.time()
time_elapse = time_end - time_start
print("Time elapsed: %0.5f s" % time_elapse)

Memory usage before changing types 4977.37 MB
Memory usage after changing types 2449.34 MB
Memory usage before changing types 788.58 MB
Memory usage after changing types 388.07 MB
X_train.shape (307511, 2046)
X_test.shape (48744, 2045)
X_train.isnull().sum().sum: 0
X_test.isnull().sum().sum: 0
X_train.shape (307511, 2044)
X_test.shape (48744, 2044)
X_train.shape (307511, 2364)
X_test.shape (48744, 2364)
Time elapsed: 1816.44950 s


### Baseline model

In [13]:
time_start = time.time()

xgb = XGBClassifier(tree_method="gpu_hist", predictor="gpu_predictor")
xgb.fit(X_train, y_train)

auc_train = roc_auc(xgb, X_train, y_train)
print("AUC of the train set: %0.5f" % auc_train)

auc_val = roc_auc(xgb, X_val, y_val)
print("AUC of the validation set: %0.5f" % auc_val)

time_end = time.time()
time_elapse = time_end - time_start
print("Time elapsed: %0.5f s" % time_elapse)

AUC of XGBOOST model on the train set: 0.91061
AUC of XGBOOST model on the validation set: 0.76960
Time elapsed: 99.20997 s


### Tuning using `hyperopt`

In [14]:
params = {
    "max_depth": scope.int(hp.quniform("max_depth", 2, 10, 1)),
    "min_child_weight": scope.int(hp.quniform("min_child_weight", 1, 16, 1)), 
    "subsample": hp.uniform("subsample", 0.4, 1.0),
    "colsample_bytree": hp.uniform("colsample_bytree", 0.4, 1.0),
    "reg_lambda": hp.loguniform("reg_lambda", np.log(0.01), np.log(10000)),
    #"reg_alpha": hp.loguniform("reg_alpha", np.log(0.0001), np.log(100)),
    "learning_rate": hp.loguniform("learning_rate", np.log(0.001), np.log(0.5)),
    #"gamma": hp.uniform("gamma", 0., 2.),
}

params_fixed = {
    "booster": "gbtree",
    "tree_method": "gpu_hist",
    "predictor": "gpu_predictor",
    "n_estimators": 500
}

num_eval = 200

xgb = XGBClassifier()
trials, best_params, best_model = run_hyperopt(xgb, params, 
                                               X_train, y_train, X_val, y_val, 
                                               num_eval,
                                               params_fixed=params_fixed)
best_params

100%|██████████| 200/200 [25:50:23<00:00, 465.12s/trial, best loss: -0.7910676942424485]   
Time elapsed: 93494.71568 s


{'colsample_bytree': 0.8277609897889766,
 'learning_rate': 0.23944672331396236,
 'max_depth': 7,
 'min_child_weight': 12,
 'reg_lambda': 5589.365573544125,
 'subsample': 0.941505700131026}

In [15]:
auc_train = roc_auc(best_model, X_train, y_train)
print("AUC of the train set: %0.5f" % auc_train)

auc_val = roc_auc(best_model, X_val, y_val)
print("AUC of the validation set: %0.5f" % auc_val)


best_model.fit(np.concatenate([X_train, X_val]), np.concatenate([y_train, y_val]))

out_sub = os.path.join(SUB_DIR, "xgb_data5_ohe_tuned_01.csv")
write_submit_csv(best_model, X_test, sk_id_test, out_sub)

out_model = os.path.join(MODELS_DIR, "xgb_data5_ohe_tuned_01.pickle")
pickle.dump(best_model, open(out_model, "wb"))

AUC of the train set: 0.86169
AUC of the validation set: 0.79121


In [9]:
time_start = time.time()

X_train = load_csv(os.path.join(IN_DIR, "X_y_train.csv"))
X_test = load_csv(os.path.join(IN_DIR, "X_test.csv"))

print("X_train.shape", X_train.shape)
print("X_test.shape", X_test.shape)
print("X_train.isnull().sum().sum:", X_train.isnull().sum().sum())
print("X_test.isnull().sum().sum:", X_test.isnull().sum().sum())

y_train = X_train["APPL_TARGET"].values
X_train = X_train.drop(["APPL_TARGET", "SK_ID_CURR"], axis="columns")
print("X_train.shape", X_train.shape)

sk_id_test = X_test[["SK_ID_CURR"]]
X_test = X_test.drop(["SK_ID_CURR"], axis="columns")
print("X_test.shape", X_test.shape)


ohe = OneHotEncoder(to_array=True)
ohe.fit(X_train)
X_train = ohe.transform(X_train)
X_test = ohe.transform(X_test)

print("X_train.shape", X_train.shape)
print("X_test.shape", X_test.shape)

features = ohe._features


X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, 
                                                  stratify=y_train, random_state=60392)

print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)

time_end = time.time()
time_elapse = time_end - time_start
print("Time elapsed: %0.5f s" % time_elapse)

Memory usage before changing types 4977.37 MB
Memory usage after changing types 2449.34 MB
Memory usage before changing types 788.58 MB
Memory usage after changing types 388.07 MB
X_train.shape (307511, 2046)
X_test.shape (48744, 2045)
X_train.isnull().sum().sum: 0
X_test.isnull().sum().sum: 0
X_train.shape (307511, 2044)
X_test.shape (48744, 2044)
X_train.shape (307511, 2364)
X_test.shape (48744, 2364)
(246008, 2364) (246008,) (61503, 2364) (61503,)
Time elapsed: 1911.26998 s


In [10]:
params = {
    "max_depth": scope.int(hp.quniform("max_depth", 2, 12, 1)),
    "min_child_weight": scope.int(hp.quniform("min_child_weight", 1, 20, 1)), 
    "subsample": hp.uniform("subsample", 0.4, 1.0),
    "colsample_bytree": hp.uniform("colsample_bytree", 0.4, 1.0),
    "reg_lambda": hp.loguniform("reg_lambda", np.log(0.01), np.log(10000)),
    "reg_alpha": hp.loguniform("reg_alpha", np.log(0.001), np.log(1000)),
    "learning_rate": hp.loguniform("learning_rate", np.log(0.001), np.log(1.)),
    "gamma": hp.uniform("gamma", 0., 5.),
}

params_fixed = {
    "booster": "gbtree",
    "tree_method": "gpu_hist",
    "predictor": "gpu_predictor",
    "n_estimators": 500
}

num_eval = 200

xgb = XGBClassifier()
trials, best_params, best_model = run_hyperopt(xgb, params, 
                                               X_train, y_train, X_val, y_val, 
                                               num_eval,
                                               params_fixed=params_fixed)
best_params

100%|██████████| 200/200 [24:58:07<00:00, 449.44s/trial, best loss: -0.7957121763270056]  
Time elapsed: 90352.94079 s


{'colsample_bytree': 0.727760707935723,
 'gamma': 1.8480869489155598,
 'learning_rate': 0.10043251062182912,
 'max_depth': 9,
 'min_child_weight': 7,
 'reg_alpha': 0.007754632616862692,
 'reg_lambda': 1232.6160000051507,
 'subsample': 0.886220423354186}

In [11]:
auc_train = roc_auc(best_model, X_train, y_train)
print("AUC of the train set: %0.5f" % auc_train)

auc_val = roc_auc(best_model, X_val, y_val)
print("AUC of the validation set: %0.5f" % auc_val)


best_model.fit(np.concatenate([X_train, X_val]), np.concatenate([y_train, y_val]))

out_sub = os.path.join(SUB_DIR, "xgb_data5_ohe_tuned_02.csv")
write_submit_csv(best_model, X_test, sk_id_test, out_sub)

out_model = os.path.join(MODELS_DIR, "xgb_data5_ohe_tuned_02.pickle")
pickle.dump(best_model, open(out_model, "wb"))

AUC of the train set: 0.86691
AUC of the validation set: 0.79492


In [9]:
time_start = time.time()

X_train = load_csv(os.path.join(IN_DIR, "X_y_train.csv"))
X_test = load_csv(os.path.join(IN_DIR, "X_test.csv"))

print("X_train.shape", X_train.shape)
print("X_test.shape", X_test.shape)
print("X_train.isnull().sum().sum:", X_train.isnull().sum().sum())
print("X_test.isnull().sum().sum:", X_test.isnull().sum().sum())

y_train = X_train["APPL_TARGET"].values
X_train = X_train.drop(["APPL_TARGET", "SK_ID_CURR"], axis="columns")
print("X_train.shape", X_train.shape)

sk_id_test = X_test[["SK_ID_CURR"]]
X_test = X_test.drop(["SK_ID_CURR"], axis="columns")
print("X_test.shape", X_test.shape)


ohe = OneHotEncoder(to_array=True)
ohe.fit(X_train)
X_train = ohe.transform(X_train)
X_test = ohe.transform(X_test)

print("X_train.shape", X_train.shape)
print("X_test.shape", X_test.shape)

features = ohe._features


X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, 
                                                  stratify=y_train, random_state=26802)

print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)

time_end = time.time()
time_elapse = time_end - time_start
print("Time elapsed: %0.5f s" % time_elapse)

Memory usage before changing types 4977.37 MB
Memory usage after changing types 2449.34 MB
Memory usage before changing types 788.58 MB
Memory usage after changing types 388.07 MB
X_train.shape (307511, 2046)
X_test.shape (48744, 2045)
X_train.isnull().sum().sum: 0
X_test.isnull().sum().sum: 0
X_train.shape (307511, 2044)
X_test.shape (48744, 2044)
X_train.shape (307511, 2364)
X_test.shape (48744, 2364)
(246008, 2364) (246008,) (61503, 2364) (61503,)
Time elapsed: 1808.58785 s


In [13]:
params = {
    "max_depth": scope.int(hp.quniform("max_depth", 2, 12, 1)),
    "min_child_weight": scope.int(hp.quniform("min_child_weight", 1, 20, 1)), 
    "subsample": hp.uniform("subsample", 0.4, 1.0),
    "colsample_bytree": hp.uniform("colsample_bytree", 0.4, 1.0),
    "reg_lambda": hp.loguniform("reg_lambda", np.log(0.001), np.log(10000)),
    "reg_alpha": hp.loguniform("reg_alpha", np.log(0.0001), np.log(1000)),
    "learning_rate": hp.loguniform("learning_rate", np.log(0.001), np.log(1.)),
    "gamma": hp.uniform("gamma", 0., 5.),
}

params_fixed = {
    "booster": "gbtree",
    "tree_method": "gpu_hist",
    "predictor": "gpu_predictor",
    "n_estimators": 500
}

num_eval = 200

xgb = XGBClassifier()
trials, best_params, best_model = run_hyperopt(xgb, params, 
                                               X_train, y_train, X_val, y_val, 
                                               num_eval,
                                               params_fixed=params_fixed)
best_params

100%|██████████| 200/200 [21:17:03<00:00, 383.12s/trial, best loss: -0.7926602920717406]  
Time elapsed: 76942.01761 s


{'colsample_bytree': 0.5191534522752808,
 'gamma': 4.623319349053346,
 'learning_rate': 0.10403041813785364,
 'max_depth': 6,
 'min_child_weight': 18,
 'reg_alpha': 3.264299697048531,
 'reg_lambda': 452.41246910139256,
 'subsample': 0.9181638806764917}

In [12]:
auc_train = roc_auc(best_model, X_train, y_train)
print("AUC of the train set: %0.5f" % auc_train)

auc_val = roc_auc(best_model, X_val, y_val)
print("AUC of the validation set: %0.5f" % auc_val)


best_model.fit(np.concatenate([X_train, X_val]), np.concatenate([y_train, y_val]))

out_sub = os.path.join(SUB_DIR, "xgb_data5_ohe_tuned_04.csv")
write_submit_csv(best_model, X_test, sk_id_test, out_sub)

out_model = os.path.join(MODELS_DIR, "xgb_data5_ohe_tuned_04.pickle")
pickle.dump(best_model, open(out_model, "wb"))

AUC of the train set: 0.84528
AUC of the validation set: 0.79053


## Label encoding

In [18]:
time_start = time.time()

X_train = load_csv(os.path.join(IN_DIR, "X_y_train.csv"))
X_test = load_csv(os.path.join(IN_DIR, "X_test.csv"))

print("X_train.shape", X_train.shape)
print("X_test.shape", X_test.shape)
print("X_train.isnull().sum().sum:", X_train.isnull().sum().sum())
print("X_test.isnull().sum().sum:", X_test.isnull().sum().sum())

y_train = X_train["APPL_TARGET"].values
X_train = X_train.drop(["APPL_TARGET", "SK_ID_CURR"], axis="columns")
print("X_train.shape", X_train.shape)

sk_id_test = X_test[["SK_ID_CURR"]]
X_test = X_test.drop(["SK_ID_CURR"], axis="columns")
print("X_test.shape", X_test.shape)


le = LabelEncoder(to_array=True)
X_train = le.fit_transform(X_train)
X_test = le.transform(X_test)
print("X_train.shape", X_train.shape)
print("X_test.shape", X_test.shape)

cat_col_idx = le.get_cat_cols()
features = le._features

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, 
                                                  stratify=y_train, random_state=35039)

print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)

time_end = time.time()
time_elapse = time_end - time_start
print("Time elapsed: %0.5f s" % time_elapse)

Memory usage before changing types 4977.37 MB
Memory usage after changing types 2449.34 MB
Memory usage before changing types 788.58 MB
Memory usage after changing types 388.07 MB
X_train.shape (307511, 2046)
X_test.shape (48744, 2045)
X_train.isnull().sum().sum: 0
X_test.isnull().sum().sum: 0
X_train.shape (307511, 2044)
X_test.shape (48744, 2044)
X_train.shape (307511, 2044)
X_test.shape (48744, 2044)
(246008, 2044) (246008,) (61503, 2044) (61503,)
Time elapsed: 1497.16492 s


### Baseline model

In [15]:
time_start = time.time()

xgb = XGBClassifier(tree_method="gpu_hist")
xgb.fit(X_train, y_train)

auc_train = roc_auc(xgb, X_train, y_train)
print("AUC of the train set: %0.5f" % auc_train)

auc_val = roc_auc(xgb, X_val, y_val)
print("AUC of the validation set: %0.5f" % auc_val)

time_end = time.time()
time_elapse = time_end - time_start
print("Time elapsed: %0.5f s" % time_elapse)

AUC of the train set: 0.91210
AUC of the validation set: 0.77463
Time elapsed: 68.57388 s


### Tuning using `hyperopt`

In [19]:
params = {
    "max_depth": scope.int(hp.quniform("max_depth", 2, 12, 1)),
    "min_child_weight": scope.int(hp.quniform("min_child_weight", 1, 14, 1)), 
    "subsample": hp.uniform("subsample", 0.4, 1.0),
    "colsample_bytree": hp.uniform("colsample_bytree", 0.4, 1.0),
    "reg_lambda": hp.loguniform("reg_lambda", np.log(0.001), np.log(10000)),
    "reg_alpha": hp.loguniform("reg_alpha", np.log(0.0001), np.log(1000)),
    "learning_rate": hp.loguniform("learning_rate", np.log(0.001), np.log(1.)),
    #"gamma": hp.uniform("gamma", 0., 5.),
}

params_fixed = {
    "booster": "gbtree",
    "tree_method": "gpu_hist",
    "predictor": "gpu_predictor",
    "n_estimators": 500
}

num_eval = 200

xgb = XGBClassifier()
trials, best_params, best_model = run_hyperopt(xgb, params, 
                                               X_train, y_train, X_val, y_val, 
                                               num_eval,
                                               params_fixed=params_fixed)
best_params

100%|██████████| 200/200 [19:48:54<00:00, 356.67s/trial, best loss: -0.7983844283788208]   
Time elapsed: 71585.33025 s


{'colsample_bytree': 0.6054492781459379,
 'learning_rate': 0.060075903200569394,
 'max_depth': 6,
 'min_child_weight': 3,
 'reg_alpha': 0.004351104122044269,
 'reg_lambda': 231.86577481386294,
 'subsample': 0.9034672972914196}

In [None]:
auc_train = roc_auc(best_model, X_train, y_train)
print("AUC of the train set: %0.5f" % auc_train)

auc_val = roc_auc(best_model, X_val, y_val)
print("AUC of the validation set: %0.5f" % auc_val)


best_model.fit(np.concatenate([X_train, X_val]), np.concatenate([y_train, y_val]))

out_sub = os.path.join(SUB_DIR, "xgb_data5_le_tuned_02.csv")
write_submit_csv(best_model, X_test, sk_id_test, out_sub)

out_model = os.path.join(MODELS_DIR, "xgb_data5_le_tuned_02.pickle")
pickle.dump(best_model, open(out_model, "wb"))

# LightGBM

## Baseline model

In [None]:
time_start = time.time()

lgbm = LGBMClassifier(device="gpu", categorical_feature=cat_col_idx)
lgbm.fit(X_train, y_train)

auc_lgbm_train = roc_auc(lgbm, X_train, y_train)
print("AUC of LightGBM model on the train set: %0.5f" % auc_lgbm_train)

auc_lgbm_val = roc_auc(lgbm, X_val, y_val)
print("AUC of LightGBM model on the validation set: %0.5f" % auc_lgbm_val)

time_end = time.time()
time_elapse = time_end - time_start
print("Time elapsed: %0.5f s" % time_elapse)

## Tuning using `hyperopt`

### `gbtree`

In [None]:
params_lgbm = {
    "max_depth": scope.int(hp.quniform("max_depth", 2, 10, 1)),
    "num_leaves": scope.int(hp.quniform("num_leaves", 10, 200, 5)),
    "min_child_samples": scope.int(hp.quniform("min_child_samples", 10, 500, 10)), 
    "subsample": hp.uniform("subsample", 0.4, 1.0),
    "colsample_bytree": hp.uniform("colsample_bytree", 0.4, 1.0),
    "reg_lambda": hp.loguniform("reg_lambda", np.log(0.01), np.log(10000)),
    "learning_rate": hp.loguniform("learning_rate", np.log(0.005), np.log(1)),
}

# categorical_feature
params_fixed_lgbm = {
    "boosting_type": "gbdt",
    "device": "gpu" ,
    "n_estimators": 500,
    "categorical_feature": cat_col_idx
}

num_eval = 100

trials_lgbm, best_params_lgbm = hyperopt_lgbm(params_lgbm, 
                                              X_train, y_train, X_val, y_val, 
                                              num_eval,
                                              params_fixed=params_fixed_lgbm,
                                              rstate=31029)
best_params_lgbm

In [None]:
{'colsample_bytree': 0.667434542409391,
 'learning_rate': 0.09146817439402177,
 'max_depth': 4.0,
 'min_child_samples': 380.0,
 'num_leaves': 160.0,
 'reg_lambda': 197.79685074014847,
 'subsample': 0.504377097169451}

In [None]:
best_params_lgbm = whole_to_int(best_params_lgbm)

lgbm_best = LGBMClassifier(**params_fixed_lgbm, **best_params_lgbm)
lgbm_best.fit(X_train, y_train)

auc_lgbm_train = roc_auc(lgbm_best, X_train, y_train)
print("AUC of LightGBM model on the train set: %0.5f" % auc_lgbm_train)

auc_lgbm_val = roc_auc(lgbm_best, X_val, y_val)
print("AUC of LightGBM model on the evaluation set: %0.5f" % auc_lgbm_val)


lgbm_best.fit(np.concatenate([X_train, X_val]), np.concatenate([y_train, y_val]))

out_sub = os.path.join(SUB_DIR, "lgbm_data5_le_tuned_01.csv")
write_submit_csv(lgbm_best, X_test, sk_id_test, out_sub)

out_model = os.path.join(MODELS_DIR, "lgbm_data5_le_tuned_01.pickle")
pickle.dump(lgbm_best, open(out_model, "wb"))