In [34]:
import time
import os
import pickle
import copy
import glob

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import roc_auc_score

from xgboost import XGBClassifier

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

# Helper functions

In [52]:
def change_dtypes(df):
    """
    change types of columns to reduce memory size
    :param df: dataframe
    :return df: dataframe
    """
    memory = df.memory_usage().sum() / 10**6
    print("Memory usage before changing types %0.2f MB" % memory)

    for col in df.columns:
        if (df[col].dtype == "object") and (df[col].nunique() < df.shape[0]):
            df[col] = df[col].astype("category")

        elif df[col].dtype == float:
            df[col] = df[col].astype(np.float32)

        elif df[col].dtype == int:
            df[col] = df[col].astype(np.int32)

    memory = df.memory_usage().sum() / 10 ** 6
    print("Memory usage after changing types %0.2f MB" % memory)
    return df


def load_csv(filename):
    df = pd.read_csv(filename)
    df = change_dtypes(df)
    return df

In [3]:
class Standardizer(BaseEstimator, TransformerMixin):
    def __init__(self, to_array=False):
        self._to_array = to_array
        
    def fit(self, df_train):
        num_cols = df_train.select_dtypes(["number"]).columns.to_list()
        self._mean = {col: df_train[col].mean() for col in num_cols}
        self._std = {col: df_train[col].std() for col in num_cols}
        return self
    
    def transform(self, df):
        for col in self._mean:
            if self._std[col] > 0:
                df[col] = (df[col] - self._mean[col]) / self._std[col]
                df[col] = df[col].astype("float32")
            else:
                print("WARNING: " + col + " has zero std.")
        if self._to_array:
            return df.values
        else:
            return df

In [4]:
def roc_auc(estimator, X_eval, y_eval):
    """
    :param estimator: sklearn estimator that have predict_proba() method
    :param X_eval: test features
    :param y_eval: test target
    :return: float
    """
    proba = estimator.predict_proba(X_eval)
    return roc_auc_score(y_eval, proba[:, 1])


def write_submit_csv(estimator, X_test, id_test, out):
    """
    :param estimator: a sklearn estimator that has predict_proba() method
    :param X_test: df or array
    :param id_test: dataframe containing column "SK_ID_CURR"
    :param out: str, csv output file name
    :return: None
    """
    prob_test = estimator.predict_proba(X_test)[:, 1]
    submit = id_test
    submit["TARGET"] = prob_test
    submit.to_csv(out, index=False)
    return None

In [5]:
def hyperopt_lr(params_tuned, 
                X_train, y_train, 
                X_val, y_val, 
                num_eval,
                params_fixed=None,
                rstate=None):
    
    time_start = time.time()
    if params_fixed is None:
        params_fixed = {"n_jobs": 20}
    
    def objective(params):
        estimator = LogisticRegression(**params_fixed, **params)
        estimator.fit(X_train, y_train)
        
        auc = roc_auc(estimator, X_val, y_val)
        return {"loss": -auc, "status": STATUS_OK}
    
    if rstate is not None:
        rstate = np.random.RandomState(rstate)
        
    trials = Trials()
    best_param = fmin(objective, 
                      params_tuned, 
                      algo=tpe.suggest, 
                      max_evals=num_eval, 
                      trials=trials,
                      rstate=rstate)
    
    time_end = time.time()
    time_elapse = time_end - time_start
    print("Time elapsed: %0.5f s" % time_elapse)
    return trials, best_param



def hyperopt_rf(params_tuned, 
                X_train, y_train, 
                X_val, y_val, 
                num_eval, 
                params_fixed=None,
                rstate=None):
    
    time_start = time.time()
    if params_fixed is None:
        params_fixed = {"n_jobs": 20, "n_estimators": 100}
    
    def objective(params):
        estimator = RandomForestClassifier(**params_fixed, **params)
        (estimator.get_params())
        estimator.fit(X_train, y_train)
        
        auc = roc_auc(estimator, X_val, y_val)
        return {"loss": -auc, "status": STATUS_OK}
    
    if rstate is not None:
        rstate = np.random.RandomState(rstate)
    
    trials = Trials()
    best_param = fmin(objective, 
                      params_tuned, 
                      algo=tpe.suggest, 
                      max_evals=num_eval, 
                      trials=trials,
                      rstate=rstate)
    
    time_end = time.time()
    time_elapse = time_end - time_start
    print("Time elapsed: %0.5f s" % time_elapse)
    return trials, best_param


def hyperopt_xgb(params_tuned, 
                 X_train, y_train, 
                 X_val, y_val, 
                 num_eval, 
                 params_fixed=None,
                 rstate=None):
    
    time_start = time.time()
    if params_fixed is None:
        params_fixed = {"n_jobs": 20, "n_estimators": 100}
    
    def objective(params):
        estimator = XGBClassifier(**params_fixed, **params)
        estimator.fit(X_train, y_train)
        
        auc = roc_auc(estimator, X_val, y_val)
        return {"loss": -auc, "status": STATUS_OK}

    if rstate is not None:
        rstate = np.random.RandomState(rstate)
    trials = Trials()
    best_param = fmin(objective, 
                      params_tuned, 
                      algo=tpe.suggest, 
                      max_evals=num_eval, 
                      trials=trials,
                      rstate=rstate)
    
    time_end = time.time()
    time_elapse = time_end - time_start
    print("Time elapsed: %0.5f s" % time_elapse)
    return trials, best_param

In [40]:
def whole_to_int(a_dict):
    new_dict = copy.deepcopy(a_dict)
    for k, v in new_dict.items():
        if np.isclose(np.round(v), v):
            new_dict[k] = int(new_dict[k])
    return new_dict


def averaging_y_hat(submit_csv_files):
    y_hats = [pd.read_csv(f) for f in submit_csv_files]
    result = y_hats[0][["SK_ID_CURR"]]
    result["TARGET"] = 0.
    for y in y_hats:
        result["TARGET"] = result["TARGET"] + y["TARGET"]
    
    result["TARGET"] = result["TARGET"] / len(y_hats)
    return result

In [6]:
IN_DIR = "data/data1_"
SUB_DIR = "data/submit_"
MODELS_DIR = "data/models_"

# Load data

In [7]:
time_start = time.time()

X_train = load_csv(os.path.join(IN_DIR, "X_y_sel_xgb_train.csv"))
X_test = load_csv(os.path.join(IN_DIR, "X_sel_xgb_test.csv"))

print("X_train.shape", X_train.shape)
print("X_test.shape", X_test.shape)
print("X_train.isnull().sum().sum:", X_train.isnull().sum().sum())
print("X_test.isnull().sum().sum:", X_test.isnull().sum().sum())

y_train = X_train["APPL_TARGET"].values
X_train = X_train.drop(["APPL_TARGET"], axis="columns")
print("X_train.shape", X_train.shape)
print("X_test.shape", X_test.shape)

sk_id_test = load_csv(os.path.join(IN_DIR, "sk_id_test.csv"))

time_end = time.time()
time_elapse = time_end - time_start
print("Elapsed Time", time_elapse)

Memory usage before changing types 2691.34 MB
Memory usage after changing types 1346.90 MB
Memory usage before changing types 426.22 MB
Memory usage after changing types 213.30 MB
X_train.shape (307511, 1101)
X_test.shape (48744, 1100)
X_train.isnull().sum().sum: 0
X_test.isnull().sum().sum: 0
X_train.shape (307511, 1100)
X_test.shape (48744, 1100)
Memory usage before changing types 0.39 MB
Memory usage after changing types 0.20 MB
Elapsed Time 460.78423500061035


In [53]:
time_start = time.time()

X_train = load_csv(os.path.join(IN_DIR, "X_y_sel_xgb_train.csv"))
X_test = load_csv(os.path.join(IN_DIR, "X_sel_xgb_test.csv"))

print("X_train.shape", X_train.shape)
print("X_test.shape", X_test.shape)
print("X_train.isnull().sum().sum:", X_train.isnull().sum().sum())
print("X_test.isnull().sum().sum:", X_test.isnull().sum().sum())

y_train = X_train["APPL_TARGET"].values
X_train = X_train.drop(["APPL_TARGET"], axis="columns")
print("X_train.shape", X_train.shape)
print("X_test.shape", X_test.shape)

sk_id_test = load_csv(os.path.join(IN_DIR, "sk_id_test.csv"))

time_end = time.time()
time_elapse = time_end - time_start
print("Elapsed Time", time_elapse)

Memory usage before changing types 2691.34 MB
Memory usage after changing types 1346.90 MB
Memory usage before changing types 426.22 MB
Memory usage after changing types 213.30 MB
X_train.shape (307511, 1101)
X_test.shape (48744, 1100)
X_train.isnull().sum().sum: 0
X_test.isnull().sum().sum: 0
X_train.shape (307511, 1100)
X_test.shape (48744, 1100)
Memory usage before changing types 0.39 MB
Memory usage after changing types 0.20 MB
Elapsed Time 460.2423806190491


# Preprocessing

# Standardization

In [8]:
scaler = Standardizer(to_array=True)
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

print("X_train.shape", X_train.shape)
print("X_test.shape", X_test.shape)

X_train.shape (307511, 1100)
X_test.shape (48744, 1100)


# Split into train and validation sets for model selection

In [9]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, 
                                                  stratify=y_train, random_state=21083)

X_train.shape, y_train.shape, X_val.shape, y_val.shape

((246008, 1100), (246008,), (61503, 1100), (61503,))

# Logistic regression

## Baseline (not tuned) model

In [11]:
lr = LogisticRegression(max_iter=100)
lr.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [12]:
auc_lr_train = roc_auc(lr, X_train, y_train)
print("AUC of Logistic regression model on the train set: %0.5f" % auc_lr_train)

AUC of Logistic regression model on the train set: 0.78777


In [13]:
auc_lr_val = roc_auc(lr, X_val, y_val)
print("AUC of Logistic regression model on the evaluation set: %0.5f" % auc_lr_val)

AUC of Logistic regression model on the evaluation set: 0.77466


In [14]:
lr.fit(np.concatenate([X_train, X_val]), np.concatenate([y_train, y_val]))
write_submit_csv(lr, X_test, sk_id_test, os.path.join(SUB_DIR, "lr_data1_sel_xgb_baseline.csv"))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


## Tuning using `hyperopt`

In [22]:
params_lr = {"C": hp.loguniform('C', np.log(0.0001), np.log(10))}
num_eval = 5

trials_lr, best_params_lr = hyperopt_lr(params_lr, X_train, y_train, X_val, y_val, num_eval)

100%|██████████| 5/5 [1:00:34<00:00, 726.91s/trial, best loss: -0.7750106132221244]
Time elapsed: 3634.56129 s


In [31]:
lr_best = LogisticRegression(**best_params_lr)
lr_best.fit(np.concatenate([X_train, X_val]), np.concatenate([y_train, y_val]))

out_sub = os.path.join(SUB_DIR, "lr_data1_sel_xgb_tuned.csv")
write_submit_csv(lr_best, X_test, sk_id_test, out_sub)

out_model = os.path.join(MODELS_DIR, "lr_data1_sel_xgb_tuned.pickle")
pickle.dump(lr_best, open(out_model, "wb"))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


# Random forest

## Baseline model

In [33]:
rf = RandomForestClassifier(n_estimators=1000, min_samples_leaf=40, n_jobs=16, random_state=21083)
rf.fit(X_train, y_train)

RandomForestClassifier(min_samples_leaf=40, n_estimators=1000, n_jobs=16,
                       random_state=21083)

In [34]:
auc_rf_train = roc_auc(rf, X_train, y_train)
print("AUC of Random Forest model on the train set: %0.5f" % auc_rf_train)

AUC of Random Forest model on the train set: 0.93251


In [35]:
auc_rf_val = roc_auc(rf, X_val, y_val)
print("AUC of Random Forest model on the evaluation set: %0.5f" % auc_rf_val)

AUC of Random Forest model on the evaluation set: 0.75218


## Tuning using `hyperopt`

In [55]:
params_rf = {
    "min_samples_split": scope.int(hp.quniform("min_samples_split", 20, 400, 1)),
    "min_samples_leaf": scope.int(hp.quniform("min_samples_leaf", 10, 200, 1)), 
    "max_features": scope.int(hp.quniform("max_features", 10, 200, 1)),
}

params_fixed_rf = {
    "n_jobs": 20,
    "n_estimators": 100
}


num_eval = 60

trials_rf, best_params_rf = hyperopt_rf(params_rf, 
                                        X_train, y_train, X_val, y_val, 
                                        num_eval,
                                        params_fixed=params_fixed_rf,
                                        rstate=21083)
best_params_rf

100%|██████████| 60/60 [2:04:55<00:00, 124.93s/trial, best loss: -0.7583419783402278]  
Time elapsed: 7495.69767 s


In [60]:
best_params_rf = {s: int(best_params_rf[s]) for s in best_params_rf}
best_params_rf

{'max_features': 98, 'min_samples_leaf': 91, 'min_samples_split': 174}

In [62]:
rf_best = RandomForestClassifier(n_estimators=500, n_jobs=20, **best_params_rf)
rf_best.fit(X_train, y_train)

auc_rf_train = roc_auc(rf_best, X_train, y_train)
print("AUC of Random Forest model on the train set: %0.5f" % auc_rf_train)

auc_rf_val = roc_auc(rf_best, X_val, y_val)
print("AUC of Random Forest model on the evaluation set: %0.5f" % auc_rf_val)

AUC of Random Forest model on the train set: 0.87597
AUC of Random Forest model on the evaluation set: 0.75972


In [77]:
rf_best.fit(np.concatenate([X_train, X_val]), np.concatenate([y_train, y_val]))

out_sub = os.path.join(SUB_DIR, "rf_data1_sel_xgb_tuned.csv")
write_submit_csv(rf_best, X_test, sk_id_test, out_sub)

out_model = os.path.join(MODELS_DIR, "rf_data1_sel_xgb_tuned.pickle")
pickle.dump(rf_best, open(out_model, "wb"))

# XGBOOST

## Baseline model

In [83]:
time_start = time.time()

xgb = XGBClassifier(tree_method="gpu_hist")
xgb.fit(X_train, y_train)

auc_xgb_train = roc_auc(xgb, X_train, y_train)
print("AUC of XGBOOST model on the train set: %0.5f" % auc_xgb_train)

auc_xgb_val = roc_auc(xgb, X_val, y_val)
print("AUC of XGBOOST model on the validation set: %0.5f" % auc_xgb_val)

time_end = time.time()
time_elapse = time_end - time_start
print("Time elapsed: %0.5f s" % time_elapse)

AUC of XGBOOST model on the train set: 0.89709
AUC of XGBOOST model on the validation set: 0.77105
Time elapsed: 245.69354 s


## Tuning using `hyperopt`

In [15]:

params_xgb = {
    "max_depth": scope.int(hp.quniform("max_depth", 2, 8, 1)),
    "min_child_weight": scope.int(hp.quniform("min_child_weight", 1, 14, 1)), 
    "subsample": hp.uniform("subsample", 0.2, 1.0),
    "colsample_bytree": hp.uniform("colsample_bytree", 0.2, 1.0),
    "reg_lambda": hp.loguniform("reg_lambda", np.log(0.0001), np.log(10000)),
    "reg_alpha": hp.loguniform("reg_alpha", np.log(0.0001), np.log(10000)),
    "learning_rate": hp.loguniform("learning_rate", np.log(0.01), np.log(1)),
#    "gamma": hp.uniform("gamma", 0., 0.4),
}

params_fixed_xgb = {
    "tree_method": "gpu_hist" ,
    "n_estimators": 500
}

num_eval = 200

trials_xgb, best_params_xgb = hyperopt_xgb(params_xgb, 
                                           X_train, y_train, X_val, y_val, 
                                           num_eval,
                                           params_fixed=params_fixed_xgb,
                                           rstate=42)
best_params_xgb

100%|██████████| 200/200 [8:25:49<00:00, 151.75s/trial, best loss: -0.7882966092870476]  
Time elapsed: 30349.15448 s


{'colsample_bytree': 0.47671365847278224,
 'learning_rate': 0.08050806097226641,
 'max_depth': 7.0,
 'min_child_weight': 5.0,
 'reg_alpha': 0.0014314593128521287,
 'reg_lambda': 529.4881903912216,
 'subsample': 0.7017064695318056}

{'colsample_bytree': 0.47671365847278224,
 'learning_rate': 0.08050806097226641,
 'max_depth': 7.0,
 'min_child_weight': 5.0,
 'reg_alpha': 0.0014314593128521287,
 'reg_lambda': 529.4881903912216,
 'subsample': 0.7017064695318056}

In [18]:
best_params_xgb["max_depth"] = int(best_params_xgb["max_depth"])
best_params_xgb["min_child_weight"] = int(best_params_xgb["min_child_weight"])
best_params_xgb

{'colsample_bytree': 0.47671365847278224,
 'learning_rate': 0.08050806097226641,
 'max_depth': 7,
 'min_child_weight': 5,
 'reg_alpha': 0.0014314593128521287,
 'reg_lambda': 529.4881903912216,
 'subsample': 0.7017064695318056}

In [20]:
xgb_best = XGBClassifier(n_estimators=500, tree_method="gpu_hist", **best_params_xgb)
xgb_best.fit(X_train, y_train)

auc_xgb_train = roc_auc(xgb_best, X_train, y_train)
print("AUC of XGBoost model on the train set: %0.5f" % auc_xgb_train)

auc_xgb_val = roc_auc(xgb_best, X_val, y_val)
print("AUC of XGBoost model on the evaluation set: %0.5f" % auc_xgb_val)

AUC of XGBoost model on the train set: 0.86774
AUC of XGBoost model on the evaluation set: 0.78830


In [22]:
xgb_best.fit(np.concatenate([X_train, X_val]), np.concatenate([y_train, y_val]))

out_sub = os.path.join(SUB_DIR, "xgb_data1_sel_xgb_tuned_01.csv")
write_submit_csv(xgb_best, X_test, sk_id_test, out_sub)

out_model = os.path.join(MODELS_DIR, "xgb_data1_sel_xgb_tuned_01.pickle")
pickle.dump(xgb_best, open(out_model, "wb"))

In [24]:
# try different gamma
best_params_xgb = {"colsample_bytree": 0.47671365847278224,
                   "learning_rate": 0.08050806097226641,
                   "max_depth": 7,
                   "min_child_weight": 5,
                   "reg_alpha": 0.0014314593128521287,
                   "reg_lambda": 529.4881903912216,
                   "subsample": 0.7017064695318056,
                   "gamma": 2.}

xgb_best = XGBClassifier(n_estimators=500, tree_method="gpu_hist", **best_params_xgb)
xgb_best.fit(X_train, y_train)

auc_xgb_train = roc_auc(xgb_best, X_train, y_train)
print("AUC of XGBoost model on the train set: %0.5f" % auc_xgb_train)

auc_xgb_val = roc_auc(xgb_best, X_val, y_val)
print("AUC of XGBoost model on the evaluation set: %0.5f" % auc_xgb_val)


xgb_best.fit(np.concatenate([X_train, X_val]), np.concatenate([y_train, y_val]))

out_sub = os.path.join(SUB_DIR, "xgb_data1_sel_xgb_tuned_01.csv")
write_submit_csv(xgb_best, X_test, sk_id_test, out_sub)

out_model = os.path.join(MODELS_DIR, "xgb_data1_sel_xgb_tuned_01.pickle")
pickle.dump(xgb_best, open(out_model, "wb"))

AUC of XGBoost model on the train set: 0.85349
AUC of XGBoost model on the evaluation set: 0.78812


In [25]:
xgb_best.fit(np.concatenate([X_train, X_val]), np.concatenate([y_train, y_val]))

out_sub = os.path.join(SUB_DIR, "xgb_data1_sel_xgb_tuned_02.csv")
write_submit_csv(xgb_best, X_test, sk_id_test, out_sub)

out_model = os.path.join(MODELS_DIR, "xgb_data1_sel_xgb_tuned_02.pickle")
pickle.dump(xgb_best, open(out_model, "wb"))

### Another tuning.

In [29]:
params_xgb = {
    "max_depth": scope.int(hp.quniform("max_depth", 2, 12, 1)),
    "min_child_weight": scope.int(hp.quniform("min_child_weight", 1, 14, 1)), 
    "subsample": hp.uniform("subsample", 0.2, 1.0),
    "colsample_bytree": hp.uniform("colsample_bytree", 0.2, 1.0),
    "reg_lambda": hp.loguniform("reg_lambda", np.log(0.0001), np.log(10000)),
    "reg_alpha": hp.loguniform("reg_alpha", np.log(0.0001), np.log(10000)),
    "learning_rate": hp.loguniform("learning_rate", np.log(0.005), np.log(1)),
#    "gamma": hp.uniform("gamma", 0., 0.4),
}

params_fixed_xgb = {
    "tree_method": "gpu_hist" ,
    "n_estimators": 500
}

num_eval = 200

trials_xgb, best_params_xgb = hyperopt_xgb(params_xgb, 
                                           X_train, y_train, X_val, y_val, 
                                           num_eval,
                                           params_fixed=params_fixed_xgb,
                                           rstate=42)
best_params_xgb

100%|██████████| 200/200 [12:38:34<00:00, 227.57s/trial, best loss: -0.7882748627352448]  
Time elapsed: 45514.27978 s


{'colsample_bytree': 0.8617867706122423,
 'learning_rate': 0.058376323447823765,
 'max_depth': 8.0,
 'min_child_weight': 1.0,
 'reg_alpha': 0.6654125785836109,
 'reg_lambda': 450.87992024647934,
 'subsample': 0.6704436939093745}

{'colsample_bytree': 0.8617867706122423,
 'learning_rate': 0.058376323447823765,
 'max_depth': 8.0,
 'min_child_weight': 1.0,
 'reg_alpha': 0.6654125785836109,
 'reg_lambda': 450.87992024647934,
 'subsample': 0.6704436939093745}

In [26]:
best_params_xgb = {"colsample_bytree": 0.8617867706122423,
                   "learning_rate": 0.058376323447823765,
                   "max_depth": 8,
                   "min_child_weight": 1,
                   "reg_alpha": 0.6654125785836109,
                   "reg_lambda": 450.87992024647934,
                   "subsample": 0.6704436939093745,}

xgb_best = XGBClassifier(n_estimators=500, tree_method="gpu_hist", **best_params_xgb)
xgb_best.fit(X_train, y_train)

auc_xgb_train = roc_auc(xgb_best, X_train, y_train)
print("AUC of XGBoost model on the train set: %0.5f" % auc_xgb_train)

auc_xgb_val = roc_auc(xgb_best, X_val, y_val)
print("AUC of XGBoost model on the evaluation set: %0.5f" % auc_xgb_val)


xgb_best.fit(np.concatenate([X_train, X_val]), np.concatenate([y_train, y_val]))

out_sub = os.path.join(SUB_DIR, "xgb_data1_sel_xgb_tuned_03.csv")
write_submit_csv(xgb_best, X_test, sk_id_test, out_sub)

out_model = os.path.join(MODELS_DIR, "xgb_data1_sel_xgb_tuned_03.pickle")
pickle.dump(xgb_best, open(out_model, "wb"))

AUC of XGBoost model on the train set: 0.87398
AUC of XGBoost model on the evaluation set: 0.78794


In [27]:
# try different gamma

best_params_xgb = {"colsample_bytree": 0.8617867706122423,
                   "learning_rate": 0.058376323447823765,
                   "max_depth": 8,
                   "min_child_weight": 1,
                   "reg_alpha": 0.6654125785836109,
                   "reg_lambda": 450.87992024647934,
                   "subsample": 0.6704436939093745,
                   "gamma": 2.5}

xgb_best = XGBClassifier(n_estimators=500, tree_method="gpu_hist", **best_params_xgb)
xgb_best.fit(X_train, y_train)

auc_xgb_train = roc_auc(xgb_best, X_train, y_train)
print("AUC of XGBoost model on the train set: %0.5f" % auc_xgb_train)

auc_xgb_val = roc_auc(xgb_best, X_val, y_val)
print("AUC of XGBoost model on the evaluation set: %0.5f" % auc_xgb_val)


xgb_best.fit(np.concatenate([X_train, X_val]), np.concatenate([y_train, y_val]))

out_sub = os.path.join(SUB_DIR, "xgb_data1_sel_xgb_tuned_04.csv")
write_submit_csv(xgb_best, X_test, sk_id_test, out_sub)

out_model = os.path.join(MODELS_DIR, "xgb_data1_sel_xgb_tuned_04.pickle")
pickle.dump(xgb_best, open(out_model, "wb"))

AUC of XGBoost model on the train set: 0.85323
AUC of XGBoost model on the evaluation set: 0.78810


In [44]:
sub_csv_files = glob.glob(os.path.join(SUB_DIR, "xgb_data1_sel_xgb_tuned*csv"))

averg_pred = averaging_y_hat(sub_csv_files)

out_sub = os.path.join(SUB_DIR, "xgb_data1_sel_xgb_tuned_ensemble.csv")
averg_pred.to_csv(out_sub, index=False)