In [None]:
import os
import time
import copy
import pickle

import numpy as np
import pandas as pd

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

from xgboost import XGBClassifier

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [None]:
def change_dtypes(df):
    """
    change types of columns to reduce memory size
    :param df: dataframe
    :return df: dataframe
    """
    memory = df.memory_usage().sum() / 10**6
    print("Memory usage before changing types %0.2f MB" % memory)

    for col in df.columns:
        if (df[col].dtype == "object") and (df[col].nunique() < df.shape[0]):
            df[col] = df[col].astype("category")

        elif df[col].dtype == float:
            df[col] = df[col].astype(np.float32)

        elif df[col].dtype == int:
            df[col] = df[col].astype(np.int32)

    memory = df.memory_usage().sum() / 10 ** 6
    print("Memory usage after changing types %0.2f MB" % memory)
    return df


def load_csv(filename):
    df = pd.read_csv(filename)
    df = change_dtypes(df)
    return df

In [None]:
def merge_dfs(dfs):
    """assume that indices match"""
    print("Shape of dfs")
    for df in dfs:
        print(df.shape)
        
    df_concat = pd.concat(dfs, axis="columns")
    print("shape of concatenated df", df_concat.shape)
    print("Number of nulls:", df_concat.isnull().sum().sum())
    
    features = df_concat.columns.to_list()
    return features, df_concat.values.astype(np.float32)

In [None]:
def roc_auc(estimator, X_eval, y_eval):
    """
    :param estimator: sklearn estimator that have predict_proba() method
    :param X_eval: test features
    :param y_eval: test target
    :return: float
    """
    proba = estimator.predict_proba(X_eval)
    return roc_auc_score(y_eval, proba[:, 1])


def write_submit_csv(estimator, X_test, id_test, out):
    """
    :param estimator: a sklearn estimator that has predict_proba() method
    :param X_test: df or array
    :param id_test: dataframe containing column "SK_ID_CURR"
    :param out: str, csv output file name
    :return: None
    """
    prob_test = estimator.predict_proba(X_test)[:, 1]
    submit = id_test
    submit["TARGET"] = prob_test
    submit.to_csv(out, index=False)
    return None


def feature_importance_df(estimator, features):
    """
    :param estimator: an estimator object that has feature_importances_ attribute
    :param features: list of str, list of feature names
    :return: feature_imp, dataframe
    """
    feature_imp = pd.DataFrame({"feature": features, "importance": estimator.feature_importances_})
    feature_imp = feature_imp.sort_values(by=["importance"], ascending=False)
    
    feature_imp["rank"] = np.arange(feature_imp.shape[0]) + 1
    return feature_imp


In [None]:
def whole_to_int(a_dict):
    new_dict = copy.deepcopy(a_dict)
    for k, v in new_dict.items():
        if np.isclose(np.round(v), v):
            new_dict[k] = int(new_dict[k])
    return new_dict


def run_hyperopt(classifier,
                 params_tuned, 
                 X_train, y_train,
                 X_val, y_val,
                 num_eval,
                 params_fixed=None,
                 rstate=None):
    
    time_start = time.time()
    if params_fixed is None:
        params_fixed = {"n_jobs": 20, "n_estimators": 100}
    
    def objective(params):
        classifier.set_params(**params_fixed, **params)
        classifier.fit(X_train, y_train)
        
        auc = roc_auc(classifier, X_val, y_val)
        return {"loss": -auc, "status": STATUS_OK}
    
    if rstate is not None:
        rstate = np.random.RandomState(rstate)
        
    trials = Trials()
    best_params = fmin(objective, 
                      params_tuned, 
                      algo=tpe.suggest, 
                      max_evals=num_eval, 
                      trials=trials,
                      rstate=rstate)
    
    best_params = whole_to_int(best_params)
    best_model = classifier.set_params(**params_fixed, **best_params)
    best_model.fit(X_train, y_train)
    
    time_end = time.time()
    time_elapse = time_end - time_start
    print("Time elapsed: %0.5f s" % time_elapse)
    
    return trials, best_params, best_model

In [None]:
def averaging_y_hat(submit_csv_files):
    y_hats = [pd.read_csv(f) for f in submit_csv_files]
    result = y_hats[0][["SK_ID_CURR"]]
    result["TARGET"] = 0.
    for y in y_hats:
        result["TARGET"] = result["TARGET"] + y["TARGET"]
    
    result["TARGET"] = result["TARGET"] / len(y_hats)
    return result

In [None]:
INP_DIR = "data/data_"
SUB_DIR = "data/submit_"
MODELS_DIR = "data/models_"

# Load data

## `X_org`

In [None]:
X_org_train = load_csv(os.path.join(INP_DIR, "X_org_train.csv"))
X_org_test = load_csv(os.path.join(INP_DIR, "X_org_test.csv"))

X_org_train.shape, X_org_test.shape

In [None]:
X_org_train.head()

In [None]:
X_org_test.head()

## `X_q10`

In [None]:
X_q10_train = load_csv(os.path.join(INP_DIR, "X_q10_train.csv"))
X_q10_test = load_csv(os.path.join(INP_DIR, "X_q10_test.csv"))

X_q10_train.shape, X_q10_test.shape

In [None]:
X_q10_train.head()

In [None]:
X_q10_test.head()

## `X_valcount`

In [None]:
X_valcount_train = load_csv(os.path.join(INP_DIR, "X_valcount_train.csv"))
X_valcount_test = load_csv(os.path.join(INP_DIR, "X_valcount_test.csv"))

X_valcount_train.shape, X_valcount_test.shape

In [None]:
X_valcount_train.head()

In [None]:
X_valcount_test.head()

## `X_target_mean`

In [None]:
X_target_mean_train = load_csv(os.path.join(INP_DIR, "X_target_mean_train.csv"))
X_target_mean_test = load_csv(os.path.join(INP_DIR, "X_target_mean_test.csv"))

X_target_mean_train.shape, X_target_mean_test.shape

In [None]:
X_target_mean_train.head()

In [None]:
X_target_mean_test.head()

## `X_woe`

In [None]:
X_woe_train = load_csv(os.path.join(INP_DIR, "X_woe_train.csv"))
X_woe_test = load_csv(os.path.join(INP_DIR, "X_woe_test.csv"))

X_woe_train.shape, X_woe_test.shape

In [None]:
X_woe_train.head()

In [None]:
X_woe_test.head()

## `id_code_test`

In [None]:
id_code_test = load_csv(os.path.join(INP_DIR, "id_code_test.csv"))
id_code_test.shape

# XGBOOST

## Use `X_org`

In [None]:
print("Merge train")

dfs_train = [X_org_train]
features, X_train = merge_dfs(dfs_train)
y_train = load_csv(os.path.join(INP_DIR, "y_train.csv"))
y_train = y_train["target"].values

print("y_train.shape:", y_train.shape)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, 
                                                  stratify=y_train, random_state=21083)

print("after train-validatin split")
print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)

print("")
print("Merge test")
dfs_test = [X_org_test]
_, X_test = merge_dfs(dfs_test)

### Baseline model

In [None]:
time_start = time.time()

xgb = XGBClassifier(n_jobs=2)
xgb.fit(X_train, y_train)

auc_xgb_train = roc_auc(xgb, X_train, y_train)
print("AUC of XGBOOST model on the train set: %0.5f" % auc_xgb_train)

auc_xgb_val = roc_auc(xgb, X_val, y_val)
print("AUC of XGBOOST model on the validation set: %0.5f" % auc_xgb_val)

time_end = time.time()
time_elapse = time_end - time_start
print("Time elapsed: %0.5f s" % time_elapse)

### Tuning using `hyperopt`

In [None]:
params_xgb = {
    "max_depth": scope.int(hp.quniform("max_depth", 2, 20, 1)),
    "min_child_weight": scope.int(hp.quniform("min_child_weight", 1, 20, 1)), 
    "subsample": hp.uniform("subsample", 0.4, 1.0),
    "colsample_bytree": hp.uniform("colsample_bytree", 0.4, 1.0),
    "reg_lambda": hp.loguniform("reg_lambda", np.log(0.01), np.log(10000)),
    #"reg_alpha": hp.loguniform("reg_alpha", np.log(0.0001), np.log(100)),
    "learning_rate": hp.loguniform("learning_rate", np.log(0.005), np.log(1.)),
    #"gamma": hp.uniform("gamma", 0., 2.),
}

params_fixed_xgb = {
    "booster": "gbtree",
    "n_jobs": 2,
    "n_estimators": 200
}

num_eval = 10

trials_xgb, best_params_xgb = hyperopt_xgb(params_xgb, 
                                           X_train, y_train, X_val, y_val, 
                                           num_eval,
                                           params_fixed=params_fixed_xgb,
                                           rstate=30918)
best_params_xgb

In [None]:
best_params_xgb = whole_to_int(best_params_xgb)

xgb_best = XGBClassifier(**params_fixed_xgb, **best_params_xgb)
xgb_best.fit(X_train, y_train)

auc_xgb_train = roc_auc(xgb_best, X_train, y_train)
print("AUC of XGBoost model on the train set: %0.5f" % auc_xgb_train)

auc_xgb_val = roc_auc(xgb_best, X_val, y_val)
print("AUC of XGBoost model on the evaluation set: %0.5f" % auc_xgb_val)

In [None]:
xgb_best.fit(np.concatenate([X_train, X_val]), np.concatenate([y_train, y_val]))

out_sub = os.path.join(SUB_DIR, "xgb_org_tuned_01.csv")
write_submit_csv(xgb_best, X_test, id_code_test, out_sub)

out_model = os.path.join(MODELS_DIR, "xgb_org_tuned_01.pickle")
pickle.dump(xgb_best, open(out_model, "wb"))

In [None]:
out_sub = os.path.join(SUB_DIR, "xgb_org_tuned_01.csv")
write_submit_csv(xgb_best, X_test, id_code_test, out_sub)

out_model = os.path.join(MODELS_DIR, "xgb_org_tuned_01.pickle")
pickle.dump(xgb_best, open(out_model, "wb"))

## Use `X_org` and `X_q10`

In [None]:
print("Merge train")

dfs_train = [X_org_train, X_q10_train]

features, X_train = merge_dfs(dfs_train)
y_train = load_csv(os.path.join(INP_DIR, "y_train.csv"))
y_train = y_train["target"].values

print("y_train.shape:", y_train.shape)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, 
                                                  stratify=y_train, random_state=21083)

print("after train-validatin split")
print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)

print("")
print("Merge test")
dfs_test = [X_org_test, X_q10_test]
_, X_test = merge_dfs(dfs_test)

### Baseline model

In [None]:
time_start = time.time()

xgb = XGBClassifier(n_jobs=2)
xgb.fit(X_train, y_train)

auc_xgb_train = roc_auc(xgb, X_train, y_train)
print("AUC of XGBOOST model on the train set: %0.5f" % auc_xgb_train)

auc_xgb_val = roc_auc(xgb, X_val, y_val)
print("AUC of XGBOOST model on the validation set: %0.5f" % auc_xgb_val)

time_end = time.time()
time_elapse = time_end - time_start
print("Time elapsed: %0.5f s" % time_elapse)

## Use `X_org` and `X_valcount`

In [None]:
print("Merge train")

dfs_train = [X_org_train, X_valcount_train]

features, X_train = merge_dfs(dfs_train)
y_train = load_csv(os.path.join(INP_DIR, "y_train.csv"))
y_train = y_train["target"].values

print("y_train.shape:", y_train.shape)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, 
                                                  stratify=y_train, random_state=21083)

print("after train-validatin split")
print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)

print("")
print("Merge test")
dfs_test = [X_org_test, X_valcount_test]
_, X_test = merge_dfs(dfs_test)

### Baseline model

In [None]:
time_start = time.time()

xgb = XGBClassifier(n_jobs=2)
xgb.fit(X_train, y_train)

auc_xgb_train = roc_auc(xgb, X_train, y_train)
print("AUC of XGBOOST model on the train set: %0.5f" % auc_xgb_train)

auc_xgb_val = roc_auc(xgb, X_val, y_val)
print("AUC of XGBOOST model on the validation set: %0.5f" % auc_xgb_val)

time_end = time.time()
time_elapse = time_end - time_start
print("Time elapsed: %0.5f s" % time_elapse)

## Use `X_org`, `X_q10` and `X_valcount`

In [None]:
print("Merge train")

dfs_train = [X_org_train, X_q10_train, X_valcount_train]

features, X_train = merge_dfs(dfs_train)
y_train = load_csv(os.path.join(INP_DIR, "y_train.csv"))
y_train = y_train["target"].values

print("y_train.shape:", y_train.shape)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, 
                                                  stratify=y_train, random_state=21083)

print("after train-validatin split")
print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)

print("")
print("Merge test")
dfs_test = [X_org_test, X_q10_test, X_valcount_test]
_, X_test = merge_dfs(dfs_test)

### Baseline model

In [None]:
time_start = time.time()

xgb = XGBClassifier(n_jobs=2)
xgb.fit(X_train, y_train)

auc_xgb_train = roc_auc(xgb, X_train, y_train)
print("AUC of XGBOOST model on the train set: %0.5f" % auc_xgb_train)

auc_xgb_val = roc_auc(xgb, X_val, y_val)
print("AUC of XGBOOST model on the validation set: %0.5f" % auc_xgb_val)

time_end = time.time()
time_elapse = time_end - time_start
print("Time elapsed: %0.5f s" % time_elapse)