In [1]:
import os
import time
import copy
import pickle

import numpy as np
import pandas as pd

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [2]:
def change_dtypes(df):
    """
    change types of columns to reduce memory size
    :param df: dataframe
    :return df: dataframe
    """
    memory = df.memory_usage().sum() / 10**6
    print("Memory usage before changing types %0.2f MB" % memory)

    for col in df.columns:
        if (df[col].dtype == "object") and (df[col].nunique() < df.shape[0]):
            df[col] = df[col].astype("category")

        elif df[col].dtype == float:
            df[col] = df[col].astype(np.float32)

        elif df[col].dtype == int:
            df[col] = df[col].astype(np.int32)

    memory = df.memory_usage().sum() / 10 ** 6
    print("Memory usage after changing types %0.2f MB" % memory)
    return df


def load_csv(filename):
    df = pd.read_csv(filename)
    df = change_dtypes(df)
    return df

In [3]:
def merge_dfs(dfs):
    """assume that indices match"""
    print("Shape of dfs")
    for df in dfs:
        print(df.shape)
        
    df_concat = pd.concat(dfs, axis="columns")
    print("shape of concatenated df", df_concat.shape)
    print("Number of nulls:", df_concat.isnull().sum().sum())
    
    features = df_concat.columns.to_list()
    return features, df_concat.values.astype(np.float32)

In [4]:
def roc_auc(estimator, X_eval, y_eval):
    """
    :param estimator: sklearn estimator that have predict_proba() method
    :param X_eval: test features
    :param y_eval: test target
    :return: float
    """
    proba = estimator.predict_proba(X_eval)
    return roc_auc_score(y_eval, proba[:, 1])


def write_submit_csv(estimator, X_test, id_test, out):
    """
    :param estimator: a sklearn estimator that has predict_proba() method
    :param X_test: df or array
    :param id_test: dataframe containing column "SK_ID_CURR"
    :param out: str, csv output file name
    :return: None
    """
    prob_test = estimator.predict_proba(X_test)[:, 1]
    submit = id_test
    submit["TARGET"] = prob_test
    submit.to_csv(out, index=False)
    return None


def feature_importance_df(estimator, features):
    """
    :param estimator: an estimator object that has feature_importances_ attribute
    :param features: list of str, list of feature names
    :return: feature_imp, dataframe
    """
    feature_imp = pd.DataFrame({"feature": features, "importance": estimator.feature_importances_})
    feature_imp = feature_imp.sort_values(by=["importance"], ascending=False)
    
    feature_imp["rank"] = np.arange(feature_imp.shape[0]) + 1
    return feature_imp


In [5]:
def whole_to_int(a_dict):
    new_dict = copy.deepcopy(a_dict)
    for k, v in new_dict.items():
        if np.isclose(np.round(v), v):
            new_dict[k] = int(new_dict[k])
    return new_dict


def run_hyperopt(classifier,
                 params_tuned, 
                 X_train, y_train,
                 X_val, y_val,
                 num_eval,
                 params_fixed=None,
                 rstate=None):
    
    time_start = time.time()
    if params_fixed is None:
        params_fixed = {"n_jobs": 20, "n_estimators": 100}
    
    def objective(params):
        classifier.set_params(**params_fixed, **params)
        classifier.fit(X_train, y_train)
        
        auc = roc_auc(classifier, X_val, y_val)
        return {"loss": -auc, "status": STATUS_OK}
    
    if rstate is not None:
        rstate = np.random.RandomState(rstate)
        
    trials = Trials()
    best_params = fmin(objective, 
                      params_tuned, 
                      algo=tpe.suggest, 
                      max_evals=num_eval, 
                      trials=trials,
                      rstate=rstate)
    
    best_params = whole_to_int(best_params)
    best_model = classifier.set_params(**params_fixed, **best_params)
    best_model.fit(X_train, y_train)
    
    time_end = time.time()
    time_elapse = time_end - time_start
    print("Time elapsed: %0.5f s" % time_elapse)
    
    return trials, best_params, best_model

In [6]:
def averaging_y_hat(submit_csv_files):
    y_hats = [pd.read_csv(f) for f in submit_csv_files]
    result = y_hats[0][["SK_ID_CURR"]]
    result["TARGET"] = 0.
    for y in y_hats:
        result["TARGET"] = result["TARGET"] + y["TARGET"]
    
    result["TARGET"] = result["TARGET"] / len(y_hats)
    return result

In [7]:
INP_DIR = "data/data_"
SUB_DIR = "data/submit_"
MODELS_DIR = "data/models_"

# Load data

## `X_org`

In [8]:
X_org_train = load_csv(os.path.join(INP_DIR, "X_org_train.csv"))
X_org_test = load_csv(os.path.join(INP_DIR, "X_org_test.csv"))

X_org_train.shape, X_org_test.shape

Memory usage before changing types 320.00 MB
Memory usage after changing types 160.00 MB
Memory usage before changing types 320.00 MB
Memory usage after changing types 160.00 MB


((200000, 200), (200000, 200))

In [9]:
X_org_train.head()

Unnamed: 0,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,-4.92,5.747,...,4.4354,3.9642,3.1364,1.691,18.522699,-2.3978,7.8784,8.5635,12.7803,-1.0914
1,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,3.1468,8.0851,...,7.6421,7.7214,2.5837,10.9516,15.4305,2.0339,8.1267,8.7889,18.356001,1.9518
2,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,-4.9193,5.9525,...,2.9057,9.7905,1.6704,1.6858,21.6042,3.1417,-6.5213,8.2675,14.7222,0.3965
3,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.925,-5.8609,8.245,...,4.4666,4.7433,0.7178,1.4214,23.0347,-1.2706,-2.9275,10.2922,17.9697,-8.9996
4,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,6.2654,7.6784,...,-1.4905,9.5214,-0.1508,9.1942,13.2876,-1.5121,3.9267,9.5031,17.9974,-8.8104


In [10]:
X_org_test.head()

Unnamed: 0,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,11.0656,7.7798,12.9536,9.4292,11.4327,-2.3805,5.8493,18.2675,2.1337,8.81,...,-2.1556,11.8495,-1.43,2.4508,13.7112,2.4669,4.3654,10.72,15.4722,-8.7197
1,8.5304,1.2543,11.3047,5.1858,9.1974,-4.0117,6.0196,18.631599,-4.4131,5.9739,...,10.6165,8.8349,0.9403,10.1282,15.5765,0.4773,-1.4852,9.8714,19.129299,-20.976
2,5.4827,-10.3581,10.1407,7.0479,10.2628,9.8052,4.895,20.2537,1.5233,8.3442,...,-0.7484,10.9935,1.9803,2.18,12.9813,2.1281,-7.1086,7.0618,19.895599,-23.179399
3,8.5374,-1.3222,12.022,6.5749,8.8458,3.1744,4.9397,20.566,3.3755,7.4578,...,9.5702,9.0766,1.658,3.5813,15.1874,3.1656,3.9567,9.2295,13.0168,-4.2108
4,11.7058,-0.1327,14.1295,7.7506,9.1035,-8.5848,6.8595,10.6048,2.989,7.1437,...,4.2259,9.1723,1.2835,3.3778,19.554199,-0.286,-5.1612,7.2882,13.926,-9.1846


## `X_q10`

In [11]:
X_q10_train = load_csv(os.path.join(INP_DIR, "X_q10_train.csv"))
X_q10_test = load_csv(os.path.join(INP_DIR, "X_q10_test.csv"))

X_q10_train.shape, X_q10_test.shape

Memory usage before changing types 318.40 MB
Memory usage after changing types 159.20 MB
Memory usage before changing types 318.40 MB
Memory usage after changing types 159.20 MB


((200000, 199), (200000, 199))

In [12]:
X_q10_train.head()

Unnamed: 0,var_0_10QCUT,var_1_10QCUT,var_2_10QCUT,var_3_10QCUT,var_4_10QCUT,var_5_10QCUT,var_6_10QCUT,var_7_10QCUT,var_8_10QCUT,var_9_10QCUT,...,var_190_10QCUT,var_191_10QCUT,var_192_10QCUT,var_193_10QCUT,var_194_10QCUT,var_195_10QCUT,var_196_10QCUT,var_197_10QCUT,var_198_10QCUT,var_199_10QCUT
0,4,2,7,3,6,4,4,8,1,1,...,7,2,8,4,6,1,9,4,2,6
1,7,3,9,3,8,10,6,6,8,7,...,9,6,7,10,3,10,9,5,8,7
2,3,5,7,7,4,4,10,4,1,2,...,5,8,5,4,9,10,1,3,4,7
3,6,5,3,6,9,7,7,4,1,7,...,7,3,3,4,10,3,3,10,8,4
4,5,6,8,5,8,9,8,8,10,6,...,2,8,1,10,1,2,6,8,8,4


In [13]:
X_q10_test.head()

Unnamed: 0,var_0_10QCUT,var_1_10QCUT,var_2_10QCUT,var_3_10QCUT,var_4_10QCUT,var_5_10QCUT,var_6_10QCUT,var_7_10QCUT,var_8_10QCUT,var_9_10QCUT,...,var_190_10QCUT,var_191_10QCUT,var_192_10QCUT,var_193_10QCUT,var_194_10QCUT,var_195_10QCUT,var_196_10QCUT,var_197_10QCUT,var_198_10QCUT,var_199_10QCUT
0,6,10,8,9,6,7,7,7,7,9,...,2,10,1,5,1,10,7,10,5,4
1,3,8,6,3,2,6,8,8,1,2,...,10,7,3,10,3,7,3,9,9,1
2,1,1,5,6,4,10,3,9,7,7,...,3,9,6,4,1,10,1,1,10,1
3,3,6,7,5,1,9,4,9,8,5,...,10,8,5,6,3,10,6,7,2,5
4,7,7,9,7,2,4,10,1,8,4,...,6,8,4,5,7,5,1,1,3,4


## `X_valcount`

In [14]:
X_valcount_train = load_csv(os.path.join(INP_DIR, "X_valcount_train.csv"))
X_valcount_test = load_csv(os.path.join(INP_DIR, "X_valcount_test.csv"))

X_valcount_train.shape, X_valcount_test.shape

Memory usage before changing types 318.40 MB
Memory usage after changing types 159.20 MB
Memory usage before changing types 318.40 MB
Memory usage after changing types 159.20 MB


((200000, 199), (200000, 199))

In [15]:
X_valcount_train.head()

Unnamed: 0,var_0_VALCOUNT,var_1_VALCOUNT,var_2_VALCOUNT,var_3_VALCOUNT,var_4_VALCOUNT,var_5_VALCOUNT,var_6_VALCOUNT,var_7_VALCOUNT,var_8_VALCOUNT,var_9_VALCOUNT,...,var_190_VALCOUNT,var_191_VALCOUNT,var_192_VALCOUNT,var_193_VALCOUNT,var_194_VALCOUNT,var_195_VALCOUNT,var_196_VALCOUNT,var_197_VALCOUNT,var_198_VALCOUNT,var_199_VALCOUNT
0,0.02318,0.01442,0.02018,0.018425,0.02553,0.01847,0.02502,0.020355,0.010525,0.00934,...,0.027125,0.01731,0.023575,0.02611,0.020655,0.00892,0.01803,0.02501,0.014545,0.0214
1,0.022535,0.018725,0.011875,0.019795,0.02262,0.007615,0.023985,0.021825,0.02077,0.016915,...,0.01823,0.022005,0.027595,0.004885,0.020115,0.00779,0.01722,0.023465,0.020685,0.02316
2,0.023055,0.02124,0.02071,0.02234,0.024285,0.019355,0.005935,0.02251,0.010525,0.011855,...,0.02661,0.01838,0.03231,0.02611,0.015245,0.005835,0.009335,0.02207,0.02433,0.022425
3,0.023185,0.02186,0.02313,0.02253,0.0202,0.021585,0.02492,0.022655,0.00575,0.017435,...,0.02624,0.02019,0.024865,0.025925,0.00934,0.02032,0.01641,0.010905,0.0203,0.019065
4,0.025325,0.02179,0.01782,0.0218,0.02262,0.01995,0.02422,0.018585,0.005105,0.02236,...,0.01844,0.02115,0.012755,0.0121,0.00854,0.019105,0.01931,0.019985,0.0203,0.019065


In [16]:
X_valcount_test.head()

Unnamed: 0,var_0_VALCOUNT,var_1_VALCOUNT,var_2_VALCOUNT,var_3_VALCOUNT,var_4_VALCOUNT,var_5_VALCOUNT,var_6_VALCOUNT,var_7_VALCOUNT,var_8_VALCOUNT,var_9_VALCOUNT,...,var_190_VALCOUNT,var_191_VALCOUNT,var_192_VALCOUNT,var_193_VALCOUNT,var_194_VALCOUNT,var_195_VALCOUNT,var_196_VALCOUNT,var_197_VALCOUNT,var_198_VALCOUNT,var_199_VALCOUNT
0,0.023185,0.002985,0.01782,0.01328,0.02587,0.02111,0.02492,0.02117,0.02107,0.019705,...,0.015905,0.009195,0.00174,0.0275,0.011735,0.004765,0.019855,0.004135,0.02347,0.019065
1,0.02258,0.0197,0.02007,0.019175,0.016155,0.020885,0.02129,0.020355,0.01238,0.011855,...,0.008525,0.02373,0.02548,0.00802,0.020115,0.024435,0.016955,0.01714,0.01703,0.008395
2,0.006135,0.002035,0.023045,0.02201,0.02212,0.003415,0.024355,0.01639,0.020705,0.01853,...,0.02134,0.012415,0.031905,0.027055,0.007435,0.00664,0.006805,0.00404,0.01307,0.005505
3,0.02258,0.02179,0.02071,0.0218,0.013615,0.01898,0.024275,0.01489,0.01923,0.017765,...,0.011155,0.023625,0.03231,0.02771,0.018975,0.005835,0.01931,0.021705,0.01613,0.021035
4,0.02217,0.021175,0.011185,0.022225,0.015815,0.01914,0.00703,0.007215,0.020765,0.01797,...,0.027125,0.02243,0.029515,0.02807,0.01965,0.02464,0.01242,0.00487,0.02091,0.019065


## `X_target_mean`

In [17]:
X_target_mean_train = load_csv(os.path.join(INP_DIR, "X_target_mean_train.csv"))
X_target_mean_test = load_csv(os.path.join(INP_DIR, "X_target_mean_test.csv"))

X_target_mean_train.shape, X_target_mean_test.shape

Memory usage before changing types 318.40 MB
Memory usage after changing types 159.20 MB
Memory usage before changing types 318.40 MB
Memory usage after changing types 159.20 MB


((200000, 199), (200000, 199))

In [18]:
X_target_mean_train.head()

Unnamed: 0,var_0_TARGETMEANNUM,var_1_TARGETMEANNUM,var_2_TARGETMEANNUM,var_3_TARGETMEANNUM,var_4_TARGETMEANNUM,var_5_TARGETMEANNUM,var_6_TARGETMEANNUM,var_7_TARGETMEANNUM,var_8_TARGETMEANNUM,var_9_TARGETMEANNUM,...,var_190_TARGETMEANNUM,var_191_TARGETMEANNUM,var_192_TARGETMEANNUM,var_193_TARGETMEANNUM,var_194_TARGETMEANNUM,var_195_TARGETMEANNUM,var_196_TARGETMEANNUM,var_197_TARGETMEANNUM,var_198_TARGETMEANNUM,var_199_TARGETMEANNUM
0,0.090655,0.088354,0.097395,0.099495,0.09599,0.093895,0.085241,0.099345,0.089687,0.138179,...,0.0973,0.089546,0.082596,0.1025,0.103895,0.09445,0.10335,0.1021,0.108989,0.10165
1,0.091691,0.088396,0.1104,0.099495,0.100155,0.133,0.088813,0.101345,0.100825,0.088596,...,0.11245,0.091986,0.08915,0.09525,0.103105,0.12715,0.10335,0.099525,0.087559,0.099745
2,0.090545,0.099805,0.097395,0.10045,0.09686,0.093895,0.1591,0.097945,0.089687,0.101355,...,0.090945,0.1052,0.1022,0.1025,0.094919,0.12715,0.09295,0.099895,0.098195,0.099745
3,0.0928,0.099805,0.089713,0.103805,0.100555,0.09755,0.106261,0.097945,0.089687,0.088596,...,0.0973,0.089654,0.108655,0.1025,0.09285,0.09804,0.093791,0.089204,0.087559,0.099595
4,0.0882,0.103795,0.115706,0.09897,0.100155,0.1005,0.109167,0.099345,0.118512,0.111606,...,0.089904,0.1052,0.12705,0.09525,0.1195,0.09335,0.10286,0.094276,0.087559,0.099595


In [19]:
X_target_mean_test.head()

Unnamed: 0,var_0_TARGETMEANNUM,var_1_TARGETMEANNUM,var_2_TARGETMEANNUM,var_3_TARGETMEANNUM,var_4_TARGETMEANNUM,var_5_TARGETMEANNUM,var_6_TARGETMEANNUM,var_7_TARGETMEANNUM,var_8_TARGETMEANNUM,var_9_TARGETMEANNUM,...,var_190_TARGETMEANNUM,var_191_TARGETMEANNUM,var_192_TARGETMEANNUM,var_193_TARGETMEANNUM,var_194_TARGETMEANNUM,var_195_TARGETMEANNUM,var_196_TARGETMEANNUM,var_197_TARGETMEANNUM,var_198_TARGETMEANNUM,var_199_TARGETMEANNUM
0,0.0928,0.14365,0.115706,0.099545,0.09599,0.09755,0.106261,0.10191,0.0976,0.086787,...,0.089904,0.143,0.12705,0.098,0.1195,0.12715,0.098895,0.089204,0.093755,0.099595
1,0.090545,0.1014,0.086663,0.099495,0.101795,0.10325,0.109167,0.099345,0.089687,0.101355,...,0.1515,0.104505,0.108655,0.09525,0.103105,0.096157,0.093791,0.091218,0.089146,0.086
2,0.0901,0.083746,0.085791,0.103805,0.09686,0.133,0.086,0.09575,0.0976,0.088596,...,0.0851,0.107411,0.094155,0.1025,0.1195,0.12715,0.09295,0.133637,0.091205,0.086
3,0.090545,0.103795,0.097395,0.09897,0.0974,0.1005,0.085241,0.09575,0.100825,0.106127,...,0.1515,0.1052,0.1022,0.098685,0.103105,0.12715,0.10286,0.092514,0.108989,0.101555
4,0.091691,0.101355,0.1104,0.10045,0.101795,0.093895,0.1591,0.10235,0.100825,0.099725,...,0.098705,0.1052,0.10895,0.098,0.09545,0.09559,0.09295,0.133637,0.106061,0.099595


## `X_woe`

In [20]:
X_woe_train = load_csv(os.path.join(INP_DIR, "X_woe_train.csv"))
X_woe_test = load_csv(os.path.join(INP_DIR, "X_woe_test.csv"))

X_woe_train.shape, X_woe_test.shape

Memory usage before changing types 318.40 MB
Memory usage after changing types 159.20 MB
Memory usage before changing types 318.40 MB
Memory usage after changing types 159.20 MB


((200000, 199), (200000, 199))

In [21]:
X_woe_train.head()

Unnamed: 0,var_0_WOENUM,var_1_WOENUM,var_2_WOENUM,var_3_WOENUM,var_4_WOENUM,var_5_WOENUM,var_6_WOENUM,var_7_WOENUM,var_8_WOENUM,var_9_WOENUM,...,var_190_WOENUM,var_191_WOENUM,var_192_WOENUM,var_193_WOENUM,var_194_WOENUM,var_195_WOENUM,var_196_WOENUM,var_197_WOENUM,var_198_WOENUM,var_199_WOENUM
0,230.56691,233.389511,222.650864,220.284805,224.259186,226.697449,237.3172,220.452332,231.746826,183.04953,...,222.759125,231.919678,240.758835,216.975037,215.467896,226.047195,216.054428,217.410599,210.11087,217.90242
1,229.316238,233.338425,208.666183,220.284805,219.550339,187.468979,232.821106,218.236938,218.809357,233.090485,...,206.595612,228.962097,232.405792,225.115372,216.318924,192.639618,216.054428,220.251221,234.381363,220.006104
2,230.69928,219.939316,222.650864,219.22345,223.261459,226.697449,166.493988,222.026825,231.746826,218.225815,...,230.214523,214.0737,217.301559,216.975037,225.500076,192.639618,227.813583,219.839172,221.744202,220.006104
3,227.991623,219.939316,231.713867,215.564178,219.107269,222.474823,212.951935,222.026825,231.746826,233.090485,...,222.759125,231.786102,210.454941,216.975037,227.932251,221.919235,226.820557,232.338745,234.381363,220.173248
4,233.581375,215.575348,203.373917,220.872406,219.550339,219.168137,209.927567,220.452332,200.659866,207.444473,...,231.480164,214.0737,192.729752,225.115372,199.717361,227.340057,216.584,226.25029,234.381363,220.173248


In [22]:
X_woe_test.head()

Unnamed: 0,var_0_WOENUM,var_1_WOENUM,var_2_WOENUM,var_3_WOENUM,var_4_WOENUM,var_5_WOENUM,var_6_WOENUM,var_7_WOENUM,var_8_WOENUM,var_9_WOENUM,...,var_190_WOENUM,var_191_WOENUM,var_192_WOENUM,var_193_WOENUM,var_194_WOENUM,var_195_WOENUM,var_196_WOENUM,var_197_WOENUM,var_198_WOENUM,var_199_WOENUM
0,227.991623,178.529938,203.373917,220.229019,224.259186,222.474823,212.951935,217.617813,222.418045,235.351257,...,231.480164,179.059326,192.729752,221.964706,199.717361,192.639618,220.956253,232.338745,226.862839,220.173248
1,230.69928,218.176483,235.507797,220.284805,217.743835,216.162384,209.927567,220.452332,231.746826,218.225815,...,172.288452,214.813934,210.454941,225.115372,216.318924,224.067657,226.820557,229.88501,232.41127,236.348328
2,231.241455,239.250763,236.613968,215.564178,223.261459,187.468979,236.348328,224.53653,222.418045,233.090485,...,237.498779,211.746628,226.392929,216.975037,199.717361,192.639618,227.813583,186.917999,229.90152,236.348328
3,230.69928,215.575348,222.650864,220.872406,222.645325,219.168137,237.3172,224.53653,218.809357,213.093216,...,172.288452,214.0737,217.301559,221.191956,216.318924,192.639618,216.584,228.331955,210.11087,218.006409
4,229.316238,218.225815,208.666183,219.22345,217.743835,226.697449,166.493988,217.138199,218.809357,220.028305,...,221.169769,214.0737,210.151154,221.964706,224.883499,224.720947,227.813583,186.917999,213.162735,220.173248


In [23]:
y_full_train = load_csv(os.path.join(INP_DIR, "y_train.csv"))
y_full_train = y_full_train["target"].values

y_full_train.shape

Memory usage before changing types 1.60 MB
Memory usage after changing types 0.80 MB


(200000,)

## `id_code_test`

In [24]:
id_code_test = load_csv(os.path.join(INP_DIR, "id_code_test.csv"))
id_code_test.shape

Memory usage before changing types 1.60 MB
Memory usage after changing types 1.60 MB


(200000, 1)

# `LGBMClassifier`

## Use `X_org`

In [36]:
print("Merge train")

dfs_train = [X_org_train]
features, X_train = merge_dfs(dfs_train)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_full_train, test_size=0.2, 
                                                  stratify=y_full_train, random_state=21083)

print("after train-validatin split")
print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)


print("\nMerge test")
dfs_test = [X_org_test]
_, X_test = merge_dfs(dfs_test)

Merge train
Shape of dfs
(200000, 200)
shape of concatenated df (200000, 200)
Number of nulls: 0
after train-validatin split
(160000, 200) (160000,) (40000, 200) (40000,)

Merge test
Shape of dfs
(200000, 200)
shape of concatenated df (200000, 200)
Number of nulls: 0


### Baseline model

In [37]:
time_start = time.time()

lgbm = lgbm = LGBMClassifier(device="gpu")
lgbm.fit(X_train, y_train)

auc_train = roc_auc(lgbm, X_train, y_train)
print("AUC of  the train set: %0.5f" % auc_train)

auc_val = roc_auc(lgbm, X_val, y_val)
print("AUC of the validation set: %0.5f" % auc_val)

time_end = time.time()
time_elapse = time_end - time_start
print("Time elapsed: %0.5f s" % time_elapse)

AUC of  the train set: 0.92498
AUC of the validation set: 0.86636
Time elapsed: 3.65565 s


### Tuning using `hyperopt`

In [38]:
params = {
    "max_depth": scope.int(hp.quniform("max_depth", 2, 10, 1)),
    "num_leaves": scope.int(hp.quniform("num_leaves", 5, 100, 5)),
    "min_child_samples": scope.int(hp.quniform("min_child_samples", 10, 300, 10)), 
    "subsample": hp.uniform("subsample", 0.4, 1.0),
    "colsample_bytree": hp.uniform("colsample_bytree", 0.4, 1.0),
    "reg_lambda": hp.loguniform("reg_lambda", np.log(0.001), np.log(10000)),
    #"reg_alpha": hp.loguniform("reg_alpha", np.log(0.001), np.log(1000)),
    "learning_rate": hp.loguniform("learning_rate", np.log(0.005), np.log(1)),
}

# categorical_feature
params_fixed = {
    "boosting_type": "gbdt",
    "device": "gpu" ,
    "n_estimators": 500
}

num_eval = 100

lgbm = LGBMClassifier()
trials, best_params, best_model = run_hyperopt(lgbm, params, 
                                               X_train, y_train, X_val, y_val, 
                                               num_eval,
                                               params_fixed=params_fixed)
best_params

100%|██████████| 100/100 [11:43<00:00,  7.03s/trial, best loss: -0.893388145431818]
Time elapsed: 707.90110 s


{'colsample_bytree': 0.5731504313939707,
 'learning_rate': 0.11792112591849245,
 'max_depth': 7,
 'min_child_samples': 210,
 'num_leaves': 10,
 'reg_lambda': 0.019709267367751634,
 'subsample': 0.4817237141688642}

In [39]:
auc_train = roc_auc(best_model, X_train, y_train)
print("AUC of the train set: %0.5f" % auc_train)

auc_val = roc_auc(best_model, X_val, y_val)
print("AUC of the evaluation set: %0.5f" % auc_val)


best_model.fit(np.concatenate([X_train, X_val]), np.concatenate([y_train, y_val]))

out_sub = os.path.join(SUB_DIR, "lgbm_org_tuned_01.csv")
write_submit_csv(best_model, X_test, id_code_test, out_sub)

out_model = os.path.join(MODELS_DIR, "lgbm_org_tuned_01.pickle")
pickle.dump(lgbm, open(out_model, "wb"))

AUC of the train set: 0.93973
AUC of the evaluation set: 0.89339


## Use `X_org` and `X_q10`

In [40]:
print("Merge train")

dfs_train = [X_org_train, X_q10_train]
features, X_train = merge_dfs(dfs_train)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_full_train, test_size=0.2, 
                                                  stratify=y_full_train, random_state=21083)

print("after train-validatin split")
print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)


print("\nMerge test")
dfs_test = [X_org_test, X_q10_test]
_, X_test = merge_dfs(dfs_test)

Merge train
Shape of dfs
(200000, 200)
(200000, 199)
shape of concatenated df (200000, 399)
Number of nulls: 0
after train-validatin split
(160000, 399) (160000,) (40000, 399) (40000,)

Merge test
Shape of dfs
(200000, 200)
(200000, 199)
shape of concatenated df (200000, 399)
Number of nulls: 0


In [41]:
params = {
    "max_depth": scope.int(hp.quniform("max_depth", 2, 10, 1)),
    "num_leaves": scope.int(hp.quniform("num_leaves", 5, 100, 5)),
    "min_child_samples": scope.int(hp.quniform("min_child_samples", 10, 400, 10)), 
    "subsample": hp.uniform("subsample", 0.4, 1.0),
    "colsample_bytree": hp.uniform("colsample_bytree", 0.4, 1.0),
    "reg_lambda": hp.loguniform("reg_lambda", np.log(0.001), np.log(10000)),
    #"reg_alpha": hp.loguniform("reg_alpha", np.log(0.001), np.log(1000)),
    "learning_rate": hp.loguniform("learning_rate", np.log(0.005), np.log(1)),
}

# categorical_feature
params_fixed = {
    "boosting_type": "gbdt",
    "device": "gpu" ,
    "n_estimators": 500
}

num_eval = 100

lgbm = LGBMClassifier()
trials, best_params, best_model = run_hyperopt(lgbm, params, 
                                               X_train, y_train, X_val, y_val, 
                                               num_eval,
                                               params_fixed=params_fixed)
best_params

100%|██████████| 100/100 [16:40<00:00, 10.01s/trial, best loss: -0.8949313604296473]
Time elapsed: 1006.87983 s


{'colsample_bytree': 0.45506888874984014,
 'learning_rate': 0.2142907257779847,
 'max_depth': 5,
 'min_child_samples': 30,
 'num_leaves': 5,
 'reg_lambda': 0.01964205870738078,
 'subsample': 0.5955495929550342}

In [42]:
auc_train = roc_auc(best_model, X_train, y_train)
print("AUC of the train set: %0.5f" % auc_train)

auc_val = roc_auc(best_model, X_val, y_val)
print("AUC of the evaluation set: %0.5f" % auc_val)


best_model.fit(np.concatenate([X_train, X_val]), np.concatenate([y_train, y_val]))

out_sub = os.path.join(SUB_DIR, "lgbm_org_q10_tuned_01.csv")
write_submit_csv(best_model, X_test, id_code_test, out_sub)

out_model = os.path.join(MODELS_DIR, "lgbm_org_q10_tuned_01.pickle")
pickle.dump(lgbm, open(out_model, "wb"))

AUC of the train set: 0.92550
AUC of the evaluation set: 0.89493


## Use `X_org` and  `X_valcount`

In [43]:
print("Merge train")

dfs_train = [X_org_train, X_valcount_train]
features, X_train = merge_dfs(dfs_train)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_full_train, test_size=0.2, 
                                                  stratify=y_full_train, random_state=21083)

print("after train-validatin split")
print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)


print("\nMerge test")
dfs_test = [X_org_test, X_valcount_test]
_, X_test = merge_dfs(dfs_test)

Merge train
Shape of dfs
(200000, 200)
(200000, 199)
shape of concatenated df (200000, 399)
Number of nulls: 0
after train-validatin split
(160000, 399) (160000,) (40000, 399) (40000,)

Merge test
Shape of dfs
(200000, 200)
(200000, 199)
shape of concatenated df (200000, 399)
Number of nulls: 0


In [44]:
params = {
    "max_depth": scope.int(hp.quniform("max_depth", 2, 10, 1)),
    "num_leaves": scope.int(hp.quniform("num_leaves", 5, 100, 5)),
    "min_child_samples": scope.int(hp.quniform("min_child_samples", 10, 400, 10)), 
    "subsample": hp.uniform("subsample", 0.4, 1.0),
    "colsample_bytree": hp.uniform("colsample_bytree", 0.4, 1.0),
    "reg_lambda": hp.loguniform("reg_lambda", np.log(0.001), np.log(10000)),
    #"reg_alpha": hp.loguniform("reg_alpha", np.log(0.001), np.log(1000)),
    "learning_rate": hp.loguniform("learning_rate", np.log(0.005), np.log(1)),
}

# categorical_feature
params_fixed = {
    "boosting_type": "gbdt",
    "device": "gpu" ,
    "n_estimators": 500
}

num_eval = 200

lgbm = LGBMClassifier()
trials, best_params, best_model = run_hyperopt(lgbm, params, 
                                               X_train, y_train, X_val, y_val, 
                                               num_eval,
                                               params_fixed=params_fixed)
best_params

100%|██████████| 200/200 [35:10<00:00, 10.55s/trial, best loss: -0.8938862455371834]
Time elapsed: 2116.44064 s


{'colsample_bytree': 0.6876462443291219,
 'learning_rate': 0.2153616828802118,
 'max_depth': 9,
 'min_child_samples': 210,
 'num_leaves': 5,
 'reg_lambda': 1.3014094628213906,
 'subsample': 0.8033241520875941}

In [45]:
auc_train = roc_auc(best_model, X_train, y_train)
print("AUC of the train set: %0.5f" % auc_train)

auc_val = roc_auc(best_model, X_val, y_val)
print("AUC of the evaluation set: %0.5f" % auc_val)


best_model.fit(np.concatenate([X_train, X_val]), np.concatenate([y_train, y_val]))

out_sub = os.path.join(SUB_DIR, "lgbm_org_valcount_tuned_01.csv")
write_submit_csv(best_model, X_test, id_code_test, out_sub)

out_model = os.path.join(MODELS_DIR, "lgbm_org_valcount_tuned_01.pickle")
pickle.dump(lgbm, open(out_model, "wb"))

AUC of the train set: 0.92709
AUC of the evaluation set: 0.89389


## Use `X_org`,  `X_valcount` and `X_target_mean`

In [47]:
print("Merge train")

dfs_train = [X_org_train, X_valcount_train, X_target_mean_train]
features, X_train = merge_dfs(dfs_train)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_full_train, test_size=0.2, 
                                                  stratify=y_full_train, random_state=21083)

print("after train-validatin split")
print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)


print("\nMerge test")
dfs_test = [X_org_test, X_valcount_test, X_target_mean_test]
_, X_test = merge_dfs(dfs_test)

Merge train
Shape of dfs
(200000, 200)
(200000, 199)
(200000, 199)
shape of concatenated df (200000, 598)
Number of nulls: 0
after train-validatin split
(160000, 598) (160000,) (40000, 598) (40000,)

Merge test
Shape of dfs
(200000, 200)
(200000, 199)
(200000, 199)
shape of concatenated df (200000, 598)
Number of nulls: 0


In [48]:
params = {
    "max_depth": scope.int(hp.quniform("max_depth", 2, 10, 1)),
    "num_leaves": scope.int(hp.quniform("num_leaves", 5, 100, 5)),
    "min_child_samples": scope.int(hp.quniform("min_child_samples", 10, 400, 10)), 
    "subsample": hp.uniform("subsample", 0.4, 1.0),
    "colsample_bytree": hp.uniform("colsample_bytree", 0.4, 1.0),
    "reg_lambda": hp.loguniform("reg_lambda", np.log(0.001), np.log(10000)),
    #"reg_alpha": hp.loguniform("reg_alpha", np.log(0.001), np.log(1000)),
    "learning_rate": hp.loguniform("learning_rate", np.log(0.005), np.log(1)),
}

# categorical_feature
params_fixed = {
    "boosting_type": "gbdt",
    "device": "gpu" ,
    "n_estimators": 500
}

num_eval = 200

lgbm = LGBMClassifier()
trials, best_params, best_model = run_hyperopt(lgbm, params, 
                                               X_train, y_train, X_val, y_val, 
                                               num_eval,
                                               params_fixed=params_fixed)
best_params

100%|██████████| 200/200 [46:13<00:00, 13.87s/trial, best loss: -0.8933248225243986]
Time elapsed: 2784.12794 s


{'colsample_bytree': 0.8895404893244775,
 'learning_rate': 0.11940750339427914,
 'max_depth': 8,
 'min_child_samples': 350,
 'num_leaves': 10,
 'reg_lambda': 0.1123393750039746,
 'subsample': 0.9844517256389533}

In [49]:
auc_train = roc_auc(best_model, X_train, y_train)
print("AUC of the train set: %0.5f" % auc_train)

auc_val = roc_auc(best_model, X_val, y_val)
print("AUC of the evaluation set: %0.5f" % auc_val)


best_model.fit(np.concatenate([X_train, X_val]), np.concatenate([y_train, y_val]))

out_sub = os.path.join(SUB_DIR, "lgbm_org_valcount_target_mean_tuned_01.csv")
write_submit_csv(best_model, X_test, id_code_test, out_sub)

out_model = os.path.join(MODELS_DIR, "lgbm_org_valcount_tuned_target_mean_01.pickle")
pickle.dump(lgbm, open(out_model, "wb"))

AUC of the train set: 0.94304
AUC of the evaluation set: 0.89332


## Use `X_org`,  `X_valcount`,  `X_target_mean` and `X_woe`

In [50]:
print("Merge train")

dfs_train = [X_org_train, X_valcount_train, X_target_mean_train, X_woe_train]
features, X_train = merge_dfs(dfs_train)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_full_train, test_size=0.2, 
                                                  stratify=y_full_train, random_state=21083)

print("after train-validatin split")
print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)


print("\nMerge test")
dfs_test = [X_org_test, X_valcount_test, X_target_mean_test, X_woe_test]
_, X_test = merge_dfs(dfs_test)

Merge train
Shape of dfs
(200000, 200)
(200000, 199)
(200000, 199)
(200000, 199)
shape of concatenated df (200000, 797)
Number of nulls: 0
after train-validatin split
(160000, 797) (160000,) (40000, 797) (40000,)

Merge test
Shape of dfs
(200000, 200)
(200000, 199)
(200000, 199)
(200000, 199)
shape of concatenated df (200000, 797)
Number of nulls: 0


In [51]:
params = {
    "max_depth": scope.int(hp.quniform("max_depth", 2, 10, 1)),
    "num_leaves": scope.int(hp.quniform("num_leaves", 5, 100, 5)),
    "min_child_samples": scope.int(hp.quniform("min_child_samples", 10, 400, 10)), 
    "subsample": hp.uniform("subsample", 0.4, 1.0),
    "colsample_bytree": hp.uniform("colsample_bytree", 0.4, 1.0),
    "reg_lambda": hp.loguniform("reg_lambda", np.log(0.001), np.log(10000)),
    #"reg_alpha": hp.loguniform("reg_alpha", np.log(0.001), np.log(1000)),
    "learning_rate": hp.loguniform("learning_rate", np.log(0.005), np.log(1)),
}

# categorical_feature
params_fixed = {
    "boosting_type": "gbdt",
    "device": "gpu" ,
    "n_estimators": 500
}

num_eval = 200

lgbm = LGBMClassifier()
trials, best_params, best_model = run_hyperopt(lgbm, params, 
                                               X_train, y_train, X_val, y_val, 
                                               num_eval,
                                               params_fixed=params_fixed)
best_params

100%|██████████| 200/200 [56:29<00:00, 16.95s/trial, best loss: -0.8930499323836626] 
Time elapsed: 3401.18754 s


{'colsample_bytree': 0.9619837276989326,
 'learning_rate': 0.2086694232326363,
 'max_depth': 10,
 'min_child_samples': 200,
 'num_leaves': 5,
 'reg_lambda': 0.04141993319888672,
 'subsample': 0.5314150101983962}

In [52]:
auc_train = roc_auc(best_model, X_train, y_train)
print("AUC of the train set: %0.5f" % auc_train)

auc_val = roc_auc(best_model, X_val, y_val)
print("AUC of the evaluation set: %0.5f" % auc_val)


best_model.fit(np.concatenate([X_train, X_val]), np.concatenate([y_train, y_val]))

out_sub = os.path.join(SUB_DIR, "lgbm_org_valcount_target_mean_woe_tuned_01.csv")
write_submit_csv(best_model, X_test, id_code_test, out_sub)

out_model = os.path.join(MODELS_DIR, "lgbm_org_valcount_tuned_target_mean_woe_01.pickle")
pickle.dump(lgbm, open(out_model, "wb"))

AUC of the train set: 0.92802
AUC of the evaluation set: 0.89305


## Use `X_org` and `X_interact`

In [53]:
X_interact_train = load_csv(os.path.join(INP_DIR, "X_interact_0_train.csv"))
X_interact_test = load_csv(os.path.join(INP_DIR, "X_interact_0_test.csv"))


print("Merge train")

dfs_train = [X_org_train, X_interact_train]
features, X_train = merge_dfs(dfs_train)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_full_train, test_size=0.2, 
                                                  stratify=y_full_train, random_state=21083)

print("after train-validatin split")
print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)


print("\nMerge test")
dfs_test = [X_org_test, X_interact_train]
_, X_test = merge_dfs(dfs_test)

Memory usage before changing types 1000.00 MB
Memory usage after changing types 500.00 MB
Memory usage before changing types 1000.00 MB
Memory usage after changing types 500.00 MB
Merge train
Shape of dfs
(200000, 200)
(200000, 625)
shape of concatenated df (200000, 825)
Number of nulls: 0
after train-validatin split
(160000, 825) (160000,) (40000, 825) (40000,)

Merge test
Shape of dfs
(200000, 200)
(200000, 625)
shape of concatenated df (200000, 825)
Number of nulls: 0


In [54]:
params = {
    "max_depth": scope.int(hp.quniform("max_depth", 2, 10, 1)),
    "num_leaves": scope.int(hp.quniform("num_leaves", 5, 100, 5)),
    "min_child_samples": scope.int(hp.quniform("min_child_samples", 10, 400, 10)), 
    "subsample": hp.uniform("subsample", 0.4, 1.0),
    "colsample_bytree": hp.uniform("colsample_bytree", 0.4, 1.0),
    "reg_lambda": hp.loguniform("reg_lambda", np.log(0.001), np.log(10000)),
    #"reg_alpha": hp.loguniform("reg_alpha", np.log(0.001), np.log(1000)),
    "learning_rate": hp.loguniform("learning_rate", np.log(0.005), np.log(1)),
}

# categorical_feature
params_fixed = {
    "boosting_type": "gbdt",
    "device": "gpu" ,
    "n_estimators": 500
}

num_eval = 100

lgbm = LGBMClassifier()
trials, best_params, best_model = run_hyperopt(lgbm, params, 
                                               X_train, y_train, X_val, y_val, 
                                               num_eval,
                                               params_fixed=params_fixed)
best_params

100%|██████████| 100/100 [32:37<00:00, 19.58s/trial, best loss: -0.8909490139629812]
Time elapsed: 1969.33530 s


{'colsample_bytree': 0.5136490874102092,
 'learning_rate': 0.49112375034053984,
 'max_depth': 2,
 'min_child_samples': 40,
 'num_leaves': 55,
 'reg_lambda': 418.67783601550366,
 'subsample': 0.572003252273251}