In [1]:
import warnings
warnings.filterwarnings('ignore')

import os
import pickle
import pandas as pd
import numpy as np

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

from _preprocessing import merge_tables
from _preprocessing import onehot_encoding
from _preprocessing import GeneralLabelEncoder
from _model_tunning import roc_auc
from _model_tunning import grid_search_stepwise
from _input_output import feature_importance_df
from _input_output import write_submit_csv

INP_DIR = "data/data_"
OUT_DIR = "data/tuning_"
SUBMIT_DIR = "data/submit_"

N_JOBS = 8

# Tuning XGBoost with `application` data set

## Loading `application`

In [2]:
main_csv_train = os.path.join(INP_DIR, "application_train.csv")
main_csv_test = os.path.join(INP_DIR, "application_test.csv")

add_files = []
prefixes = []

other_csv_files = [os.path.join(INP_DIR, f) for f in add_files]
print(other_csv_files)

X_train = merge_tables(main_csv_train, other_csv_files=other_csv_files, prefixes=prefixes)
X_test = merge_tables(main_csv_test, other_csv_files=other_csv_files, prefixes=prefixes)

y_train = X_train["TARGET"]
X_train = X_train.drop(["SK_ID_CURR", "TARGET"], axis=1)

id_test = X_test[["SK_ID_CURR"]]
X_test = X_test.drop(["SK_ID_CURR"], axis=1)

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("id_test shape:", id_test.shape)

[]
Loading data/data_/application_train.csv
Memory usage before changing types 294.90 MB
Memory usage after changing types 133.77 MB
Memory usage before changing types 133.77 MB
Memory usage after changing types 133.77 MB
Loading data/data_/application_test.csv
Memory usage before changing types 46.70 MB
Memory usage after changing types 21.16 MB
Memory usage before changing types 21.16 MB
Memory usage after changing types 21.16 MB
X_train shape: (307511, 124)
y_train shape: (307511,)
X_test shape: (48744, 124)
id_test shape: (48744, 1)


## One-hot encoding

In [3]:
# One hot encoding
X_train_xgb, X_test_xgb = onehot_encoding(X_train, X_test)
print("X_train_xgb shape:", X_train_xgb.shape)
print("X_test_xgb shape:", X_test_xgb.shape)

features = list(X_train_xgb.columns)

# No need to impute missing values, since XGBoost can handle missing values automatically
# no need to standardize features

# train test split
#X_train_xgb_1, X_eval, y_train_1, y_eval = train_test_split(X_train_xgb, y_train, test_size=0.2, 
#                                                           random_state=123, stratify=y_train)

X_train_xgb shape: (307511, 245)
X_test_xgb shape: (48744, 245)


In [None]:
pickle_out = os.path.join(OUT_DIR, "xgb_Ap_ohe_1.pkl")

xgb = XGBClassifier(n_jobs=N_JOBS)

step_1 = dict(n_estimators = [150], learning_rate = [0.05, 0.1, 0.2, 0.3, 0.5])
step_2 = dict(max_depth = [2, 4, 6, 8, 10])
step_3 = dict(min_child_weight = [0, 1, 3, 5, 7, 9])

step_4 = dict(subsample=[0.6, 0.8, 1.0])
step_5 = dict(colsample_bytree=[0.6, 0.8, 1.0])

step_6 = dict(reg_lambda=[0, 1, 10, 100, 1000])
step_7 = dict(reg_alpha=[0, 1, 10, 100, 1000])

step_8 = dict(n_estimators = [500], learning_rate = [0.01, 0.05, 0.1, 0.2, 0.3])


params_grid_steps = [step_1, step_2, step_3, step_4, step_5, step_6, step_7, step_8]
print("params_grid_steps:\n", params_grid_steps)

results = grid_search_stepwise(xgb, X_train_xgb, y_train, params_grid_steps, 
                               scoring="roc_auc", cv=5,
                               random_state=123, pkl_out=pickle_out)

Runing the above tuning code in a Linux machine using 8 CPUs gives **the best CV AUC score of 0.76210.**

In [5]:
results = pickle.load(open(os.path.join(OUT_DIR, "xgb_Ap_ohe_1.pkl"), "rb"))

xgb = results["best_estimator"]

print("Best params:\n", xgb.get_params())
print("\nBest CV AUC: %0.5f \n" % results["best_scores"][-1])

# write the Kaggle submission file
write_submit_csv(xgb, X_test_xgb, id_test, os.path.join(SUBMIT_DIR, "xgb_Ap_ohe_1.csv"))

feature_importance = feature_importance_df(xgb, features)
feature_importance.head(20)

Best params:
 {'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bynode': 1, 'colsample_bytree': 1.0, 'gamma': 0, 'learning_rate': 0.2, 'max_delta_step': 0, 'max_depth': 4, 'min_child_weight': 9, 'missing': nan, 'n_estimators': 500, 'n_jobs': 8, 'nthread': None, 'objective': 'binary:logistic', 'random_state': 0, 'reg_alpha': 0, 'reg_lambda': 1000, 'scale_pos_weight': 1, 'seed': None, 'silent': None, 'subsample': 1.0, 'verbosity': 1}

Best CV AUC: 0.76210 



Unnamed: 0,feature,importance
131,NAME_EDUCATION_TYPE_Higher education,0.079447
111,CODE_GENDER_M,0.065218
112,FLAG_OWN_CAR_N,0.049986
129,NAME_INCOME_TYPE_Working,0.045835
28,EXT_SOURCE_2,0.042087
29,EXT_SOURCE_3,0.031847
79,FLAG_DOCUMENT_3,0.027294
134,NAME_EDUCATION_TYPE_Secondary / secondary special,0.022799
126,NAME_INCOME_TYPE_State servant,0.019847
37,FLOORSMAX_AVG,0.019825


## Label encoding

In [6]:
# Label encoding
lbe = GeneralLabelEncoder()
lbe.fit(X_train)
X_train_xgb = lbe.transform(X_train)
X_test_xgb = lbe.transform(X_test)

print("X_train_xgb shape:", X_train_xgb.shape)
print("X_test_xgb shape:", X_test_xgb.shape)

features = list(X_train_xgb.columns)

# No need to impute missing values, since XGBoost can handle missing values automatically
# no need to standardize features

# train test split
#X_train_xgb_1, X_eval, y_train_1, y_eval = train_test_split(X_train_xgb, y_train, test_size=0.2, 
#                                                           random_state=123, stratify=y_train)

X_train_xgb shape: (307511, 124)
X_test_xgb shape: (48744, 124)


In [None]:
pickle_out = os.path.join(OUT_DIR, "xgb_Ap_lbe_1.pkl")

xgb = XGBClassifier(n_jobs=N_JOBS)

step_1 = dict(n_estimators = [150], learning_rate = [0.05, 0.1, 0.2, 0.3, 0.5])
step_2 = dict(max_depth = [2, 4, 6, 8, 10])
step_3 = dict(min_child_weight = [0, 1, 3, 5, 7, 9])

step_4 = dict(subsample=[0.6, 0.8, 1.0])
step_5 = dict(colsample_bytree=[0.6, 0.8, 1.0])

step_6 = dict(reg_lambda=[0, 1, 10, 100, 1000])
step_7 = dict(reg_alpha=[0, 1, 10, 100, 1000])

step_8 = dict(n_estimators = [500], learning_rate = [0.01, 0.05, 0.1, 0.2, 0.3])


params_grid_steps = [step_1, step_2, step_3, step_4, step_5, step_6, step_7, step_8]
print("params_grid_steps:\n", params_grid_steps)

results = grid_search_stepwise(xgb, X_train_xgb, y_train, params_grid_steps, 
                               scoring="roc_auc", cv=5,
                               random_state=123, pkl_out=pickle_out)

Runing the above tuning code on a Linux machine using 8 CPUs gives the **best CV AUC score of 0.76182.**

In [7]:
results = pickle.load(open(os.path.join(OUT_DIR, "xgb_Ap_lbe_1.pkl"), "rb"))

xgb = results["best_estimator"]

print("Best params:\n", xgb.get_params())
print("\nBest CV AUC: %0.5f \n" % results["best_scores"][-1])

# write the Kaggle submission file
write_submit_csv(xgb, X_test_xgb, id_test, os.path.join(SUBMIT_DIR, "xgb_Ap_lbe_1.csv"))

feature_importance = feature_importance_df(xgb, features)
feature_importance.head(20)

Best params:
 {'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bynode': 1, 'colsample_bytree': 1.0, 'gamma': 0, 'learning_rate': 0.1, 'max_delta_step': 0, 'max_depth': 4, 'min_child_weight': 9, 'missing': nan, 'n_estimators': 500, 'n_jobs': 8, 'nthread': None, 'objective': 'binary:logistic', 'random_state': 0, 'reg_alpha': 10, 'reg_lambda': 100, 'scale_pos_weight': 1, 'seed': None, 'silent': None, 'subsample': 1.0, 'verbosity': 1}

Best CV AUC: 0.76182 



Unnamed: 0,feature,importance
40,EXT_SOURCE_2,0.078506
11,NAME_EDUCATION_TYPE,0.077557
41,EXT_SOURCE_3,0.071016
1,CODE_GENDER,0.035532
10,NAME_INCOME_TYPE,0.035077
39,EXT_SOURCE_1,0.026591
95,FLAG_DOCUMENT_3,0.025782
123,CREDIT_TO_GOODS,0.023116
19,OWN_CAR_AGE,0.022792
8,AMT_GOODS_PRICE,0.020646


# Training `XGBoost` on the combined `application`, `bureau` and `bureau_balance`

In [8]:
# Loading application, bureau and bureau_balance

main_csv_train = os.path.join(INP_DIR, "application_train.csv")
main_csv_test = os.path.join(INP_DIR, "application_test.csv")

add_files = ["bureau.csv", "bureau_balance.csv"]
prefixes = ["bu_", "bb_"]

other_csv_files = [os.path.join(INP_DIR, f) for f in add_files]
print(other_csv_files)

X_train = merge_tables(main_csv_train, other_csv_files=other_csv_files, prefixes=prefixes)
X_test = merge_tables(main_csv_test, other_csv_files=other_csv_files, prefixes=prefixes)

y_train = X_train["TARGET"]
X_train = X_train.drop(["SK_ID_CURR", "TARGET"], axis=1)

id_test = X_test[["SK_ID_CURR"]]
X_test = X_test.drop(["SK_ID_CURR"], axis=1)

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("id_test shape:", id_test.shape)

['data/data_/bureau.csv', 'data/data_/bureau_balance.csv']
Loading data/data_/application_train.csv
Memory usage before changing types 294.90 MB
Memory usage after changing types 133.77 MB
Loading  data/data_/bureau.csv
Memory usage before changing types 339.14 MB
Memory usage after changing types 167.59 MB
Loading  data/data_/bureau_balance.csv
Memory usage before changing types 327.83 MB
Memory usage after changing types 163.91 MB
Memory usage before changing types 552.91 MB
Memory usage after changing types 467.12 MB
Loading data/data_/application_test.csv
Memory usage before changing types 46.70 MB
Memory usage after changing types 21.16 MB
Loading  data/data_/bureau.csv
Memory usage before changing types 339.14 MB
Memory usage after changing types 167.59 MB
Loading  data/data_/bureau_balance.csv
Memory usage before changing types 327.83 MB
Memory usage after changing types 163.91 MB
Memory usage before changing types 87.60 MB
Memory usage after changing types 74.00 MB
X_train shap

In [9]:
# One hot encoding
X_train_xgb, X_test_xgb = onehot_encoding(X_train, X_test)
print("X_train_xgb shape:", X_train_xgb.shape)
print("X_test_xgb shape:", X_test_xgb.shape)

features = list(X_train_xgb.columns)

# No need to impute missing values, since XGBoost can handle missing values automatically
# no need to standardize features

# train test split
X_train_xgb_1, X_eval, y_train_1, y_eval = train_test_split(X_train_xgb, y_train, test_size=0.2, 
                                                            random_state=123, stratify=y_train)

X_train_xgb shape: (307511, 541)
X_test_xgb shape: (48744, 541)


In [None]:
results = pickle.load(open(os.path.join(OUT_DIR, "xgb_Ap_ohe_1.pkl"), "rb"))

xgb = results["best_estimator"]
xgb.fit(X_train_xgb_1, y_train_1)

auc_xgb = roc_auc(xgb, X_eval, y_eval)
print("AUC of XGBoost model on an evaluation set: %0.5f" % auc_xgb)

# fit on the whole set
xgb.fit(X_train_xgb, y_train)

# dump model to pickle file
pickle_out = os.path.join(OUT_DIR, "xgb_Ap_ohe_2.pkl")
pickle.dump(xgb, open(pickle_out, "wb"))

Runing the above tuning code on a Linux machine using 8 CPUs gives **the AUC score of 0.76610** for the holdout evaluation set.

In [12]:
xgb = pickle.load(open(os.path.join(OUT_DIR, "xgb_Ap_ohe_2.pkl"), "rb"))

print("Model params:\n", xgb.get_params())

# write the Kaggle submission file
write_submit_csv(xgb, X_test_xgb, id_test, os.path.join(SUBMIT_DIR, "xgb_Ap_ohe_2.csv"))

feature_importance = feature_importance_df(xgb, features)
feature_importance.head(20)

Best params:
 {'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bynode': 1, 'colsample_bytree': 1.0, 'gamma': 0, 'learning_rate': 0.2, 'max_delta_step': 0, 'max_depth': 4, 'min_child_weight': 9, 'missing': nan, 'n_estimators': 500, 'n_jobs': 8, 'nthread': None, 'objective': 'binary:logistic', 'random_state': 0, 'reg_alpha': 0, 'reg_lambda': 1000, 'scale_pos_weight': 1, 'seed': None, 'silent': None, 'subsample': 1.0, 'verbosity': 1}


Unnamed: 0,feature,importance
396,NAME_INCOME_TYPE_Working,0.071418
398,NAME_EDUCATION_TYPE_Higher education,0.057739
401,NAME_EDUCATION_TYPE_Secondary / secondary special,0.036896
378,CODE_GENDER_M,0.029307
29,EXT_SOURCE_3,0.026515
28,EXT_SOURCE_2,0.023176
220,bu_CREDIT_TYPE_Microloan_sum,0.02101
79,FLAG_DOCUMENT_3,0.016679
422,OCCUPATION_TYPE_Low-skill Laborers,0.016268
507,WALLSMATERIAL_MODE_Panel,0.014053


# Training XGBoost on all data

In [13]:
# Loading application, bureau and bureau_balance

main_csv_train = os.path.join(INP_DIR, "application_train.csv")
main_csv_test = os.path.join(INP_DIR, "application_test.csv")

add_files = ["bureau.csv", "bureau_balance.csv", "previous_application.csv", 
             "POS_CASH_balance.csv", "credit_card_balance.csv", "installments_payments.csv"]
prefixes = ["bu_", "bb_", "pa_", "po_", "cc_", "ip_"]

other_csv_files = [os.path.join(INP_DIR, f) for f in add_files]
print(other_csv_files)

X_train = merge_tables(main_csv_train, other_csv_files=other_csv_files, prefixes=prefixes)
X_test = merge_tables(main_csv_test, other_csv_files=other_csv_files, prefixes=prefixes)

y_train = X_train["TARGET"]
X_train = X_train.drop(["SK_ID_CURR", "TARGET"], axis=1)

id_test = X_test[["SK_ID_CURR"]]
X_test = X_test.drop(["SK_ID_CURR"], axis=1)

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("id_test shape:", id_test.shape)

['data/data_/bureau.csv', 'data/data_/bureau_balance.csv', 'data/data_/previous_application.csv', 'data/data_/POS_CASH_balance.csv', 'data/data_/credit_card_balance.csv', 'data/data_/installments_payments.csv']
Loading data/data_/application_train.csv
Memory usage before changing types 294.90 MB
Memory usage after changing types 133.77 MB
Loading  data/data_/bureau.csv
Memory usage before changing types 339.14 MB
Memory usage after changing types 167.59 MB
Loading  data/data_/bureau_balance.csv
Memory usage before changing types 327.83 MB
Memory usage after changing types 163.91 MB
Loading  data/data_/previous_application.csv
Memory usage before changing types 1068.08 MB
Memory usage after changing types 534.04 MB
Loading  data/data_/POS_CASH_balance.csv
Memory usage before changing types 571.99 MB
Memory usage after changing types 286.00 MB
Loading  data/data_/credit_card_balance.csv
Memory usage before changing types 1233.44 MB
Memory usage after changing types 616.72 MB
Loading  dat

In [14]:
# One hot encoding
X_train_xgb, X_test_xgb = onehot_encoding(X_train, X_test)
print("X_train_xgb shape:", X_train_xgb.shape)
print("X_test_xgb shape:", X_test_xgb.shape)

features = list(X_train_xgb.columns)

# No need to impute missing values, since XGBoost can handle missing values automatically
# no need to standardize features

# train test split
X_train_xgb_1, X_eval, y_train_1, y_eval = train_test_split(X_train_xgb, y_train, test_size=0.2, 
                                                           random_state=123, stratify=y_train)

X_train_xgb shape: (307511, 1743)
X_test_xgb shape: (48744, 1743)


In [None]:
results = pickle.load(open(os.path.join(OUT_DIR, "xgb_Ap_ohe_1.pkl"), "rb"))

xgb = results["best_estimator"]
xgb.fit(X_train_xgb_1, y_train_1)

auc_xgb = roc_auc(xgb, X_eval, y_eval)
print("AUC of XGBoost model on an evaluation set: %0.5f" % auc_xgb)

# fit on the whole set
xgb.fit(X_train_xgb, y_train)

# dump model to pickle file
pickle_out = os.path.join(OUT_DIR, "xgb_Ap_ohe_3.pkl")
pickle.dump(xgb, open(pickle_out, "wb"))

Runing the above tuning code on a Linux machine using 8 CPUs gives **the AUC score of 0.78634 for the holdout evaluation set.**

In [15]:
xgb = pickle.load(open(os.path.join(OUT_DIR, "xgb_Ap_ohe_3.pkl"), "rb"))

print("Model params:\n", xgb.get_params())

# write the Kaggle submission file
write_submit_csv(xgb, X_test_xgb, id_test, os.path.join(SUBMIT_DIR, "xgb_Ap_ohe_3.csv"))

feature_importance = feature_importance_df(xgb, features)
feature_importance.head(20)

Model params:
 {'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bynode': 1, 'colsample_bytree': 1.0, 'gamma': 0, 'learning_rate': 0.2, 'max_delta_step': 0, 'max_depth': 4, 'min_child_weight': 9, 'missing': nan, 'n_estimators': 500, 'n_jobs': 8, 'nthread': None, 'objective': 'binary:logistic', 'random_state': 0, 'reg_alpha': 0, 'reg_lambda': 1000, 'scale_pos_weight': 1, 'seed': None, 'silent': None, 'subsample': 1.0, 'verbosity': 1}


Unnamed: 0,feature,importance
1600,NAME_EDUCATION_TYPE_Higher education,0.021471
1598,NAME_INCOME_TYPE_Working,0.020976
1603,NAME_EDUCATION_TYPE_Secondary / secondary special,0.017266
29,EXT_SOURCE_3,0.014058
28,EXT_SOURCE_2,0.013907
1020,cc_AMT_BALANCE_min_mean,0.011936
1580,CODE_GENDER_M,0.010242
393,pa_AMT_DOWN_PAYMENT_sum,0.00918
887,po_SK_DPD_DEF_mean_min,0.008918
550,pa_NAME_CONTRACT_STATUS_Refused_mean,0.008498
