In [None]:
import numpy as np
import pandas as pd
from warnings import filterwarnings as f_w
f_w('ignore')
pd.options.display.max_columns = 999
pd.options.display.max_rows = 500
from homecredit import HomeCreditDataTable
from lightgbm import LGBMClassifier
from tqdm import tqdm_notebook
from bayes_opt import BayesianOptimization
from matplotlib import pyplot as plt

In [None]:
def read_table(table):
    hc = HomeCreditDataTable()
    hc.train = pd.read_csv('train.csv',
                           index_col = 'SK_ID_CURR')
    hc.test = pd.read_csv('test.csv',
                          index_col = 'SK_ID_CURR')
    hc.data = hc.data.join(pd.read_csv(locator.loc[table, 'feat_eng'],
                                       index_col = 'SK_ID_CURR'))
    return hc

In [None]:
def add_table(hc, table):
    df = pd.read_csv(locator.loc[table, 'feat_eng'],
                     index_col = 'SK_ID_CURR')
    hc.data = hc.data.join(df)
    return hc

In [None]:
locator = pd.read_excel('locator.xlsx', index_col = 0)

In [None]:
seed = 8

In [None]:
hc = read_table('appl')

In [None]:
for table in ['prev', 'inst', 'pos', 'card', 'buro', 'debt']:
    hc = add_table(hc, table)

In [None]:
hc.data = pd.get_dummies(hc.data,
                         dummy_na = True)

In [None]:
hc.cv_split(stratified=False,
            random_state = seed)

In [None]:
hc.model = LGBMClassifier(n_estimators = 10000, 
                          learning_rate = 0.02,
                          n_jobs = 6,
                          num_leaves = 5,
                          max_depth = 5,
                          lambda_l1 = 22.648736497717117,
                          lambda_l2 = 98.67287726505594,
                          min_gain_to_split = 0.29019888246738307,
                          min_sum_hessian_in_leaf = 66.7839604637625,
                          bagging_fraction = 0.5325536786661328,
                          feature_fraction = 0.5885253688928079)
hc.early_stop_rounds = 500

In [None]:
hc.validate()

In [None]:
gi_kf = pd.DataFrame(index=hc.data.columns)
for n, imp in enumerate(hc.feat_imp):
    gi_kf['split%d'%n] = imp[1]
gi_kf['mean_gain'] = gi_kf.mean(axis=1)
gi_kf = gi_kf.sort_values(by='mean_gain', ascending=False)

In [None]:
num = 650:
cols = gi.head(num).index
hc.predict(cols)
hc.submission.to_csv('sub.csv')