In [1]:
import os, math, subprocess
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from IPython.display import display

from sklearn.model_selection import train_test_split
from sklearn import metrics

# some settings for displaying Pandas results
pd.set_option('display.width', 2000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.precision', 4)
pd.set_option('display.max_colwidth', -1)

# Load data

In [2]:
# load train/test data
data_path = "home-credit-default-risk/application_train.csv"
pdf_train = pd.read_csv(data_path)

data_path = "home-credit-default-risk/application_test.csv"
pdf_test = pd.read_csv(data_path)

# filter by tvt code
pdf_tvt_extend = pd.read_pickle("pdf_tvt_extend.pkl", compression="bz2")
pdf_train_filtered = (pdf_tvt_extend.query("tvt_code == 'train'")
                      .merge(pdf_train[["SK_ID_CURR"]], on="SK_ID_CURR")
                      .drop(columns=["tvt_code"]))
pdf_train_filtered.head()

Unnamed: 0,SK_ID_CURR,TARGET
0,100002,1
1,100003,0
2,100004,0
3,100006,0
4,100007,0


In [3]:
# load bureau + balance
data_path = "home-credit-default-risk/bureau.csv"
pdf_bureau = pd.read_csv(data_path)

data_path = "home-credit-default-risk/bureau_balance.csv"
pdf_bureau_balance = pd.read_csv(data_path)

# Preprocess data

In [4]:
def my_auc(y_score,y_true,flexible_sign=True):
    # filter NaN
    idx  = np.isfinite(y_score)
    xxx  = y_score[idx]
    yyy  = y_true[idx]
    
    # if label not only 1s/0s
    if yyy.std() > 0.0:
        auc  = metrics.roc_auc_score(y_score=xxx,y_true=yyy)
    else:
        auc  = 0.5
        
    # for evaluation only
    if (auc < 0.5) & (flexible_sign):
        auc = 1.0 - auc
    return auc

def feature_evaluate(pdf_train, pdf_feat, ls_feat=None):
    out_res = {
        "name": [],
        "auc": [],
        "corr": [],
        "coverage": []
    }
    pdf_eval = pdf_train.merge(pdf_feat, on="SK_ID_CURR")
    if ls_feat is None:
        ls_feat = [cname for cname in pdf_feat.columns if cname != "SK_ID_CURR"]
        
    # calculate correlation
    pdf_corr = pdf_eval.corr()
    
    for feat in ls_feat:
        out_res["name"].append(feat)
        out_res["auc"].append(my_auc(pdf_eval[feat], pdf_eval["TARGET"]))
        out_res["corr"].append(pdf_corr.loc[feat, "TARGET"])
        out_res["coverage"].append((~pdf_eval[feat].isna()).mean())
        
    pdf_res = pd.DataFrame(out_res)
    pdf_res = pdf_res[["name", "auc", "corr", "coverage"]].sort_values(by="auc", ascending=False)
    return pdf_res

In [5]:
def gen_one_hot_feat(pdf_input, dict_feat, main_key="SK_ID_CURR"):

    pdf_data = pdf_input.copy()
    select_features = []    

    for cname in dict_feat:
        ls_vals = dict_feat[cname]
        for val in ls_vals:
            try:
                new_name = "{}_{}".format(cname, val.replace(" ", "_")\
                                                      .replace(":", "_")\
                                                      .replace("/", "_")\
                                                      .replace("-", "_"))

                select_features.append(new_name)
                pdf_data[new_name] = pdf_data[cname].apply(lambda x: int(x == val))
            except Exception as err:
                print("One hot for {}-{}. Error: {}".format(cname, val, err))                        
            
                
    return pdf_data[[main_key] + select_features]

In [6]:
def agg_common_data(pdf_input, ls_func, main_key="SK_ID_CURR"):
    ls_agg_name = [cname for cname in pdf_input.columns if cname != main_key]

    # define agg
    dict_agg = {}
    for name in ls_agg_name:
        dict_agg[name] = ls_func
    display(dict_agg)

    # do agg
    pdf_agg = pdf_input.groupby(main_key).agg(dict_agg)
    print("After agg: {}".format(pdf_agg.shape))

    # rename columns
    name01 = pdf_agg.columns.get_level_values(0)
    name02 = pdf_agg.columns.get_level_values(1)
    rename_cols = ["{}_{}".format(tpl[0], tpl[1]) for tpl in zip(name01, name02)]
    pdf_agg.columns = rename_cols

    return pdf_agg

# Feature engineering

- status: binary, frequency
- count: number bureau, number bureau transaction

In [7]:
pdf_bureau_balance.head()

Unnamed: 0,SK_ID_BUREAU,MONTHS_BALANCE,STATUS
0,5715448,0,C
1,5715448,-1,C
2,5715448,-2,C
3,5715448,-3,C
4,5715448,-4,C


In [8]:
pdf_bureau_balance["STATUS"].value_counts().index.tolist()

['C', '0', 'X', '1', '5', '2', '3', '4']

In [9]:
%%time
dict_onehot = {
    "STATUS": ['C', '0', 'X', '1', '5', '2', '3', '4'],
}
pdf_onehot = gen_one_hot_feat(pdf_bureau_balance, dict_onehot, main_key="SK_ID_BUREAU")
display(pdf_onehot.head())

Unnamed: 0,SK_ID_BUREAU,STATUS_C,STATUS_0,STATUS_X,STATUS_1,STATUS_5,STATUS_2,STATUS_3,STATUS_4
0,5715448,1,0,0,0,0,0,0,0
1,5715448,1,0,0,0,0,0,0,0
2,5715448,1,0,0,0,0,0,0,0
3,5715448,1,0,0,0,0,0,0,0
4,5715448,1,0,0,0,0,0,0,0


CPU times: user 1min 19s, sys: 7.43 s, total: 1min 27s
Wall time: 1min 8s


In [10]:
%%time
pdf_agg01 = agg_common_data(pdf_onehot, ["max", "sum"], main_key="SK_ID_BUREAU")
pdf_agg01.head()

{'STATUS_0': ['max', 'sum'],
 'STATUS_1': ['max', 'sum'],
 'STATUS_2': ['max', 'sum'],
 'STATUS_3': ['max', 'sum'],
 'STATUS_4': ['max', 'sum'],
 'STATUS_5': ['max', 'sum'],
 'STATUS_C': ['max', 'sum'],
 'STATUS_X': ['max', 'sum']}

After agg: (817395, 16)
CPU times: user 17.4 s, sys: 1.13 s, total: 18.5 s
Wall time: 4.07 s


In [11]:
pdf_agg02 = pdf_bureau_balance.groupby("SK_ID_BUREAU").size().to_frame("bureau_num_trans")
pdf_agg02.head()

Unnamed: 0_level_0,bureau_num_trans
SK_ID_BUREAU,Unnamed: 1_level_1
5001709,97
5001710,83
5001711,4
5001712,19
5001713,22


In [12]:
pdf_agg = pdf_agg01.join(pdf_agg02).reset_index()
pdf_agg.head()

Unnamed: 0,SK_ID_BUREAU,STATUS_C_max,STATUS_C_sum,STATUS_X_max,STATUS_X_sum,STATUS_5_max,STATUS_5_sum,STATUS_4_max,STATUS_4_sum,STATUS_3_max,STATUS_3_sum,STATUS_2_max,STATUS_2_sum,STATUS_1_max,STATUS_1_sum,STATUS_0_max,STATUS_0_sum,bureau_num_trans
0,5001709,1,86,1,11,0,0,0,0,0,0,0,0,0,0,0,0,97
1,5001710,1,48,1,30,0,0,0,0,0,0,0,0,0,0,1,5,83
2,5001711,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,3,4
3,5001712,1,9,0,0,0,0,0,0,0,0,0,0,0,0,1,10,19
4,5001713,0,0,1,22,0,0,0,0,0,0,0,0,0,0,0,0,22


# Join application to bureau balance

In [13]:
# join 
pdf_ids = pd.concat([pdf_train[["SK_ID_CURR"]], pdf_test[["SK_ID_CURR"]]])
pdf_data = (pdf_ids.merge(pdf_bureau[["SK_ID_CURR", "SK_ID_BUREAU"]], on="SK_ID_CURR")
            .merge(pdf_agg, on="SK_ID_BUREAU"))
print(pdf_data.shape)
pdf_data.head()

(774354, 19)


Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,STATUS_C_max,STATUS_C_sum,STATUS_X_max,STATUS_X_sum,STATUS_5_max,STATUS_5_sum,STATUS_4_max,STATUS_4_sum,STATUS_3_max,STATUS_3_sum,STATUS_2_max,STATUS_2_sum,STATUS_1_max,STATUS_1_sum,STATUS_0_max,STATUS_0_sum,bureau_num_trans
0,100002,6158904,1,2,1,1,0,0,0,0,0,0,0,0,1,1,1,18,22
1,100002,6158905,1,13,0,0,0,0,0,0,0,0,0,0,0,0,1,3,16
2,100002,6158906,1,2,1,3,0,0,0,0,0,0,0,0,1,6,1,5,16
3,100002,6158907,1,2,1,3,0,0,0,0,0,0,0,0,1,6,1,5,16
4,100002,6158908,0,0,0,0,0,0,0,0,0,0,0,0,1,2,1,2,4


In [14]:
# statistics on status_sum, bureau_num_trans
ls_cols = [cname for cname in pdf_data.columns if "_sum" in cname] + ["bureau_num_trans"]
pdf_stats = pdf_data[["SK_ID_CURR"] + ls_cols]
pdf_stats.head()

Unnamed: 0,SK_ID_CURR,STATUS_C_sum,STATUS_X_sum,STATUS_5_sum,STATUS_4_sum,STATUS_3_sum,STATUS_2_sum,STATUS_1_sum,STATUS_0_sum,bureau_num_trans
0,100002,2,1,0,0,0,0,1,18,22
1,100002,13,0,0,0,0,0,0,3,16
2,100002,2,3,0,0,0,0,6,5,16
3,100002,2,3,0,0,0,0,6,5,16
4,100002,0,0,0,0,0,0,2,2,4


In [15]:
pdf_agg03 = agg_common_data(pdf_stats, ["min", "max", "mean", "std"])
pdf_agg03.head()

{'STATUS_0_sum': ['min', 'max', 'mean', 'std'],
 'STATUS_1_sum': ['min', 'max', 'mean', 'std'],
 'STATUS_2_sum': ['min', 'max', 'mean', 'std'],
 'STATUS_3_sum': ['min', 'max', 'mean', 'std'],
 'STATUS_4_sum': ['min', 'max', 'mean', 'std'],
 'STATUS_5_sum': ['min', 'max', 'mean', 'std'],
 'STATUS_C_sum': ['min', 'max', 'mean', 'std'],
 'STATUS_X_sum': ['min', 'max', 'mean', 'std'],
 'bureau_num_trans': ['min', 'max', 'mean', 'std']}

After agg: (134542, 36)


Unnamed: 0_level_0,STATUS_C_sum_min,STATUS_C_sum_max,STATUS_C_sum_mean,STATUS_C_sum_std,STATUS_4_sum_min,STATUS_4_sum_max,STATUS_4_sum_mean,STATUS_4_sum_std,bureau_num_trans_min,bureau_num_trans_max,bureau_num_trans_mean,bureau_num_trans_std,STATUS_2_sum_min,STATUS_2_sum_max,STATUS_2_sum_mean,STATUS_2_sum_std,STATUS_5_sum_min,STATUS_5_sum_max,STATUS_5_sum_mean,STATUS_5_sum_std,STATUS_0_sum_min,STATUS_0_sum_max,STATUS_0_sum_mean,STATUS_0_sum_std,STATUS_1_sum_min,STATUS_1_sum_max,STATUS_1_sum_mean,STATUS_1_sum_std,STATUS_3_sum_min,STATUS_3_sum_max,STATUS_3_sum_mean,STATUS_3_sum_std,STATUS_X_sum_min,STATUS_X_sum_max,STATUS_X_sum_mean,STATUS_X_sum_std
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1
100001,0,44,15.7143,16.9972,0,0,0.0,0.0,2,52,24.5714,16.0505,0,0,0.0,0.0,0,0,0.0,0.0,1,12,4.4286,4.8941,0,1,0.1429,0.378,0,0,0.0,0.0,0,9,4.2857,3.8173
100002,0,13,2.875,4.1897,0,0,0.0,0.0,4,22,13.75,6.364,0,0,0.0,0.0,0,0,0.0,0.0,2,18,5.625,5.1807,0,6,3.375,2.8754,0,0,0.0,0.0,0,3,1.875,1.3562
100005,0,5,1.6667,2.8868,0,0,0.0,0.0,3,13,7.0,5.2915,0,0,0.0,0.0,0,0,0.0,0.0,2,7,4.6667,2.5166,0,0,0.0,0.0,0,0,0.0,0.0,0,1,0.6667,0.5774
100010,26,26,26.0,0.0,0,0,0.0,0.0,36,36,36.0,0.0,0,0,0.0,0.0,0,0,0.0,0.0,10,10,10.0,0.0,0,0,0.0,0.0,0,0,0.0,0.0,0,0,0.0,0.0
100013,0,44,25.75,20.7264,0,0,0.0,0.0,40,69,57.5,13.1276,0,0,0.0,0.0,0,0,0.0,0.0,0,34,19.75,14.523,0,3,1.75,1.2583,0,0,0.0,0.0,0,40,10.25,19.8389


In [16]:
feature_evaluate(pdf_train_filtered, pdf_agg03.reset_index())


Unnamed: 0,name,auc,corr,coverage
10,bureau_num_trans_mean,0.5928,-0.0785,1.0
9,bureau_num_trans_max,0.5728,-0.0668,1.0
2,STATUS_C_sum_mean,0.5717,-0.0618,1.0
8,bureau_num_trans_min,0.5708,-0.0459,1.0
1,STATUS_C_sum_max,0.5625,-0.0591,1.0
3,STATUS_C_sum_std,0.5596,-0.0542,0.8679
11,bureau_num_trans_std,0.5523,-0.0458,0.8679
22,STATUS_0_sum_mean,0.5426,-0.0387,1.0
26,STATUS_1_sum_mean,0.5418,0.0316,1.0
25,STATUS_1_sum_max,0.5403,0.0241,1.0


In [17]:
# keep binary values
ls_cols = [cname for cname in pdf_data.columns if "_max" in cname]
pdf_binary = pdf_data[["SK_ID_CURR"] + ls_cols]
pdf_binary.head()

Unnamed: 0,SK_ID_CURR,STATUS_C_max,STATUS_X_max,STATUS_5_max,STATUS_4_max,STATUS_3_max,STATUS_2_max,STATUS_1_max,STATUS_0_max
0,100002,1,1,0,0,0,0,1,1
1,100002,1,0,0,0,0,0,0,1
2,100002,1,1,0,0,0,0,1,1
3,100002,1,1,0,0,0,0,1,1
4,100002,0,0,0,0,0,0,1,1


In [18]:
pdf_agg04 = agg_common_data(pdf_binary, ["max"])
pdf_agg04.head()

{'STATUS_0_max': ['max'],
 'STATUS_1_max': ['max'],
 'STATUS_2_max': ['max'],
 'STATUS_3_max': ['max'],
 'STATUS_4_max': ['max'],
 'STATUS_5_max': ['max'],
 'STATUS_C_max': ['max'],
 'STATUS_X_max': ['max']}

After agg: (134542, 8)


Unnamed: 0_level_0,STATUS_4_max_max,STATUS_C_max_max,STATUS_2_max_max,STATUS_0_max_max,STATUS_3_max_max,STATUS_X_max_max,STATUS_1_max_max,STATUS_5_max_max
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
100001,0,1,0,1,0,1,1,0
100002,0,1,0,1,0,1,1,0
100005,0,1,0,1,0,1,0,0
100010,0,1,0,1,0,0,0,0
100013,0,1,0,1,0,1,1,0


In [19]:
feature_evaluate(pdf_train_filtered, pdf_agg04.reset_index())


Unnamed: 0,name,auc,corr,coverage
6,STATUS_1_max_max,0.5381,0.0446,1.0
1,STATUS_C_max_max,0.5246,-0.0335,1.0
2,STATUS_2_max_max,0.505,0.0115,1.0
7,STATUS_5_max_max,0.5045,0.0161,1.0
4,STATUS_3_max_max,0.5044,0.0143,1.0
0,STATUS_4_max_max,0.5042,0.0158,1.0
5,STATUS_X_max_max,0.502,0.0027,1.0
3,STATUS_0_max_max,0.5001,0.0003,1.0


# save features

In [20]:
pdf_feat = pdf_agg03.join(pdf_agg04)
print(pdf_feat.shape)
pdf_feat.head()

(134542, 44)


Unnamed: 0_level_0,STATUS_C_sum_min,STATUS_C_sum_max,STATUS_C_sum_mean,STATUS_C_sum_std,STATUS_4_sum_min,STATUS_4_sum_max,STATUS_4_sum_mean,STATUS_4_sum_std,bureau_num_trans_min,bureau_num_trans_max,bureau_num_trans_mean,bureau_num_trans_std,STATUS_2_sum_min,STATUS_2_sum_max,STATUS_2_sum_mean,STATUS_2_sum_std,STATUS_5_sum_min,STATUS_5_sum_max,STATUS_5_sum_mean,STATUS_5_sum_std,STATUS_0_sum_min,STATUS_0_sum_max,STATUS_0_sum_mean,STATUS_0_sum_std,STATUS_1_sum_min,STATUS_1_sum_max,STATUS_1_sum_mean,STATUS_1_sum_std,STATUS_3_sum_min,STATUS_3_sum_max,STATUS_3_sum_mean,STATUS_3_sum_std,STATUS_X_sum_min,STATUS_X_sum_max,STATUS_X_sum_mean,STATUS_X_sum_std,STATUS_4_max_max,STATUS_C_max_max,STATUS_2_max_max,STATUS_0_max_max,STATUS_3_max_max,STATUS_X_max_max,STATUS_1_max_max,STATUS_5_max_max
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1
100001,0,44,15.7143,16.9972,0,0,0.0,0.0,2,52,24.5714,16.0505,0,0,0.0,0.0,0,0,0.0,0.0,1,12,4.4286,4.8941,0,1,0.1429,0.378,0,0,0.0,0.0,0,9,4.2857,3.8173,0,1,0,1,0,1,1,0
100002,0,13,2.875,4.1897,0,0,0.0,0.0,4,22,13.75,6.364,0,0,0.0,0.0,0,0,0.0,0.0,2,18,5.625,5.1807,0,6,3.375,2.8754,0,0,0.0,0.0,0,3,1.875,1.3562,0,1,0,1,0,1,1,0
100005,0,5,1.6667,2.8868,0,0,0.0,0.0,3,13,7.0,5.2915,0,0,0.0,0.0,0,0,0.0,0.0,2,7,4.6667,2.5166,0,0,0.0,0.0,0,0,0.0,0.0,0,1,0.6667,0.5774,0,1,0,1,0,1,0,0
100010,26,26,26.0,0.0,0,0,0.0,0.0,36,36,36.0,0.0,0,0,0.0,0.0,0,0,0.0,0.0,10,10,10.0,0.0,0,0,0.0,0.0,0,0,0.0,0.0,0,0,0.0,0.0,0,1,0,1,0,0,0,0
100013,0,44,25.75,20.7264,0,0,0.0,0.0,40,69,57.5,13.1276,0,0,0.0,0.0,0,0,0.0,0.0,0,34,19.75,14.523,0,3,1.75,1.2583,0,0,0.0,0.0,0,40,10.25,19.8389,0,1,0,1,0,1,1,0


In [21]:
%%time
fname = "bureau_balance"
fname = os.path.join("features", "{}.pkl.bz2".format(fname))
pdf_feat.to_pickle(fname, compression="bz2")
print("Store features completed!")


Store features completed!
CPU times: user 2.72 s, sys: 46.1 ms, total: 2.76 s
Wall time: 1.33 s
