In [1]:
import os, subprocess
import pandas as pd
import numpy as np
from IPython.display import display
from lib_feature_engineering import *

# Combine features

In [2]:
# check features folders
subprocess.check_output(["ls", "features"]).splitlines()

['baseline_extend.pkl.bz2',
 'baseline.pkl.bz2',
 'bureau_balance.pkl.bz2',
 'bureau.pkl.bz2',
 'mean_encoding_feat_cat.pkl.bz2',
 'pdf_features_label.pkl.bz2']

In [3]:
# specified features set for joining
ls_feat_file = [
    "baseline.pkl.bz2",
    "baseline_extend.pkl.bz2",
    "bureau.pkl.bz2",
    "bureau_balance.pkl.bz2",
    "mean_encoding_feat_cat.pkl.bz2"
]

In [4]:
%%time
# use first features for base joined
feat_path = os.path.join("features", ls_feat_file[0])
pdf_combined = pd.read_pickle(feat_path, compression="bz2")

# join next features set
for fname in ls_feat_file[1:]:
    feat_path = os.path.join("features", fname)
    pdf_feat = pd.read_pickle(feat_path)
    
    # add table prefix
    tbl_prefix = fname.split(".")[0]
    rename_col = {cname: "{}_{}".format(tbl_prefix, cname) for cname in pdf_feat.columns if cname != "SK_ID_CURR"}
    pdf_feat.rename(columns=rename_col, inplace=True)
    
    # join
    pdf_combined = pdf_combined.merge(pdf_feat, on="SK_ID_CURR", how="left")

print("rows, columns", pdf_combined.shape)
ls_features = [feat for feat in pdf_combined.columns if feat not in ["SK_ID_CURR"]]
display(pdf_combined.head())

('rows, columns', (356255, 446))


Unnamed: 0,SK_ID_CURR,is_FLAG_EMP_PHONE,is_FLAG_WORK_PHONE,is_FLAG_PHONE,is_FLAG_EMAIL,is_REG_REGION_NOT_LIVE_REGION,is_REG_REGION_NOT_WORK_REGION,is_LIVE_REGION_NOT_WORK_REGION,is_REG_CITY_NOT_WORK_CITY,is_LIVE_CITY_NOT_WORK_CITY,...,mean_encoding_feat_cat_CREDIT_CURRENCY_mean_encoding_max,mean_encoding_feat_cat_CREDIT_CURRENCY_mean_encoding_sum,mean_encoding_feat_cat_CREDIT_CURRENCY_mean_encoding_min,mean_encoding_feat_cat_CREDIT_CURRENCY_mean_encoding_mean,mean_encoding_feat_cat_CREDIT_CURRENCY_mean_encoding_std,mean_encoding_feat_cat_CREDIT_TYPE_mean_encoding_max,mean_encoding_feat_cat_CREDIT_TYPE_mean_encoding_sum,mean_encoding_feat_cat_CREDIT_TYPE_mean_encoding_min,mean_encoding_feat_cat_CREDIT_TYPE_mean_encoding_mean,mean_encoding_feat_cat_CREDIT_TYPE_mean_encoding_std
0,100002,1,0,1,0,0,0,0,0,0,...,0.078385,40.132965,0.078385,0.078385,0.0,0.08879,41.896807,0.07487,0.08183,0.006967
1,100003,1,0,1,0,0,0,0,0,0,...,0.078385,5.016621,0.078385,0.078385,0.0,0.08879,5.237101,0.07487,0.08183,0.007015
2,100004,1,1,1,0,0,0,0,0,0,...,0.078385,0.627078,0.078385,0.078385,0.0,0.07487,0.598959,0.07487,0.07487,0.0
3,100006,1,0,0,0,0,0,0,0,0,...,,,,,,,,,,
4,100007,1,0,0,0,0,0,0,1,1,...,0.078385,0.078385,0.078385,0.078385,,0.07487,0.07487,0.07487,0.07487,


CPU times: user 22.9 s, sys: 3.01 s, total: 26 s
Wall time: 17 s


In [5]:
%%time
if True:
    def filter_feat_low_auc(pdf_label, pdf_input, threshold=0.501):
        pdf_eval = feature_evaluate(pdf_label, pdf_input)
        ls_filtered_feat = pdf_eval.query("auc > {}".format(threshold))["name"].tolist()
        return ls_filtered_feat
    
    
    # load train data
    data_path = "home-credit-default-risk/application_train.csv"
    pdf_train = pd.read_csv(data_path)

    # filter by tvt code
    pdf_tvt_extend = pd.read_pickle("pdf_tvt_extend.pkl", compression="bz2")
    pdf_train_filtered = (pdf_tvt_extend.query("tvt_code == 'train'")
                          .merge(pdf_train[["SK_ID_CURR"]], on="SK_ID_CURR")
                          .drop(columns=["tvt_code"]))
    
    ls_filtered_feat = filter_feat_low_auc(pdf_train_filtered, pdf_combined, threshold=0.501)
    pdf_combined = pdf_combined[["SK_ID_CURR"] + ls_filtered_feat]
    print("After filtered: {}".format(pdf_combined.shape))

After filtered: (356255, 309)
CPU times: user 2min 33s, sys: 4.24 s, total: 2min 37s
Wall time: 2min 26s


# join with label

In [6]:
pdf_tvt = pd.read_pickle("pdf_tvt_extend.pkl", compression="bz2")
print(pdf_tvt.shape)
display(pdf_tvt.head())

(356255, 3)


Unnamed: 0,SK_ID_CURR,TARGET,tvt_code
0,100002,1,train
1,100003,0,train
2,100004,0,train
3,100006,0,train
4,100007,0,train


In [7]:
pdf_tvt["tvt_code"].value_counts()

train          216948
kaggle_test     48744
test            46127
val             44436
Name: tvt_code, dtype: int64

In [8]:
pdf_features_label = pdf_tvt.merge(pdf_combined, on="SK_ID_CURR", how="left")
print(pdf_features_label.shape)
display(pdf_features_label.head().T)

(356255, 311)


Unnamed: 0,0,1,2,3,4
SK_ID_CURR,100002,100003,100004,100006,100007
TARGET,1,0,0,0,0
tvt_code,train,train,train,train,train
baseline_extend_EXT_SOURCE_3,0.139376,,0.729567,,
baseline_extend_EXT_SOURCE_1,0.083037,0.311267,,,
baseline_extend_EXT_SOURCE_2,0.262949,0.622246,0.555912,0.650442,0.322738
bureau_DAYS_ENDDATE_FACT_TO_YEARS_mean,2.39452,3.83767,2.37534,,3.14795
bureau_DAYS_CREDIT_TO_YEARS_mean,2.39452,3.83767,2.37534,,3.14795
bureau_DAYS_CREDIT_UPDATE_TO_YEARS_mean,2.39452,3.83767,2.37534,,3.14795
bureau_DAYS_CREDIT_ENDDATE_TO_YEARS_mean,2.39452,3.83767,2.37534,,3.14795


In [9]:
%%time
# save combined features with label
pdf_features_label.to_pickle(os.path.join("features", "pdf_features_label.pkl.bz2"), compression="bz2")

CPU times: user 1min 12s, sys: 709 ms, total: 1min 13s
Wall time: 1min 12s
