In [1]:
import os, subprocess, pickle
import pandas as pd
import numpy as np
from IPython.display import display
from lib_feature_engineering import *

# Combine features

In [2]:
# check features folders
subprocess.check_output(["ls", "features"]).splitlines()

['baseline_extend.pkl.bz2',
 'baseline.pkl.bz2',
 'bureau_balance.pkl.bz2',
 'bureau.pkl.bz2',
 'credit_card_balance.pkl.bz2',
 'installments_payments.pkl.bz2',
 'mean_encoding_feat_cat.pkl.bz2',
 'pdf_features_label.pkl.bz2',
 'pos_cash.pkl.bz2',
 'prev_app.pkl.bz2']

In [18]:
# specified features set for joining
ls_feat_file = [
    "baseline.pkl.bz2",
    "baseline_extend.pkl.bz2",
    'bureau_balance.pkl.bz2',
    'bureau.pkl.bz2',
    'credit_card_balance.pkl.bz2',
    'installments_payments.pkl.bz2',
    'pos_cash.pkl.bz2',
    'prev_app.pkl.bz2'
]

In [20]:
%%time
# use first features for base joined
feat_path = os.path.join("features", ls_feat_file[0])
pdf_combined = pd.read_pickle(feat_path, compression="bz2")

# join next features set
for fname in ls_feat_file[1:]:
    feat_path = os.path.join("features", fname)
    pdf_feat = pd.read_pickle(feat_path, compression="bz2")
    print(fname, pdf_feat.shape)
    
    # add table prefix
    tbl_prefix = fname.split(".")[0]
    rename_col = {cname: "{}_{}".format(tbl_prefix, cname) for cname in pdf_feat.columns if cname != "SK_ID_CURR"}
    pdf_feat.rename(columns=rename_col, inplace=True)
    
    # join
    pdf_combined = pdf_combined.merge(pdf_feat, on="SK_ID_CURR", how="left")

print("rows, columns", pdf_combined.shape)
ls_features = [feat for feat in pdf_combined.columns if feat not in ["SK_ID_CURR"]]
display(pdf_combined.head())

('baseline_extend.pkl.bz2', (356255, 73))
('bureau_balance.pkl.bz2', (134542, 53))
('bureau.pkl.bz2', (305811, 89))
('credit_card_balance.pkl.bz2', (103558, 101))
('installments_payments.pkl.bz2', (339587, 30))
('pos_cash.pkl.bz2', (337252, 26))
('prev_app.pkl.bz2', (338857, 374))
('rows, columns', (356255, 889))


Unnamed: 0,SK_ID_CURR,is_FLAG_EMP_PHONE,is_FLAG_WORK_PHONE,is_FLAG_PHONE,is_FLAG_EMAIL,is_REG_REGION_NOT_LIVE_REGION,is_REG_REGION_NOT_WORK_REGION,is_LIVE_REGION_NOT_WORK_REGION,is_REG_CITY_NOT_WORK_CITY,is_LIVE_CITY_NOT_WORK_CITY,...,prev_app_AMT_ANNUITY_max,prev_app_AMT_ANNUITY_min,prev_app_AMT_ANNUITY_sum,prev_app_AMT_ANNUITY_mean,prev_app_AMT_ANNUITY_std,prev_app_RATE_DOWN_PAYMENT_max,prev_app_RATE_DOWN_PAYMENT_min,prev_app_RATE_DOWN_PAYMENT_sum,prev_app_RATE_DOWN_PAYMENT_mean,prev_app_RATE_DOWN_PAYMENT_std
0,100002,1,0,1,0,0,0,0,0,0,...,9251.775,9251.775,9251.775,9251.775,,0.0,0.0,0.0,0.0,
1,100003,1,0,1,0,0,0,0,0,0,...,98356.995,6737.31,169661.97,56553.99,46332.557777,0.100061,0.0,0.100061,0.05003,0.070754
2,100004,1,1,1,0,0,0,0,0,0,...,5357.25,5357.25,5357.25,5357.25,,0.212008,0.212008,0.212008,0.212008,
3,100006,1,0,0,0,0,0,0,0,0,...,39954.51,2482.92,141907.05,23651.175,13623.580119,0.21783,0.108994,0.326824,0.163412,0.076958
4,100007,1,0,0,0,0,0,0,1,1,...,22678.785,1834.29,73672.83,12278.805,8063.586466,0.21889,0.100143,0.319033,0.159516,0.083967


CPU times: user 1min 11s, sys: 6.24 s, total: 1min 17s
Wall time: 37.9 s


In [None]:
%%time
if False:
    def filter_feat_low_auc(pdf_label, pdf_input, threshold=0.501):
        pdf_eval = feature_evaluate(pdf_label, pdf_input)
        ls_filtered_feat = pdf_eval.query("auc > {}".format(threshold))["name"].tolist()
        return ls_filtered_feat
    
    
    # load train data
    data_path = "home-credit-default-risk/application_train.csv"
    pdf_train = pd.read_csv(data_path)

    # filter by tvt code
    pdf_tvt_extend = pd.read_pickle("pdf_tvt_extend.pkl", compression="bz2")
    pdf_train_filtered = (pdf_tvt_extend.query("tvt_code == 'train'")
                          .merge(pdf_train[["SK_ID_CURR"]], on="SK_ID_CURR")
                          .drop(columns=["tvt_code"]))
    
    ls_filtered_feat = filter_feat_low_auc(pdf_train_filtered, pdf_combined, threshold=0.501)
    pdf_combined = pdf_combined[["SK_ID_CURR"] + ls_filtered_feat]
    print("After filtered: {}".format(pdf_combined.shape))

# join with label

In [21]:
pdf_tvt = pd.read_pickle("pdf_tvt_extend.pkl", compression="bz2")
print(pdf_tvt.shape)
display(pdf_tvt.head())

(356255, 3)


Unnamed: 0,SK_ID_CURR,TARGET,tvt_code
0,100002,1,train
1,100003,0,train
2,100004,0,train
3,100006,0,train
4,100007,0,train


In [22]:
pdf_tvt["tvt_code"].value_counts()

train          216948
kaggle_test     48744
test            46127
val             44436
Name: tvt_code, dtype: int64

In [23]:
pdf_features_label = pdf_tvt.merge(pdf_combined, on="SK_ID_CURR", how="left")
print(pdf_features_label.shape)
display(pdf_features_label.head().T)

(356255, 891)


Unnamed: 0,0,1,2,3,4
SK_ID_CURR,100002,100003,100004,100006,100007
TARGET,1,0,0,0,0
tvt_code,train,train,train,train,train
is_FLAG_EMP_PHONE,1,1,1,1,1
is_FLAG_WORK_PHONE,0,0,1,0,0
is_FLAG_PHONE,1,1,1,0,0
is_FLAG_EMAIL,0,0,0,0,0
is_REG_REGION_NOT_LIVE_REGION,0,0,0,0,0
is_REG_REGION_NOT_WORK_REGION,0,0,0,0,0
is_LIVE_REGION_NOT_WORK_REGION,0,0,0,0,0


In [30]:
%%time
# save combined features with label
# pdf_features_label.to_pickle(os.path.join("features", "pdf_features_label.pkl.bz2"), compression="bz2")
pdf_features_label.to_csv(os.path.join("features", "pdf_features_label.csv.bz2"), compression="bz2")

CPU times: user 5min 7s, sys: 4.19 s, total: 5min 11s
Wall time: 5min 12s
