In [1]:
# Full width
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import math
import os
import subprocess

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from IPython.display import display

# 
from lib_modeling import *
from lib_feature_engineering import *

# some settings for displaying Pandas results
pd.set_option('display.width', 2000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.precision', 4)
pd.set_option('display.max_colwidth', -1)

# Load data

- Load train + tvt = train_filtered for features evaluation
- Load train/test for applying mean encoding

In [4]:
# load train/test data
data_path = "home-credit-default-risk/application_train.csv"
pdf_train = pd.read_csv(data_path)

data_path = "home-credit-default-risk/application_test.csv"
pdf_test = pd.read_csv(data_path)

# filter by tvt code
pdf_tvt_extend = pd.read_pickle("pdf_tvt_extend.pkl", compression="bz2")
pdf_train_filtered = (pdf_tvt_extend.query("tvt_code == 'train'")
                      .merge(pdf_train[["SK_ID_CURR"]], on="SK_ID_CURR")
                      .drop(columns=["tvt_code"]))
pdf_train_filtered.head()

Unnamed: 0,SK_ID_CURR,TARGET
0,100002,1
1,100003,0
2,100004,0
3,100006,0
4,100007,0


In [5]:
# load previous application
data_path = "home-credit-default-risk/credit_card_balance.csv"
pdf_data = pd.read_csv(data_path)
print(pdf_data.shape)
pdf_data.head()

(3840312, 23)


Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,AMT_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,AMT_DRAWINGS_ATM_CURRENT,AMT_DRAWINGS_CURRENT,AMT_DRAWINGS_OTHER_CURRENT,AMT_DRAWINGS_POS_CURRENT,AMT_INST_MIN_REGULARITY,AMT_PAYMENT_CURRENT,AMT_PAYMENT_TOTAL_CURRENT,AMT_RECEIVABLE_PRINCIPAL,AMT_RECIVABLE,AMT_TOTAL_RECEIVABLE,CNT_DRAWINGS_ATM_CURRENT,CNT_DRAWINGS_CURRENT,CNT_DRAWINGS_OTHER_CURRENT,CNT_DRAWINGS_POS_CURRENT,CNT_INSTALMENT_MATURE_CUM,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
0,2562384,378907,-6,56.97,135000,0.0,877.5,0.0,877.5,1700.325,1800.0,1800.0,0.0,0.0,0.0,0.0,1,0.0,1.0,35.0,Active,0,0
1,2582071,363914,-1,63975.555,45000,2250.0,2250.0,0.0,0.0,2250.0,2250.0,2250.0,60175.08,64875.555,64875.555,1.0,1,0.0,0.0,69.0,Active,0,0
2,1740877,371185,-7,31815.225,450000,0.0,0.0,0.0,0.0,2250.0,2250.0,2250.0,26926.425,31460.085,31460.085,0.0,0,0.0,0.0,30.0,Active,0,0
3,1389973,337855,-4,236572.11,225000,2250.0,2250.0,0.0,0.0,11795.76,11925.0,11925.0,224949.285,233048.97,233048.97,1.0,1,0.0,0.0,10.0,Active,0,0
4,1891521,126868,-1,453919.455,450000,0.0,11547.0,0.0,11547.0,22924.89,27000.0,27000.0,443044.395,453919.455,453919.455,0.0,1,0.0,1.0,101.0,Active,0,0


In [6]:
# load meta data
meta_path = "../02_pandas/reports/report_credit_card_balance.csv"
pdf_meta = pd.read_csv(meta_path)

# Categorical features

In [7]:
# get list categorical attributes
ls_cate = pdf_meta.query("sub_type == 'object'")["name"].tolist()
ls_cate

['NAME_CONTRACT_STATUS']

In [8]:
# construct categorical mapping
dict_onehot = {}
for cate in ls_cate:
    ls_val = pdf_data[cate].value_counts().index.tolist()
    dict_onehot[cate] = ls_val


### one hot

In [9]:
%%time
pdf_onehot = gen_one_hot_feat(pdf_data, dict_onehot, main_key="SK_ID_CURR")
print(pdf_onehot.shape)

(3840312, 8)
CPU times: user 23.7 s, sys: 1.46 s, total: 25.2 s
Wall time: 10.4 s


In [10]:
%%time
pdf_agg01 = agg_common_data(pdf_onehot, ["max", "sum", "mean"], main_key="SK_ID_CURR")
eval_agg01 = feature_evaluate(pdf_train_filtered, pdf_agg01)
display(eval_agg01)

{'NAME_CONTRACT_STATUS_Active': ['max', 'sum', 'mean'],
 'NAME_CONTRACT_STATUS_Approved': ['max', 'sum', 'mean'],
 'NAME_CONTRACT_STATUS_Completed': ['max', 'sum', 'mean'],
 'NAME_CONTRACT_STATUS_Demand': ['max', 'sum', 'mean'],
 'NAME_CONTRACT_STATUS_Refused': ['max', 'sum', 'mean'],
 'NAME_CONTRACT_STATUS_Sent_proposal': ['max', 'sum', 'mean'],
 'NAME_CONTRACT_STATUS_Signed': ['max', 'sum', 'mean']}

After agg: (103558, 21)


Unnamed: 0,name,auc,corr,coverage
1,NAME_CONTRACT_STATUS_Active_sum,0.5549,-0.0591,1.0
2,NAME_CONTRACT_STATUS_Active_mean,0.521,0.0235,1.0
7,NAME_CONTRACT_STATUS_Completed_sum,0.5155,-0.0216,1.0
8,NAME_CONTRACT_STATUS_Completed_mean,0.5155,-0.0235,1.0
6,NAME_CONTRACT_STATUS_Completed_max,0.5154,-0.0264,1.0
19,NAME_CONTRACT_STATUS_Signed_sum,0.5058,-0.0066,1.0
18,NAME_CONTRACT_STATUS_Signed_max,0.5058,-0.015,1.0
20,NAME_CONTRACT_STATUS_Signed_mean,0.5056,-0.0053,1.0
11,NAME_CONTRACT_STATUS_Sent_proposal_mean,0.5016,-0.0127,1.0
10,NAME_CONTRACT_STATUS_Sent_proposal_sum,0.5016,-0.0126,1.0


CPU times: user 3.2 s, sys: 98.6 ms, total: 3.3 s
Wall time: 1.43 s


In [11]:
eval_agg01.query("auc <= 0.501").shape

(10, 4)

In [12]:
sel_feat = eval_agg01.query("auc > 0.501")["name"].tolist()
pdf_agg01 = pdf_agg01[sel_feat]
print(pdf_agg01.shape)

(103558, 11)


# Numerical features

In [13]:
# get list numerical attributes
ls_num = pdf_meta.query("sub_type == 'int64'")["name"].tolist()
ls_num = [col for col in ls_num if "SK" not in col]
ls_num

['MONTHS_BALANCE', 'AMT_CREDIT_LIMIT_ACTUAL', 'CNT_DRAWINGS_CURRENT']

In [14]:
pdf_num = pdf_data[["SK_ID_PREV", "SK_ID_CURR"] + ls_num].copy()
pdf_num.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,CNT_DRAWINGS_CURRENT
0,2562384,378907,-6,135000,1
1,2582071,363914,-1,45000,1
2,1740877,371185,-7,450000,0
3,1389973,337855,-4,225000,1
4,1891521,126868,-1,450000,1


In [15]:
pdf_num["MONTHS_BALANCE"] = pdf_num["MONTHS_BALANCE"] * -1

In [16]:
%%time
pdf_agg02 = agg_common_data(pdf_num[["SK_ID_CURR"] + ls_num], ["max", "min", "sum", "mean", "std"], main_key="SK_ID_CURR")
eval_agg02 = feature_evaluate(pdf_train_filtered, pdf_agg02)
display(eval_agg02)

{'AMT_CREDIT_LIMIT_ACTUAL': ['max', 'min', 'sum', 'mean', 'std'],
 'CNT_DRAWINGS_CURRENT': ['max', 'min', 'sum', 'mean', 'std'],
 'MONTHS_BALANCE': ['max', 'min', 'sum', 'mean', 'std']}

After agg: (103558, 15)


Unnamed: 0,name,auc,corr,coverage
14,CNT_DRAWINGS_CURRENT_std,0.6257,0.1087,0.9929
13,CNT_DRAWINGS_CURRENT_mean,0.625,0.0843,1.0
10,CNT_DRAWINGS_CURRENT_max,0.6153,0.1011,1.0
12,CNT_DRAWINGS_CURRENT_sum,0.5969,0.0504,1.0
3,MONTHS_BALANCE_mean,0.5627,-0.0622,1.0
0,MONTHS_BALANCE_max,0.5606,-0.0613,1.0
2,MONTHS_BALANCE_sum,0.5597,-0.0589,1.0
4,MONTHS_BALANCE_std,0.5582,-0.0605,0.9929
7,AMT_CREDIT_LIMIT_ACTUAL_sum,0.547,-0.0426,1.0
1,MONTHS_BALANCE_min,0.5259,-0.0309,1.0


CPU times: user 2.74 s, sys: 185 ms, total: 2.93 s
Wall time: 1.22 s


# Continuous features

In [17]:
# get list continuous attributes
ls_con = pdf_meta.query("sub_type == 'float64'")["name"].tolist()
ls_con

['AMT_BALANCE',
 'AMT_DRAWINGS_ATM_CURRENT',
 'AMT_DRAWINGS_CURRENT',
 'AMT_DRAWINGS_OTHER_CURRENT',
 'AMT_DRAWINGS_POS_CURRENT',
 'AMT_INST_MIN_REGULARITY',
 'AMT_PAYMENT_CURRENT',
 'AMT_PAYMENT_TOTAL_CURRENT',
 'AMT_RECEIVABLE_PRINCIPAL',
 'AMT_RECIVABLE',
 'AMT_TOTAL_RECEIVABLE',
 'CNT_DRAWINGS_ATM_CURRENT',
 'CNT_DRAWINGS_OTHER_CURRENT',
 'CNT_DRAWINGS_POS_CURRENT',
 'CNT_INSTALMENT_MATURE_CUM']

In [18]:
pdf_con = pdf_data[["SK_ID_PREV", "SK_ID_CURR"] + ls_con].copy()
print(pdf_con.shape)

(3840312, 17)


In [19]:
%%time
pdf_agg03 = agg_common_data(pdf_con[["SK_ID_CURR"] + ls_con], ["max", "min", "sum", "mean", "std"], main_key="SK_ID_CURR")
eval_agg03 = feature_evaluate(pdf_train_filtered, pdf_agg03)
display(eval_agg03)

{'AMT_BALANCE': ['max', 'min', 'sum', 'mean', 'std'],
 'AMT_DRAWINGS_ATM_CURRENT': ['max', 'min', 'sum', 'mean', 'std'],
 'AMT_DRAWINGS_CURRENT': ['max', 'min', 'sum', 'mean', 'std'],
 'AMT_DRAWINGS_OTHER_CURRENT': ['max', 'min', 'sum', 'mean', 'std'],
 'AMT_DRAWINGS_POS_CURRENT': ['max', 'min', 'sum', 'mean', 'std'],
 'AMT_INST_MIN_REGULARITY': ['max', 'min', 'sum', 'mean', 'std'],
 'AMT_PAYMENT_CURRENT': ['max', 'min', 'sum', 'mean', 'std'],
 'AMT_PAYMENT_TOTAL_CURRENT': ['max', 'min', 'sum', 'mean', 'std'],
 'AMT_RECEIVABLE_PRINCIPAL': ['max', 'min', 'sum', 'mean', 'std'],
 'AMT_RECIVABLE': ['max', 'min', 'sum', 'mean', 'std'],
 'AMT_TOTAL_RECEIVABLE': ['max', 'min', 'sum', 'mean', 'std'],
 'CNT_DRAWINGS_ATM_CURRENT': ['max', 'min', 'sum', 'mean', 'std'],
 'CNT_DRAWINGS_OTHER_CURRENT': ['max', 'min', 'sum', 'mean', 'std'],
 'CNT_DRAWINGS_POS_CURRENT': ['max', 'min', 'sum', 'mean', 'std'],
 'CNT_INSTALMENT_MATURE_CUM': ['max', 'min', 'sum', 'mean', 'std']}

After agg: (103558, 75)


Unnamed: 0,name,auc,corr,coverage
3,AMT_DRAWINGS_CURRENT_mean,0.6091,0.063,1.0
63,AMT_BALANCE_mean,0.6077,0.0894,1.0
53,AMT_TOTAL_RECEIVABLE_mean,0.6069,0.0887,1.0
13,AMT_RECIVABLE_mean,0.6069,0.0887,1.0
18,AMT_RECEIVABLE_PRINCIPAL_mean,0.6068,0.0882,1.0
68,CNT_DRAWINGS_ATM_CURRENT_mean,0.6066,0.1092,0.703
4,AMT_DRAWINGS_CURRENT_std,0.6019,0.0711,0.9929
69,CNT_DRAWINGS_ATM_CURRENT_std,0.5971,0.1073,0.6979
48,AMT_INST_MIN_REGULARITY_mean,0.5925,0.0752,1.0
60,AMT_BALANCE_max,0.5843,0.0716,1.0


CPU times: user 7.8 s, sys: 382 ms, total: 8.18 s
Wall time: 5.36 s


In [20]:
eval_agg03.query("auc <= 0.501").shape

(2, 4)

# Save features

In [21]:
pdf_feat = pdf_agg01.join(pdf_agg02).join(pdf_agg03)
print(pdf_feat.shape)

(103558, 101)


In [22]:
%%time
fname = "credit_card_balance"
fname = os.path.join("features", "{}.pkl.bz2".format(fname))
pdf_feat.to_pickle(fname, compression="bz2")
print("Store features completed!")

Store features completed!
CPU times: user 3.61 s, sys: 68 ms, total: 3.67 s
Wall time: 3.69 s
