In [1]:
# Full width
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import math
import os
import subprocess

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from IPython.display import display

# 
from lib_modeling import *
from lib_feature_engineering import *

# some settings for displaying Pandas results
pd.set_option('display.width', 2000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.precision', 4)
pd.set_option('display.max_colwidth', -1)

# Load data

- Load train + tvt = train_filtered for features evaluation
- Load train/test for applying mean encoding

In [4]:
# load train/test data
data_path = "home-credit-default-risk/application_train.csv"
pdf_train = pd.read_csv(data_path)

data_path = "home-credit-default-risk/application_test.csv"
pdf_test = pd.read_csv(data_path)

# filter by tvt code
pdf_tvt_extend = pd.read_pickle("pdf_tvt_extend.pkl", compression="bz2")
pdf_train_filtered = (pdf_tvt_extend.query("tvt_code == 'train'")
                      .merge(pdf_train[["SK_ID_CURR"]], on="SK_ID_CURR")
                      .drop(columns=["tvt_code"]))
pdf_train_filtered.head()

Unnamed: 0,SK_ID_CURR,TARGET
0,100002,1
1,100003,0
2,100004,0
3,100006,0
4,100007,0


In [5]:
# load previous application
data_path = "home-credit-default-risk/previous_application.csv"
pdf_prev_app = pd.read_csv(data_path)
print(pdf_prev_app.shape)
pdf_prev_app.head()

(1670214, 37)


Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NAME_CONTRACT_TYPE,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,FLAG_LAST_APPL_PER_CONTRACT,NFLAG_LAST_APPL_IN_DAY,RATE_DOWN_PAYMENT,RATE_INTEREST_PRIMARY,RATE_INTEREST_PRIVILEGED,NAME_CASH_LOAN_PURPOSE,NAME_CONTRACT_STATUS,DAYS_DECISION,NAME_PAYMENT_TYPE,CODE_REJECT_REASON,NAME_TYPE_SUITE,NAME_CLIENT_TYPE,NAME_GOODS_CATEGORY,NAME_PORTFOLIO,NAME_PRODUCT_TYPE,CHANNEL_TYPE,SELLERPLACE_AREA,NAME_SELLER_INDUSTRY,CNT_PAYMENT,NAME_YIELD_GROUP,PRODUCT_COMBINATION,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL
0,2030495,271877,Consumer loans,1730.43,17145.0,17145.0,0.0,17145.0,SATURDAY,15,Y,1,0.0,0.1828,0.8673,XAP,Approved,-73,Cash through the bank,XAP,,Repeater,Mobile,POS,XNA,Country-wide,35,Connectivity,12.0,middle,POS mobile with interest,365243.0,-42.0,300.0,-42.0,-37.0,0.0
1,2802425,108129,Cash loans,25188.615,607500.0,679671.0,,607500.0,THURSDAY,11,Y,1,,,,XNA,Approved,-164,XNA,XAP,Unaccompanied,Repeater,XNA,Cash,x-sell,Contact center,-1,XNA,36.0,low_action,Cash X-Sell: low,365243.0,-134.0,916.0,365243.0,365243.0,1.0
2,2523466,122040,Cash loans,15060.735,112500.0,136444.5,,112500.0,TUESDAY,11,Y,1,,,,XNA,Approved,-301,Cash through the bank,XAP,"Spouse, partner",Repeater,XNA,Cash,x-sell,Credit and cash offices,-1,XNA,12.0,high,Cash X-Sell: high,365243.0,-271.0,59.0,365243.0,365243.0,1.0
3,2819243,176158,Cash loans,47041.335,450000.0,470790.0,,450000.0,MONDAY,7,Y,1,,,,XNA,Approved,-512,Cash through the bank,XAP,,Repeater,XNA,Cash,x-sell,Credit and cash offices,-1,XNA,12.0,middle,Cash X-Sell: middle,365243.0,-482.0,-152.0,-182.0,-177.0,1.0
4,1784265,202054,Cash loans,31924.395,337500.0,404055.0,,337500.0,THURSDAY,9,Y,1,,,,Repairs,Refused,-781,Cash through the bank,HC,,Repeater,XNA,Cash,walk-in,Credit and cash offices,-1,XNA,24.0,high,Cash Street: high,,,,,,


In [6]:
# load meta data
meta_path = "../02_pandas/reports/report_previous_application.csv"
pdf_meta = pd.read_csv(meta_path)

# Categorical features

In [7]:
# get list categorical attributes
ls_cate = pdf_meta.query("sub_type == 'object'")["name"].tolist()
ls_cate

['NAME_CONTRACT_TYPE',
 'WEEKDAY_APPR_PROCESS_START',
 'FLAG_LAST_APPL_PER_CONTRACT',
 'NAME_CASH_LOAN_PURPOSE',
 'NAME_CONTRACT_STATUS',
 'NAME_PAYMENT_TYPE',
 'CODE_REJECT_REASON',
 'NAME_TYPE_SUITE',
 'NAME_CLIENT_TYPE',
 'NAME_GOODS_CATEGORY',
 'NAME_PORTFOLIO',
 'NAME_PRODUCT_TYPE',
 'CHANNEL_TYPE',
 'NAME_SELLER_INDUSTRY',
 'NAME_YIELD_GROUP',
 'PRODUCT_COMBINATION']

In [8]:
# construct categorical mapping
dict_onehot = {}
for cate in ls_cate:
    ls_val = pdf_prev_app[cate].value_counts().index.tolist()
    dict_onehot[cate] = ls_val


### one hot

In [9]:
%%time
pdf_onehot = gen_one_hot_feat(pdf_prev_app, dict_onehot, main_key="SK_ID_CURR")
print(pdf_onehot.shape)

(1670214, 144)
CPU times: user 5min 36s, sys: 12.8 s, total: 5min 49s
Wall time: 1min 35s


In [10]:
%%time
pdf_agg01 = agg_common_data(pdf_onehot, ["max", "sum", "mean"], main_key="SK_ID_CURR")
eval_agg01 = feature_evaluate(pdf_train_filtered, pdf_agg01)
display(eval_agg01)

{'CHANNEL_TYPE_AP+_(Cash_loan)': ['max', 'sum', 'mean'],
 'CHANNEL_TYPE_Car_dealer': ['max', 'sum', 'mean'],
 'CHANNEL_TYPE_Channel_of_corporate_sales': ['max', 'sum', 'mean'],
 'CHANNEL_TYPE_Contact_center': ['max', 'sum', 'mean'],
 'CHANNEL_TYPE_Country_wide': ['max', 'sum', 'mean'],
 'CHANNEL_TYPE_Credit_and_cash_offices': ['max', 'sum', 'mean'],
 'CHANNEL_TYPE_Regional___Local': ['max', 'sum', 'mean'],
 'CHANNEL_TYPE_Stone': ['max', 'sum', 'mean'],
 'CODE_REJECT_REASON_CLIENT': ['max', 'sum', 'mean'],
 'CODE_REJECT_REASON_HC': ['max', 'sum', 'mean'],
 'CODE_REJECT_REASON_LIMIT': ['max', 'sum', 'mean'],
 'CODE_REJECT_REASON_SCO': ['max', 'sum', 'mean'],
 'CODE_REJECT_REASON_SCOFR': ['max', 'sum', 'mean'],
 'CODE_REJECT_REASON_SYSTEM': ['max', 'sum', 'mean'],
 'CODE_REJECT_REASON_VERIF': ['max', 'sum', 'mean'],
 'CODE_REJECT_REASON_XAP': ['max', 'sum', 'mean'],
 'CODE_REJECT_REASON_XNA': ['max', 'sum', 'mean'],
 'FLAG_LAST_APPL_PER_CONTRACT_N': ['max', 'sum', 'mean'],
 'FLAG_LAST_APP

After agg: (338857, 429)
(338857, 429)


Unnamed: 0,name,auc,corr,coverage
347,NAME_CONTRACT_STATUS_Refused_mean,0.561,0.07868,1.0
35,CODE_REJECT_REASON_XAP_mean,0.5608,-0.074839,1.0
59,NAME_CONTRACT_STATUS_Approved_mean,0.5594,-0.063943,1.0
346,NAME_CONTRACT_STATUS_Refused_sum,0.5568,0.065433,1.0
163,NAME_PRODUCT_TYPE_walk_in_sum,0.5502,0.064161,1.0
164,NAME_PRODUCT_TYPE_walk_in_mean,0.5499,0.058756,1.0
345,NAME_CONTRACT_STATUS_Refused_max,0.5493,0.057058,1.0
162,NAME_PRODUCT_TYPE_walk_in_max,0.5468,0.059478,1.0
20,NAME_YIELD_GROUP_low_normal_mean,0.545,-0.040623,1.0
55,NAME_CLIENT_TYPE_New_sum,0.5397,0.047273,1.0


CPU times: user 4min 1s, sys: 6.49 s, total: 4min 8s
Wall time: 1min 23s


In [11]:
eval_agg01.query("auc <= 0.501").shape

(150, 4)

In [12]:
sel_feat = eval_agg01.query("auc > 0.501")["name"].tolist()
pdf_agg01 = pdf_agg01[sel_feat]
print(pdf_agg01.shape)

(338857, 279)


# Numerical features

In [13]:
# get list numerical attributes
ls_num = pdf_meta.query("sub_type == 'int64'")["name"].tolist()
ls_num = [col for col in ls_num if col not in ["SK_ID_PREV", "SK_ID_CURR"]]
ls_num

['HOUR_APPR_PROCESS_START',
 'NFLAG_LAST_APPL_IN_DAY',
 'DAYS_DECISION',
 'SELLERPLACE_AREA']

In [14]:
pdf_num = pdf_prev_app[["SK_ID_PREV", "SK_ID_CURR"] + ls_num].copy()
pdf_num.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,HOUR_APPR_PROCESS_START,NFLAG_LAST_APPL_IN_DAY,DAYS_DECISION,SELLERPLACE_AREA
0,2030495,271877,15,1,-73,35
1,2802425,108129,11,1,-164,-1
2,2523466,122040,11,1,-301,-1
3,2819243,176158,7,1,-512,-1
4,1784265,202054,9,1,-781,-1


In [15]:
pdf_num["YEARS_DECISION"] = pdf_num["DAYS_DECISION"] / -365

In [16]:
%%time
ls_agg = ["SK_ID_CURR", "HOUR_APPR_PROCESS_START", "YEARS_DECISION", "NFLAG_LAST_APPL_IN_DAY", "SELLERPLACE_AREA"]
pdf_agg02 = agg_common_data(pdf_num[ls_agg], ["max", "min", "sum", "mean", "std"], main_key="SK_ID_CURR")
eval_agg02 = feature_evaluate(pdf_train_filtered, pdf_agg02)
display(eval_agg02)

{'HOUR_APPR_PROCESS_START': ['max', 'min', 'sum', 'mean', 'std'],
 'NFLAG_LAST_APPL_IN_DAY': ['max', 'min', 'sum', 'mean', 'std'],
 'SELLERPLACE_AREA': ['max', 'min', 'sum', 'mean', 'std'],
 'YEARS_DECISION': ['max', 'min', 'sum', 'mean', 'std']}

After agg: (338857, 20)


Unnamed: 0,name,auc,corr,coverage
14,YEARS_DECISION_std,0.5675,-0.0605,0.8192
13,YEARS_DECISION_mean,0.5582,-0.0472,1.0
10,YEARS_DECISION_max,0.5574,-0.0545,1.0
18,SELLERPLACE_AREA_mean,0.5382,-0.0027,1.0
8,HOUR_APPR_PROCESS_START_mean,0.5377,-0.0363,1.0
19,SELLERPLACE_AREA_std,0.5366,-0.0044,0.8192
15,SELLERPLACE_AREA_max,0.5351,-0.0032,1.0
6,HOUR_APPR_PROCESS_START_min,0.5333,-0.0317,1.0
5,HOUR_APPR_PROCESS_START_max,0.5333,-0.0319,1.0
17,SELLERPLACE_AREA_sum,0.5325,-0.004,1.0


CPU times: user 11 s, sys: 216 ms, total: 11.2 s
Wall time: 1.85 s


In [18]:
eval_agg02.query("auc <= 0.501").shape

(2, 4)

# Continuous features

In [19]:
# get list continuous attributes
ls_con = pdf_meta.query("sub_type == 'float64'")["name"].tolist()
ls_con

['AMT_ANNUITY',
 'AMT_APPLICATION',
 'AMT_CREDIT',
 'AMT_DOWN_PAYMENT',
 'AMT_GOODS_PRICE',
 'RATE_DOWN_PAYMENT',
 'RATE_INTEREST_PRIMARY',
 'RATE_INTEREST_PRIVILEGED',
 'CNT_PAYMENT',
 'DAYS_FIRST_DRAWING',
 'DAYS_FIRST_DUE',
 'DAYS_LAST_DUE_1ST_VERSION',
 'DAYS_LAST_DUE',
 'DAYS_TERMINATION',
 'NFLAG_INSURED_ON_APPROVAL']

In [20]:
pdf_con = pdf_prev_app[["SK_ID_PREV", "SK_ID_CURR"] + ls_con].copy()
pdf_con.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE,RATE_DOWN_PAYMENT,RATE_INTEREST_PRIMARY,RATE_INTEREST_PRIVILEGED,CNT_PAYMENT,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL
0,2030495,271877,1730.43,17145.0,17145.0,0.0,17145.0,0.0,0.1828,0.8673,12.0,365243.0,-42.0,300.0,-42.0,-37.0,0.0
1,2802425,108129,25188.615,607500.0,679671.0,,607500.0,,,,36.0,365243.0,-134.0,916.0,365243.0,365243.0,1.0
2,2523466,122040,15060.735,112500.0,136444.5,,112500.0,,,,12.0,365243.0,-271.0,59.0,365243.0,365243.0,1.0
3,2819243,176158,47041.335,450000.0,470790.0,,450000.0,,,,12.0,365243.0,-482.0,-152.0,-182.0,-177.0,1.0
4,1784265,202054,31924.395,337500.0,404055.0,,337500.0,,,,24.0,,,,,,


In [22]:
%%time
pdf_agg03 = agg_common_data(pdf_con[["SK_ID_CURR"] + ls_con], ["max", "min", "sum", "mean", "std"], main_key="SK_ID_CURR")
eval_agg03 = feature_evaluate(pdf_train_filtered, pdf_agg03)
display(eval_agg03)

{'AMT_ANNUITY': ['max', 'min', 'sum', 'mean', 'std'],
 'AMT_APPLICATION': ['max', 'min', 'sum', 'mean', 'std'],
 'AMT_CREDIT': ['max', 'min', 'sum', 'mean', 'std'],
 'AMT_DOWN_PAYMENT': ['max', 'min', 'sum', 'mean', 'std'],
 'AMT_GOODS_PRICE': ['max', 'min', 'sum', 'mean', 'std'],
 'CNT_PAYMENT': ['max', 'min', 'sum', 'mean', 'std'],
 'DAYS_FIRST_DRAWING': ['max', 'min', 'sum', 'mean', 'std'],
 'DAYS_FIRST_DUE': ['max', 'min', 'sum', 'mean', 'std'],
 'DAYS_LAST_DUE': ['max', 'min', 'sum', 'mean', 'std'],
 'DAYS_LAST_DUE_1ST_VERSION': ['max', 'min', 'sum', 'mean', 'std'],
 'DAYS_TERMINATION': ['max', 'min', 'sum', 'mean', 'std'],
 'NFLAG_INSURED_ON_APPROVAL': ['max', 'min', 'sum', 'mean', 'std'],
 'RATE_DOWN_PAYMENT': ['max', 'min', 'sum', 'mean', 'std'],
 'RATE_INTEREST_PRIMARY': ['max', 'min', 'sum', 'mean', 'std'],
 'RATE_INTEREST_PRIVILEGED': ['max', 'min', 'sum', 'mean', 'std']}

After agg: (338857, 75)


Unnamed: 0,name,auc,corr,coverage
59,RATE_INTEREST_PRIVILEGED_std,0.618,0.064864,0.0006
14,RATE_INTEREST_PRIMARY_std,0.6037,-0.0029722,0.0006
26,DAYS_FIRST_DUE_min,0.5577,0.0075782,0.9956
1,DAYS_LAST_DUE_1ST_VERSION_min,0.557,0.013404,0.9956
21,DAYS_LAST_DUE_min,0.5567,0.026435,0.9956
17,DAYS_FIRST_DRAWING_sum,0.5558,-0.042224,1.0
46,DAYS_TERMINATION_min,0.5555,0.025585,0.9956
7,AMT_DOWN_PAYMENT_sum,0.5542,-0.027547,1.0
29,DAYS_FIRST_DUE_std,0.554,-0.014257,0.7213
19,DAYS_FIRST_DRAWING_std,0.5538,0.069697,0.7213


CPU times: user 35.2 s, sys: 989 ms, total: 36.2 s
Wall time: 7.76 s


In [23]:
eval_agg03.query("auc <= 0.501").shape

(2, 4)

# Save features

In [24]:
pdf_feat = pdf_agg01.join(pdf_agg02).join(pdf_agg03)
print(pdf_feat.shape)

(338857, 374)


In [25]:
%%time
fname = "prev_app"
fname = os.path.join("features", "{}.pkl.bz2".format(fname))
pdf_feat.to_pickle(fname, compression="bz2")
print("Store features completed!")

Store features completed!
CPU times: user 40 s, sys: 799 ms, total: 40.8 s
Wall time: 40.9 s
