In [1]:
# Full width
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import os, math, subprocess
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from IPython.display import display

# 
from lib_modeling import *
from lib_feature_engineering import *

# some settings for displaying Pandas results
pd.set_option('display.width', 2000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.precision', 4)
pd.set_option('display.max_colwidth', -1)

# Load data

In [4]:
# load train data
data_path = "home-credit-default-risk/application_train.csv"
pdf_train = pd.read_csv(data_path)

# filter by tvt code
pdf_tvt_extend = pd.read_pickle("pdf_tvt_extend.pkl", compression="bz2")
pdf_train_filtered = (pdf_tvt_extend.query("tvt_code == 'train'")
                      .merge(pdf_train[["SK_ID_CURR"]], on="SK_ID_CURR")
                      .drop(columns=["tvt_code"]))
pdf_train_filtered.head()

Unnamed: 0,SK_ID_CURR,TARGET
0,100002,1
1,100003,0
2,100004,0
3,100006,0
4,100007,0


In [5]:
# load bureau
data_path = "home-credit-default-risk/bureau.csv"
pdf_data = pd.read_csv(data_path)

# Preprocess data

## onehot and frequency

- CREDIT_ACTIVE: Tình trạng của khoản vay
- CREDIT_CURRENCY

In [6]:
# những khoản vay bị bán (Sold) được xem là (Bad debt)
print("Before")
print(pdf_data["CREDIT_ACTIVE"].value_counts())

pdf_data.loc[pdf_data["CREDIT_ACTIVE"] == "Sold", "CREDIT_ACTIVE"] = "Bad debt"
print("After")
print(pdf_data["CREDIT_ACTIVE"].value_counts())

Before
Closed      1079273
Active      630607 
Sold        6527   
Bad debt    21     
Name: CREDIT_ACTIVE, dtype: int64
After
Closed      1079273
Active      630607 
Bad debt    6548   
Name: CREDIT_ACTIVE, dtype: int64


In [7]:
# những currency hiếm gặp sẽ đưa vào chung một loại currency 2
print("Before")
print(pdf_data["CREDIT_CURRENCY"].value_counts())

pdf_data.loc[pdf_data["CREDIT_CURRENCY"].isin(["currency 3", "currency 4"]), "CREDIT_CURRENCY"] = "currency 2"
print("After")
print(pdf_data["CREDIT_CURRENCY"].value_counts())

Before
currency 1    1715020
currency 2    1224   
currency 3    174    
currency 4    10     
Name: CREDIT_CURRENCY, dtype: int64
After
currency 1    1715020
currency 2    1408   
Name: CREDIT_CURRENCY, dtype: int64


In [8]:
pdf_distr_credit_type = pdf_data["CREDIT_TYPE"].value_counts().to_frame("freq")
pdf_distr_credit_type

Unnamed: 0,freq
Consumer credit,1251615
Credit card,402195
Car loan,27690
Mortgage,18391
Microloan,12413
Loan for business development,1975
Another type of loan,1017
Unknown type of loan,555
Loan for working capital replenishment,469
Cash loan (non-earmarked),56


In [9]:
pdf_check_credit_type = pdf_train_filtered.merge(pdf_data[["SK_ID_CURR", "CREDIT_TYPE"]]).groupby(["CREDIT_TYPE", "TARGET"]).size().to_frame("count")
pdf_check_credit_type = pdf_check_credit_type.reset_index().set_index("CREDIT_TYPE")

# 
pdf_pct = pdf_check_credit_type.query("TARGET == 1").join(pdf_distr_credit_type)
pdf_pct["pct"] = pdf_pct["count"] * 100.0 / pdf_pct["freq"]
pdf_pct.sort_values("pct")

Unnamed: 0_level_0,TARGET,count,freq,pct
CREDIT_TYPE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Mortgage,1,545,18391,2.9634
Car loan,1,892,27690,3.2214
Loan for business development,1,76,1975,3.8481
Unknown type of loan,1,22,555,3.964
Another type of loan,1,45,1017,4.4248
Consumer credit,1,56552,1251615,4.5183
Credit card,1,21562,402195,5.3611
Loan for working capital replenishment,1,31,469,6.6098
Microloan,1,1357,12413,10.9321
Loan for the purchase of equipment,1,3,19,15.7895


In [10]:
# dựa vào phần trăm tín dụng xấu lớn hơn 5% và có phân bố nhỏ sẽ được xếp vào good/bad loan
ls_good_loan = [
    "Mobile operator loan",
    "Interbank credit",
    "Loan for purchase of shares (margin lending)",
    "Real estate loan"
    "Cash loan (non-earmarked)",
    "Another type of loan"
]

ls_bad_loan = [
    "Loan for the purchase of equipment",
    "Microloan",
    "Loan for working capital replenishment"
]

pdf_data.loc[pdf_data["CREDIT_TYPE"].isin(ls_good_loan), "CREDIT_TYPE"] = "Consumer credit"
pdf_data.loc[pdf_data["CREDIT_TYPE"].isin(ls_bad_loan), "CREDIT_TYPE"] = "Bad loan"

print("After")
print(pdf_data["CREDIT_TYPE"].value_counts())

After
Consumer credit                  1252638
Credit card                      402195 
Car loan                         27690  
Mortgage                         18391  
Bad loan                         12901  
Loan for business development    1975   
Unknown type of loan             555    
Cash loan (non-earmarked)        56     
Real estate loan                 27     
Name: CREDIT_TYPE, dtype: int64


In [11]:
%%time
dict_feat = {
    "CREDIT_ACTIVE": ['Closed', 'Active', 'Bad debt'],
    "CREDIT_CURRENCY": ['currency 1', 'currency 2'],
    "CREDIT_TYPE": ["Consumer credit", "Credit card", "Car loan", "Mortgage", "Loan for business development", "Unknown type of loan", "Bad loan"],
}
pdf_onehot = gen_one_hot_feat(pdf_data, dict_feat)

CPU times: user 30.8 s, sys: 1.2 s, total: 32 s
Wall time: 9.04 s


In [12]:
pdf_agg01 = agg_common_data(pdf_onehot, ["max", "sum", "mean"])
eval_agg01 = feature_evaluate(pdf_train_filtered, pdf_agg01.reset_index())
display(eval_agg01)

{'CREDIT_ACTIVE_Active': ['max', 'sum', 'mean'],
 'CREDIT_ACTIVE_Bad_debt': ['max', 'sum', 'mean'],
 'CREDIT_ACTIVE_Closed': ['max', 'sum', 'mean'],
 'CREDIT_CURRENCY_currency_1': ['max', 'sum', 'mean'],
 'CREDIT_CURRENCY_currency_2': ['max', 'sum', 'mean'],
 'CREDIT_TYPE_Bad_loan': ['max', 'sum', 'mean'],
 'CREDIT_TYPE_Car_loan': ['max', 'sum', 'mean'],
 'CREDIT_TYPE_Consumer_credit': ['max', 'sum', 'mean'],
 'CREDIT_TYPE_Credit_card': ['max', 'sum', 'mean'],
 'CREDIT_TYPE_Loan_for_business_development': ['max', 'sum', 'mean'],
 'CREDIT_TYPE_Mortgage': ['max', 'sum', 'mean'],
 'CREDIT_TYPE_Unknown_type_of_loan': ['max', 'sum', 'mean']}

After agg: (305811, 36)


Unnamed: 0,name,auc,corr,coverage
5,CREDIT_ACTIVE_Closed_mean,0.5881,-0.0812,1.0
23,CREDIT_ACTIVE_Active_mean,0.5856,0.0788,1.0
22,CREDIT_ACTIVE_Active_sum,0.56,0.0662,1.0
4,CREDIT_ACTIVE_Closed_sum,0.549,-0.0319,1.0
8,CREDIT_TYPE_Credit_card_mean,0.5377,0.0359,1.0
7,CREDIT_TYPE_Credit_card_sum,0.5306,0.0357,1.0
20,CREDIT_TYPE_Consumer_credit_mean,0.5305,-0.0271,1.0
3,CREDIT_ACTIVE_Closed_max,0.5303,-0.0487,1.0
21,CREDIT_ACTIVE_Active_max,0.5261,0.0368,1.0
19,CREDIT_TYPE_Consumer_credit_sum,0.5193,-0.0126,1.0


In [13]:
eval_agg01.query("auc <= 0.501").shape

(11, 4)

In [14]:
sel_feat = eval_agg01.query("auc > 0.501")["name"].tolist()
pdf_agg01 = pdf_agg01[sel_feat]
print(pdf_agg01.shape)

(305811, 25)


## days to years

- DAYS_CREDIT: Ngày vay so với hiện tại
- CREDIT_DAY_OVERDUE
- DAYS_CREDIT_UPDATE
- DAYS_CREDIT_ENDDATE
- DAYS_ENDDATE_FACT

In [15]:
def days_to_years(pdf_input, ls_cols):
    pdf_out = pdf_input[["SK_ID_CURR"] + ls_cols].copy()
    col_out = []
    for cname in ls_cols:
        new_name = "{}_TO_YEARS".format(cname)
        col_out.append(new_name)
        pdf_out[new_name] = pdf_out["DAYS_CREDIT"] / -365
        
    return pdf_out[["SK_ID_CURR"] + col_out]

In [16]:
ls_cols = ["DAYS_CREDIT", "CREDIT_DAY_OVERDUE", "DAYS_CREDIT_UPDATE", "DAYS_CREDIT_ENDDATE", "DAYS_ENDDATE_FACT"]
pdf_years = days_to_years(pdf_data, ls_cols)
pdf_years.head()

Unnamed: 0,SK_ID_CURR,DAYS_CREDIT_TO_YEARS,CREDIT_DAY_OVERDUE_TO_YEARS,DAYS_CREDIT_UPDATE_TO_YEARS,DAYS_CREDIT_ENDDATE_TO_YEARS,DAYS_ENDDATE_FACT_TO_YEARS
0,215354,1.3616,1.3616,1.3616,1.3616,1.3616
1,215354,0.5699,0.5699,0.5699,0.5699,0.5699
2,215354,0.5562,0.5562,0.5562,0.5562,0.5562
3,215354,0.5562,0.5562,0.5562,0.5562,0.5562
4,215354,1.7233,1.7233,1.7233,1.7233,1.7233


In [17]:
pdf_agg02 = agg_common_data(pdf_years, ["min", "max", "mean", "std", "median"])
pdf_agg02.head()

{'CREDIT_DAY_OVERDUE_TO_YEARS': ['min', 'max', 'mean', 'std', 'median'],
 'DAYS_CREDIT_ENDDATE_TO_YEARS': ['min', 'max', 'mean', 'std', 'median'],
 'DAYS_CREDIT_TO_YEARS': ['min', 'max', 'mean', 'std', 'median'],
 'DAYS_CREDIT_UPDATE_TO_YEARS': ['min', 'max', 'mean', 'std', 'median'],
 'DAYS_ENDDATE_FACT_TO_YEARS': ['min', 'max', 'mean', 'std', 'median']}

After agg: (305811, 25)


Unnamed: 0_level_0,DAYS_CREDIT_UPDATE_TO_YEARS_min,DAYS_CREDIT_UPDATE_TO_YEARS_max,DAYS_CREDIT_UPDATE_TO_YEARS_mean,DAYS_CREDIT_UPDATE_TO_YEARS_std,DAYS_CREDIT_UPDATE_TO_YEARS_median,DAYS_CREDIT_ENDDATE_TO_YEARS_min,DAYS_CREDIT_ENDDATE_TO_YEARS_max,DAYS_CREDIT_ENDDATE_TO_YEARS_mean,DAYS_CREDIT_ENDDATE_TO_YEARS_std,DAYS_CREDIT_ENDDATE_TO_YEARS_median,DAYS_ENDDATE_FACT_TO_YEARS_min,DAYS_ENDDATE_FACT_TO_YEARS_max,DAYS_ENDDATE_FACT_TO_YEARS_mean,DAYS_ENDDATE_FACT_TO_YEARS_std,DAYS_ENDDATE_FACT_TO_YEARS_median,DAYS_CREDIT_TO_YEARS_min,DAYS_CREDIT_TO_YEARS_max,DAYS_CREDIT_TO_YEARS_mean,DAYS_CREDIT_TO_YEARS_std,DAYS_CREDIT_TO_YEARS_median,CREDIT_DAY_OVERDUE_TO_YEARS_min,CREDIT_DAY_OVERDUE_TO_YEARS_max,CREDIT_DAY_OVERDUE_TO_YEARS_mean,CREDIT_DAY_OVERDUE_TO_YEARS_std,CREDIT_DAY_OVERDUE_TO_YEARS_median
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
100001,0.1342,4.3068,2.0137,1.3423,2.3479,0.1342,4.3068,2.0137,1.3423,2.3479,0.1342,4.3068,2.0137,1.3423,2.3479,0.1342,4.3068,2.0137,1.3423,2.3479,0.1342,4.3068,2.0137,1.3423,2.3479
100002,0.2822,3.937,2.3945,1.1821,2.8562,0.2822,3.937,2.3945,1.1821,2.8562,0.2822,3.937,2.3945,1.1821,2.8562,0.2822,3.937,2.3945,1.1821,2.8562,0.2822,3.937,2.3945,1.1821,2.8562
100003,1.6603,7.0849,3.8377,2.4927,3.3027,1.6603,7.0849,3.8377,2.4927,3.3027,1.6603,7.0849,3.8377,2.4927,3.3027,1.6603,7.0849,3.8377,2.4927,3.3027,1.6603,7.0849,3.8377,2.4927,3.3027
100004,1.1178,3.6329,2.3753,1.7784,2.3753,1.1178,3.6329,2.3753,1.7784,2.3753,1.1178,3.6329,2.3753,1.7784,2.3753,1.1178,3.6329,2.3753,1.7784,2.3753,1.1178,3.6329,2.3753,1.7784,2.3753
100005,0.1699,1.0219,0.5224,0.4446,0.3753,0.1699,1.0219,0.5224,0.4446,0.3753,0.1699,1.0219,0.5224,0.4446,0.3753,0.1699,1.0219,0.5224,0.4446,0.3753,0.1699,1.0219,0.5224,0.4446,0.3753


In [18]:
eval_agg02 = feature_evaluate(pdf_train_filtered, pdf_agg02.reset_index())
display(eval_agg02)

Unnamed: 0,name,auc,corr,coverage
12,DAYS_ENDDATE_FACT_TO_YEARS_mean,0.6043,-0.0912,1.0
2,DAYS_CREDIT_UPDATE_TO_YEARS_mean,0.6043,-0.0912,1.0
22,CREDIT_DAY_OVERDUE_TO_YEARS_mean,0.6043,-0.0912,1.0
17,DAYS_CREDIT_TO_YEARS_mean,0.6043,-0.0912,1.0
7,DAYS_CREDIT_ENDDATE_TO_YEARS_mean,0.6043,-0.0912,1.0
9,DAYS_CREDIT_ENDDATE_TO_YEARS_median,0.6042,-0.0877,1.0
19,DAYS_CREDIT_TO_YEARS_median,0.6042,-0.0877,1.0
14,DAYS_ENDDATE_FACT_TO_YEARS_median,0.6042,-0.0877,1.0
24,CREDIT_DAY_OVERDUE_TO_YEARS_median,0.6042,-0.0877,1.0
4,DAYS_CREDIT_UPDATE_TO_YEARS_median,0.6042,-0.0877,1.0


In [20]:
pdf_filtered021 = pdf_years.query("DAYS_CREDIT_TO_YEARS >= 1").set_index("SK_ID_CURR").add_suffix("_get_1year").reset_index()
pdf_agg021 = agg_common_data(pdf_filtered021, ["min", "max", "mean", "std", "median"])
eval_agg021 = feature_evaluate(pdf_train_filtered, pdf_agg021.reset_index())
display(eval_agg021)

{'CREDIT_DAY_OVERDUE_TO_YEARS_get_1year': ['min',
  'max',
  'mean',
  'std',
  'median'],
 'DAYS_CREDIT_ENDDATE_TO_YEARS_get_1year': ['min',
  'max',
  'mean',
  'std',
  'median'],
 'DAYS_CREDIT_TO_YEARS_get_1year': ['min', 'max', 'mean', 'std', 'median'],
 'DAYS_CREDIT_UPDATE_TO_YEARS_get_1year': ['min',
  'max',
  'mean',
  'std',
  'median'],
 'DAYS_ENDDATE_FACT_TO_YEARS_get_1year': ['min',
  'max',
  'mean',
  'std',
  'median']}

After agg: (286197, 25)


Unnamed: 0,name,auc,corr,coverage
12,CREDIT_DAY_OVERDUE_TO_YEARS_get_1year_mean,0.5724,-0.0604,1.0
2,DAYS_ENDDATE_FACT_TO_YEARS_get_1year_mean,0.5724,-0.0604,1.0
22,DAYS_CREDIT_UPDATE_TO_YEARS_get_1year_mean,0.5724,-0.0604,1.0
17,DAYS_CREDIT_ENDDATE_TO_YEARS_get_1year_mean,0.5724,-0.0604,1.0
7,DAYS_CREDIT_TO_YEARS_get_1year_mean,0.5724,-0.0604,1.0
9,DAYS_CREDIT_TO_YEARS_get_1year_median,0.5718,-0.0588,1.0
19,DAYS_CREDIT_ENDDATE_TO_YEARS_get_1year_median,0.5718,-0.0588,1.0
14,CREDIT_DAY_OVERDUE_TO_YEARS_get_1year_median,0.5718,-0.0588,1.0
24,DAYS_CREDIT_UPDATE_TO_YEARS_get_1year_median,0.5718,-0.0588,1.0
4,DAYS_ENDDATE_FACT_TO_YEARS_get_1year_median,0.5718,-0.0588,1.0


In [21]:
pdf_filtered022 = pdf_years.query("DAYS_CREDIT_TO_YEARS >= 2").set_index("SK_ID_CURR").add_suffix("_get_2year").reset_index()
pdf_agg022 = agg_common_data(pdf_filtered022, ["min", "max", "mean", "std", "median"])
eval_agg022 = feature_evaluate(pdf_train_filtered, pdf_agg022.reset_index())
display(eval_agg022)

{'CREDIT_DAY_OVERDUE_TO_YEARS_get_2year': ['min',
  'max',
  'mean',
  'std',
  'median'],
 'DAYS_CREDIT_ENDDATE_TO_YEARS_get_2year': ['min',
  'max',
  'mean',
  'std',
  'median'],
 'DAYS_CREDIT_TO_YEARS_get_2year': ['min', 'max', 'mean', 'std', 'median'],
 'DAYS_CREDIT_UPDATE_TO_YEARS_get_2year': ['min',
  'max',
  'mean',
  'std',
  'median'],
 'DAYS_ENDDATE_FACT_TO_YEARS_get_2year': ['min',
  'max',
  'mean',
  'std',
  'median']}

After agg: (257415, 25)


Unnamed: 0,name,auc,corr,coverage
1,CREDIT_DAY_OVERDUE_TO_YEARS_get_2year_max,0.5496,-0.0447,1.0
21,DAYS_CREDIT_ENDDATE_TO_YEARS_get_2year_max,0.5496,-0.0447,1.0
6,DAYS_CREDIT_TO_YEARS_get_2year_max,0.5496,-0.0447,1.0
16,DAYS_CREDIT_UPDATE_TO_YEARS_get_2year_max,0.5496,-0.0447,1.0
11,DAYS_ENDDATE_FACT_TO_YEARS_get_2year_max,0.5496,-0.0447,1.0
12,DAYS_ENDDATE_FACT_TO_YEARS_get_2year_mean,0.5468,-0.0373,1.0
2,CREDIT_DAY_OVERDUE_TO_YEARS_get_2year_mean,0.5468,-0.0373,1.0
22,DAYS_CREDIT_ENDDATE_TO_YEARS_get_2year_mean,0.5468,-0.0373,1.0
7,DAYS_CREDIT_TO_YEARS_get_2year_mean,0.5468,-0.0373,1.0
17,DAYS_CREDIT_UPDATE_TO_YEARS_get_2year_mean,0.5468,-0.0373,1.0


In [22]:
pdf_filtered023 = pdf_years.query("DAYS_CREDIT_TO_YEARS >= 3").set_index("SK_ID_CURR").add_suffix("_get_3year").reset_index()
pdf_agg023 = agg_common_data(pdf_filtered023, ["min", "max", "mean", "std", "median"])
eval_agg023 = feature_evaluate(pdf_train_filtered, pdf_agg023.reset_index())
display(eval_agg023)

{'CREDIT_DAY_OVERDUE_TO_YEARS_get_3year': ['min',
  'max',
  'mean',
  'std',
  'median'],
 'DAYS_CREDIT_ENDDATE_TO_YEARS_get_3year': ['min',
  'max',
  'mean',
  'std',
  'median'],
 'DAYS_CREDIT_TO_YEARS_get_3year': ['min', 'max', 'mean', 'std', 'median'],
 'DAYS_CREDIT_UPDATE_TO_YEARS_get_3year': ['min',
  'max',
  'mean',
  'std',
  'median'],
 'DAYS_ENDDATE_FACT_TO_YEARS_get_3year': ['min',
  'max',
  'mean',
  'std',
  'median']}

After agg: (224608, 25)


Unnamed: 0,name,auc,corr,coverage
1,DAYS_CREDIT_TO_YEARS_get_3year_max,0.5379,-0.0332,1.0
21,DAYS_CREDIT_ENDDATE_TO_YEARS_get_3year_max,0.5379,-0.0332,1.0
6,CREDIT_DAY_OVERDUE_TO_YEARS_get_3year_max,0.5379,-0.0332,1.0
16,DAYS_CREDIT_UPDATE_TO_YEARS_get_3year_max,0.5379,-0.0332,1.0
11,DAYS_ENDDATE_FACT_TO_YEARS_get_3year_max,0.5379,-0.0332,1.0
12,DAYS_ENDDATE_FACT_TO_YEARS_get_3year_mean,0.5315,-0.0251,1.0
2,DAYS_CREDIT_TO_YEARS_get_3year_mean,0.5315,-0.0251,1.0
22,DAYS_CREDIT_ENDDATE_TO_YEARS_get_3year_mean,0.5315,-0.0251,1.0
7,CREDIT_DAY_OVERDUE_TO_YEARS_get_3year_mean,0.5315,-0.0251,1.0
17,DAYS_CREDIT_UPDATE_TO_YEARS_get_3year_mean,0.5315,-0.0251,1.0


## keep columns

In [23]:
ls_cols = [cname for cname in pdf_data.columns if "AMT" in cname] + ["CNT_CREDIT_PROLONG"]
pdf_amt = pdf_data[["SK_ID_CURR"] + ls_cols]
pdf_amt.head()

Unnamed: 0,SK_ID_CURR,AMT_CREDIT_MAX_OVERDUE,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,AMT_ANNUITY,CNT_CREDIT_PROLONG
0,215354,,91323.0,0.0,,0.0,,0
1,215354,,225000.0,171342.0,,0.0,,0
2,215354,,464323.5,,,0.0,,0
3,215354,,90000.0,,,0.0,,0
4,215354,77674.5,2700000.0,,,0.0,,0


In [24]:
pdf_agg03 = agg_common_data(pdf_amt, ["min", "max", "mean", "std", "median"])

{'AMT_ANNUITY': ['min', 'max', 'mean', 'std', 'median'],
 'AMT_CREDIT_MAX_OVERDUE': ['min', 'max', 'mean', 'std', 'median'],
 'AMT_CREDIT_SUM': ['min', 'max', 'mean', 'std', 'median'],
 'AMT_CREDIT_SUM_DEBT': ['min', 'max', 'mean', 'std', 'median'],
 'AMT_CREDIT_SUM_LIMIT': ['min', 'max', 'mean', 'std', 'median'],
 'AMT_CREDIT_SUM_OVERDUE': ['min', 'max', 'mean', 'std', 'median'],
 'CNT_CREDIT_PROLONG': ['min', 'max', 'mean', 'std', 'median']}

After agg: (305811, 35)


In [25]:
eval_agg03 = feature_evaluate(pdf_train_filtered, pdf_agg03.reset_index())
display(eval_agg03)

Unnamed: 0,name,auc,corr,coverage
24,AMT_CREDIT_SUM_DEBT_median,0.5582,0.0022,0.9721
22,AMT_CREDIT_SUM_DEBT_mean,0.5578,-0.0004,0.9721
13,AMT_CREDIT_MAX_OVERDUE_std,0.5506,0.0004,0.4506
23,AMT_CREDIT_SUM_DEBT_std,0.5497,-0.0039,0.8123
12,AMT_CREDIT_MAX_OVERDUE_mean,0.5477,0.008,0.6988
11,AMT_CREDIT_MAX_OVERDUE_max,0.5456,0.0048,0.6988
21,AMT_CREDIT_SUM_DEBT_max,0.5452,-0.0029,0.9721
14,AMT_CREDIT_MAX_OVERDUE_median,0.5331,0.0077,0.6988
18,AMT_CREDIT_SUM_LIMIT_std,0.5309,-0.013,0.7283
34,AMT_ANNUITY_median,0.5283,-0.0011,0.3042


In [26]:
eval_agg03.query("auc <= 0.501").shape

(4, 4)

In [27]:
sel_feat = eval_agg03.query("auc > 0.501")["name"].tolist()
pdf_agg03 = pdf_agg03[sel_feat]
print(pdf_agg03.shape)

(305811, 31)


# save features

In [28]:
pdf_feat = pdf_agg01.join(pdf_agg02).join(pdf_agg03).join(pdf_agg021).join(pdf_agg022).join(pdf_agg023)
print(pdf_feat.shape)
pdf_feat.head()

(305811, 156)


Unnamed: 0_level_0,CREDIT_ACTIVE_Closed_mean,CREDIT_ACTIVE_Active_mean,CREDIT_ACTIVE_Active_sum,CREDIT_ACTIVE_Closed_sum,CREDIT_TYPE_Credit_card_mean,CREDIT_TYPE_Credit_card_sum,CREDIT_TYPE_Consumer_credit_mean,CREDIT_ACTIVE_Closed_max,CREDIT_ACTIVE_Active_max,CREDIT_TYPE_Consumer_credit_sum,CREDIT_TYPE_Credit_card_max,CREDIT_TYPE_Bad_loan_sum,CREDIT_TYPE_Bad_loan_mean,CREDIT_TYPE_Bad_loan_max,CREDIT_TYPE_Car_loan_mean,CREDIT_TYPE_Car_loan_sum,CREDIT_TYPE_Car_loan_max,CREDIT_TYPE_Mortgage_mean,CREDIT_TYPE_Mortgage_sum,CREDIT_TYPE_Mortgage_max,CREDIT_CURRENCY_currency_1_sum,CREDIT_TYPE_Consumer_credit_max,CREDIT_ACTIVE_Bad_debt_mean,CREDIT_ACTIVE_Bad_debt_max,CREDIT_ACTIVE_Bad_debt_sum,DAYS_CREDIT_UPDATE_TO_YEARS_min,DAYS_CREDIT_UPDATE_TO_YEARS_max,DAYS_CREDIT_UPDATE_TO_YEARS_mean,DAYS_CREDIT_UPDATE_TO_YEARS_std,DAYS_CREDIT_UPDATE_TO_YEARS_median,DAYS_CREDIT_ENDDATE_TO_YEARS_min,DAYS_CREDIT_ENDDATE_TO_YEARS_max,DAYS_CREDIT_ENDDATE_TO_YEARS_mean,DAYS_CREDIT_ENDDATE_TO_YEARS_std,DAYS_CREDIT_ENDDATE_TO_YEARS_median,DAYS_ENDDATE_FACT_TO_YEARS_min,DAYS_ENDDATE_FACT_TO_YEARS_max,DAYS_ENDDATE_FACT_TO_YEARS_mean,DAYS_ENDDATE_FACT_TO_YEARS_std,DAYS_ENDDATE_FACT_TO_YEARS_median,DAYS_CREDIT_TO_YEARS_min,DAYS_CREDIT_TO_YEARS_max,DAYS_CREDIT_TO_YEARS_mean,DAYS_CREDIT_TO_YEARS_std,DAYS_CREDIT_TO_YEARS_median,CREDIT_DAY_OVERDUE_TO_YEARS_min,CREDIT_DAY_OVERDUE_TO_YEARS_max,CREDIT_DAY_OVERDUE_TO_YEARS_mean,CREDIT_DAY_OVERDUE_TO_YEARS_std,CREDIT_DAY_OVERDUE_TO_YEARS_median,AMT_CREDIT_SUM_DEBT_median,AMT_CREDIT_SUM_DEBT_mean,AMT_CREDIT_MAX_OVERDUE_std,AMT_CREDIT_SUM_DEBT_std,AMT_CREDIT_MAX_OVERDUE_mean,AMT_CREDIT_MAX_OVERDUE_max,AMT_CREDIT_SUM_DEBT_max,AMT_CREDIT_MAX_OVERDUE_median,AMT_CREDIT_SUM_LIMIT_std,AMT_ANNUITY_median,AMT_CREDIT_SUM_DEBT_min,AMT_CREDIT_SUM_LIMIT_max,AMT_CREDIT_SUM_LIMIT_mean,AMT_ANNUITY_mean,AMT_CREDIT_SUM_median,AMT_CREDIT_SUM_min,AMT_ANNUITY_std,AMT_ANNUITY_min,AMT_CREDIT_SUM_mean,AMT_CREDIT_MAX_OVERDUE_min,AMT_ANNUITY_max,AMT_CREDIT_SUM_max,AMT_CREDIT_SUM_OVERDUE_std,AMT_CREDIT_SUM_OVERDUE_mean,AMT_CREDIT_SUM_OVERDUE_max,AMT_CREDIT_SUM_LIMIT_median,AMT_CREDIT_SUM_std,CNT_CREDIT_PROLONG_std,CNT_CREDIT_PROLONG_max,CNT_CREDIT_PROLONG_mean,AMT_CREDIT_SUM_OVERDUE_median,DAYS_ENDDATE_FACT_TO_YEARS_get_1year_min,DAYS_ENDDATE_FACT_TO_YEARS_get_1year_max,DAYS_ENDDATE_FACT_TO_YEARS_get_1year_mean,DAYS_ENDDATE_FACT_TO_YEARS_get_1year_std,DAYS_ENDDATE_FACT_TO_YEARS_get_1year_median,DAYS_CREDIT_TO_YEARS_get_1year_min,DAYS_CREDIT_TO_YEARS_get_1year_max,DAYS_CREDIT_TO_YEARS_get_1year_mean,DAYS_CREDIT_TO_YEARS_get_1year_std,DAYS_CREDIT_TO_YEARS_get_1year_median,CREDIT_DAY_OVERDUE_TO_YEARS_get_1year_min,CREDIT_DAY_OVERDUE_TO_YEARS_get_1year_max,CREDIT_DAY_OVERDUE_TO_YEARS_get_1year_mean,CREDIT_DAY_OVERDUE_TO_YEARS_get_1year_std,CREDIT_DAY_OVERDUE_TO_YEARS_get_1year_median,DAYS_CREDIT_ENDDATE_TO_YEARS_get_1year_min,DAYS_CREDIT_ENDDATE_TO_YEARS_get_1year_max,DAYS_CREDIT_ENDDATE_TO_YEARS_get_1year_mean,DAYS_CREDIT_ENDDATE_TO_YEARS_get_1year_std,DAYS_CREDIT_ENDDATE_TO_YEARS_get_1year_median,DAYS_CREDIT_UPDATE_TO_YEARS_get_1year_min,DAYS_CREDIT_UPDATE_TO_YEARS_get_1year_max,DAYS_CREDIT_UPDATE_TO_YEARS_get_1year_mean,DAYS_CREDIT_UPDATE_TO_YEARS_get_1year_std,DAYS_CREDIT_UPDATE_TO_YEARS_get_1year_median,CREDIT_DAY_OVERDUE_TO_YEARS_get_2year_min,CREDIT_DAY_OVERDUE_TO_YEARS_get_2year_max,CREDIT_DAY_OVERDUE_TO_YEARS_get_2year_mean,CREDIT_DAY_OVERDUE_TO_YEARS_get_2year_std,CREDIT_DAY_OVERDUE_TO_YEARS_get_2year_median,DAYS_CREDIT_TO_YEARS_get_2year_min,DAYS_CREDIT_TO_YEARS_get_2year_max,DAYS_CREDIT_TO_YEARS_get_2year_mean,DAYS_CREDIT_TO_YEARS_get_2year_std,DAYS_CREDIT_TO_YEARS_get_2year_median,DAYS_ENDDATE_FACT_TO_YEARS_get_2year_min,DAYS_ENDDATE_FACT_TO_YEARS_get_2year_max,DAYS_ENDDATE_FACT_TO_YEARS_get_2year_mean,DAYS_ENDDATE_FACT_TO_YEARS_get_2year_std,DAYS_ENDDATE_FACT_TO_YEARS_get_2year_median,DAYS_CREDIT_UPDATE_TO_YEARS_get_2year_min,DAYS_CREDIT_UPDATE_TO_YEARS_get_2year_max,DAYS_CREDIT_UPDATE_TO_YEARS_get_2year_mean,DAYS_CREDIT_UPDATE_TO_YEARS_get_2year_std,DAYS_CREDIT_UPDATE_TO_YEARS_get_2year_median,DAYS_CREDIT_ENDDATE_TO_YEARS_get_2year_min,DAYS_CREDIT_ENDDATE_TO_YEARS_get_2year_max,DAYS_CREDIT_ENDDATE_TO_YEARS_get_2year_mean,DAYS_CREDIT_ENDDATE_TO_YEARS_get_2year_std,DAYS_CREDIT_ENDDATE_TO_YEARS_get_2year_median,DAYS_CREDIT_TO_YEARS_get_3year_min,DAYS_CREDIT_TO_YEARS_get_3year_max,DAYS_CREDIT_TO_YEARS_get_3year_mean,DAYS_CREDIT_TO_YEARS_get_3year_std,DAYS_CREDIT_TO_YEARS_get_3year_median,CREDIT_DAY_OVERDUE_TO_YEARS_get_3year_min,CREDIT_DAY_OVERDUE_TO_YEARS_get_3year_max,CREDIT_DAY_OVERDUE_TO_YEARS_get_3year_mean,CREDIT_DAY_OVERDUE_TO_YEARS_get_3year_std,CREDIT_DAY_OVERDUE_TO_YEARS_get_3year_median,DAYS_ENDDATE_FACT_TO_YEARS_get_3year_min,DAYS_ENDDATE_FACT_TO_YEARS_get_3year_max,DAYS_ENDDATE_FACT_TO_YEARS_get_3year_mean,DAYS_ENDDATE_FACT_TO_YEARS_get_3year_std,DAYS_ENDDATE_FACT_TO_YEARS_get_3year_median,DAYS_CREDIT_UPDATE_TO_YEARS_get_3year_min,DAYS_CREDIT_UPDATE_TO_YEARS_get_3year_max,DAYS_CREDIT_UPDATE_TO_YEARS_get_3year_mean,DAYS_CREDIT_UPDATE_TO_YEARS_get_3year_std,DAYS_CREDIT_UPDATE_TO_YEARS_get_3year_median,DAYS_CREDIT_ENDDATE_TO_YEARS_get_3year_min,DAYS_CREDIT_ENDDATE_TO_YEARS_get_3year_max,DAYS_CREDIT_ENDDATE_TO_YEARS_get_3year_mean,DAYS_CREDIT_ENDDATE_TO_YEARS_get_3year_std,DAYS_CREDIT_ENDDATE_TO_YEARS_get_3year_median
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1
100001,0.5714,0.4286,3,4,0.0,0,1.0,1,1,7,0,0,0.0,0,0.0,0,0,0.0,0,0,7,1,0.0,0,0,0.1342,4.3068,2.0137,1.3423,2.3479,0.1342,4.3068,2.0137,1.3423,2.3479,0.1342,4.3068,2.0137,1.3423,2.3479,0.1342,4.3068,2.0137,1.3423,2.3479,0.1342,4.3068,2.0137,1.3423,2.3479,0.0,85240.9286,,137485.6311,,,373239.0,,0.0,0.0,0.0,0.0,0.0,3545.3571,168345.0,85500.0,4800.6075,0.0,207623.5714,,10822.5,378000.0,0.0,0.0,0.0,0.0,122544.5445,0.0,0,0.0,0.0,1.5315,4.3068,2.617,1.0205,2.4082,1.5315,4.3068,2.617,1.0205,2.4082,1.5315,4.3068,2.617,1.0205,2.4082,1.5315,4.3068,2.617,1.0205,2.4082,1.5315,4.3068,2.617,1.0205,2.4082,2.3479,4.3068,2.8884,0.9475,2.4493,2.3479,4.3068,2.8884,0.9475,2.4493,2.3479,4.3068,2.8884,0.9475,2.4493,2.3479,4.3068,2.8884,0.9475,2.4493,2.3479,4.3068,2.8884,0.9475,2.4493,4.3068,4.3068,4.3068,,4.3068,4.3068,4.3068,4.3068,,4.3068,4.3068,4.3068,4.3068,,4.3068,4.3068,4.3068,4.3068,,4.3068,4.3068,4.3068,4.3068,,4.3068
100002,0.75,0.25,2,6,0.5,4,0.5,1,1,4,1,0,0.0,0,0.0,0,0,0.0,0,0,8,1,0.0,0,0,0.2822,3.937,2.3945,1.1821,2.8562,0.2822,3.937,2.3945,1.1821,2.8562,0.2822,3.937,2.3945,1.1821,2.8562,0.2822,3.937,2.3945,1.1821,2.8562,0.2822,3.937,2.3945,1.1821,2.8562,0.0,49156.2,2363.2469,109916.6047,1681.029,5043.645,245781.0,40.5,15994.2825,0.0,0.0,31988.565,7997.1412,0.0,54130.5,0.0,0.0,0.0,108131.9456,0.0,0.0,450000.0,0.0,0.0,0.0,0.0,146075.5574,0.0,0,0.0,0.0,1.3041,3.937,2.6963,0.8833,2.8575,1.3041,3.937,2.6963,0.8833,2.8575,1.3041,3.937,2.6963,0.8833,2.8575,1.3041,3.937,2.6963,0.8833,2.8575,1.3041,3.937,2.6963,0.8833,2.8575,2.8548,3.937,3.1605,0.4478,3.0712,2.8548,3.937,3.1605,0.4478,3.0712,2.8548,3.937,3.1605,0.4478,3.0712,2.8548,3.937,3.1605,0.4478,3.0712,2.8548,3.937,3.1605,0.4478,3.0712,3.0712,3.937,3.3635,0.4967,3.0822,3.0712,3.937,3.3635,0.4967,3.0822,3.0712,3.937,3.3635,0.4967,3.0822,3.0712,3.937,3.3635,0.4967,3.0822,3.0712,3.937,3.3635,0.4967,3.0822
100003,0.75,0.25,1,3,0.5,2,0.5,1,1,2,1,0,0.0,0,0.0,0,0,0.0,0,0,4,1,0.0,0,0,1.6603,7.0849,3.8377,2.4927,3.3027,1.6603,7.0849,3.8377,2.4927,3.3027,1.6603,7.0849,3.8377,2.4927,3.3027,1.6603,7.0849,3.8377,2.4927,3.3027,1.6603,7.0849,3.8377,2.4927,3.3027,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,405000.0,,0.0,810000.0,202500.0,,92576.25,22248.0,,,254350.125,0.0,,810000.0,0.0,0.0,0.0,0.0,372269.4655,0.0,0,0.0,0.0,1.6603,7.0849,3.8377,2.4927,3.3027,1.6603,7.0849,3.8377,2.4927,3.3027,1.6603,7.0849,3.8377,2.4927,3.3027,1.6603,7.0849,3.8377,2.4927,3.3027,1.6603,7.0849,3.8377,2.4927,3.3027,2.1233,7.0849,4.5635,2.4818,4.4822,2.1233,7.0849,4.5635,2.4818,4.4822,2.1233,7.0849,4.5635,2.4818,4.4822,2.1233,7.0849,4.5635,2.4818,4.4822,2.1233,7.0849,4.5635,2.4818,4.4822,4.4822,7.0849,5.7836,1.8404,5.7836,4.4822,7.0849,5.7836,1.8404,5.7836,4.4822,7.0849,5.7836,1.8404,5.7836,4.4822,7.0849,5.7836,1.8404,5.7836,4.4822,7.0849,5.7836,1.8404,5.7836
100004,1.0,0.0,0,2,0.0,0,1.0,1,0,2,0,0,0.0,0,0.0,0,0,0.0,0,0,2,1,0.0,0,0,1.1178,3.6329,2.3753,1.7784,2.3753,1.1178,3.6329,2.3753,1.7784,2.3753,1.1178,3.6329,2.3753,1.7784,2.3753,1.1178,3.6329,2.3753,1.7784,2.3753,1.1178,3.6329,2.3753,1.7784,2.3753,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,,94518.9,94500.0,,,94518.9,0.0,,94537.8,0.0,0.0,0.0,0.0,26.7286,0.0,0,0.0,0.0,1.1178,3.6329,2.3753,1.7784,2.3753,1.1178,3.6329,2.3753,1.7784,2.3753,1.1178,3.6329,2.3753,1.7784,2.3753,1.1178,3.6329,2.3753,1.7784,2.3753,1.1178,3.6329,2.3753,1.7784,2.3753,3.6329,3.6329,3.6329,,3.6329,3.6329,3.6329,3.6329,,3.6329,3.6329,3.6329,3.6329,,3.6329,3.6329,3.6329,3.6329,,3.6329,3.6329,3.6329,3.6329,,3.6329,3.6329,3.6329,3.6329,,3.6329,3.6329,3.6329,3.6329,,3.6329,3.6329,3.6329,3.6329,,3.6329,3.6329,3.6329,3.6329,,3.6329,3.6329,3.6329,3.6329,,3.6329
100005,0.3333,0.6667,2,1,0.3333,1,0.6667,1,1,2,1,0,0.0,0,0.0,0,0,0.0,0,0,3,1,0.0,0,0,0.1699,1.0219,0.5224,0.4446,0.3753,0.1699,1.0219,0.5224,0.4446,0.3753,0.1699,1.0219,0.5224,0.4446,0.3753,0.1699,1.0219,0.5224,0.4446,0.3753,0.1699,1.0219,0.5224,0.4446,0.3753,25321.5,189469.5,,306503.339,0.0,0.0,543087.0,0.0,0.0,0.0,0.0,0.0,0.0,1420.5,58500.0,29826.0,2460.3782,0.0,219042.0,0.0,4261.5,568800.0,0.0,0.0,0.0,0.0,303238.4268,0.0,0,0.0,0.0,1.0219,1.0219,1.0219,,1.0219,1.0219,1.0219,1.0219,,1.0219,1.0219,1.0219,1.0219,,1.0219,1.0219,1.0219,1.0219,,1.0219,1.0219,1.0219,1.0219,,1.0219,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [29]:
%%time
fname = "bureau"
fname = os.path.join("features", "{}.pkl.bz2".format(fname))
pdf_feat.to_pickle(fname, compression="bz2")
print("Store features completed!")


Store features completed!
CPU times: user 20.9 s, sys: 376 ms, total: 21.2 s
Wall time: 21.3 s
