In [1]:
# Full width
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import math
import os
import subprocess

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from IPython.display import display

# 
from lib_modeling import *
from lib_feature_engineering import *

# some settings for displaying Pandas results
pd.set_option('display.width', 2000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.precision', 4)
pd.set_option('display.max_colwidth', -1)

# Load data

In [4]:
# load train/test data
data_path = "home-credit-default-risk/application_train.csv"
pdf_train = pd.read_csv(data_path)

data_path = "home-credit-default-risk/application_test.csv"
pdf_test = pd.read_csv(data_path)

# filter by tvt code
pdf_tvt_extend = pd.read_pickle("pdf_tvt_extend.pkl", compression="bz2")
pdf_train_filtered = (pdf_tvt_extend.query("tvt_code == 'train'")
                      .merge(pdf_train[["SK_ID_CURR"]], on="SK_ID_CURR")
                      .drop(columns=["tvt_code"]))
pdf_train_filtered.head()

Unnamed: 0,SK_ID_CURR,TARGET
0,100002,1
1,100003,0
2,100004,0
3,100006,0
4,100007,0


In [5]:
# load bureau + balance
data_path = "home-credit-default-risk/bureau.csv"
pdf_bureau = pd.read_csv(data_path)

data_path = "home-credit-default-risk/bureau_balance.csv"
pdf_bureau_balance = pd.read_csv(data_path)

# Mean encoding

## application

In [6]:
dict_onehot = {
    "NAME_TYPE_SUITE": ["Unaccompanied", "Family", "Spouse, partner", "Children", "Other_A", "Other_B", "Group of people"],
    "NAME_INCOME_TYPE": ["Working", "State servant", "Commercial associate", "Pensioner", "Unemployed", "Student", "Businessman", "Maternity leave"],
    "NAME_EDUCATION_TYPE": ["Secondary / secondary special", "Higher education", "Incomplete higher", "Lower secondary", "Academic degree"],
    "NAME_FAMILY_STATUS": ["Single / not married", "Married", "Civil marriage", "Widow", "Separated", "Unknown"],
    "NAME_HOUSING_TYPE": ["House / apartment", "Rented apartment", "With parents", "Municipal apartment", "Office apartment", "Co-op apartment"],
    "OCCUPATION_TYPE": ["Laborers", "Core staff", "Accountants", "Managers", "Drivers", "Sales staff", "Cleaning staff", "Cooking staff", "Private service staff", "Medicine staff", "Security staff", "High skill tech staff", "Waiters/barmen staff", "Low-skill Laborers", "Realty agents", "Secretaries", "IT staff", "HR staff"],
    "ORGANIZATION_TYPE": ["Business Entity Type 3", "School", "Government", "Religion", "Other", "XNA", "Electricity", "Medicine", "Business Entity Type 2", "Self-employed", "Transport: type 2", "Construction", "Housing", "Kindergarten", "Trade: type 7", "Industry: type 11", "Military", "Services", "Security Ministries", "Transport: type 4", "Industry: type 1", "Emergency", "Security", "Trade: type 2", "University", "Transport: type 3", "Police", "Business Entity Type 1", "Postal", "Industry: type 4", "Agriculture", "Restaurant", "Culture", "Hotel", "Industry: type 7", "Trade: type 3", "Industry: type 3", "Bank", "Industry: type 9", "Insurance", "Trade: type 6", "Industry: type 2", "Transport: type 1", "Industry: type 12", "Mobile", "Trade: type 1", "Industry: type 5", "Industry: type 10", "Legal Services", "Advertising", "Trade: type 5", "Cleaning", "Industry: type 13", "Trade: type 4", "Telecom", "Industry: type 8", "Realtor", "Industry: type 6"],
    "FONDKAPREMONT_MODE": ["reg oper account", "org spec account", "reg oper spec account", "not specified"],
    "HOUSETYPE_MODE": ["block of flats", "terraced house", "specific housing"],
    "WALLSMATERIAL_MODE": ["Stone, brick", "Block", "Panel", "Mixed", "Wooden", "Others", "Monolithic"],    
}

In [7]:
%%time
pdf_onehot = gen_one_hot_feat(pdf_train, dict_onehot, main_key="SK_ID_CURR")
display(pdf_onehot.head())

Unnamed: 0,SK_ID_CURR,NAME_INCOME_TYPE_Working,NAME_INCOME_TYPE_State_servant,NAME_INCOME_TYPE_Commercial_associate,NAME_INCOME_TYPE_Pensioner,NAME_INCOME_TYPE_Unemployed,NAME_INCOME_TYPE_Student,NAME_INCOME_TYPE_Businessman,NAME_INCOME_TYPE_Maternity_leave,FONDKAPREMONT_MODE_reg_oper_account,FONDKAPREMONT_MODE_org_spec_account,FONDKAPREMONT_MODE_reg_oper_spec_account,FONDKAPREMONT_MODE_not_specified,NAME_HOUSING_TYPE_House___apartment,NAME_HOUSING_TYPE_Rented_apartment,NAME_HOUSING_TYPE_With_parents,NAME_HOUSING_TYPE_Municipal_apartment,NAME_HOUSING_TYPE_Office_apartment,NAME_HOUSING_TYPE_Co_op_apartment,NAME_EDUCATION_TYPE_Secondary___secondary_special,NAME_EDUCATION_TYPE_Higher_education,NAME_EDUCATION_TYPE_Incomplete_higher,NAME_EDUCATION_TYPE_Lower_secondary,NAME_EDUCATION_TYPE_Academic_degree,OCCUPATION_TYPE_Laborers,OCCUPATION_TYPE_Core_staff,OCCUPATION_TYPE_Accountants,OCCUPATION_TYPE_Managers,OCCUPATION_TYPE_Drivers,OCCUPATION_TYPE_Sales_staff,OCCUPATION_TYPE_Cleaning_staff,OCCUPATION_TYPE_Cooking_staff,OCCUPATION_TYPE_Private_service_staff,OCCUPATION_TYPE_Medicine_staff,OCCUPATION_TYPE_Security_staff,OCCUPATION_TYPE_High_skill_tech_staff,OCCUPATION_TYPE_Waiters_barmen_staff,OCCUPATION_TYPE_Low_skill_Laborers,OCCUPATION_TYPE_Realty_agents,OCCUPATION_TYPE_Secretaries,OCCUPATION_TYPE_IT_staff,OCCUPATION_TYPE_HR_staff,ORGANIZATION_TYPE_Business_Entity_Type_3,ORGANIZATION_TYPE_School,ORGANIZATION_TYPE_Government,ORGANIZATION_TYPE_Religion,ORGANIZATION_TYPE_Other,ORGANIZATION_TYPE_XNA,ORGANIZATION_TYPE_Electricity,ORGANIZATION_TYPE_Medicine,ORGANIZATION_TYPE_Business_Entity_Type_2,ORGANIZATION_TYPE_Self_employed,ORGANIZATION_TYPE_Transport__type_2,ORGANIZATION_TYPE_Construction,ORGANIZATION_TYPE_Housing,ORGANIZATION_TYPE_Kindergarten,ORGANIZATION_TYPE_Trade__type_7,ORGANIZATION_TYPE_Industry__type_11,ORGANIZATION_TYPE_Military,ORGANIZATION_TYPE_Services,ORGANIZATION_TYPE_Security_Ministries,ORGANIZATION_TYPE_Transport__type_4,ORGANIZATION_TYPE_Industry__type_1,ORGANIZATION_TYPE_Emergency,ORGANIZATION_TYPE_Security,ORGANIZATION_TYPE_Trade__type_2,ORGANIZATION_TYPE_University,ORGANIZATION_TYPE_Transport__type_3,ORGANIZATION_TYPE_Police,ORGANIZATION_TYPE_Business_Entity_Type_1,ORGANIZATION_TYPE_Postal,ORGANIZATION_TYPE_Industry__type_4,ORGANIZATION_TYPE_Agriculture,ORGANIZATION_TYPE_Restaurant,ORGANIZATION_TYPE_Culture,ORGANIZATION_TYPE_Hotel,ORGANIZATION_TYPE_Industry__type_7,ORGANIZATION_TYPE_Trade__type_3,ORGANIZATION_TYPE_Industry__type_3,ORGANIZATION_TYPE_Bank,ORGANIZATION_TYPE_Industry__type_9,ORGANIZATION_TYPE_Insurance,ORGANIZATION_TYPE_Trade__type_6,ORGANIZATION_TYPE_Industry__type_2,ORGANIZATION_TYPE_Transport__type_1,ORGANIZATION_TYPE_Industry__type_12,ORGANIZATION_TYPE_Mobile,ORGANIZATION_TYPE_Trade__type_1,ORGANIZATION_TYPE_Industry__type_5,ORGANIZATION_TYPE_Industry__type_10,ORGANIZATION_TYPE_Legal_Services,ORGANIZATION_TYPE_Advertising,ORGANIZATION_TYPE_Trade__type_5,ORGANIZATION_TYPE_Cleaning,ORGANIZATION_TYPE_Industry__type_13,ORGANIZATION_TYPE_Trade__type_4,ORGANIZATION_TYPE_Telecom,ORGANIZATION_TYPE_Industry__type_8,ORGANIZATION_TYPE_Realtor,ORGANIZATION_TYPE_Industry__type_6,"WALLSMATERIAL_MODE_Stone,_brick",WALLSMATERIAL_MODE_Block,WALLSMATERIAL_MODE_Panel,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Wooden,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Monolithic,NAME_FAMILY_STATUS_Single___not_married,NAME_FAMILY_STATUS_Married,NAME_FAMILY_STATUS_Civil_marriage,NAME_FAMILY_STATUS_Widow,NAME_FAMILY_STATUS_Separated,NAME_FAMILY_STATUS_Unknown,HOUSETYPE_MODE_block_of_flats,HOUSETYPE_MODE_terraced_house,HOUSETYPE_MODE_specific_housing,NAME_TYPE_SUITE_Unaccompanied,NAME_TYPE_SUITE_Family,"NAME_TYPE_SUITE_Spouse,_partner",NAME_TYPE_SUITE_Children,NAME_TYPE_SUITE_Other_A,NAME_TYPE_SUITE_Other_B,NAME_TYPE_SUITE_Group_of_people
0,100002,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0
1,100003,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0
2,100004,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
3,100006,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0
4,100007,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0


CPU times: user 2min 48s, sys: 4.25 s, total: 2min 52s
Wall time: 18.5 s


In [8]:
feature_evaluate(pdf_train_filtered, pdf_onehot)


Unnamed: 0,name,auc,corr,coverage
0,NAME_INCOME_TYPE_Working,0.5541,0.0591,1.0
19,NAME_EDUCATION_TYPE_Higher_education,0.544,-0.056,1.0
18,NAME_EDUCATION_TYPE_Secondary___secondary_special,0.5406,0.0488,1.0
112,HOUSETYPE_MODE_block_of_flats,0.5379,-0.0414,1.0
3,NAME_INCOME_TYPE_Pensioner,0.5324,-0.0461,1.0
46,ORGANIZATION_TYPE_XNA,0.5322,-0.0458,1.0
23,OCCUPATION_TYPE_Laborers,0.5295,0.042,1.0
101,WALLSMATERIAL_MODE_Panel,0.5246,-0.0327,1.0
107,NAME_FAMILY_STATUS_Married,0.522,-0.0251,1.0
50,ORGANIZATION_TYPE_Self_employed,0.5181,0.03,1.0


In [9]:
%%time
ls_cat = dict_onehot.keys()
pdf_agg01_train, dict_encoding_map = agg_mean_encoding(pdf_train, ls_cat)
print(pdf_agg01_train.shape)
display(pdf_agg01_train.head())

Encoding NAME_INCOME_TYPE...
Encoding FONDKAPREMONT_MODE...
Encoding NAME_HOUSING_TYPE...
Encoding NAME_EDUCATION_TYPE...
Encoding OCCUPATION_TYPE...
Encoding ORGANIZATION_TYPE...
Encoding WALLSMATERIAL_MODE...
Encoding NAME_FAMILY_STATUS...
Encoding HOUSETYPE_MODE...
Encoding NAME_TYPE_SUITE...
(307511, 11)


Unnamed: 0,SK_ID_CURR,NAME_INCOME_TYPE_mean_encoding,FONDKAPREMONT_MODE_mean_encoding,NAME_HOUSING_TYPE_mean_encoding,NAME_EDUCATION_TYPE_mean_encoding,OCCUPATION_TYPE_mean_encoding,ORGANIZATION_TYPE_mean_encoding,WALLSMATERIAL_MODE_mean_encoding,NAME_FAMILY_STATUS_mean_encoding,HOUSETYPE_MODE_mean_encoding,NAME_TYPE_SUITE_mean_encoding
0,100002,0.0959,0.0698,0.078,0.0894,0.1058,0.093,0.0741,0.0981,0.0694,0.0818
1,100003,0.0575,0.0698,0.078,0.0536,0.063,0.0591,0.0702,0.0756,0.0694,0.0749
2,100004,0.0959,,0.078,0.0894,0.1058,0.0698,,0.0981,,0.0818
3,100006,0.0959,,0.078,0.0894,0.1058,0.093,,0.0994,,0.0818
4,100007,0.0959,,0.078,0.0894,0.063,0.0588,,0.0981,,0.0818


CPU times: user 1min 13s, sys: 603 ms, total: 1min 14s
Wall time: 52.1 s


In [10]:
%%time
pdf_agg01_test = mean_encode_mapping(pdf_test, dict_encoding_map)
print(pdf_agg01_test.shape)
display(pdf_agg01_test.head())

Encoding NAME_INCOME_TYPE...
Encoding FONDKAPREMONT_MODE...
Encoding NAME_EDUCATION_TYPE...
Encoding ORGANIZATION_TYPE...
Encoding NAME_FAMILY_STATUS...
Encoding WALLSMATERIAL_MODE...
Encoding NAME_HOUSING_TYPE...
Encoding OCCUPATION_TYPE...
Encoding HOUSETYPE_MODE...
Encoding NAME_TYPE_SUITE...
(48744, 11)


Unnamed: 0,SK_ID_CURR,NAME_INCOME_TYPE_mean_encoding,FONDKAPREMONT_MODE_mean_encoding,NAME_EDUCATION_TYPE_mean_encoding,ORGANIZATION_TYPE_mean_encoding,NAME_FAMILY_STATUS_mean_encoding,WALLSMATERIAL_MODE_mean_encoding,NAME_HOUSING_TYPE_mean_encoding,OCCUPATION_TYPE_mean_encoding,HOUSETYPE_MODE_mean_encoding,NAME_TYPE_SUITE_mean_encoding
0,100001,0.0959,,0.0536,0.0703,0.0756,0.0741,0.078,,0.0694,0.0818
1,100005,0.0959,,0.0894,0.1017,0.0756,,0.078,0.1715,,0.0818
2,100013,0.0959,,0.0536,0.1575,0.0756,,0.078,0.1133,,
3,100028,0.0959,0.0698,0.0894,0.093,0.0756,0.0635,0.078,0.0963,0.0694,0.0818
4,100038,0.0959,,0.0894,0.093,0.0756,,0.078,,,0.0818


CPU times: user 9.68 s, sys: 296 ms, total: 9.97 s
Wall time: 8.37 s


In [11]:
feature_evaluate(pdf_train_filtered, pdf_agg01_train)

Unnamed: 0,name,auc,corr,coverage
4,OCCUPATION_TYPE_mean_encoding,0.5752,0.0807,0.6858
5,ORGANIZATION_TYPE_mean_encoding,0.5751,0.073,1.0
0,NAME_INCOME_TYPE_mean_encoding,0.5623,0.0653,1.0
3,NAME_EDUCATION_TYPE_mean_encoding,0.5461,0.0571,1.0
7,NAME_FAMILY_STATUS_mean_encoding,0.5365,0.0418,1.0
6,WALLSMATERIAL_MODE_mean_encoding,0.5272,0.0281,0.4905
2,NAME_HOUSING_TYPE_mean_encoding,0.5183,0.0371,1.0
1,FONDKAPREMONT_MODE_mean_encoding,0.5106,0.0142,0.3165
9,NAME_TYPE_SUITE_mean_encoding,0.5062,0.0081,0.9959
8,HOUSETYPE_MODE_mean_encoding,0.5037,0.0141,0.4973


## bureau

In [12]:
%%time
dict_onehot = {
    "CREDIT_ACTIVE": ['Closed', 'Active', 'Sold', 'Bad debt'],
    "CREDIT_CURRENCY": ['currency 1', 'currency 2', 'currency 3', 'currency 4'],
    "CREDIT_TYPE": ['Consumer credit', 'Credit card', 'Car loan', 'Mortgage', 'Microloan', 'Loan for business development', 'Another type of loan', 'Unknown type of loan', 'Loan for working capital replenishment', 'Cash loan (non-earmarked)', 'Real estate loan', 'Loan for the purchase of equipment', 'Loan for purchase of shares (margin lending)', 'Interbank credit', 'Mobile operator loan'],
}
pdf_onehot = gen_one_hot_feat(pdf_bureau, dict_onehot, main_key="SK_ID_CURR")
display(pdf_onehot.head())

Unnamed: 0,SK_ID_CURR,CREDIT_CURRENCY_currency_1,CREDIT_CURRENCY_currency_2,CREDIT_CURRENCY_currency_3,CREDIT_CURRENCY_currency_4,CREDIT_TYPE_Consumer_credit,CREDIT_TYPE_Credit_card,CREDIT_TYPE_Car_loan,CREDIT_TYPE_Mortgage,CREDIT_TYPE_Microloan,CREDIT_TYPE_Loan_for_business_development,CREDIT_TYPE_Another_type_of_loan,CREDIT_TYPE_Unknown_type_of_loan,CREDIT_TYPE_Loan_for_working_capital_replenishment,CREDIT_TYPE_Cash_loan_(non_earmarked),CREDIT_TYPE_Real_estate_loan,CREDIT_TYPE_Loan_for_the_purchase_of_equipment,CREDIT_TYPE_Loan_for_purchase_of_shares_(margin_lending),CREDIT_TYPE_Interbank_credit,CREDIT_TYPE_Mobile_operator_loan,CREDIT_ACTIVE_Closed,CREDIT_ACTIVE_Active,CREDIT_ACTIVE_Sold,CREDIT_ACTIVE_Bad_debt
0,215354,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1,215354,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,215354,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,215354,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,215354,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


CPU times: user 55.5 s, sys: 1.82 s, total: 57.3 s
Wall time: 16.1 s


In [13]:
pdf_agg01_bureau = agg_common_data(pdf_onehot, ["max", "sum"])
display(pdf_agg01_bureau.head())

{'CREDIT_ACTIVE_Active': ['max', 'sum'],
 'CREDIT_ACTIVE_Bad_debt': ['max', 'sum'],
 'CREDIT_ACTIVE_Closed': ['max', 'sum'],
 'CREDIT_ACTIVE_Sold': ['max', 'sum'],
 'CREDIT_CURRENCY_currency_1': ['max', 'sum'],
 'CREDIT_CURRENCY_currency_2': ['max', 'sum'],
 'CREDIT_CURRENCY_currency_3': ['max', 'sum'],
 'CREDIT_CURRENCY_currency_4': ['max', 'sum'],
 'CREDIT_TYPE_Another_type_of_loan': ['max', 'sum'],
 'CREDIT_TYPE_Car_loan': ['max', 'sum'],
 'CREDIT_TYPE_Cash_loan_(non_earmarked)': ['max', 'sum'],
 'CREDIT_TYPE_Consumer_credit': ['max', 'sum'],
 'CREDIT_TYPE_Credit_card': ['max', 'sum'],
 'CREDIT_TYPE_Interbank_credit': ['max', 'sum'],
 'CREDIT_TYPE_Loan_for_business_development': ['max', 'sum'],
 'CREDIT_TYPE_Loan_for_purchase_of_shares_(margin_lending)': ['max', 'sum'],
 'CREDIT_TYPE_Loan_for_the_purchase_of_equipment': ['max', 'sum'],
 'CREDIT_TYPE_Loan_for_working_capital_replenishment': ['max', 'sum'],
 'CREDIT_TYPE_Microloan': ['max', 'sum'],
 'CREDIT_TYPE_Mobile_operator_loan':

After agg: (305811, 46)


Unnamed: 0_level_0,CREDIT_CURRENCY_currency_4_max,CREDIT_CURRENCY_currency_4_sum,CREDIT_ACTIVE_Closed_max,CREDIT_ACTIVE_Closed_sum,CREDIT_TYPE_Credit_card_max,CREDIT_TYPE_Credit_card_sum,CREDIT_CURRENCY_currency_1_max,CREDIT_CURRENCY_currency_1_sum,CREDIT_CURRENCY_currency_2_max,CREDIT_CURRENCY_currency_2_sum,CREDIT_CURRENCY_currency_3_max,CREDIT_CURRENCY_currency_3_sum,CREDIT_TYPE_Mobile_operator_loan_max,CREDIT_TYPE_Mobile_operator_loan_sum,CREDIT_TYPE_Cash_loan_(non_earmarked)_max,CREDIT_TYPE_Cash_loan_(non_earmarked)_sum,CREDIT_TYPE_Real_estate_loan_max,CREDIT_TYPE_Real_estate_loan_sum,CREDIT_TYPE_Another_type_of_loan_max,CREDIT_TYPE_Another_type_of_loan_sum,CREDIT_TYPE_Interbank_credit_max,CREDIT_TYPE_Interbank_credit_sum,CREDIT_TYPE_Car_loan_max,CREDIT_TYPE_Car_loan_sum,CREDIT_ACTIVE_Sold_max,CREDIT_ACTIVE_Sold_sum,CREDIT_TYPE_Loan_for_purchase_of_shares_(margin_lending)_max,CREDIT_TYPE_Loan_for_purchase_of_shares_(margin_lending)_sum,CREDIT_TYPE_Mortgage_max,CREDIT_TYPE_Mortgage_sum,CREDIT_TYPE_Unknown_type_of_loan_max,CREDIT_TYPE_Unknown_type_of_loan_sum,CREDIT_TYPE_Microloan_max,CREDIT_TYPE_Microloan_sum,CREDIT_ACTIVE_Bad_debt_max,CREDIT_ACTIVE_Bad_debt_sum,CREDIT_ACTIVE_Active_max,CREDIT_ACTIVE_Active_sum,CREDIT_TYPE_Loan_for_the_purchase_of_equipment_max,CREDIT_TYPE_Loan_for_the_purchase_of_equipment_sum,CREDIT_TYPE_Consumer_credit_max,CREDIT_TYPE_Consumer_credit_sum,CREDIT_TYPE_Loan_for_business_development_max,CREDIT_TYPE_Loan_for_business_development_sum,CREDIT_TYPE_Loan_for_working_capital_replenishment_max,CREDIT_TYPE_Loan_for_working_capital_replenishment_sum
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1
100001,0,0,1,4,0,0,1,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,3,0,0,1,7,0,0,0,0
100002,0,0,1,6,1,4,1,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,0,0,1,4,0,0,0,0
100003,0,0,1,3,1,2,1,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,2,0,0,0,0
100004,0,0,1,2,0,0,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,0,0,0,0
100005,0,0,1,1,1,1,1,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,0,0,1,2,0,0,0,0


In [14]:
feature_evaluate(pdf_train_filtered, pdf_agg01_bureau.reset_index())


Unnamed: 0,name,auc,corr,coverage
37,CREDIT_ACTIVE_Active_sum,0.56,0.066235,1.0
3,CREDIT_ACTIVE_Closed_sum,0.549,-0.031939,1.0
5,CREDIT_TYPE_Credit_card_sum,0.5306,0.035684,1.0
2,CREDIT_ACTIVE_Closed_max,0.5303,-0.048711,1.0
36,CREDIT_ACTIVE_Active_max,0.5261,0.036785,1.0
41,CREDIT_TYPE_Consumer_credit_sum,0.5192,-0.012555,1.0
4,CREDIT_TYPE_Credit_card_max,0.5179,0.020099,1.0
33,CREDIT_TYPE_Microloan_sum,0.5116,0.03669,1.0
32,CREDIT_TYPE_Microloan_max,0.5116,0.053957,1.0
23,CREDIT_TYPE_Car_loan_sum,0.5115,-0.023506,1.0


In [15]:
%%time
ls_cat = dict_onehot.keys()
pdf_bureau_join = pdf_train_filtered.merge(pdf_bureau, on="SK_ID_CURR")

# 
pdf_encoded_bureau, dict_encode_bureau = agg_mean_encoding(pdf_bureau_join, ls_cat)
print(pdf_encoded_bureau.shape)
display(pdf_encoded_bureau.head())

Encoding CREDIT_CURRENCY...
Encoding CREDIT_TYPE...
Encoding CREDIT_ACTIVE...
(118641250, 4)


Unnamed: 0,SK_ID_CURR,CREDIT_CURRENCY_mean_encoding,CREDIT_TYPE_mean_encoding,CREDIT_ACTIVE_mean_encoding
0,100002,0.0784,0.0888,0.0695
1,100002,0.0784,0.0888,0.0695
2,100002,0.0784,0.0888,0.0695
3,100002,0.0784,0.0888,0.0695
4,100002,0.0784,0.0888,0.0695


CPU times: user 1min 31s, sys: 5.12 s, total: 1min 36s
Wall time: 1min 11s


In [16]:
pdf_agg01_bureau = agg_common_data(pdf_encoded_bureau, ["max", "sum", "min", "mean", "std"])
display(pdf_agg01_bureau.head())

{'CREDIT_ACTIVE_mean_encoding': ['max', 'sum', 'min', 'mean', 'std'],
 'CREDIT_CURRENCY_mean_encoding': ['max', 'sum', 'min', 'mean', 'std'],
 'CREDIT_TYPE_mean_encoding': ['max', 'sum', 'min', 'mean', 'std']}

After agg: (185814, 15)


Unnamed: 0_level_0,CREDIT_ACTIVE_mean_encoding_max,CREDIT_ACTIVE_mean_encoding_sum,CREDIT_ACTIVE_mean_encoding_min,CREDIT_ACTIVE_mean_encoding_mean,CREDIT_ACTIVE_mean_encoding_std,CREDIT_CURRENCY_mean_encoding_max,CREDIT_CURRENCY_mean_encoding_sum,CREDIT_CURRENCY_mean_encoding_min,CREDIT_CURRENCY_mean_encoding_mean,CREDIT_CURRENCY_mean_encoding_std,CREDIT_TYPE_mean_encoding_max,CREDIT_TYPE_mean_encoding_sum,CREDIT_TYPE_mean_encoding_min,CREDIT_TYPE_mean_encoding_mean,CREDIT_TYPE_mean_encoding_std
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
100002,0.0932,38.6076,0.0695,0.0754,0.0103,0.0784,40.133,0.0784,0.0784,0.0,0.0888,41.8968,0.0749,0.0818,0.007
100003,0.0932,4.8259,0.0695,0.0754,0.0103,0.0784,5.0166,0.0784,0.0784,0.0,0.0888,5.2371,0.0749,0.0818,0.007
100004,0.0695,0.5558,0.0695,0.0695,0.0,0.0784,0.6271,0.0784,0.0784,0.0,0.0749,0.599,0.0749,0.0749,0.0
100007,0.0695,0.0695,0.0695,0.0695,,0.0784,0.0784,0.0784,0.0784,,0.0749,0.0749,0.0749,0.0749,
100010,0.0932,0.6506,0.0695,0.0813,0.0127,0.0784,0.6271,0.0784,0.0784,0.0,0.0749,0.5503,0.0627,0.0688,0.0065


In [17]:
feature_evaluate(pdf_train_filtered, pdf_agg01_bureau.reset_index())


Unnamed: 0,name,auc,corr,coverage
3,CREDIT_ACTIVE_mean_encoding_mean,0.5873,0.0816,1.0
13,CREDIT_TYPE_mean_encoding_mean,0.5605,0.0621,1.0
2,CREDIT_ACTIVE_mean_encoding_min,0.5303,0.0489,1.0
0,CREDIT_ACTIVE_mean_encoding_max,0.5294,0.0396,1.0
10,CREDIT_TYPE_mean_encoding_max,0.5286,0.0578,1.0
14,CREDIT_TYPE_mean_encoding_std,0.5278,0.0498,0.8631
12,CREDIT_TYPE_mean_encoding_min,0.5254,0.0364,1.0
4,CREDIT_ACTIVE_mean_encoding_std,0.5249,0.0138,0.8631
8,CREDIT_CURRENCY_mean_encoding_mean,0.5065,0.0092,1.0
6,CREDIT_CURRENCY_mean_encoding_sum,0.5062,0.0099,1.0


# save features

In [18]:
pdf_agg_train_test = pd.concat([pdf_agg01_train, pdf_agg01_test], sort=False)
pdf_feat = pdf_agg_train_test.merge(pdf_agg01_bureau.reset_index(), on="SK_ID_CURR")
print(pdf_feat.shape)
pdf_feat.head()

(185814, 26)


Unnamed: 0,SK_ID_CURR,NAME_INCOME_TYPE_mean_encoding,FONDKAPREMONT_MODE_mean_encoding,NAME_HOUSING_TYPE_mean_encoding,NAME_EDUCATION_TYPE_mean_encoding,OCCUPATION_TYPE_mean_encoding,ORGANIZATION_TYPE_mean_encoding,WALLSMATERIAL_MODE_mean_encoding,NAME_FAMILY_STATUS_mean_encoding,HOUSETYPE_MODE_mean_encoding,NAME_TYPE_SUITE_mean_encoding,CREDIT_ACTIVE_mean_encoding_max,CREDIT_ACTIVE_mean_encoding_sum,CREDIT_ACTIVE_mean_encoding_min,CREDIT_ACTIVE_mean_encoding_mean,CREDIT_ACTIVE_mean_encoding_std,CREDIT_CURRENCY_mean_encoding_max,CREDIT_CURRENCY_mean_encoding_sum,CREDIT_CURRENCY_mean_encoding_min,CREDIT_CURRENCY_mean_encoding_mean,CREDIT_CURRENCY_mean_encoding_std,CREDIT_TYPE_mean_encoding_max,CREDIT_TYPE_mean_encoding_sum,CREDIT_TYPE_mean_encoding_min,CREDIT_TYPE_mean_encoding_mean,CREDIT_TYPE_mean_encoding_std
0,100002,0.0959,0.0698,0.078,0.0894,0.1058,0.093,0.0741,0.0981,0.0694,0.0818,0.0932,38.6076,0.0695,0.0754,0.0103,0.0784,40.133,0.0784,0.0784,0.0,0.0888,41.8968,0.0749,0.0818,0.007
1,100003,0.0575,0.0698,0.078,0.0536,0.063,0.0591,0.0702,0.0756,0.0694,0.0749,0.0932,4.8259,0.0695,0.0754,0.0103,0.0784,5.0166,0.0784,0.0784,0.0,0.0888,5.2371,0.0749,0.0818,0.007
2,100004,0.0959,,0.078,0.0894,0.1058,0.0698,,0.0981,,0.0818,0.0695,0.5558,0.0695,0.0695,0.0,0.0784,0.6271,0.0784,0.0784,0.0,0.0749,0.599,0.0749,0.0749,0.0
3,100007,0.0959,,0.078,0.0894,0.063,0.0588,,0.0981,,0.0818,0.0695,0.0695,0.0695,0.0695,,0.0784,0.0784,0.0784,0.0784,,0.0749,0.0749,0.0749,0.0749,
4,100010,0.0575,,0.078,0.0536,0.0621,0.0764,,0.0756,,0.0818,0.0932,0.6506,0.0695,0.0813,0.0127,0.0784,0.6271,0.0784,0.0784,0.0,0.0749,0.5503,0.0627,0.0688,0.0065


In [19]:
%%time
fname = "mean_encoding_feat_cat"
fname = os.path.join("features", "{}.pkl.bz2".format(fname))
pdf_feat.to_pickle(fname, compression="bz2")
print("Store features completed!")


Store features completed!
CPU times: user 7.83 s, sys: 23.1 ms, total: 7.85 s
Wall time: 7.34 s
