In this notebook we will aggregate and merge tables

In [6]:
%load_ext autoreload
%autoreload 2

import os
import gc
import pandas as pd
import numpy as np

from _preprocessing import aggregate
from _preprocessing import change_dtypes

INP_DIR = "data/download"
OUT_DIR = "data/data_"

NUM_STATS = ("count", "median", "min", "max", "sum")
CAT_STATS = (("count", "sum"), ("percent", "mean"))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load `application_train.csv` and `application_test.csv`

In [9]:
application_train = pd.read_csv(os.path.join(INP_DIR, "application_train.csv"))
application_train = change_dtypes(application_train)

application_test = pd.read_csv(os.path.join(INP_DIR, "application_test.csv"))
application_test = change_dtypes(application_test)

print("application_train shape", application_train.shape)
print("application_test shape", application_test.shape)

Memory usage before changing types 300.13 MB
Memory usage after changing types 129.78 MB
Memory usage before changing types 47.18 MB
Memory usage after changing types 20.53 MB
application_train shape (307511, 122)
application_test shape (48744, 121)


## Write to csv

In [10]:
X_train_0 = application_train.drop(["SK_ID_CURR", "TARGET"], axis=1)
y_train = application_train[["TARGET"]]

X_test_0 = application_test.drop(["SK_ID_CURR"], axis=1)
id_test =  application_test[["SK_ID_CURR"]]


# write to csv files
X_train_0.to_csv(os.path.join(OUT_DIR, "X_train_0.csv"), index=False)
y_train.to_csv(os.path.join(OUT_DIR, "y_train.csv"), index=False)

X_test_0.to_csv(os.path.join(OUT_DIR, "X_test_0.csv"), index=False)
id_test.to_csv(os.path.join(OUT_DIR, "id_test.csv"), index=False)

gc.enable()
del X_train_0, y_train, X_test_0, id_test
gc.collect()

5635

## Aggregate table `bureau_balance` by column `SK_ID_BUREAU`

In [11]:
bureau_balance = pd.read_csv(os.path.join(INP_DIR, "bureau_balance.csv"))
bureau_balance = change_dtypes(bureau_balance)

bureau_balance_agg = aggregate(bureau_balance, by=["SK_ID_BUREAU"], 
                               num_stats=NUM_STATS, cat_stats=CAT_STATS,
                               prefix="bb_")
gc.enable()
del bureau_balance
gc.collect()

bureau_balance_agg.head()

Memory usage before changing types 655.20 MB
Memory usage after changing types 245.70 MB


Unnamed: 0,SK_ID_BUREAU,bb_MONTHS_BALANCE_count,bb_MONTHS_BALANCE_median,bb_MONTHS_BALANCE_min,bb_MONTHS_BALANCE_max,bb_MONTHS_BALANCE_sum,bb_STATUS_0_count,bb_STATUS_0_percent,bb_STATUS_1_count,bb_STATUS_1_percent,bb_STATUS_2_count,bb_STATUS_2_percent,bb_STATUS_3_count,bb_STATUS_3_percent,bb_STATUS_4_count,bb_STATUS_4_percent,bb_STATUS_5_count,bb_STATUS_5_percent,bb_STATUS_C_count,bb_STATUS_C_percent,bb_STATUS_X_count,bb_STATUS_X_percent
0,5001709,97,-48.0,-96,0,-4656,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,86,0.886598,11,0.113402
1,5001710,83,-41.0,-82,0,-3403,5,0.060241,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,48,0.578313,30,0.361446
2,5001711,4,-1.5,-3,0,-6,3,0.75,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,1,0.25
3,5001712,19,-9.0,-18,0,-171,10,0.526316,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,9,0.473684,0,0.0
4,5001713,22,-10.5,-21,0,-231,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,22,1.0


## Merge table `bureau_balance_agg` with table `bureau` on column `SK_ID_BUREAU`

In [12]:
bureau = pd.read_csv(os.path.join(INP_DIR, "bureau.csv"))
bureau = change_dtypes(bureau)

bureau_merge = bureau.merge(bureau_balance_agg, how="left", on="SK_ID_BUREAU")

bureau_merge = bureau_merge.drop(["SK_ID_BUREAU"], axis=1)

print("bureau shape:", bureau.shape)
print("bureau_merge shape:", bureau_merge.shape)

gc.enable()
del bureau
gc.collect()

bureau_merge.head()

Memory usage before changing types 233.43 MB
Memory usage after changing types 101.27 MB
bureau shape: (1716428, 17)
bureau_merge shape: (1716428, 37)


Unnamed: 0,SK_ID_CURR,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE,AMT_ANNUITY,bb_MONTHS_BALANCE_count,bb_MONTHS_BALANCE_median,bb_MONTHS_BALANCE_min,bb_MONTHS_BALANCE_max,bb_MONTHS_BALANCE_sum,bb_STATUS_0_count,bb_STATUS_0_percent,bb_STATUS_1_count,bb_STATUS_1_percent,bb_STATUS_2_count,bb_STATUS_2_percent,bb_STATUS_3_count,bb_STATUS_3_percent,bb_STATUS_4_count,bb_STATUS_4_percent,bb_STATUS_5_count,bb_STATUS_5_percent,bb_STATUS_C_count,bb_STATUS_C_percent,bb_STATUS_X_count,bb_STATUS_X_percent
0,215354,Closed,currency 1,-497,0,-153.0,-153.0,,0,91323.0,0.0,,0.0,Consumer credit,-131,,,,,,,,,,,,,,,,,,,,,,
1,215354,Active,currency 1,-208,0,1075.0,,,0,225000.0,171342.0,,0.0,Credit card,-20,,,,,,,,,,,,,,,,,,,,,,
2,215354,Active,currency 1,-203,0,528.0,,,0,464323.5,,,0.0,Consumer credit,-16,,,,,,,,,,,,,,,,,,,,,,
3,215354,Active,currency 1,-203,0,,,,0,90000.0,,,0.0,Credit card,-16,,,,,,,,,,,,,,,,,,,,,,
4,215354,Active,currency 1,-629,0,1197.0,,77674.5,0,2700000.0,,,0.0,Consumer credit,-21,,,,,,,,,,,,,,,,,,,,,,


## Aggregate table `bureau_merge` by column `SK_ID_CURR`

In [13]:
bureau_agg = aggregate(bureau_merge, by=["SK_ID_CURR"], num_stats=NUM_STATS, cat_stats=CAT_STATS, prefix="bu_")

print("bureau_merge shape:", bureau_merge.shape)
print("bureau_agg shape:", bureau_agg.shape)
print("application_train shape:", application_train.shape)

gc.enable()
del bureau_merge
gc.collect()

bureau_agg.head()

bureau_merge shape: (1716428, 37)
bureau_agg shape: (305811, 212)
application_train shape: (307511, 122)


Unnamed: 0,SK_ID_CURR,bu_DAYS_CREDIT_count,bu_DAYS_CREDIT_median,bu_DAYS_CREDIT_min,bu_DAYS_CREDIT_max,bu_DAYS_CREDIT_sum,bu_CREDIT_DAY_OVERDUE_count,bu_CREDIT_DAY_OVERDUE_median,bu_CREDIT_DAY_OVERDUE_min,bu_CREDIT_DAY_OVERDUE_max,bu_CREDIT_DAY_OVERDUE_sum,bu_DAYS_CREDIT_ENDDATE_count,bu_DAYS_CREDIT_ENDDATE_median,bu_DAYS_CREDIT_ENDDATE_min,bu_DAYS_CREDIT_ENDDATE_max,bu_DAYS_CREDIT_ENDDATE_sum,bu_DAYS_ENDDATE_FACT_count,bu_DAYS_ENDDATE_FACT_median,bu_DAYS_ENDDATE_FACT_min,bu_DAYS_ENDDATE_FACT_max,bu_DAYS_ENDDATE_FACT_sum,bu_AMT_CREDIT_MAX_OVERDUE_count,bu_AMT_CREDIT_MAX_OVERDUE_median,bu_AMT_CREDIT_MAX_OVERDUE_min,bu_AMT_CREDIT_MAX_OVERDUE_max,bu_AMT_CREDIT_MAX_OVERDUE_sum,bu_CNT_CREDIT_PROLONG_count,bu_CNT_CREDIT_PROLONG_median,bu_CNT_CREDIT_PROLONG_min,bu_CNT_CREDIT_PROLONG_max,bu_CNT_CREDIT_PROLONG_sum,bu_AMT_CREDIT_SUM_count,bu_AMT_CREDIT_SUM_median,bu_AMT_CREDIT_SUM_min,bu_AMT_CREDIT_SUM_max,bu_AMT_CREDIT_SUM_sum,bu_AMT_CREDIT_SUM_DEBT_count,bu_AMT_CREDIT_SUM_DEBT_median,bu_AMT_CREDIT_SUM_DEBT_min,bu_AMT_CREDIT_SUM_DEBT_max,bu_AMT_CREDIT_SUM_DEBT_sum,bu_AMT_CREDIT_SUM_LIMIT_count,bu_AMT_CREDIT_SUM_LIMIT_median,bu_AMT_CREDIT_SUM_LIMIT_min,bu_AMT_CREDIT_SUM_LIMIT_max,bu_AMT_CREDIT_SUM_LIMIT_sum,bu_AMT_CREDIT_SUM_OVERDUE_count,bu_AMT_CREDIT_SUM_OVERDUE_median,bu_AMT_CREDIT_SUM_OVERDUE_min,bu_AMT_CREDIT_SUM_OVERDUE_max,bu_AMT_CREDIT_SUM_OVERDUE_sum,bu_DAYS_CREDIT_UPDATE_count,bu_DAYS_CREDIT_UPDATE_median,bu_DAYS_CREDIT_UPDATE_min,bu_DAYS_CREDIT_UPDATE_max,bu_DAYS_CREDIT_UPDATE_sum,bu_AMT_ANNUITY_count,bu_AMT_ANNUITY_median,...,bu_bb_STATUS_C_percent_max,bu_bb_STATUS_C_percent_sum,bu_bb_STATUS_X_count_count,bu_bb_STATUS_X_count_median,bu_bb_STATUS_X_count_min,bu_bb_STATUS_X_count_max,bu_bb_STATUS_X_count_sum,bu_bb_STATUS_X_percent_count,bu_bb_STATUS_X_percent_median,bu_bb_STATUS_X_percent_min,bu_bb_STATUS_X_percent_max,bu_bb_STATUS_X_percent_sum,bu_CREDIT_ACTIVE_Active_count,bu_CREDIT_ACTIVE_Active_percent,bu_CREDIT_ACTIVE_Bad debt_count,bu_CREDIT_ACTIVE_Bad debt_percent,bu_CREDIT_ACTIVE_Closed_count,bu_CREDIT_ACTIVE_Closed_percent,bu_CREDIT_ACTIVE_Sold_count,bu_CREDIT_ACTIVE_Sold_percent,bu_CREDIT_CURRENCY_currency 1_count,bu_CREDIT_CURRENCY_currency 1_percent,bu_CREDIT_CURRENCY_currency 2_count,bu_CREDIT_CURRENCY_currency 2_percent,bu_CREDIT_CURRENCY_currency 3_count,bu_CREDIT_CURRENCY_currency 3_percent,bu_CREDIT_CURRENCY_currency 4_count,bu_CREDIT_CURRENCY_currency 4_percent,bu_CREDIT_TYPE_Another type of loan_count,bu_CREDIT_TYPE_Another type of loan_percent,bu_CREDIT_TYPE_Car loan_count,bu_CREDIT_TYPE_Car loan_percent,bu_CREDIT_TYPE_Cash loan (non-earmarked)_count,bu_CREDIT_TYPE_Cash loan (non-earmarked)_percent,bu_CREDIT_TYPE_Consumer credit_count,bu_CREDIT_TYPE_Consumer credit_percent,bu_CREDIT_TYPE_Credit card_count,bu_CREDIT_TYPE_Credit card_percent,bu_CREDIT_TYPE_Interbank credit_count,bu_CREDIT_TYPE_Interbank credit_percent,bu_CREDIT_TYPE_Loan for business development_count,bu_CREDIT_TYPE_Loan for business development_percent,bu_CREDIT_TYPE_Loan for purchase of shares (margin lending)_count,bu_CREDIT_TYPE_Loan for purchase of shares (margin lending)_percent,bu_CREDIT_TYPE_Loan for the purchase of equipment_count,bu_CREDIT_TYPE_Loan for the purchase of equipment_percent,bu_CREDIT_TYPE_Loan for working capital replenishment_count,bu_CREDIT_TYPE_Loan for working capital replenishment_percent,bu_CREDIT_TYPE_Microloan_count,bu_CREDIT_TYPE_Microloan_percent,bu_CREDIT_TYPE_Mobile operator loan_count,bu_CREDIT_TYPE_Mobile operator loan_percent,bu_CREDIT_TYPE_Mortgage_count,bu_CREDIT_TYPE_Mortgage_percent,bu_CREDIT_TYPE_Real estate loan_count,bu_CREDIT_TYPE_Real estate loan_percent,bu_CREDIT_TYPE_Unknown type of loan_count,bu_CREDIT_TYPE_Unknown type of loan_percent
0,100001,7,-857.0,-1572,-49,-5145,7,0.0,0,0,0,7,-179.0,-1329.0,1778.0,577.0,4,-715.0,-1328.0,-544.0,-3302.0,0,,,,0.0,7,0.0,0,0,0,7,168345.0,85500.0,378000.0,1453365.0,7,0.0,0.0,373239.0,596686.5,6,0.0,0.0,0.0,0.0,7,0.0,0.0,0.0,0.0,7,-155.0,-155,-6,-652,7,0.0,...,0.966667,3.088683,7,6.0,0.0,9.0,30.0,7,0.241379,0.0,0.5,1.502129,3,0.428571,0,0.0,4,0.571429,0,0.0,7,1.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,7,1.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
1,100002,8,-1042.5,-1437,-103,-6992,8,0.0,0,0,0,6,-424.5,-1072.0,780.0,-2094.0,6,-939.0,-1185.0,-36.0,-4185.0,5,40.5,0.0,5043.64502,8405.144531,8,0.0,0,0,0,8,54130.5,0.0,450000.0,865055.6,5,0.0,0.0,245781.0,245781.0,4,0.0,0.0,31988.564453,31988.564453,8,0.0,0.0,0.0,0.0,8,-402.5,-1185,-7,-3999,7,0.0,...,0.8125,1.403409,8,2.5,0.0,3.0,15.0,8,0.1875,0.0,0.5,1.295455,2,0.25,0,0.0,6,0.75,0,0.0,8,1.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,4,0.5,4,0.5,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
2,100003,4,-1205.5,-2586,-606,-5603,4,0.0,0,0,0,4,-480.0,-2434.0,1216.0,-2178.0,3,-621.0,-2131.0,-540.0,-3292.0,4,0.0,0.0,0.0,0.0,4,0.0,0,0,0,4,92576.25,22248.0,810000.0,1017400.0,4,0.0,0.0,0.0,0.0,4,0.0,0.0,810000.0,810000.0,4,0.0,0.0,0.0,0.0,4,-545.0,-2131,-43,-3264,0,,...,,0.0,0,,,,0.0,0,,,,0.0,1,0.25,0,0.0,3,0.75,0,0.0,4,1.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,2,0.5,2,0.5,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
3,100004,2,-867.0,-1326,-408,-1734,2,0.0,0,0,0,2,-488.5,-595.0,-382.0,-977.0,2,-532.5,-683.0,-382.0,-1065.0,1,0.0,0.0,0.0,0.0,2,0.0,0,0,0,2,94518.898438,94500.0,94537.796875,189037.8,2,0.0,0.0,0.0,0.0,2,0.0,0.0,0.0,0.0,2,0.0,0.0,0.0,0.0,2,-532.0,-682,-382,-1064,0,,...,,0.0,0,,,,0.0,0,,,,0.0,0,0.0,0,0.0,2,1.0,0,0.0,2,1.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,2,1.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
4,100005,3,-137.0,-373,-62,-572,3,0.0,0,0,0,3,122.0,-128.0,1324.0,1318.0,1,-123.0,-123.0,-123.0,-123.0,1,0.0,0.0,0.0,0.0,3,0.0,0,0,0,3,58500.0,29826.0,568800.0,657126.0,3,25321.5,0.0,543087.0,568408.5,3,0.0,0.0,0.0,0.0,3,0.0,0.0,0.0,0.0,3,-31.0,-121,-11,-163,3,0.0,...,0.384615,0.384615,3,1.0,0.0,1.0,2.0,3,0.076923,0.0,0.333333,0.410256,2,0.666667,0,0.0,1,0.333333,0,0.0,3,1.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,2,0.666667,1,0.333333,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0


## Merge table `bureau_agg` with `application_train` and `application_test`

In [14]:
application_bureau_train = application_train.merge(bureau_agg, how="left", on="SK_ID_CURR")
application_bureau_test = application_test.merge(bureau_agg, how="left", on="SK_ID_CURR")

print("application_train shape:", application_train.shape)
print("application_bureau_train shape:", application_bureau_train.shape)

print("application_test shape:", application_test.shape)
print("application_bureau_test shape:", application_bureau_test.shape)

gc.enable()
del bureau_agg
gc.collect()

application_train shape: (307511, 122)
application_bureau_train shape: (307511, 333)
application_test shape: (48744, 121)
application_bureau_test shape: (48744, 332)


14

In [15]:
application_bureau_train.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,ORGANIZATION_TYPE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,...,bu_bb_STATUS_C_percent_max,bu_bb_STATUS_C_percent_sum,bu_bb_STATUS_X_count_count,bu_bb_STATUS_X_count_median,bu_bb_STATUS_X_count_min,bu_bb_STATUS_X_count_max,bu_bb_STATUS_X_count_sum,bu_bb_STATUS_X_percent_count,bu_bb_STATUS_X_percent_median,bu_bb_STATUS_X_percent_min,bu_bb_STATUS_X_percent_max,bu_bb_STATUS_X_percent_sum,bu_CREDIT_ACTIVE_Active_count,bu_CREDIT_ACTIVE_Active_percent,bu_CREDIT_ACTIVE_Bad debt_count,bu_CREDIT_ACTIVE_Bad debt_percent,bu_CREDIT_ACTIVE_Closed_count,bu_CREDIT_ACTIVE_Closed_percent,bu_CREDIT_ACTIVE_Sold_count,bu_CREDIT_ACTIVE_Sold_percent,bu_CREDIT_CURRENCY_currency 1_count,bu_CREDIT_CURRENCY_currency 1_percent,bu_CREDIT_CURRENCY_currency 2_count,bu_CREDIT_CURRENCY_currency 2_percent,bu_CREDIT_CURRENCY_currency 3_count,bu_CREDIT_CURRENCY_currency 3_percent,bu_CREDIT_CURRENCY_currency 4_count,bu_CREDIT_CURRENCY_currency 4_percent,bu_CREDIT_TYPE_Another type of loan_count,bu_CREDIT_TYPE_Another type of loan_percent,bu_CREDIT_TYPE_Car loan_count,bu_CREDIT_TYPE_Car loan_percent,bu_CREDIT_TYPE_Cash loan (non-earmarked)_count,bu_CREDIT_TYPE_Cash loan (non-earmarked)_percent,bu_CREDIT_TYPE_Consumer credit_count,bu_CREDIT_TYPE_Consumer credit_percent,bu_CREDIT_TYPE_Credit card_count,bu_CREDIT_TYPE_Credit card_percent,bu_CREDIT_TYPE_Interbank credit_count,bu_CREDIT_TYPE_Interbank credit_percent,bu_CREDIT_TYPE_Loan for business development_count,bu_CREDIT_TYPE_Loan for business development_percent,bu_CREDIT_TYPE_Loan for purchase of shares (margin lending)_count,bu_CREDIT_TYPE_Loan for purchase of shares (margin lending)_percent,bu_CREDIT_TYPE_Loan for the purchase of equipment_count,bu_CREDIT_TYPE_Loan for the purchase of equipment_percent,bu_CREDIT_TYPE_Loan for working capital replenishment_count,bu_CREDIT_TYPE_Loan for working capital replenishment_percent,bu_CREDIT_TYPE_Microloan_count,bu_CREDIT_TYPE_Microloan_percent,bu_CREDIT_TYPE_Mobile operator loan_count,bu_CREDIT_TYPE_Mobile operator loan_percent,bu_CREDIT_TYPE_Mortgage_count,bu_CREDIT_TYPE_Mortgage_percent,bu_CREDIT_TYPE_Real estate loan_count,bu_CREDIT_TYPE_Real estate loan_percent,bu_CREDIT_TYPE_Unknown type of loan_count,bu_CREDIT_TYPE_Unknown type of loan_percent
0,100002,True,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,351000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.018801,-9461,-637,-3648.0,-2120,,True,True,0,True,True,0,Laborers,1.0,2,2,WEDNESDAY,10,0,0,0,0,0,0,Business Entity Type 3,0.083037,0.262949,0.139376,0.0247,0.0369,0.9722,0.6192,0.0143,0.0,0.069,0.0833,0.125,0.0369,0.0202,0.019,0.0,0.0,...,0.8125,1.403409,8.0,2.5,0.0,3.0,15.0,8.0,0.1875,0.0,0.5,1.295455,2.0,0.25,0.0,0.0,6.0,0.75,0.0,0.0,8.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.5,4.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,100003,False,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,1129500.0,Family,State servant,Higher education,Married,House / apartment,0.003541,-16765,-1188,-1186.0,-291,,True,True,0,True,True,0,Core staff,2.0,1,1,MONDAY,11,0,0,0,0,0,0,School,0.311267,0.622246,,0.0959,0.0529,0.9851,0.796,0.0605,0.08,0.0345,0.2917,0.3333,0.013,0.0773,0.0549,0.0039,0.0098,...,,0.0,0.0,,,,0.0,0.0,,,,0.0,1.0,0.25,0.0,0.0,3.0,0.75,0.0,0.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.5,2.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,False,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,135000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.010032,-19046,-225,-4260.0,-2531,26.0,True,True,1,True,True,0,Laborers,1.0,2,2,MONDAY,9,0,0,0,0,0,0,Government,,0.555912,0.729567,,,,,,,,,,,,,,,...,,0.0,0.0,,,,0.0,0.0,,,,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,False,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,297000.0,Unaccompanied,Working,Secondary / secondary special,Civil marriage,House / apartment,0.008019,-19005,-3039,-9833.0,-2437,,True,True,0,True,False,0,Laborers,2.0,2,2,WEDNESDAY,17,0,0,0,0,0,0,Business Entity Type 3,,0.650442,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,100007,False,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,513000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.028663,-19932,-3038,-4311.0,-3458,,True,True,0,True,False,0,Core staff,1.0,2,2,THURSDAY,11,0,0,0,0,1,1,Religion,,0.322738,,,,,,,,,,,,,,,,...,,0.0,0.0,,,,0.0,0.0,,,,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
application_bureau_test.head()

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,ORGANIZATION_TYPE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,APARTMENTS_MODE,...,bu_bb_STATUS_C_percent_max,bu_bb_STATUS_C_percent_sum,bu_bb_STATUS_X_count_count,bu_bb_STATUS_X_count_median,bu_bb_STATUS_X_count_min,bu_bb_STATUS_X_count_max,bu_bb_STATUS_X_count_sum,bu_bb_STATUS_X_percent_count,bu_bb_STATUS_X_percent_median,bu_bb_STATUS_X_percent_min,bu_bb_STATUS_X_percent_max,bu_bb_STATUS_X_percent_sum,bu_CREDIT_ACTIVE_Active_count,bu_CREDIT_ACTIVE_Active_percent,bu_CREDIT_ACTIVE_Bad debt_count,bu_CREDIT_ACTIVE_Bad debt_percent,bu_CREDIT_ACTIVE_Closed_count,bu_CREDIT_ACTIVE_Closed_percent,bu_CREDIT_ACTIVE_Sold_count,bu_CREDIT_ACTIVE_Sold_percent,bu_CREDIT_CURRENCY_currency 1_count,bu_CREDIT_CURRENCY_currency 1_percent,bu_CREDIT_CURRENCY_currency 2_count,bu_CREDIT_CURRENCY_currency 2_percent,bu_CREDIT_CURRENCY_currency 3_count,bu_CREDIT_CURRENCY_currency 3_percent,bu_CREDIT_CURRENCY_currency 4_count,bu_CREDIT_CURRENCY_currency 4_percent,bu_CREDIT_TYPE_Another type of loan_count,bu_CREDIT_TYPE_Another type of loan_percent,bu_CREDIT_TYPE_Car loan_count,bu_CREDIT_TYPE_Car loan_percent,bu_CREDIT_TYPE_Cash loan (non-earmarked)_count,bu_CREDIT_TYPE_Cash loan (non-earmarked)_percent,bu_CREDIT_TYPE_Consumer credit_count,bu_CREDIT_TYPE_Consumer credit_percent,bu_CREDIT_TYPE_Credit card_count,bu_CREDIT_TYPE_Credit card_percent,bu_CREDIT_TYPE_Interbank credit_count,bu_CREDIT_TYPE_Interbank credit_percent,bu_CREDIT_TYPE_Loan for business development_count,bu_CREDIT_TYPE_Loan for business development_percent,bu_CREDIT_TYPE_Loan for purchase of shares (margin lending)_count,bu_CREDIT_TYPE_Loan for purchase of shares (margin lending)_percent,bu_CREDIT_TYPE_Loan for the purchase of equipment_count,bu_CREDIT_TYPE_Loan for the purchase of equipment_percent,bu_CREDIT_TYPE_Loan for working capital replenishment_count,bu_CREDIT_TYPE_Loan for working capital replenishment_percent,bu_CREDIT_TYPE_Microloan_count,bu_CREDIT_TYPE_Microloan_percent,bu_CREDIT_TYPE_Mobile operator loan_count,bu_CREDIT_TYPE_Mobile operator loan_percent,bu_CREDIT_TYPE_Mortgage_count,bu_CREDIT_TYPE_Mortgage_percent,bu_CREDIT_TYPE_Real estate loan_count,bu_CREDIT_TYPE_Real estate loan_percent,bu_CREDIT_TYPE_Unknown type of loan_count,bu_CREDIT_TYPE_Unknown type of loan_percent
0,100001,Cash loans,F,N,Y,0,135000.0,568800.0,20560.5,450000.0,Unaccompanied,Working,Higher education,Married,House / apartment,0.01885,-19241,-2329,-5170.0,-812,,True,True,0,True,0,True,,2.0,2,2,TUESDAY,18,0,0,0,0,0,0,Kindergarten,0.752614,0.789654,0.15952,0.066,0.059,0.9732,,,,0.1379,0.125,,,,0.0505,,,0.0672,...,0.966667,3.088683,7.0,6.0,0.0,9.0,30.0,7.0,0.241379,0.0,0.5,1.502129,3.0,0.428571,0.0,0.0,4.0,0.571429,0.0,0.0,7.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,100005,Cash loans,M,N,Y,0,99000.0,222768.0,17370.0,180000.0,Unaccompanied,Working,Secondary / secondary special,Married,House / apartment,0.035792,-18064,-4469,-9118.0,-1623,,True,True,0,True,0,False,Low-skill Laborers,2.0,2,2,FRIDAY,9,0,0,0,0,0,0,Self-employed,0.56499,0.291656,0.432962,,,,,,,,,,,,,,,,...,0.384615,0.384615,3.0,1.0,0.0,1.0,2.0,3.0,0.076923,0.0,0.333333,0.410256,2.0,0.666667,0.0,0.0,1.0,0.333333,0.0,0.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.666667,1.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,100013,Cash loans,M,Y,Y,0,202500.0,663264.0,69777.0,630000.0,,Working,Higher education,Married,House / apartment,0.019101,-20038,-4458,-2175.0,-3503,5.0,True,True,0,True,0,False,Drivers,2.0,2,2,MONDAY,14,0,0,0,0,0,0,Transport: type 3,,0.699787,0.610991,,,,,,,,,,,,,,,,...,0.666667,1.588142,4.0,0.5,0.0,40.0,41.0,4.0,0.009091,0.0,1.0,1.018182,0.0,0.0,0.0,0.0,4.0,1.0,0.0,0.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.5,0.0,0.0,2.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,100028,Cash loans,F,N,Y,2,315000.0,1575000.0,49018.5,1575000.0,Unaccompanied,Working,Secondary / secondary special,Married,House / apartment,0.026392,-13976,-1866,-2000.0,-4208,,True,True,0,True,1,False,Sales staff,4.0,2,2,WEDNESDAY,11,0,0,0,0,0,0,Business Entity Type 3,0.525734,0.509677,0.612704,0.3052,0.1974,0.997,0.9592,0.1165,0.32,0.2759,0.375,0.0417,0.2042,0.2404,0.3673,0.0386,0.08,0.3109,...,0.885714,4.346939,12.0,1.0,0.0,60.0,133.0,12.0,0.018451,0.0,1.0,3.125213,5.0,0.416667,0.0,0.0,7.0,0.583333,0.0,0.0,12.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.583333,5.0,0.416667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,100038,Cash loans,M,Y,N,1,180000.0,625500.0,32067.0,625500.0,Unaccompanied,Working,Secondary / secondary special,Married,House / apartment,0.010032,-13040,-2191,-4000.0,-4262,16.0,True,True,1,True,0,False,,3.0,2,2,FRIDAY,5,0,0,0,0,1,1,Business Entity Type 3,0.202145,0.425687,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


## Write to csv files

In [17]:
X_train_1 = application_bureau_train.drop(["SK_ID_CURR", "TARGET"], axis=1)
X_test_1 = application_bureau_test.drop(["SK_ID_CURR"], axis=1)

# write to csv files
X_train_1.to_csv(os.path.join(OUT_DIR, "X_train_1.csv"), index=False)
X_test_1.to_csv(os.path.join(OUT_DIR, "X_test_1.csv"), index=False)

gc.enable()
del X_train_1, X_test_1
gc.collect()

118

## Aggregate table `POS_CASH_balance` by column `SK_ID_PREV`

In [18]:
POS_CASH_balance = pd.read_csv(os.path.join(INP_DIR, "POS_CASH_balance.csv"))
POS_CASH_balance = change_dtypes(POS_CASH_balance)

POS_CASH_balance_agg = aggregate(POS_CASH_balance.drop(["SK_ID_CURR"], axis=1), by=["SK_ID_PREV"],
                                num_stats=NUM_STATS, cat_stats=CAT_STATS,
                                prefix="pc_")

print("POS_CASH_balance shape:", POS_CASH_balance.shape)
print("POS_CASH_balance_agg shape:", POS_CASH_balance_agg.shape)

gc.enable()
del POS_CASH_balance
gc.collect()

POS_CASH_balance_agg.head()

Memory usage before changing types 640.09 MB
Memory usage after changing types 290.04 MB
POS_CASH_balance shape: (10001358, 8)
POS_CASH_balance_agg shape: (936325, 44)


Unnamed: 0,SK_ID_PREV,pc_MONTHS_BALANCE_count,pc_MONTHS_BALANCE_median,pc_MONTHS_BALANCE_min,pc_MONTHS_BALANCE_max,pc_MONTHS_BALANCE_sum,pc_CNT_INSTALMENT_count,pc_CNT_INSTALMENT_median,pc_CNT_INSTALMENT_min,pc_CNT_INSTALMENT_max,pc_CNT_INSTALMENT_sum,pc_CNT_INSTALMENT_FUTURE_count,pc_CNT_INSTALMENT_FUTURE_median,pc_CNT_INSTALMENT_FUTURE_min,pc_CNT_INSTALMENT_FUTURE_max,pc_CNT_INSTALMENT_FUTURE_sum,pc_SK_DPD_count,pc_SK_DPD_median,pc_SK_DPD_min,pc_SK_DPD_max,pc_SK_DPD_sum,pc_SK_DPD_DEF_count,pc_SK_DPD_DEF_median,pc_SK_DPD_DEF_min,pc_SK_DPD_DEF_max,pc_SK_DPD_DEF_sum,pc_NAME_CONTRACT_STATUS_Active_count,pc_NAME_CONTRACT_STATUS_Active_percent,pc_NAME_CONTRACT_STATUS_Amortized debt_count,pc_NAME_CONTRACT_STATUS_Amortized debt_percent,pc_NAME_CONTRACT_STATUS_Approved_count,pc_NAME_CONTRACT_STATUS_Approved_percent,pc_NAME_CONTRACT_STATUS_Canceled_count,pc_NAME_CONTRACT_STATUS_Canceled_percent,pc_NAME_CONTRACT_STATUS_Completed_count,pc_NAME_CONTRACT_STATUS_Completed_percent,pc_NAME_CONTRACT_STATUS_Demand_count,pc_NAME_CONTRACT_STATUS_Demand_percent,pc_NAME_CONTRACT_STATUS_Returned to the store_count,pc_NAME_CONTRACT_STATUS_Returned to the store_percent,pc_NAME_CONTRACT_STATUS_Signed_count,pc_NAME_CONTRACT_STATUS_Signed_percent,pc_NAME_CONTRACT_STATUS_XNA_count,pc_NAME_CONTRACT_STATUS_XNA_percent
0,1000001,3,-9.0,-10,-8,-27,3,12.0,2.0,12.0,26.0,3,11.0,0.0,12.0,23.0,3,0.0,0,0,0,3,0.0,0,0,0,2,0.666667,0,0.0,0,0.0,0,0.0,1,0.333333,0,0.0,0,0.0,0,0.0,0,0.0
1,1000002,5,-52.0,-54,-50,-260,5,6.0,4.0,6.0,26.0,5,2.0,0.0,4.0,10.0,5,0.0,0,0,0,5,0.0,0,0,0,4,0.8,0,0.0,0,0.0,0,0.0,1,0.2,0,0.0,0,0.0,0,0.0,0,0.0
2,1000003,4,-2.5,-4,-1,-10,4,12.0,12.0,12.0,48.0,4,10.5,9.0,12.0,42.0,4,0.0,0,0,0,4,0.0,0,0,0,4,1.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
3,1000004,8,-25.5,-29,-22,-204,8,10.0,7.0,10.0,77.0,8,6.5,0.0,10.0,49.0,8,0.0,0,0,0,8,0.0,0,0,0,7,0.875,0,0.0,0,0.0,0,0.0,1,0.125,0,0.0,0,0.0,0,0.0,0,0.0
4,1000005,11,-51.0,-56,-46,-561,11,10.0,10.0,10.0,110.0,11,5.0,0.0,10.0,55.0,11,0.0,0,0,0,11,0.0,0,0,0,10,0.909091,0,0.0,0,0.0,0,0.0,1,0.090909,0,0.0,0,0.0,0,0.0,0,0.0


## Aggregate table `installments_payments.csv` by column `SK_ID_PREV`

In [19]:
installments_payments = pd.read_csv(os.path.join(INP_DIR, "installments_payments.csv"))
installments_payments = change_dtypes(installments_payments)

installments_payments_agg = aggregate(installments_payments.drop(["SK_ID_CURR"], axis=1), by=["SK_ID_PREV"],
                                      num_stats=NUM_STATS, cat_stats=CAT_STATS,
                                      prefix="ip_")

print("installments_payments shape:", installments_payments.shape)
print("installments_payments_agg shape:", installments_payments_agg.shape)

gc.enable()
del installments_payments
gc.collect()

installments_payments_agg.head()

Memory usage before changing types 870.75 MB
Memory usage after changing types 435.37 MB
No categorical columns in df
installments_payments shape: (13605401, 8)
installments_payments_agg shape: (997752, 31)


Unnamed: 0,SK_ID_PREV,ip_NUM_INSTALMENT_VERSION_count,ip_NUM_INSTALMENT_VERSION_median,ip_NUM_INSTALMENT_VERSION_min,ip_NUM_INSTALMENT_VERSION_max,ip_NUM_INSTALMENT_VERSION_sum,ip_NUM_INSTALMENT_NUMBER_count,ip_NUM_INSTALMENT_NUMBER_median,ip_NUM_INSTALMENT_NUMBER_min,ip_NUM_INSTALMENT_NUMBER_max,ip_NUM_INSTALMENT_NUMBER_sum,ip_DAYS_INSTALMENT_count,ip_DAYS_INSTALMENT_median,ip_DAYS_INSTALMENT_min,ip_DAYS_INSTALMENT_max,ip_DAYS_INSTALMENT_sum,ip_DAYS_ENTRY_PAYMENT_count,ip_DAYS_ENTRY_PAYMENT_median,ip_DAYS_ENTRY_PAYMENT_min,ip_DAYS_ENTRY_PAYMENT_max,ip_DAYS_ENTRY_PAYMENT_sum,ip_AMT_INSTALMENT_count,ip_AMT_INSTALMENT_median,ip_AMT_INSTALMENT_min,ip_AMT_INSTALMENT_max,ip_AMT_INSTALMENT_sum,ip_AMT_PAYMENT_count,ip_AMT_PAYMENT_median,ip_AMT_PAYMENT_min,ip_AMT_PAYMENT_max,ip_AMT_PAYMENT_sum
0,1000001,2,1.5,1.0,2.0,3.0,2,1.5,1,2,3,2,-253.0,-268.0,-238.0,-506.0,2,-269.0,-294.0,-244.0,-538.0,2,34221.710938,6404.310059,62039.113281,68443.421875,2,34221.710938,6404.310059,62039.113281,68443.421875
1,1000002,4,1.0,1.0,2.0,5.0,4,2.5,1,4,10,4,-1555.0,-1600.0,-1510.0,-6220.0,4,-1567.0,-1611.0,-1554.0,-6299.0,4,6264.0,6264.0,18443.564453,37235.5625,4,6264.0,6264.0,18443.564453,37235.5625
2,1000003,3,1.0,1.0,1.0,3.0,3,2.0,1,3,6,3,-64.0,-94.0,-34.0,-192.0,3,-81.0,-108.0,-49.0,-238.0,3,4951.350098,4951.350098,4951.350098,14854.050781,3,4951.350098,4951.350098,4951.350098,14854.050781
3,1000004,7,1.0,1.0,2.0,8.0,7,4.0,1,7,28,7,-772.0,-862.0,-682.0,-5404.0,7,-830.0,-881.0,-695.0,-5591.0,7,3391.110107,3391.110107,13176.495117,33523.15625,7,3391.110107,3391.110107,13176.495117,33523.15625
4,1000005,11,1.0,1.0,1.0,11.0,11,6.0,1,10,64,11,-1538.0,-1688.0,-1418.0,-16978.0,11,-1545.0,-1687.0,-1433.0,-17071.0,11,14713.605469,14599.259766,14713.605469,161735.3125,11,14713.605469,2.79,14713.605469,147021.703125


## Aggregate table `credit_card_balance` by column `SK_ID_PREV`

In [20]:
credit_card_balance = pd.read_csv(os.path.join(INP_DIR, "credit_card_balance.csv"))
credit_card_balance = change_dtypes(credit_card_balance)

credit_card_balance_agg = aggregate(credit_card_balance.drop(["SK_ID_CURR"], axis=1), by=["SK_ID_PREV"],
                                    num_stats=NUM_STATS, cat_stats=CAT_STATS, 
                                    prefix="cc_")

print("credit_card_balance shape:", credit_card_balance.shape)
print("credit_card_balance_agg shape:", credit_card_balance_agg.shape)

gc.enable()
del credit_card_balance
gc.collect()

credit_card_balance_agg.head()

Memory usage before changing types 706.62 MB
Memory usage after changing types 341.79 MB
credit_card_balance shape: (3840312, 23)
credit_card_balance_agg shape: (104307, 115)


Unnamed: 0,SK_ID_PREV,cc_MONTHS_BALANCE_count,cc_MONTHS_BALANCE_median,cc_MONTHS_BALANCE_min,cc_MONTHS_BALANCE_max,cc_MONTHS_BALANCE_sum,cc_AMT_BALANCE_count,cc_AMT_BALANCE_median,cc_AMT_BALANCE_min,cc_AMT_BALANCE_max,cc_AMT_BALANCE_sum,cc_AMT_CREDIT_LIMIT_ACTUAL_count,cc_AMT_CREDIT_LIMIT_ACTUAL_median,cc_AMT_CREDIT_LIMIT_ACTUAL_min,cc_AMT_CREDIT_LIMIT_ACTUAL_max,cc_AMT_CREDIT_LIMIT_ACTUAL_sum,cc_AMT_DRAWINGS_ATM_CURRENT_count,cc_AMT_DRAWINGS_ATM_CURRENT_median,cc_AMT_DRAWINGS_ATM_CURRENT_min,cc_AMT_DRAWINGS_ATM_CURRENT_max,cc_AMT_DRAWINGS_ATM_CURRENT_sum,cc_AMT_DRAWINGS_CURRENT_count,cc_AMT_DRAWINGS_CURRENT_median,cc_AMT_DRAWINGS_CURRENT_min,cc_AMT_DRAWINGS_CURRENT_max,cc_AMT_DRAWINGS_CURRENT_sum,cc_AMT_DRAWINGS_OTHER_CURRENT_count,cc_AMT_DRAWINGS_OTHER_CURRENT_median,cc_AMT_DRAWINGS_OTHER_CURRENT_min,cc_AMT_DRAWINGS_OTHER_CURRENT_max,cc_AMT_DRAWINGS_OTHER_CURRENT_sum,cc_AMT_DRAWINGS_POS_CURRENT_count,cc_AMT_DRAWINGS_POS_CURRENT_median,cc_AMT_DRAWINGS_POS_CURRENT_min,cc_AMT_DRAWINGS_POS_CURRENT_max,cc_AMT_DRAWINGS_POS_CURRENT_sum,cc_AMT_INST_MIN_REGULARITY_count,cc_AMT_INST_MIN_REGULARITY_median,cc_AMT_INST_MIN_REGULARITY_min,cc_AMT_INST_MIN_REGULARITY_max,cc_AMT_INST_MIN_REGULARITY_sum,cc_AMT_PAYMENT_CURRENT_count,cc_AMT_PAYMENT_CURRENT_median,cc_AMT_PAYMENT_CURRENT_min,cc_AMT_PAYMENT_CURRENT_max,cc_AMT_PAYMENT_CURRENT_sum,cc_AMT_PAYMENT_TOTAL_CURRENT_count,cc_AMT_PAYMENT_TOTAL_CURRENT_median,cc_AMT_PAYMENT_TOTAL_CURRENT_min,cc_AMT_PAYMENT_TOTAL_CURRENT_max,cc_AMT_PAYMENT_TOTAL_CURRENT_sum,cc_AMT_RECEIVABLE_PRINCIPAL_count,cc_AMT_RECEIVABLE_PRINCIPAL_median,cc_AMT_RECEIVABLE_PRINCIPAL_min,cc_AMT_RECEIVABLE_PRINCIPAL_max,cc_AMT_RECEIVABLE_PRINCIPAL_sum,cc_AMT_RECIVABLE_count,cc_AMT_RECIVABLE_median,cc_AMT_RECIVABLE_min,cc_AMT_RECIVABLE_max,cc_AMT_RECIVABLE_sum,cc_AMT_TOTAL_RECEIVABLE_count,cc_AMT_TOTAL_RECEIVABLE_median,cc_AMT_TOTAL_RECEIVABLE_min,cc_AMT_TOTAL_RECEIVABLE_max,cc_AMT_TOTAL_RECEIVABLE_sum,cc_CNT_DRAWINGS_ATM_CURRENT_count,cc_CNT_DRAWINGS_ATM_CURRENT_median,cc_CNT_DRAWINGS_ATM_CURRENT_min,cc_CNT_DRAWINGS_ATM_CURRENT_max,cc_CNT_DRAWINGS_ATM_CURRENT_sum,cc_CNT_DRAWINGS_CURRENT_count,cc_CNT_DRAWINGS_CURRENT_median,cc_CNT_DRAWINGS_CURRENT_min,cc_CNT_DRAWINGS_CURRENT_max,cc_CNT_DRAWINGS_CURRENT_sum,cc_CNT_DRAWINGS_OTHER_CURRENT_count,cc_CNT_DRAWINGS_OTHER_CURRENT_median,cc_CNT_DRAWINGS_OTHER_CURRENT_min,cc_CNT_DRAWINGS_OTHER_CURRENT_max,cc_CNT_DRAWINGS_OTHER_CURRENT_sum,cc_CNT_DRAWINGS_POS_CURRENT_count,cc_CNT_DRAWINGS_POS_CURRENT_median,cc_CNT_DRAWINGS_POS_CURRENT_min,cc_CNT_DRAWINGS_POS_CURRENT_max,cc_CNT_DRAWINGS_POS_CURRENT_sum,cc_CNT_INSTALMENT_MATURE_CUM_count,cc_CNT_INSTALMENT_MATURE_CUM_median,cc_CNT_INSTALMENT_MATURE_CUM_min,cc_CNT_INSTALMENT_MATURE_CUM_max,cc_CNT_INSTALMENT_MATURE_CUM_sum,cc_SK_DPD_count,cc_SK_DPD_median,cc_SK_DPD_min,cc_SK_DPD_max,cc_SK_DPD_sum,cc_SK_DPD_DEF_count,cc_SK_DPD_DEF_median,cc_SK_DPD_DEF_min,cc_SK_DPD_DEF_max,cc_SK_DPD_DEF_sum,cc_NAME_CONTRACT_STATUS_Active_count,cc_NAME_CONTRACT_STATUS_Active_percent,cc_NAME_CONTRACT_STATUS_Approved_count,cc_NAME_CONTRACT_STATUS_Approved_percent,cc_NAME_CONTRACT_STATUS_Completed_count,cc_NAME_CONTRACT_STATUS_Completed_percent,cc_NAME_CONTRACT_STATUS_Demand_count,cc_NAME_CONTRACT_STATUS_Demand_percent,cc_NAME_CONTRACT_STATUS_Refused_count,cc_NAME_CONTRACT_STATUS_Refused_percent,cc_NAME_CONTRACT_STATUS_Sent proposal_count,cc_NAME_CONTRACT_STATUS_Sent proposal_percent,cc_NAME_CONTRACT_STATUS_Signed_count,cc_NAME_CONTRACT_STATUS_Signed_percent
0,1000018,5,-4.0,-6,-2,-20,5,44360.503906,38879.144531,136695.421875,374731.4375,5,45000,45000,135000,405000,5,0.0,0.0,13500.0,27000.0,5,22827.330078,2032.560059,69156.945312,147394.984375,5,0.0,0.0,0.0,0.0,5,22827.330078,2032.560059,55656.945312,120394.976562,5,2250.0,0.0,6206.669922,12970.439453,5,3900.870117,3190.63501,9000.0,27708.75,5,3900.870117,3190.63501,9000.0,27708.75,5,43376.761719,37542.644531,132903.0,361491.0,5,44360.503906,37542.644531,136024.921875,368012.9375,5,44360.503906,37542.644531,136024.921875,368012.9375,5,0.0,0.0,3.0,6.0,5,11.0,2,15,44,5,0.0,0.0,0.0,0.0,5,11.0,2.0,12.0,38.0,5,2.0,0.0,4.0,10.0,5,0.0,0,0,0,5,0.0,0,0,0,5,1.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
1,1000030,8,-4.5,-8,-1,-36,8,48036.667969,0.0,103027.273438,447928.5,8,78750,45000,135000,652500,7,0.0,0.0,4500.0,4500.0,8,13381.650391,0.0,46660.5,138059.5,7,0.0,0.0,0.0,0.0,7,20212.650391,1849.050049,46660.5,133559.5,8,2250.0,0.0,5348.52002,16625.789062,7,5022.765137,2371.814941,16067.25,43320.417969,8,444.959991,0.0,16067.25,21263.580078,8,47639.023438,0.0,101866.726562,443795.625,8,48036.667969,0.0,103027.273438,447483.0,8,48036.667969,0.0,103027.273438,447483.0,7,0.0,0.0,1.0,1.0,8,3.5,0,14,41,7,0.0,0.0,0.0,0.0,7,4.0,2.0,13.0,40.0,8,1.5,0.0,5.0,15.0,8,0.0,0,0,0,8,0.0,0,0,0,8,1.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
2,1000031,16,-8.5,-16,-1,-136,16,2902.747559,0.0,154945.9375,838311.0,16,144000,45000,225000,2394000,13,0.0,0.0,90000.0,157500.0,16,3382.672363,0.0,155340.0,463353.84375,13,0.0,0.0,0.0,0.0,13,2862.0,0.0,155340.0,305853.84375,16,225.0,0.0,7780.814941,42804.808594,12,7200.0,394.065002,160606.796875,354519.09375,16,4105.665039,0.0,160606.796875,354519.09375,16,2902.747559,0.0,154945.9375,822446.0625,16,2902.747559,0.0,154945.9375,833599.5,16,2902.747559,0.0,154945.9375,833599.5,13,0.0,0.0,2.0,4.0,16,2.0,0,4,21,13,0.0,0.0,0.0,0.0,13,1.0,0.0,4.0,17.0,16,2.5,0.0,10.0,59.0,16,0.0,0,0,0,16,0.0,0,0,0,16,1.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
3,1000035,5,-4.0,-6,-2,-20,5,0.0,0.0,0.0,0.0,5,225000,225000,225000,1125000,0,,,,0.0,5,0.0,0.0,0.0,0.0,0,,,,0.0,0,,,,0.0,5,0.0,0.0,0.0,0.0,0,,,,0.0,5,0.0,0.0,0.0,0.0,5,0.0,0.0,0.0,0.0,5,0.0,0.0,0.0,0.0,5,0.0,0.0,0.0,0.0,0,,,,0.0,5,0.0,0,0,0,0,,,,0.0,0,,,,0.0,5,0.0,0.0,0.0,0.0,5,0.0,0,0,0,5,0.0,0,0,0,5,1.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
4,1000077,11,-7.0,-12,-2,-77,11,0.0,0.0,0.0,0.0,11,135000,45000,135000,1035000,0,,,,0.0,11,0.0,0.0,0.0,0.0,0,,,,0.0,0,,,,0.0,11,0.0,0.0,0.0,0.0,0,,,,0.0,11,0.0,0.0,0.0,0.0,11,0.0,0.0,0.0,0.0,11,0.0,0.0,0.0,0.0,11,0.0,0.0,0.0,0.0,0,,,,0.0,11,0.0,0,0,0,0,,,,0.0,0,,,,0.0,11,0.0,0.0,0.0,0.0,11,0.0,0,0,0,11,0.0,0,0,0,11,1.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0


## Merge tables `POS_CASH_balance_agg`, `installments_payments_agg`, `credit_card_balance_agg` with `previous_application`

In [21]:
previous_application = pd.read_csv(os.path.join(INP_DIR, "previous_application.csv"))
previous_application = change_dtypes(previous_application)
print("previous_application shape:", previous_application.shape)

# merge with POS_CASH_balance_agg
previous_application_merge = previous_application.merge(POS_CASH_balance_agg, how="left", on="SK_ID_PREV")
print("previous_application shape after merging with POS_CASH_balance_agg:", previous_application_merge.shape)

# merge with installments_payments_agg
previous_application_merge = previous_application_merge.merge(installments_payments_agg, how="left", on="SK_ID_PREV")
print("previous_application shape after merging with installments_payments_agg:", previous_application_merge.shape)

# merge with credit_card_balance_agg
previous_application_merge = previous_application_merge.merge(credit_card_balance_agg, how="left", on="SK_ID_PREV")
print("previous_application shape after merging with credit_card_balance_agg:", previous_application_merge.shape)

gc.enable()
del previous_application, POS_CASH_balance_agg, installments_payments_agg, credit_card_balance_agg
gc.collect()

previous_application_merge.head()

Memory usage before changing types 494.38 MB
Memory usage after changing types 162.02 MB
previous_application shape: (1670214, 37)
previous_application shape after merging with POS_CASH_balance_agg: (1670214, 80)
previous_application shape after merging with installments_payments_agg: (1670214, 110)
previous_application shape after merging with credit_card_balance_agg: (1670214, 224)


Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NAME_CONTRACT_TYPE,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,FLAG_LAST_APPL_PER_CONTRACT,NFLAG_LAST_APPL_IN_DAY,RATE_DOWN_PAYMENT,RATE_INTEREST_PRIMARY,RATE_INTEREST_PRIVILEGED,NAME_CASH_LOAN_PURPOSE,NAME_CONTRACT_STATUS,DAYS_DECISION,NAME_PAYMENT_TYPE,CODE_REJECT_REASON,NAME_TYPE_SUITE,NAME_CLIENT_TYPE,NAME_GOODS_CATEGORY,NAME_PORTFOLIO,NAME_PRODUCT_TYPE,CHANNEL_TYPE,SELLERPLACE_AREA,NAME_SELLER_INDUSTRY,CNT_PAYMENT,NAME_YIELD_GROUP,PRODUCT_COMBINATION,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL,pc_MONTHS_BALANCE_count,pc_MONTHS_BALANCE_median,pc_MONTHS_BALANCE_min,pc_MONTHS_BALANCE_max,pc_MONTHS_BALANCE_sum,pc_CNT_INSTALMENT_count,pc_CNT_INSTALMENT_median,pc_CNT_INSTALMENT_min,pc_CNT_INSTALMENT_max,pc_CNT_INSTALMENT_sum,pc_CNT_INSTALMENT_FUTURE_count,pc_CNT_INSTALMENT_FUTURE_median,pc_CNT_INSTALMENT_FUTURE_min,pc_CNT_INSTALMENT_FUTURE_max,pc_CNT_INSTALMENT_FUTURE_sum,pc_SK_DPD_count,pc_SK_DPD_median,pc_SK_DPD_min,pc_SK_DPD_max,pc_SK_DPD_sum,pc_SK_DPD_DEF_count,...,cc_AMT_RECIVABLE_median,cc_AMT_RECIVABLE_min,cc_AMT_RECIVABLE_max,cc_AMT_RECIVABLE_sum,cc_AMT_TOTAL_RECEIVABLE_count,cc_AMT_TOTAL_RECEIVABLE_median,cc_AMT_TOTAL_RECEIVABLE_min,cc_AMT_TOTAL_RECEIVABLE_max,cc_AMT_TOTAL_RECEIVABLE_sum,cc_CNT_DRAWINGS_ATM_CURRENT_count,cc_CNT_DRAWINGS_ATM_CURRENT_median,cc_CNT_DRAWINGS_ATM_CURRENT_min,cc_CNT_DRAWINGS_ATM_CURRENT_max,cc_CNT_DRAWINGS_ATM_CURRENT_sum,cc_CNT_DRAWINGS_CURRENT_count,cc_CNT_DRAWINGS_CURRENT_median,cc_CNT_DRAWINGS_CURRENT_min,cc_CNT_DRAWINGS_CURRENT_max,cc_CNT_DRAWINGS_CURRENT_sum,cc_CNT_DRAWINGS_OTHER_CURRENT_count,cc_CNT_DRAWINGS_OTHER_CURRENT_median,cc_CNT_DRAWINGS_OTHER_CURRENT_min,cc_CNT_DRAWINGS_OTHER_CURRENT_max,cc_CNT_DRAWINGS_OTHER_CURRENT_sum,cc_CNT_DRAWINGS_POS_CURRENT_count,cc_CNT_DRAWINGS_POS_CURRENT_median,cc_CNT_DRAWINGS_POS_CURRENT_min,cc_CNT_DRAWINGS_POS_CURRENT_max,cc_CNT_DRAWINGS_POS_CURRENT_sum,cc_CNT_INSTALMENT_MATURE_CUM_count,cc_CNT_INSTALMENT_MATURE_CUM_median,cc_CNT_INSTALMENT_MATURE_CUM_min,cc_CNT_INSTALMENT_MATURE_CUM_max,cc_CNT_INSTALMENT_MATURE_CUM_sum,cc_SK_DPD_count,cc_SK_DPD_median,cc_SK_DPD_min,cc_SK_DPD_max,cc_SK_DPD_sum,cc_SK_DPD_DEF_count,cc_SK_DPD_DEF_median,cc_SK_DPD_DEF_min,cc_SK_DPD_DEF_max,cc_SK_DPD_DEF_sum,cc_NAME_CONTRACT_STATUS_Active_count,cc_NAME_CONTRACT_STATUS_Active_percent,cc_NAME_CONTRACT_STATUS_Approved_count,cc_NAME_CONTRACT_STATUS_Approved_percent,cc_NAME_CONTRACT_STATUS_Completed_count,cc_NAME_CONTRACT_STATUS_Completed_percent,cc_NAME_CONTRACT_STATUS_Demand_count,cc_NAME_CONTRACT_STATUS_Demand_percent,cc_NAME_CONTRACT_STATUS_Refused_count,cc_NAME_CONTRACT_STATUS_Refused_percent,cc_NAME_CONTRACT_STATUS_Sent proposal_count,cc_NAME_CONTRACT_STATUS_Sent proposal_percent,cc_NAME_CONTRACT_STATUS_Signed_count,cc_NAME_CONTRACT_STATUS_Signed_percent
0,2030495,271877,Consumer loans,1730.430054,17145.0,17145.0,0.0,17145.0,SATURDAY,15,Y,True,0.0,0.182832,0.867336,XAP,Approved,-73,Cash through the bank,XAP,,Repeater,Mobile,POS,XNA,Country-wide,35,Connectivity,12.0,middle,POS mobile with interest,365243.0,-42.0,300.0,-42.0,-37.0,0.0,2.0,-1.5,-2.0,-1.0,-3.0,2.0,6.5,1.0,12.0,13.0,2.0,6.0,0.0,12.0,12.0,2.0,0.0,0.0,0.0,0.0,2.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,2802425,108129,Cash loans,25188.615234,607500.0,679671.0,,607500.0,THURSDAY,11,Y,True,,,,XNA,Approved,-164,XNA,XAP,Unaccompanied,Repeater,XNA,Cash,x-sell,Contact center,-1,XNA,36.0,low_action,Cash X-Sell: low,365243.0,-134.0,916.0,365243.0,365243.0,1.0,5.0,-4.0,-6.0,-2.0,-20.0,5.0,36.0,36.0,36.0,180.0,5.0,34.0,32.0,36.0,170.0,5.0,0.0,0.0,0.0,0.0,5.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,2523466,122040,Cash loans,15060.735352,112500.0,136444.5,,112500.0,TUESDAY,11,Y,True,,,,XNA,Approved,-301,Cash through the bank,XAP,"Spouse, partner",Repeater,XNA,Cash,x-sell,Credit and cash offices,-1,XNA,12.0,high,Cash X-Sell: high,365243.0,-271.0,59.0,365243.0,365243.0,1.0,10.0,-5.5,-10.0,-1.0,-55.0,10.0,12.0,12.0,12.0,120.0,10.0,7.5,3.0,12.0,75.0,10.0,0.0,0.0,0.0,0.0,10.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,2819243,176158,Cash loans,47041.335938,450000.0,470790.0,,450000.0,MONDAY,7,Y,True,,,,XNA,Approved,-512,Cash through the bank,XAP,,Repeater,XNA,Cash,x-sell,Credit and cash offices,-1,XNA,12.0,middle,Cash X-Sell: middle,365243.0,-482.0,-152.0,-182.0,-177.0,1.0,12.0,-11.5,-17.0,-6.0,-138.0,12.0,12.0,11.0,12.0,143.0,12.0,6.5,0.0,12.0,77.0,12.0,0.0,0.0,0.0,0.0,12.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,1784265,202054,Cash loans,31924.394531,337500.0,404055.0,,337500.0,THURSDAY,9,Y,True,,,,Repairs,Refused,-781,Cash through the bank,HC,,Repeater,XNA,Cash,walk-in,Credit and cash offices,-1,XNA,24.0,high,Cash Street: high,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


## Aggregate table previous_application_merge by column `SK_ID_CURR`

In [22]:
previous_application_agg = aggregate(previous_application_merge.drop(["SK_ID_PREV"], axis=1), by=["SK_ID_CURR"],
                                     num_stats=NUM_STATS, cat_stats=CAT_STATS,
                                     prefix="pa_")

print("previous_application_merg shape:", previous_application_merge.shape)
print("previous_application_agg shape:", previous_application_agg.shape)

gc.enable()
del previous_application_merge
gc.collect()

previous_application_agg.head()

previous_application_merg shape: (1670214, 224)
previous_application_agg shape: (338857, 1312)


Unnamed: 0,SK_ID_CURR,pa_AMT_ANNUITY_count,pa_AMT_ANNUITY_median,pa_AMT_ANNUITY_min,pa_AMT_ANNUITY_max,pa_AMT_ANNUITY_sum,pa_AMT_APPLICATION_count,pa_AMT_APPLICATION_median,pa_AMT_APPLICATION_min,pa_AMT_APPLICATION_max,pa_AMT_APPLICATION_sum,pa_AMT_CREDIT_count,pa_AMT_CREDIT_median,pa_AMT_CREDIT_min,pa_AMT_CREDIT_max,pa_AMT_CREDIT_sum,pa_AMT_DOWN_PAYMENT_count,pa_AMT_DOWN_PAYMENT_median,pa_AMT_DOWN_PAYMENT_min,pa_AMT_DOWN_PAYMENT_max,pa_AMT_DOWN_PAYMENT_sum,pa_AMT_GOODS_PRICE_count,pa_AMT_GOODS_PRICE_median,pa_AMT_GOODS_PRICE_min,pa_AMT_GOODS_PRICE_max,pa_AMT_GOODS_PRICE_sum,pa_HOUR_APPR_PROCESS_START_count,pa_HOUR_APPR_PROCESS_START_median,pa_HOUR_APPR_PROCESS_START_min,pa_HOUR_APPR_PROCESS_START_max,pa_HOUR_APPR_PROCESS_START_sum,pa_RATE_DOWN_PAYMENT_count,pa_RATE_DOWN_PAYMENT_median,pa_RATE_DOWN_PAYMENT_min,pa_RATE_DOWN_PAYMENT_max,pa_RATE_DOWN_PAYMENT_sum,pa_RATE_INTEREST_PRIMARY_count,pa_RATE_INTEREST_PRIMARY_median,pa_RATE_INTEREST_PRIMARY_min,pa_RATE_INTEREST_PRIMARY_max,pa_RATE_INTEREST_PRIMARY_sum,pa_RATE_INTEREST_PRIVILEGED_count,pa_RATE_INTEREST_PRIVILEGED_median,pa_RATE_INTEREST_PRIVILEGED_min,pa_RATE_INTEREST_PRIVILEGED_max,pa_RATE_INTEREST_PRIVILEGED_sum,pa_DAYS_DECISION_count,pa_DAYS_DECISION_median,pa_DAYS_DECISION_min,pa_DAYS_DECISION_max,pa_DAYS_DECISION_sum,pa_SELLERPLACE_AREA_count,pa_SELLERPLACE_AREA_median,pa_SELLERPLACE_AREA_min,pa_SELLERPLACE_AREA_max,pa_SELLERPLACE_AREA_sum,pa_CNT_PAYMENT_count,pa_CNT_PAYMENT_median,...,pa_NAME_SELLER_INDUSTRY_Consumer electronics_count,pa_NAME_SELLER_INDUSTRY_Consumer electronics_percent,pa_NAME_SELLER_INDUSTRY_Furniture_count,pa_NAME_SELLER_INDUSTRY_Furniture_percent,pa_NAME_SELLER_INDUSTRY_Industry_count,pa_NAME_SELLER_INDUSTRY_Industry_percent,pa_NAME_SELLER_INDUSTRY_Jewelry_count,pa_NAME_SELLER_INDUSTRY_Jewelry_percent,pa_NAME_SELLER_INDUSTRY_MLM partners_count,pa_NAME_SELLER_INDUSTRY_MLM partners_percent,pa_NAME_SELLER_INDUSTRY_Tourism_count,pa_NAME_SELLER_INDUSTRY_Tourism_percent,pa_NAME_SELLER_INDUSTRY_XNA_count,pa_NAME_SELLER_INDUSTRY_XNA_percent,pa_NAME_YIELD_GROUP_XNA_count,pa_NAME_YIELD_GROUP_XNA_percent,pa_NAME_YIELD_GROUP_high_count,pa_NAME_YIELD_GROUP_high_percent,pa_NAME_YIELD_GROUP_low_action_count,pa_NAME_YIELD_GROUP_low_action_percent,pa_NAME_YIELD_GROUP_low_normal_count,pa_NAME_YIELD_GROUP_low_normal_percent,pa_NAME_YIELD_GROUP_middle_count,pa_NAME_YIELD_GROUP_middle_percent,pa_PRODUCT_COMBINATION_Card Street_count,pa_PRODUCT_COMBINATION_Card Street_percent,pa_PRODUCT_COMBINATION_Card X-Sell_count,pa_PRODUCT_COMBINATION_Card X-Sell_percent,pa_PRODUCT_COMBINATION_Cash_count,pa_PRODUCT_COMBINATION_Cash_percent,pa_PRODUCT_COMBINATION_Cash Street: high_count,pa_PRODUCT_COMBINATION_Cash Street: high_percent,pa_PRODUCT_COMBINATION_Cash Street: low_count,pa_PRODUCT_COMBINATION_Cash Street: low_percent,pa_PRODUCT_COMBINATION_Cash Street: middle_count,pa_PRODUCT_COMBINATION_Cash Street: middle_percent,pa_PRODUCT_COMBINATION_Cash X-Sell: high_count,pa_PRODUCT_COMBINATION_Cash X-Sell: high_percent,pa_PRODUCT_COMBINATION_Cash X-Sell: low_count,pa_PRODUCT_COMBINATION_Cash X-Sell: low_percent,pa_PRODUCT_COMBINATION_Cash X-Sell: middle_count,pa_PRODUCT_COMBINATION_Cash X-Sell: middle_percent,pa_PRODUCT_COMBINATION_POS household with interest_count,pa_PRODUCT_COMBINATION_POS household with interest_percent,pa_PRODUCT_COMBINATION_POS household without interest_count,pa_PRODUCT_COMBINATION_POS household without interest_percent,pa_PRODUCT_COMBINATION_POS industry with interest_count,pa_PRODUCT_COMBINATION_POS industry with interest_percent,pa_PRODUCT_COMBINATION_POS industry without interest_count,pa_PRODUCT_COMBINATION_POS industry without interest_percent,pa_PRODUCT_COMBINATION_POS mobile with interest_count,pa_PRODUCT_COMBINATION_POS mobile with interest_percent,pa_PRODUCT_COMBINATION_POS mobile without interest_count,pa_PRODUCT_COMBINATION_POS mobile without interest_percent,pa_PRODUCT_COMBINATION_POS other with interest_count,pa_PRODUCT_COMBINATION_POS other with interest_percent,pa_PRODUCT_COMBINATION_POS others without interest_count,pa_PRODUCT_COMBINATION_POS others without interest_percent
0,100001,1,3951.0,3951.0,3951.0,3951.0,1,24835.5,24835.5,24835.5,24835.5,1,23787.0,23787.0,23787.0,23787.0,1,2520.0,2520.0,2520.0,2520.0,1,24835.5,24835.5,24835.5,24835.5,1,13.0,13,13,13,1,0.104326,0.104326,0.104326,0.104326,0,,,,0.0,0,,,,0.0,1,-1740.0,-1740,-1740,-1740,1,23.0,23,23,23,1,8.0,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,1,1.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,1,1.0,0,0.0,0,0.0,0,0.0
1,100002,1,9251.775391,9251.775391,9251.775391,9251.775391,1,179055.0,179055.0,179055.0,179055.0,1,179055.0,179055.0,179055.0,179055.0,1,0.0,0.0,0.0,0.0,1,179055.0,179055.0,179055.0,179055.0,1,9.0,9,9,9,1,0.0,0.0,0.0,0.0,0,,,,0.0,0,,,,0.0,1,-606.0,-606,-606,-606,1,500.0,500,500,500,1,24.0,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,1,1.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,1,1.0,0,0.0
2,100003,3,64567.664062,6737.310059,98356.992188,169661.96875,3,337500.0,68809.5,900000.0,1306309.5,3,348637.5,68053.5,1035882.0,1452573.0,2,3442.5,0.0,6885.0,6885.0,3,337500.0,68809.5,900000.0,1306309.5,3,15.0,12,17,44,2,0.05003,0.0,0.100061,0.100061,0,,,,0.0,0,,,,0.0,3,-828.0,-2341,-746,-3915,3,200.0,-1,1400,1599,3,12.0,...,1,0.333333,1,0.333333,0,0.0,0,0.0,0,0.0,0,0.0,1,0.333333,0,0.0,0,0.0,0,0.0,1,0.333333,2,0.666667,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,1,0.333333,0,0.0,1,0.333333,0,0.0,1,0.333333,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
3,100004,1,5357.25,5357.25,5357.25,5357.25,1,24282.0,24282.0,24282.0,24282.0,1,20106.0,20106.0,20106.0,20106.0,1,4860.0,4860.0,4860.0,4860.0,1,24282.0,24282.0,24282.0,24282.0,1,5.0,5,5,5,1,0.212008,0.212008,0.212008,0.212008,0,,,,0.0,0,,,,0.0,1,-815.0,-815,-815,-815,1,30.0,30,30,30,1,4.0,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,1,1.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,1,1.0,0,0.0,0,0.0
4,100005,1,4813.200195,4813.200195,4813.200195,4813.200195,2,22308.75,0.0,44617.5,44617.5,2,20076.75,0.0,40153.5,40153.5,1,4464.0,4464.0,4464.0,4464.0,1,44617.5,44617.5,44617.5,44617.5,2,10.5,10,11,21,1,0.108964,0.108964,0.108964,0.108964,0,,,,0.0,0,,,,0.0,2,-536.0,-757,-315,-1072,2,18.0,-1,37,36,1,12.0,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,1,0.5,1,0.5,1,0.5,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,1,0.5,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,1,0.5,0,0.0,0,0.0,0,0.0


## Merge table `previous_application_agg` with `application_train` and `application_test`

In [23]:
application_bureau_previous_train = application_bureau_train.merge(previous_application_agg, 
                                                                   how="left", on="SK_ID_CURR")

print("application_bureau_train shape:", application_bureau_train.shape)
print("application_bureau_previous_train shape:", application_bureau_previous_train.shape)
del application_bureau_train

application_bureau_previous_test = application_bureau_test.merge(previous_application_agg, 
                                                                   how="left", on="SK_ID_CURR")

print("application_bureau_test shape:", application_bureau_test.shape)
print("application_bureau_previous_test shape:", application_bureau_previous_test.shape)

gc.enable()
del application_bureau_test
gc.collect()

application_bureau_train shape: (307511, 333)
application_bureau_previous_train shape: (307511, 1644)
application_bureau_test shape: (48744, 332)
application_bureau_previous_test shape: (48744, 1643)


0

## Write to csv files

In [24]:
X_train_2 = application_bureau_previous_train.drop(["SK_ID_CURR", "TARGET"], axis=1)
X_train_2.to_csv(os.path.join(OUT_DIR, "X_train_2.csv"), index=False)

gc.enable()
del X_train_2
del application_bureau_previous_train
gc.collect()

X_test_2 = application_bureau_previous_test.drop(["SK_ID_CURR"], axis=1)
X_test_2.to_csv(os.path.join(OUT_DIR, "X_test_2.csv"), index=False)

gc.enable()
del application_bureau_previous_test
del X_test_2
gc.collect()

0

## Summary
We have created three datasets in `data/data_`.

1. `X_train_0.csv` and `X_test_0.csv` are just the original tables from Kaggle, `application_train.csv` and `application_test.csv` with the `TARGET` and `SK_ID_CURR` being removed.

2. `X_train_1.csv` and `X_test_1.csv` are the data in 1 merged with `bureau.csv` and `bureau_balance.csv`.

3. `X_train_2.csv` and `X_test_2.csv` are the data in 2 merged with `previous_application.csv`, `POS_CASH_balance.csv`, `installments_payments.csv` and `credit_card_balance.csv`.