In this notebook we will aggregate and merge tables

In [1]:
import os
import pandas as pd
import numpy as np

from preprocessing import aggregate

INP_DIR = "data/download"
OUT_DIR = "data/data_"

NUM_STATS = ("count", "median", "min", "max", "sum")
CAT_STATS = (("count", "sum"), ("percent", "mean"))

## Load `application_train.csv` and `application_test.csv`

In [2]:
application_train = pd.read_csv(os.path.join(INP_DIR, "application_train.csv"))
application_test = pd.read_csv(os.path.join(INP_DIR, "application_test.csv"))

print("application_train shape", application_train.shape)
print("application_test shape", application_test.shape)

application_train shape (307511, 122)
application_test shape (48744, 121)


## Write to csv

In [3]:
X_train_0 = application_train.drop(["SK_ID_CURR", "TARGET"], axis=1)
y_train = application_train[["TARGET"]]

X_test_0 = application_test.drop(["SK_ID_CURR"], axis=1)
id_test =  application_test[["SK_ID_CURR"]]


# write to csv files
X_train_0.to_csv(os.path.join(OUT_DIR, "X_train_0.csv"), index=False)
y_train.to_csv(os.path.join(OUT_DIR, "y_train.csv"), index=False)

X_test_0.to_csv(os.path.join(OUT_DIR, "X_test_0.csv"), index=False)
id_test.to_csv(os.path.join(OUT_DIR, "id_test.csv"), index=False)

del X_train_0, y_train, X_test_0, id_test

## Aggregate table `bureau_balance` by column `SK_ID_BUREAU`

In [4]:
bureau_balance = pd.read_csv(os.path.join(INP_DIR, "bureau_balance.csv"))

bureau_balance_agg = aggregate(bureau_balance, by=["SK_ID_BUREAU"], 
                               num_stats=NUM_STATS, cat_stats=CAT_STATS,
                               prefix="bb_")

del bureau_balance

bureau_balance_agg.head()

## Merge table `bureau_balance_agg` with table `bureau` on column `SK_ID_BUREAU`

In [5]:
bureau = pd.read_csv(os.path.join(INP_DIR, "bureau.csv"))

bureau_merge = bureau.merge(bureau_balance_agg, how="left", on="SK_ID_BUREAU")

bureau_merge = bureau_merge.drop(["SK_ID_BUREAU"], axis=1)

print("bureau shape:", bureau.shape)
print("bureau_merge shape:", bureau_merge.shape)

del bureau

bureau_merge.head()

bureau shape: (1716428, 17)
bureau_merge shape: (1716428, 37)


## Aggregate table `bureau_merge` by column `SK_ID_CURR`

In [6]:
bureau_agg = aggregate(bureau_merge, by=["SK_ID_CURR"], num_stats=NUM_STATS, cat_stats=CAT_STATS, prefix="bu_")

print("bureau_merge shape:", bureau_merge.shape)
print("bureau_agg shape:", bureau_agg.shape)
print("application_train shape:", application_train.shape)

del bureau_merge

bureau_agg.head()

bureau_merge shape: (1716428, 37)
bureau_agg shape: (305811, 212)
application_train shape: (307511, 122)


## Merge table `bureau_agg` with `application_train` and `application_test`

In [7]:
application_bureau_train = application_train.merge(bureau_agg, how="left", on="SK_ID_CURR")
application_bureau_test = application_test.merge(bureau_agg, how="left", on="SK_ID_CURR")

print("application_train shape:", application_train.shape)
print("application_bureau_train shape:", application_bureau_train.shape)

print("application_test shape:", application_test.shape)
print("application_bureau_test shape:", application_bureau_test.shape)

del bureau_agg

application_train shape: (307511, 122)
application_bureau_train shape: (307511, 333)
application_test shape: (48744, 121)
application_bureau_test shape: (48744, 332)


In [8]:
application_bureau_train.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,bu_CREDIT_TYPE_Microloan_count,bu_CREDIT_TYPE_Microloan_percent,bu_CREDIT_TYPE_Mobile operator loan_count,bu_CREDIT_TYPE_Mobile operator loan_percent,bu_CREDIT_TYPE_Mortgage_count,bu_CREDIT_TYPE_Mortgage_percent,bu_CREDIT_TYPE_Real estate loan_count,bu_CREDIT_TYPE_Real estate loan_percent,bu_CREDIT_TYPE_Unknown type of loan_count,bu_CREDIT_TYPE_Unknown type of loan_percent
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,,,,,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
application_bureau_test.head()

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,bu_CREDIT_TYPE_Microloan_count,bu_CREDIT_TYPE_Microloan_percent,bu_CREDIT_TYPE_Mobile operator loan_count,bu_CREDIT_TYPE_Mobile operator loan_percent,bu_CREDIT_TYPE_Mortgage_count,bu_CREDIT_TYPE_Mortgage_percent,bu_CREDIT_TYPE_Real estate loan_count,bu_CREDIT_TYPE_Real estate loan_percent,bu_CREDIT_TYPE_Unknown type of loan_count,bu_CREDIT_TYPE_Unknown type of loan_percent
0,100001,Cash loans,F,N,Y,0,135000.0,568800.0,20560.5,450000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,100005,Cash loans,M,N,Y,0,99000.0,222768.0,17370.0,180000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,100013,Cash loans,M,Y,Y,0,202500.0,663264.0,69777.0,630000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,100028,Cash loans,F,N,Y,2,315000.0,1575000.0,49018.5,1575000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,100038,Cash loans,M,Y,N,1,180000.0,625500.0,32067.0,625500.0,...,,,,,,,,,,


## Write to csv files

In [10]:
X_train_1 = application_bureau_train.drop(["SK_ID_CURR", "TARGET"], axis=1)

X_test_1 = application_bureau_test.drop(["SK_ID_CURR"], axis=1)

# write to csv files
X_train_1.to_csv(os.path.join(OUT_DIR, "X_train_1.csv"), index=False)

X_test_1.to_csv(os.path.join(OUT_DIR, "X_test_1.csv"), index=False)

del X_train_1, X_test_1

## Aggregate table `POS_CASH_balance` by column `SK_ID_PREV`

In [11]:
POS_CASH_balance = pd.read_csv(os.path.join(INP_DIR, "POS_CASH_balance.csv"))

POS_CASH_balance_agg = aggregate(POS_CASH_balance.drop(["SK_ID_CURR"], axis=1), by=["SK_ID_PREV"],
                                num_stats=NUM_STATS, cat_stats=CAT_STATS,
                                prefix="pc_")

print("POS_CASH_balance shape:", POS_CASH_balance.shape)
print("POS_CASH_balance_agg shape:", POS_CASH_balance_agg.shape)

del POS_CASH_balance

POS_CASH_balance_agg.head()

POS_CASH_balance shape: (10001358, 8)
POS_CASH_balance_agg shape: (936325, 44)


## Aggregate table `installments_payments.csv` by column `SK_ID_PREV`

In [12]:
installments_payments = pd.read_csv(os.path.join(INP_DIR, "installments_payments.csv"))

installments_payments_agg = aggregate(installments_payments.drop(["SK_ID_CURR"], axis=1), by=["SK_ID_PREV"],
                                      num_stats=NUM_STATS, cat_stats=CAT_STATS,
                                      prefix="ip_")

print("installments_payments shape:", installments_payments.shape)
print("installments_payments_agg shape:", installments_payments_agg.shape)

del installments_payments

installments_payments_agg.head()

No categorical columns in df
installments_payments shape: (13605401, 8)
installments_payments_agg shape: (997752, 31)


## Aggregate table `credit_card_balance` by column `SK_ID_PREV`

In [13]:
credit_card_balance = pd.read_csv(os.path.join(INP_DIR, "credit_card_balance.csv"))

credit_card_balance_agg = aggregate(credit_card_balance.drop(["SK_ID_CURR"], axis=1), by=["SK_ID_PREV"],
                                    num_stats=NUM_STATS, cat_stats=CAT_STATS, 
                                    prefix="cc_")

print("credit_card_balance shape:", credit_card_balance.shape)
print("credit_card_balance_agg shape:", credit_card_balance_agg.shape)

del credit_card_balance

credit_card_balance_agg.head()

credit_card_balance shape: (3840312, 23)
credit_card_balance_agg shape: (104307, 115)


## Merge tables `POS_CASH_balance_agg`, `installments_payments_agg`, `credit_card_balance_agg` with `previous_application`

In [14]:
previous_application = pd.read_csv(os.path.join(INP_DIR, "previous_application.csv"))
print("previous_application shape:", previous_application.shape)

# merge with POS_CASH_balance_agg
previous_application_merge = previous_application.merge(POS_CASH_balance_agg, how="left", on="SK_ID_PREV")
print("previous_application shape after merging with POS_CASH_balance_agg:", previous_application_merge.shape)

# merge with installments_payments_agg
previous_application_merge = previous_application_merge.merge(installments_payments_agg, how="left", on="SK_ID_PREV")
print("previous_application shape after merging with installments_payments_agg:", previous_application_merge.shape)

# merge with credit_card_balance_agg
previous_application_merge = previous_application_merge.merge(credit_card_balance_agg, how="left", on="SK_ID_PREV")
print("previous_application shape after merging with credit_card_balance_agg:", previous_application_merge.shape)

del previous_application, POS_CASH_balance_agg, installments_payments_agg, credit_card_balance_agg

previous_application_merge.head()

previous_application shape: (1670214, 37)
previous_application shape after merging with POS_CASH_balance_agg: (1670214, 80)
previous_application shape after merging with installments_payments_agg: (1670214, 110)
previous_application shape after merging with credit_card_balance_agg: (1670214, 224)


## Aggregate table previous_application_merge by column `SK_ID_CURR`

In [15]:
previous_application_agg = aggregate(previous_application_merge.drop(["SK_ID_PREV"], axis=1), by=["SK_ID_CURR"],
                                     num_stats=NUM_STATS, cat_stats=CAT_STATS,
                                     prefix="pa_")

print("previous_application_merg shape:", previous_application_merge.shape)
print("previous_application_agg shape:", previous_application_agg.shape)

del previous_application_merge

previous_application_agg.head()

previous_application_merg shape: (1670214, 224)
previous_application_agg shape: (338857, 1317)


## Merge table `previous_application_agg` with `application_train` and `application_test`

In [16]:
application_bureau_previous_train = application_bureau_train.merge(previous_application_agg, 
                                                                   how="left", on="SK_ID_CURR")

print("application_bureau_train shape:", application_bureau_train.shape)
print("application_bureau_previous_train shape:", application_bureau_previous_train.shape)
del application_bureau_train

application_bureau_previous_test = application_bureau_test.merge(previous_application_agg, 
                                                                   how="left", on="SK_ID_CURR")

print("application_bureau_test shape:", application_bureau_test.shape)
print("application_bureau_previous_test shape:", application_bureau_previous_test.shape)
del application_bureau_test

application_bureau_train shape: (307511, 333)
application_bureau_previous_train shape: (307511, 1649)
application_bureau_test shape: (48744, 332)
application_bureau_previous_test shape: (48744, 1648)


## Write to csv files

In [17]:
X_train_2 = application_bureau_previous_train.drop(["SK_ID_CURR", "TARGET"], axis=1)
#del application_bureau_previous_train
X_train_2.to_csv(os.path.join(OUT_DIR, "X_train_2.csv"), index=False)
del X_train_2

X_test_2 = application_bureau_previous_test.drop(["SK_ID_CURR"], axis=1)
#del application_bureau_previous_test
X_test_2.to_csv(os.path.join(OUT_DIR, "X_test_2.csv"), index=False)
del X_test_2
