In this notebook we will aggregate and merge tables

In [19]:
import os
import pandas as pd
import numpy as np

from preprocessing import aggregate

INP_DIR = "data/download"
OUT_DIR = "data/data_"

NUM_STATS = ("count", "mean", "median", "min", "max")
CAT_STATS = (("count", "sum"), ("percent", "mean"))

## Load `application_train.csv` and `application_test.csv`

In [2]:
application_train = pd.read_csv(os.path.join(INP_DIR, "application_train.csv"))
application_test = pd.read_csv(os.path.join(INP_DIR, "application_test.csv"))

print("application_train shape", application_train.shape)
print("application_test shape", application_test.shape)

application_train shape (307511, 122)
application_test shape (48744, 121)


In [3]:
X_train_0 = application_train.drop(["SK_ID_CURR", "TARGET"], axis=1)
y_train = application_train[["TARGET"]]

X_test_0 = application_test.drop(["SK_ID_CURR"], axis=1)
id_test =  application_test[["SK_ID_CURR"]]


# write to csv files
X_train_0.to_csv(os.path.join(OUT_DIR, "X_train_0.csv"), index=False)
y_train.to_csv(os.path.join(OUT_DIR, "y_train.csv"), index=False)

X_test_0.to_csv(os.path.join(OUT_DIR, "X_test_0.csv"), index=False)
id_test.to_csv(os.path.join(OUT_DIR, "id_test.csv"), index=False)

del X_train_0, y_train, X_test_0, id_test

## Load `bureau.csv` and `bureau_balance.csv`

In [55]:
bureau = pd.read_csv(os.path.join(INP_DIR, "bureau.csv"))
bureau_balance = pd.read_csv(os.path.join(INP_DIR, "bureau_balance.csv"))

print("bureau shape:", bureau.shape)
print("bureau_balance shape:", bureau_balance.shape)

print("Is SK_ID_BUREAU in bureau unique:", bureau["SK_ID_BUREAU"].is_unique)
print("Is SK_ID_BUREAU in bureau_balance unique:", bureau_balance["SK_ID_BUREAU"].is_unique)

bureau shape: (1716428, 17)
bureau_balance shape: (27299925, 3)
Is SK_ID_BUREAU in bureau unique: True
Is SK_ID_BUREAU in bureau_balance unique: False


In [50]:
bureau.head()

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE,AMT_ANNUITY
0,215354,5714462,Closed,currency 1,-497,0,-153.0,-153.0,,0,91323.0,0.0,,0.0,Consumer credit,-131,
1,215354,5714463,Active,currency 1,-208,0,1075.0,,,0,225000.0,171342.0,,0.0,Credit card,-20,
2,215354,5714464,Active,currency 1,-203,0,528.0,,,0,464323.5,,,0.0,Consumer credit,-16,
3,215354,5714465,Active,currency 1,-203,0,,,,0,90000.0,,,0.0,Credit card,-16,
4,215354,5714466,Active,currency 1,-629,0,1197.0,,77674.5,0,2700000.0,,,0.0,Consumer credit,-21,


In [51]:
bureau_balance.head()

Unnamed: 0,SK_ID_BUREAU,MONTHS_BALANCE,STATUS
0,5715448,0,C
1,5715448,-1,C
2,5715448,-2,C
3,5715448,-3,C
4,5715448,-4,C


## Aggregate table `bureau_balance` by column `SK_ID_BUREAU`

In [52]:
bureau_balance_agg = aggregate(bureau_balance, by=["SK_ID_BUREAU"], 
                               num_stats=("mean", "median", "count", "min", "max"), 
                               cat_stats=("count", "mean"),
                               prefix="bb_")
bureau_balance_agg.head()

Unnamed: 0,SK_ID_BUREAU,bb_MONTHS_BALANCE_mean,bb_MONTHS_BALANCE_median,bb_MONTHS_BALANCE_count,bb_MONTHS_BALANCE_min,bb_MONTHS_BALANCE_max,bb_STATUS_0_count,bb_STATUS_0_mean,bb_STATUS_1_count,bb_STATUS_1_mean,...,bb_STATUS_3_count,bb_STATUS_3_mean,bb_STATUS_4_count,bb_STATUS_4_mean,bb_STATUS_5_count,bb_STATUS_5_mean,bb_STATUS_C_count,bb_STATUS_C_mean,bb_STATUS_X_count,bb_STATUS_X_mean
0,5001709,-48.0,-48.0,97,-96,0,97,0.0,97,0.0,...,97,0.0,97,0.0,97,0.0,97,0.886598,97,0.113402
1,5001710,-41.0,-41.0,83,-82,0,83,0.060241,83,0.0,...,83,0.0,83,0.0,83,0.0,83,0.578313,83,0.361446
2,5001711,-1.5,-1.5,4,-3,0,4,0.75,4,0.0,...,4,0.0,4,0.0,4,0.0,4,0.0,4,0.25
3,5001712,-9.0,-9.0,19,-18,0,19,0.526316,19,0.0,...,19,0.0,19,0.0,19,0.0,19,0.473684,19,0.0
4,5001713,-10.5,-10.5,22,-21,0,22,0.0,22,0.0,...,22,0.0,22,0.0,22,0.0,22,0.0,22,1.0


## Merge table `bureau_balance_agg` with table `bureau` on column `SK_ID_BUREAU`

In [48]:
bureau_merge = bureau.merge(bureau_balance_agg, how="left", on="SK_ID_BUREAU")

print("Is SK_ID_BUREAU in bureau_merge unique:", bureau_merge["SK_ID_BUREAU"].is_unique)

bureau_merge = bureau_merge.drop(["SK_ID_BUREAU"], axis=1)

print("bureau shape:", bureau.shape)
print("bureau_merge shape:", bureau_merge.shape)

bureau_merge.head()

Is SK_ID_BUREAU in bureau_merge unique: True
bureau shape: (1716428, 17)
bureau_merge shape: (1716428, 37)


Unnamed: 0,SK_ID_CURR,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,...,bb_STATUS_3_count,bb_STATUS_3_mean,bb_STATUS_4_count,bb_STATUS_4_mean,bb_STATUS_5_count,bb_STATUS_5_mean,bb_STATUS_C_count,bb_STATUS_C_mean,bb_STATUS_X_count,bb_STATUS_X_mean
0,215354,Closed,currency 1,-497,0,-153.0,-153.0,,0,91323.0,...,,,,,,,,,,
1,215354,Active,currency 1,-208,0,1075.0,,,0,225000.0,...,,,,,,,,,,
2,215354,Active,currency 1,-203,0,528.0,,,0,464323.5,...,,,,,,,,,,
3,215354,Active,currency 1,-203,0,,,,0,90000.0,...,,,,,,,,,,
4,215354,Active,currency 1,-629,0,1197.0,,77674.5,0,2700000.0,...,,,,,,,,,,


In [43]:
bureau_merge["SK_ID_BUREAU"].is_unique

True

In [44]:
bureau.head()

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE,AMT_ANNUITY
0,215354,5714462,Closed,currency 1,-497,0,-153.0,-153.0,,0,91323.0,0.0,,0.0,Consumer credit,-131,
1,215354,5714463,Active,currency 1,-208,0,1075.0,,,0,225000.0,171342.0,,0.0,Credit card,-20,
2,215354,5714464,Active,currency 1,-203,0,528.0,,,0,464323.5,,,0.0,Consumer credit,-16,
3,215354,5714465,Active,currency 1,-203,0,,,,0,90000.0,,,0.0,Credit card,-16,
4,215354,5714466,Active,currency 1,-629,0,1197.0,,77674.5,0,2700000.0,,,0.0,Consumer credit,-21,


In [34]:
bureau["SK_ID_CURR"].as

Object `as_type` not found.


In [None]:
bureau["SK_ID_CURR"].as_type

In [37]:
bureau["SK_ID_CURR"].astype?

Object `astype` not found.


In [None]:
bureau["SK_ID_CURR"].astype

In [None]:
bureau["SK_ID_CURR"].astype

In [32]:
bureau_merge["SK_ID_CURR"].dtype

dtype('float64')

In [20]:
tmp = pd.DataFrame(np.random.randn(100, 3))
tmp["cat"] = np.random.choice(["a", "b", "c"], size=100)
tmp["id"] = ["a"]*25 + ["b"]*25 + ["c"]*50
tmp.head()

Unnamed: 0,0,1,2,cat,id
0,0.430908,-0.517439,1.517234,a,a
1,0.80674,-1.188113,-1.183804,b,a
2,0.312035,1.0046,-0.537798,a,a
3,0.588283,0.666943,0.654772,b,a
4,0.407811,0.512189,-0.145822,c,a


In [21]:
tmp_agg = aggregate(tmp.drop([0, 1, 2], axis=1), ["id"], num_stats=NUM_STATS, 
                    cat_stats=CAT_STATS, prefix="tmp_")
tmp_agg.head()

No numerical columns in df


Unnamed: 0,id,tmp_cat_a_count,tmp_cat_a_percent,tmp_cat_b_count,tmp_cat_b_percent,tmp_cat_c_count,tmp_cat_c_percent
0,a,8,0.32,11,0.44,6,0.24
1,b,6,0.24,9,0.36,10,0.4
2,c,14,0.28,21,0.42,15,0.3


In [38]:
tmp_agg.astype?

In [13]:
tmp_agg.columns

Index(['id', 'tmp_cat_a_count', 'tmp_cat_a_mean', 'tmp_cat_b_count',
       'tmp_cat_b_mean', 'tmp_cat_c_count', 'tmp_cat_c_mean'],
      dtype='object')