In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("data/installments_payments.csv")

In [3]:
def group_observations_by_month(var):
    
    start_date = var.iloc[0] - 0.1
    end_date = var.iloc[-1] + 0.1
    
    # make bins using the first & last days, with 30 days interval
    my_bin = list(np.arange(start_date, 
                            end_date, 
                            30,
                            ))

    output_var = np.digitize(var,
                             bins = my_bin)
    
    return output_var

In [4]:
# create identifier for a month for all credits
df["month_group"] = df.groupby(["SK_ID_CURR", "SK_ID_PREV"])[["DAYS_INSTALMENT"]].transform(group_observations_by_month)

In [None]:
df.head(5)

In [17]:
# group activities together by month
df_month_level = df.groupby(["SK_ID_CURR", 
                            "SK_ID_PREV", 
                            "month_group",
                           ]).agg({
    "DAYS_INSTALMENT" : np.mean,
    "DAYS_ENTRY_PAYMENT" : np.mean,
    "AMT_INSTALMENT" : "first",
    "AMT_PAYMENT" : np.sum,
})

In [6]:
df_month_level.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
SK_ID_CURR,SK_ID_PREV,month_group,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
100001,1369693,1,-1709.0,-1715.0,3951.0,3951.0
100001,1369693,2,-1679.0,-1715.0,3951.0,3951.0
100001,1369693,3,-1634.0,-1644.0,21348.9,21348.9
100001,1851984,1,-2916.0,-2916.0,3982.05,3982.05
100001,1851984,2,-2871.0,-2865.5,7962.975,7962.975


In [7]:
# group activities together by previous credit
df_prev_level = df.groupby(["SK_ID_CURR", 
                            "SK_ID_PREV",
                           ]).agg({
    "DAYS_INSTALMENT" : np.mean,
    "DAYS_ENTRY_PAYMENT" : np.mean,
    "AMT_INSTALMENT" : np.sum,
    "AMT_PAYMENT" : np.sum,
})

In [8]:
df_prev_level.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
SK_ID_CURR,SK_ID_PREV,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
100001,1369693,-1664.0,-1679.5,29250.9,29250.9
100001,1851984,-2886.0,-2882.333333,11945.025,11945.025
100002,1038818,-295.0,-315.421053,219625.695,219625.695
100003,1810518,-626.0,-630.428571,1150977.33,1150977.33
100003,2396755,-2145.0,-2151.75,80773.38,80773.38


In [14]:
# aggregate previous credit activities to the mortage
df_curr_level = df.groupby(["SK_ID_CURR"]).agg({
    "DAYS_ENTRY_PAYMENT" : np.mean,
    "DAYS_INSTALMENT" : np.mean,
    "AMT_INSTALMENT" : [np.sum, np.mean],
    "AMT_PAYMENT" : [np.std, np.sum],
})

df_curr_level["MEAN_DAYS_LATE"] = df_curr_level["DAYS_ENTRY_PAYMENT"]["mean"] - df_curr_level["DAYS_INSTALMENT"]["mean"]
df_curr_level["MONEY_OWED"] = df_curr_level["AMT_INSTALMENT"]["sum"] - df_curr_level["AMT_PAYMENT"]["sum"]
df_curr_level["PERC_INSTALMENT_OWED"] = df_curr_level["MONEY_OWED"] / df_curr_level["AMT_INSTALMENT"]["mean"]

In [15]:
df_curr_level.head(5)

Unnamed: 0_level_0,DAYS_ENTRY_PAYMENT,DAYS_INSTALMENT,AMT_INSTALMENT,AMT_INSTALMENT,AMT_PAYMENT,AMT_PAYMENT,MEAN_DAYS_LATE,MONEY_OWED,PERC_INSTALMENT_OWERD
Unnamed: 0_level_1,mean,mean,sum,mean,std,sum,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
SK_ID_CURR,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
100001,-2195.0,-2187.714286,41195.925,5885.132143,5076.676624,41195.925,-7.285714,0.0,0.0
100002,-315.421053,-295.0,219625.695,11559.247105,10058.037722,219625.695,-20.421053,0.0,0.0
100003,-1385.32,-1378.16,1618864.65,64754.586,110542.5923,1618864.65,-7.16,0.0,0.0
100004,-761.666667,-754.0,21288.465,7096.155,3011.87181,21288.465,-7.666667,0.0,0.0
100005,-609.555556,-586.0,56161.845,6240.205,4281.015,56161.845,-23.555556,0.0,0.0
