In [1]:
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
ins_pay_data_path = r"C:\Users\redal\Code\bootcamp_ppi\HomeCreditDefaultRisk\HomeCreditDefaultRisk\installments_payments.csv"

In [3]:
df = pd.read_csv(ins_pay_data_path)
df.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
0,1054186,161674,1.0,6,-1180.0,-1187.0,6948.36,6948.36
1,1330831,151639,0.0,34,-2156.0,-2156.0,1716.525,1716.525
2,2085231,193053,2.0,1,-63.0,-63.0,25425.0,25425.0
3,2452527,199697,1.0,3,-2418.0,-2426.0,24350.13,24350.13
4,2714724,167756,1.0,2,-1383.0,-1366.0,2165.04,2160.585


- SK_ID_PREV: ID of previous credit in Home credit related to loan in our sample. (One loan in our sample can have 0,1,2 or more previous loans in Home Credit)

- SK_ID_CURR: ID of loan in our sample

- NUM_INSTALMENT_VERSION: Version of installment calendar (0 is for credit card) of previous credit. Change of installment version from month to month signifies that some parameter of payment calendar has changed

- NUM_INSTALMENT_NUMBER: On which installment we observe payment

- DAYS_INSTALMENT: When the installment of previous credit was supposed to be paid (relative to application date of current loan)

- DAYS_ENTRY_PAYMENT: When was the installments of previous credit paid actually (relative to application date of current loan)

- AMT_INSTALMENT: What was the prescribed installment amount of previous credit on this installment

- AMT_PAYMENT: What the client actually paid on previous credit on this installment

In [None]:
df["AMT_INSTALMENT"].isna().sum()

In [29]:
df[df["SK_ID_PREV"] == 1308766].sort_values("NUM_INSTALMENT_NUMBER")

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
11910732,1308766,402199,0.0,1,-523.0,-539.0,6385.005,6385.005
13594881,1308766,402199,0.0,2,-539.0,-539.0,7114.995,7114.995
12969604,1308766,402199,0.0,3,-520.0,-520.0,3781.755,3781.755
12842367,1308766,402199,0.0,4,-492.0,-503.0,6658.515,6658.515
13193327,1308766,402199,0.0,5,-503.0,-503.0,6841.485,6841.485
11644235,1308766,402199,0.0,6,-484.0,-484.0,517.59,517.59
12969257,1308766,402199,0.0,7,-462.0,-476.0,6208.2,6208.2
12541881,1308766,402199,0.0,8,-476.0,-476.0,7291.8,7291.8
11972422,1308766,402199,0.0,9,-431.0,-447.0,17757.63,17757.63
11548451,1308766,402199,0.0,10,-447.0,-447.0,4742.37,4742.37


In [4]:
# THIS LINE IS VERY IMPORTANT -- MUST RUN!
df = df.sort_values(by=['SK_ID_PREV', 'SK_ID_CURR', 'NUM_INSTALMENT_NUMBER'])

In [59]:
def group_rows_by_months(df_original):
    df = df_original.copy()
    
    # first & last instalment day
    first_day = df['DAYS_INSTALMENT'].dropna().min() - 0.1
    last_day = df['DAYS_INSTALMENT'].dropna().max()
    
    # make bins using the first & last days, with 30 days interval
    groups = list(np.arange(first_day, last_day + 30, 30))
    
    # split the rows in df_original into groups and aggregate each group
    df['group'] = pd.cut(df['DAYS_INSTALMENT'], bins=groups)
    
    # TODO: can do this outside of function, whole df at once (instead of on each id)
    df_new = df.groupby(['group']).agg({"DAYS_INSTALMENT": "mean",
                                        "DAYS_ENTRY_PAYMENT": "mean",
                                        "AMT_INSTALMENT": "sum",
                                        "AMT_PAYMENT": "sum"})
    return df_new

In [60]:
def extract_features(id_prev, df_grouped):
    dict_for_this_sk_id = {}
    dict_for_this_sk_id["SK_ID_PREV"] = id_prev

    # if there is only 1 installment (regardless of how many days ago)
    if len(df_grouped) < 2:
        dict_for_this_sk_id["DEFAULT"] = False
        dict_for_this_sk_id["STDEV_PAYMENTS"] = 0
        
        # has that installment actually been paid?
        no_payment_flag = df_grouped["AMT_PAYMENT"].values[0] == 0.0
        
        if no_payment_flag:
            # can't be paying 'late' if hasn't paid yet lol
            dict_for_this_sk_id["MEAN_DAYS_LATE"] = np.nan
        else:
            dict_for_this_sk_id["MEAN_DAYS_LATE"] = df_grouped["DAYS_ENTRY_PAYMENT"].values[0] - df_grouped["DAYS_INSTALMENT"].values[0]
        return dict_for_this_sk_id

    # use to decide whether the person went broke -- one usual pattern is more than 3 NAs in the end
    num_rows_with_na = df_grouped.isna().any(axis=1).sum()
    
    # how much does the person owe the bank in total
    money_owed = df_grouped["AMT_INSTALMENT"].sum() - df_grouped["AMT_PAYMENT"].sum()
    
    # how much money does the person owe, comparing with their usual monthly instalment
    percentage_instalment_owed = money_owed/df_grouped["AMT_INSTALMENT"].mean()
    
    # default conditions: 
    # more than 3 rows with na values 
    # money_owed > 10000 
    # percentage_instalment_owed > 20% of avg instalment
    # last installment more than 90 days ago
    default_conditions_bool = (num_rows_with_na > 3) & \
                            (money_owed > 10000) & \
                            (percentage_instalment_owed > 0.2) & \
                            (df_grouped["AMT_INSTALMENT"].values[-1] > 90)
    
    dict_for_this_sk_id["DEFAULT"] = default_conditions_bool
    
    # STDEV_PAYMENTS and MEAN_DAYS_LATE calculation should ignore nans and zeroes
    df_rows_without_na = df_grouped[~df_grouped.isna().any(axis=1)]
    dict_for_this_sk_id["STDEV_PAYMENTS"] = df_rows_without_na["AMT_PAYMENT"].std()
    dict_for_this_sk_id["MEAN_DAYS_LATE"] = df_rows_without_na["DAYS_ENTRY_PAYMENT"].mean() - df_rows_without_na["DAYS_INSTALMENT"].mean()

    return dict_for_this_sk_id

In [62]:
import time

time_init = time.time()

# THIS IS THE SLOWEST LINE! TODO: another method of getting individual SK_ID_PREV
df_for_the_id = df[df["SK_ID_PREV"] == 1308766]  

time_fetch_id = time.time()
print("time_to_fetch_id", time_fetch_id - time_init)

cat = group_rows_by_months(df_for_the_id)

time_group = time.time()
print("time for row grouping", time_group - time_fetch_id)
#print(cat)

id_dict = extract_features(1308766, cat)
time_features = time.time()

print("time for extracting features", time_features - time_group)

print(id_dict)    
   # id_mega_dict.append(id_dict)
    
print("total_time", time_features - time_init)

time_to_fetch_id 0.026021957397460938
time for row grouping 0.009972095489501953
time for extracting features 0.0030007362365722656
{'SK_ID_PREV': 1308766, 'DEFAULT': False, 'STDEV_PAYMENTS': 14693.876403141649, 'MEAN_DAYS_LATE': -4.5098039215686185}
total_time 0.038994789123535156


In [55]:
# need to process this many SK_ID_PREV -- need nearly 8 hours with current speed!
# total_time can be reduced by a factor of 4-5 by pre-splitting the dataframe by ids, don't use
# df[df["SK_ID_PREV"] == SOME_SK_ID_PREV]
id_prev_to_preprocess = df.SK_ID_PREV.unique()
print(len(id_prev_to_preprocess))
print(id_prev_to_preprocess)

997752
[1000001 1000002 1000003 ... 2843497 2843498 2843499]


In [8]:
#1310347 (no broke but nans)
#2448869 (broke)
#1308766 (irregular paying pattern)
#1035136 (one row)

id_mega_dict = []

for id_prev in id_prev_to_preprocess:
    df_for_the_id = df[df["SK_ID_PREV"] == id_prev]
    cat = group_rows_by_months(df_for_the_id)
    id_dict = extract_features(id_prev, cat)
    
    id_mega_dict.append(id_dict)


KeyboardInterrupt: 

In [9]:
print(len(id_mega_dict))

6304


In [None]:
df_cat = pd.DataFrame.from_records(id_mega_dict)

In [None]:
df_cat.to_csv(path_or_buf = r"C:\Users\redal\Code\bootcamp_ppi\HomeCreditDefaultRisk\HomeCreditDefaultRisk\installments_payments_features.csv",index=False)