In [1]:
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
ins_pay_data_path = r"C:\Users\redal\Code\bootcamp_ppi\HomeCreditDefaultRisk\HomeCreditDefaultRisk\installments_payments.csv"

In [3]:
df = pd.read_csv(ins_pay_data_path)
df.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
0,1054186,161674,1.0,6,-1180.0,-1187.0,6948.36,6948.36
1,1330831,151639,0.0,34,-2156.0,-2156.0,1716.525,1716.525
2,2085231,193053,2.0,1,-63.0,-63.0,25425.0,25425.0
3,2452527,199697,1.0,3,-2418.0,-2426.0,24350.13,24350.13
4,2714724,167756,1.0,2,-1383.0,-1366.0,2165.04,2160.585


- SK_ID_PREV: ID of previous credit in Home credit related to loan in our sample. (One loan in our sample can have 0,1,2 or more previous loans in Home Credit)

- SK_ID_CURR: ID of loan in our sample

- NUM_INSTALMENT_VERSION: Version of installment calendar (0 is for credit card) of previous credit. Change of installment version from month to month signifies that some parameter of payment calendar has changed

- NUM_INSTALMENT_NUMBER: On which installment we observe payment

- DAYS_INSTALMENT: When the installment of previous credit was supposed to be paid (relative to application date of current loan)

- DAYS_ENTRY_PAYMENT: When was the installments of previous credit paid actually (relative to application date of current loan)

- AMT_INSTALMENT: What was the prescribed installment amount of previous credit on this installment

- AMT_PAYMENT: What the client actually paid on previous credit on this installment

In [166]:
df[df["SK_ID_PREV"] == 1035136].sort_values("NUM_INSTALMENT_NUMBER")

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
11463719,1035136,339127,6.0,1,-48.0,,31698.225,


In [5]:
# THIS LINE IS VERY IMPORTANT -- MUST RUN!
df = df.sort_values(by=['SK_ID_PREV', 'SK_ID_CURR', 'NUM_INSTALMENT_NUMBER'])

In [177]:
# TODO: SOMEWHERE HERE RUNS VERY SLOW -- NEED TO CHECK!
def group_rows_by_months(df_original):
    df = df_original.copy()

    # exception: if "first_day" is later than "last_day"!!!
    first_day = np.nanmin(df['DAYS_ENTRY_PAYMENT']) - 0.1
    last_day = np.nanmax(df['DAYS_ENTRY_PAYMENT'])
   
    # ValueError: arange: cannot compute length
    try:
        groups = list(np.arange(first_day, last_day + 30, 30))
    except ValueError:
        return {}, 0

    df['group'] = pd.cut(df['DAYS_ENTRY_PAYMENT'], bins=groups)
    
    df_new = df.groupby(['group']).agg({"DAYS_INSTALMENT": "mean",
                                        "DAYS_ENTRY_PAYMENT": "mean",
                                        "AMT_INSTALMENT": "sum",
                                        "AMT_PAYMENT": "sum"})
    
    df_w_na = df[df.isna().any(axis=1)]

    if len(df_w_na) > 0:
        
        #cat.iloc[idx, cat.columns.get_loc('AMT_PAYMENT')] = val
        df_new.iloc[len(df_new)-1, df_new.columns.get_loc('DAYS_INSTALMENT')] = df_w_na["DAYS_INSTALMENT"].mean()
        df_new.iloc[len(df_new)-1, df_new.columns.get_loc('AMT_INSTALMENT')] = df_w_na["AMT_INSTALMENT"].sum()

    return df_new, len(df_w_na)

In [184]:
# TODO: SOMEWHERE HERE RUNS VERY SLOW -- NEED TO CHECK!
def extract_features(id_prev, df_grouped, rows_w_na):
    dict_for_this_sk_id = {}
    if len(df_grouped) == 0:
        dict_for_this_sk_id["SK_ID_PREV"] = id_prev
        dict_for_this_sk_id["DEFAULT"] = False
        dict_for_this_sk_id["STDEV_PAYMENTS"] = np.nan

        dict_for_this_sk_id["MEAN_DAYS_LATE"] = np.nan
        return dict_for_this_sk_id
        
        
    
    dict_for_this_sk_id["SK_ID_PREV"] = id_prev
    
    if len(df_grouped) < 2:
        dict_for_this_sk_id["DEFAULT"] = False
        dict_for_this_sk_id["STDEV_PAYMENTS"] = 0

        dict_for_this_sk_id["MEAN_DAYS_LATE"] = df_grouped["DAYS_ENTRY_PAYMENT"].values[0] - df_grouped["DAYS_INSTALMENT"].values[0]
        return dict_for_this_sk_id

    num_rows_with_na = rows_w_na
    money_owed = df_grouped["AMT_INSTALMENT"].sum() - df_grouped["AMT_PAYMENT"].sum()

    percentage_installment_owed = money_owed/df_grouped["AMT_INSTALMENT"].mean()
    
    # default conditions: 
    # rows_w_na > 3 & DAYS_INSTALMENT_MEAN (last one) > 90 
    # AMT_INSTALMENT_SUM > 10000 
    # percentage_owed > 20% of avg installment
    default_conditions_bool = (num_rows_with_na > 3) & \
                            (money_owed > 10000) & \
                            (percentage_installment_owed > 0.2) & \
                            (df_grouped["AMT_INSTALMENT"].values[-1] > 90)
    
    dict_for_this_sk_id["DEFAULT"] = default_conditions_bool
    dict_for_this_sk_id["STDEV_PAYMENTS"] = df_grouped["AMT_INSTALMENT"][df_grouped["AMT_INSTALMENT"] != 0].std()
    dict_for_this_sk_id["MEAN_DAYS_LATE"] = df_grouped["DAYS_ENTRY_PAYMENT"].mean() - df_grouped["DAYS_INSTALMENT"].mean()

    return dict_for_this_sk_id

In [160]:
id_prev_to_preprocess = df.SK_ID_PREV.unique()
print(len(id_prev_to_preprocess))

997752


In [174]:
#print(id_prev_to_preprocess[22857+4915])

1048805


In [189]:
#1310347 (no broke but nans)
#2448869 (broke)
#1308766 (irregular paying pattern)

id_mega_dict = []

# error at: print(id_prev_to_preprocess[22857]), print(id_prev_to_preprocess[22857+4915])
# cleaned_final1: len is 64110
# cleaned2: len = 63390

for id_prev in id_prev_to_preprocess[(64111 + 63390 + 1):]:
    #print(id_prev)
    df_for_the_id = df[df["SK_ID_PREV"] == id_prev]
    stdev = df_for_the_id["AMT_PAYMENT"].std()
    cat, na_rows = group_rows_by_months(df_for_the_id)
    id_dict = extract_features(id_prev, cat, na_rows)
    
    id_mega_dict.append(id_dict)

#print(id_mega_dict)
#print(df_cat)
    #print(id_dict)

  first_day = np.nanmin(df['DAYS_ENTRY_PAYMENT']) - 0.1
  last_day = np.nanmax(df['DAYS_ENTRY_PAYMENT'])
  percentage_installment_owed = money_owed/df_grouped["AMT_INSTALMENT"].mean()


KeyboardInterrupt: 

In [190]:
print(len(id_mega_dict))

63390


In [191]:
df_cat = pd.DataFrame.from_records(id_mega_dict)

In [192]:
df_cat.to_csv(path_or_buf = r"C:\Users\redal\Code\bootcamp_ppi\HomeCreditDefaultRisk\HomeCreditDefaultRisk\installments_payments_cleaned_final2.csv",index=False)