In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
ins_pay_data_path = r"C:\Users\redal\Code\bootcamp_ppi\HomeCreditDefaultRisk\HomeCreditDefaultRisk\installments_payments.csv"

In [3]:
df = pd.read_csv(ins_pay_data_path)
df.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
0,1054186,161674,1.0,6,-1180.0,-1187.0,6948.36,6948.36
1,1330831,151639,0.0,34,-2156.0,-2156.0,1716.525,1716.525
2,2085231,193053,2.0,1,-63.0,-63.0,25425.0,25425.0
3,2452527,199697,1.0,3,-2418.0,-2426.0,24350.13,24350.13
4,2714724,167756,1.0,2,-1383.0,-1366.0,2165.04,2160.585


- SK_ID_PREV: ID of previous credit in Home credit related to loan in our sample. (One loan in our sample can have 0,1,2 or more previous loans in Home Credit)

- SK_ID_CURR: ID of loan in our sample

- NUM_INSTALMENT_VERSION: Version of installment calendar (0 is for credit card) of previous credit. Change of installment version from month to month signifies that some parameter of payment calendar has changed

- NUM_INSTALMENT_NUMBER: On which installment we observe payment

- DAYS_INSTALMENT: When the installment of previous credit was supposed to be paid (relative to application date of current loan)

- DAYS_ENTRY_PAYMENT: When was the installments of previous credit paid actually (relative to application date of current loan)

- AMT_INSTALMENT: What was the prescribed installment amount of previous credit on this installment

- AMT_PAYMENT: What the client actually paid on previous credit on this installment

In [4]:
#2448869 went broke
# 1308766 not broke yet, and pays every time, but very unsteady AMT_INSTALLMENT
df[df["SK_ID_PREV"] == 1000038].sort_values("NUM_INSTALMENT_NUMBER")

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
10776788,1000038,310847,1.0,1,-2517.0,-2545.0,3973.095,3973.095
10419828,1000038,310847,1.0,2,-2487.0,-2487.0,3973.095,3973.095
10042354,1000038,310847,1.0,3,-2457.0,-2524.0,3973.095,3973.095
9780157,1000038,310847,1.0,4,-2427.0,-2524.0,3969.675,3969.675


In [5]:
# THIS LINE IS VERY IMPORTANT -- MUST RUN!
df = df.sort_values(by=['SK_ID_PREV', 'SK_ID_CURR', 'NUM_INSTALMENT_NUMBER'])

In [6]:
# if the customers are really paying irregularly (e.g. 1308766), then don't clean with this function ???
# check stdev of each customer before cleaning with this function, and set a threshold (e.g. 10000)

def merge_and_remove_rows_by_installment_numbers_and_same_day_payments(df_original):
    df = df_original.copy()

    # check whether an installment number is equal to the row above
    bool_same_installment = pd.DataFrame(df.NUM_INSTALMENT_NUMBER.eq(df.NUM_INSTALMENT_NUMBER.shift()))
    
    # check whether an entry payment is less than 4 days after the row before
    bool_nearby_payment_dates = pd.DataFrame(np.ediff1d(df['DAYS_ENTRY_PAYMENT'].values, to_begin=np.NaN) < 4, index=bool_same_installment.index)
    bool_df = pd.concat([bool_same_installment, bool_nearby_payment_dates], axis=1)
    bool_df.columns = ["SAME_INSTALLMENT", "SAME_PAYMENT_DATES"]
    cat = pd.concat([df, bool_df], axis=1)
    
    #column 0 index and locational index (0 to len(df_original)) for our rows of interest
    col0_rows_of_interest = cat[(cat["SAME_INSTALLMENT"]==True) | (cat["SAME_PAYMENT_DATES"]==True)].index.values
    idx_rows_of_interest = list(cat.index.get_indexer(col0_rows_of_interest))    
    
    amt_payment_index_to_replace = []
    amt_payment_value_to_replace = []
    
    # append: rows of interest, while not in last row conditions
    row_index_to_eliminate = []
    
    
    for idx in idx_rows_of_interest:
 
        index_before = idx - 1
        index_after = idx + 1
        
        row_before = cat.iloc[index_before]
        row_of_interest = cat.iloc[idx]
        
        # Conditions: AMT_INSTALLMENT of last row and two rows above are not equal, or there is "debt" from the last row
        last_row_conditions = (row_before["AMT_INSTALMENT"] != cat.iloc[index_before - 1]["AMT_INSTALMENT"]) \
        or (row_before["AMT_PAYMENT"] != row_before["AMT_INSTALMENT"])
   
        if last_row_conditions:
            # add the sum of nearby-dates-payment up to get the real amount of payments within a short timeframe (3 days)
            new_row_before_amt_payment = row_before["AMT_PAYMENT"] + row_of_interest["AMT_PAYMENT"]
            row_index_to_eliminate.append(idx)
        
        else:
            continue
            
        # as long as next consecutive rows are also within idx_rows_of_interest (i.e. same installments or short timeframe), 
        # add their values to new_row_before_amt_payment
        # and then add append the rows to row_index_to_eliminate, until hit rows not in idx_rows_of_interest
        n = 1
        while idx + n in idx_rows_of_interest:
            new_row_before_amt_payment += cat.iloc[idx + n]["AMT_PAYMENT"]
            row_index_to_eliminate.append(idx + n)
            idx_rows_of_interest.remove(idx + n)
            n += 1
            
        amt_payment_index_to_replace.append(index_before)
        amt_payment_value_to_replace.append(new_row_before_amt_payment)
            
    for idx, val in zip(amt_payment_index_to_replace, amt_payment_value_to_replace):
        cat.iloc[idx, cat.columns.get_loc('AMT_PAYMENT')] = val
        cat.iloc[idx, cat.columns.get_loc('AMT_INSTALMENT')] = val
 
    cat = cat.drop(cat.index[row_index_to_eliminate])
    
    return cat

In [7]:
def extract_features(df_for_each_sk_id_prev):
    dict_for_this_sk_id = {}
    dict_for_this_sk_id["SK_ID_PREV"] = list(df_for_each_sk_id_prev["SK_ID_PREV"].unique())[0]

    # if 3 or more rows in the end are nan, AND still owes the bank more than 20% of a instalment 
    # AND if the amount is > 1000  --> default
    num_rows_with_na = df_for_each_sk_id_prev.shape[0] - df_for_each_sk_id_prev.dropna().shape[0]
    money_owed = df_for_each_sk_id_prev["AMT_OWED"].sum()
    percentage_installment_owed = money_owed/df_for_each_sk_id_prev["AMT_INSTALMENT"].mean()
    
    # print(num_rows_with_na, money_owed, percentage_installment_owed)
    default_conditions_bool = (num_rows_with_na > 3) & (money_owed > 1000) & (percentage_installment_owed > 0.1)
    
    dict_for_this_sk_id["DEFAULT"] = default_conditions_bool
    dict_for_this_sk_id["STDEV_PAYMENTS"] = df_for_each_sk_id_prev["AMT_PAYMENT"].std()
    print(df_for_each_sk_id_prev["AMT_PAYMENT"], dict_for_this_sk_id["STDEV_PAYMENTS"])
    dict_for_this_sk_id["MEAN_DAYS_LATE"] = df_for_each_sk_id_prev["DAYS_LATE_PAYMENT"].mean()
    return dict_for_this_sk_id
    
    
cat = merge_and_remove_rows_by_installment_numbers_and_same_day_payments(df[df["SK_ID_PREV"] == 1308766])

# if pay late, number will be positive
cat["DAYS_LATE_PAYMENT"] = cat["DAYS_ENTRY_PAYMENT"] - cat["DAYS_INSTALMENT"]

# positive if a person pays less than their installment
cat["AMT_OWED"] = cat["AMT_INSTALMENT"] - cat["AMT_PAYMENT"]

id_dict = extract_features(cat)
print(id_dict)

11910732    13500.000
12969604     3781.755
12842367    13500.000
11644235      517.590
12969257    13500.000
11972422    22500.000
11984210    29648.385
12293477    22500.000
12351283    22500.000
13274776     1656.450
12421791    45000.000
12614400    21600.000
12811556    22500.000
11977405    22500.000
11608484    29959.380
11818017      190.620
11845773    40500.000
12668674     4654.260
12772814    45000.000
11853079     1143.045
12244759    45459.585
12389827    13040.415
12308675    45000.000
12178256      488.565
11795720    45000.000
12000009    45000.000
13605398          NaN
Name: AMT_PAYMENT, dtype: float64 16540.07407270002
{'SK_ID_PREV': 1308766, 'DEFAULT': False, 'STDEV_PAYMENTS': 16540.07407270002, 'MEAN_DAYS_LATE': -6.576923076923077}


In [8]:
id_prev_to_preprocess = df.SK_ID_PREV.unique()
print(id_prev_to_preprocess)

[1000001 1000002 1000003 ... 2843497 2843498 2843499]


In [10]:
#1310347 (no broke but nans)
#2448869 (broke)
#1308766 (irregular paying pattern)
# 
#merge_and_remove_rows_by_installment_numbers_and_same_day_payments(df[df["SK_ID_PREV"] == 1310347])


# TODO: compare the stdev for before and after the cleaning, and check the dict by eyes
# to decide whether set a threshold for the initial stdev value or just clean all

id_mega_dict = []

for id_prev in id_prev_to_preprocess[:1]:
    df_for_the_id = df[df["SK_ID_PREV"] == 1000038]
    stdev = df_for_the_id["AMT_PAYMENT"].std()
    if stdev < 10000:
        cat = merge_and_remove_rows_by_installment_numbers_and_same_day_payments(df_for_the_id)
    else:
        cat = df_for_the_id  
    cat["DAYS_LATE_PAYMENT"] = cat["DAYS_ENTRY_PAYMENT"] - cat["DAYS_INSTALMENT"]
    cat["AMT_OWED"] = cat["AMT_INSTALMENT"] - cat["AMT_PAYMENT"]

    id_dict = extract_features(cat)
    
    id_mega_dict.append(id_dict)
    
    print(id_dict)

10776788    3973.095
10419828    3973.095
10042354    3973.095
9780157     3969.675
Name: AMT_PAYMENT, dtype: float64 1.709999999999809
{'SK_ID_PREV': 1000038, 'DEFAULT': False, 'STDEV_PAYMENTS': 1.709999999999809, 'MEAN_DAYS_LATE': -48.0}
