In [1]:
import csv
import sys
import time
import pickle
import numpy as np
import pandas as pd
from collections import defaultdict
from csv import DictReader

In [2]:
ins_pay_data_path = r"C:\Users\redal\Code\bootcamp_ppi\HomeCreditDefaultRisk\HomeCreditDefaultRisk\installments_payments_sorted_by_days_installment.csv"

In [3]:
# number of id to preprocess -- nearly 1 million!
df = pd.read_csv(ins_pay_data_path)
id_prev_to_preprocess = df.SK_ID_PREV.unique()
print(len(id_prev_to_preprocess))

997752


In [4]:
def group_rows_by_months(df):
    
    first_day = df['DAYS_INSTALMENT'].values[0] - 0.1
    last_day = df['DAYS_INSTALMENT'].values[len(df) - 1] - 0.1
    
    # make bins using the first & last days, with 30 days interval
    days_in_a_month = 30
    my_bin = list(np.arange(first_day, last_day + days_in_a_month, days_in_a_month))

    df['group'] = np.digitize(df['DAYS_INSTALMENT'], bins=my_bin)
    df_grouped = df.groupby(['group'])
    
    # faster than groupby(['group']).agg{...}
    df_new = pd.DataFrame({'DAYS_INSTALMENT': df_grouped['DAYS_INSTALMENT'].mean(), 
                           'DAYS_ENTRY_PAYMENT': df_grouped['DAYS_ENTRY_PAYMENT'].mean(),
                           'AMT_INSTALMENT': df_grouped['AMT_INSTALMENT'].sum(),
                           'AMT_PAYMENT': df_grouped['AMT_PAYMENT'].sum(),
                            })
    return df_new

In [6]:
def extract_features(id_prev, df_grouped):
    dict_for_this_sk_id = {}
    dict_for_this_sk_id["SK_ID_PREV"] = id_prev

    # if there is only 1 installment (regardless of how many days ago)
    if len(df_grouped) < 2:
        dict_for_this_sk_id["DEFAULT"] = False
        dict_for_this_sk_id["STDEV_PAYMENTS"] = 0
        
        # has that installment actually been paid?
        no_payment_flag = df_grouped["AMT_PAYMENT"].values[0] == 0.0
        
        if no_payment_flag:
            # can't be paying 'late' if hasn't paid yet lol
            dict_for_this_sk_id["MEAN_DAYS_LATE"] = np.nan
        else:
            dict_for_this_sk_id["MEAN_DAYS_LATE"] = df_grouped["DAYS_ENTRY_PAYMENT"].values[0] - df_grouped["DAYS_INSTALMENT"].values[0]
        return dict_for_this_sk_id

    # use to decide whether the person went broke -- one usual pattern is more than 3 NAs in the end
    num_rows_with_na = df_grouped.isna().any(axis=1).sum()
    
    # how much does the person owe the bank in total
    money_owed = df_grouped["AMT_INSTALMENT"].sum() - df_grouped["AMT_PAYMENT"].sum()
    
    # how much money does the person owe, comparing with their usual monthly instalment
    percentage_instalment_owed = money_owed/df_grouped["AMT_INSTALMENT"].mean()
    
    # default conditions: 
    # more than 3 rows with na values 
    # money_owed > 10000 
    # percentage_instalment_owed > 20% of avg instalment
    default_conditions_bool = (num_rows_with_na > 3) & \
                            (money_owed > 10000) & \
                            (percentage_instalment_owed > 0.2) 
    
    dict_for_this_sk_id["DEFAULT"] = default_conditions_bool
    
    # STDEV_PAYMENTS and MEAN_DAYS_LATE calculation should ignore nans and zeroes
    df_rows_without_na = df_grouped[~df_grouped.isna().any(axis=1)]
    dict_for_this_sk_id["STDEV_PAYMENTS"] = df_rows_without_na["AMT_PAYMENT"].std()
    dict_for_this_sk_id["MEAN_DAYS_LATE"] = df_rows_without_na["DAYS_ENTRY_PAYMENT"].mean() - df_rows_without_na["DAYS_INSTALMENT"].mean()

    return dict_for_this_sk_id

In [8]:
# https://blog.samrid.me/how-to-save-python-objects-in-redis

# OBSERVATION
# start running at 16:53
# ends at before 17:53
# shortened to 1 hour! Yay!
# within estimated time below

# ESTIMATION
# after optimisation:
# time for row grouping < 0.005
# time for extracting features < 0.003
# misc ~0.02 (see %prune results)
# total time < 0.01
# i.e. about 1-2 hours for 997752 SK_ID_PREV
# on top of this, for SK_ID_PREV with a single row, time ~0.0

people_list = []

with open(ins_pay_data_path, "r") as read_obj:
    csv_reader = DictReader(read_obj)
    id_dict_temp = []
    id_dict = defaultdict(list)

    current_id = '1000001'
    for row in csv_reader:
        row.pop('')
        row.pop('Unnamed: 0')
        if row['SK_ID_PREV'] == current_id:
            id_dict_temp.append(row)
        else:
            # do all operations here!
            for d in id_dict_temp: 
                for key, value in d.items():
                    try:
                        id_dict[key].append(float(value))
                    except ValueError:
                        id_dict[key].append(np.nan)
            
            df = pd.DataFrame.from_dict(id_dict)
            
            df_grouped = group_rows_by_months(df)
            dict_features = extract_features(int(df['SK_ID_PREV'][0]), df)
            people_list.append(dict_features)
            
            # update current_id and empty out the temporary lists again
            current_id = row['SK_ID_PREV']
            id_dict_temp = []
            id_dict = defaultdict(list)
            
            # update the empty temp list with values of current row
            id_dict_temp.append(row)
            continue

  percentage_instalment_owed = money_owed/df_grouped["AMT_INSTALMENT"].mean()


In [9]:
print(len(people_list))

997751


In [16]:
size_in_bytes = sys.getsizeof(people_list) 
print("Size of the dict_list is {} MB".format(size_in_bytes/1000000))

Size of the dict_list is 8.448728 MB


In [13]:
# save people_list to pickle for loss prevention
with open(r"C:\Users\redal\Code\bootcamp_ppi\HomeCreditDefaultRisk\HomeCreditDefaultRisk\insurance_data_list_of_dict.pkl", 'wb') as f:
    pickle.dump(people_list, f)

In [14]:
# save the extracted features to csv file
with open(r"C:\Users\redal\Code\bootcamp_ppi\HomeCreditDefaultRisk\HomeCreditDefaultRisk\insurance_data_extracted_features.csv", 'w', encoding='utf8', newline='') as output_file:
    fc = csv.DictWriter(output_file, 
                        fieldnames=people_list[0].keys(),

                       )
    fc.writeheader()
    fc.writerows(people_list)

In [27]:
# this cell is just to calculate the time needed to extract features for one single SK_ID_PREV

def get_feature_dicts_of_all_id():

    people_list = []

    with open(ins_pay_data_path, "r") as read_obj:
        csv_reader = DictReader(read_obj)
        id_dict_temp = []
        id_dict = defaultdict(list)

        current_id = '1000001'
        for row in csv_reader:
            row.pop('')
            row.pop('Unnamed: 0')
            if row['SK_ID_PREV'] == current_id:
                id_dict_temp.append(row)
            else:
                # do all operations here!
                for d in id_dict_temp: 
                    for key, value in d.items():
                        try:
                            id_dict[key].append(float(value))
                        except ValueError:
                            id_dict[key].append(np.nan)
            
                df = pd.DataFrame.from_dict(id_dict)
            
                df_grouped = group_rows_by_months(df)
                dict_features = extract_features(int(df['SK_ID_PREV'][0]), df)
                people_list.append(dict_features)
            
                # update current_id and empty out the temporary lists again
                current_id = row['SK_ID_PREV']
                id_dict_temp = []
                id_dict = defaultdict(list)
            
                # update the empty temp list with values of current row
                id_dict_temp.append(row)
                break

%prun get_feature_dicts_of_all_id()


 

In [28]:
# now just load the feature file, match SK_ID_PREV with SK_ID_CURR and then save the features by SK_ID_CURR into another file
# this is needed only because I forgot to save SK_ID_CURR into feature file :)

ins_pay_data_path = r"C:\Users\redal\Code\bootcamp_ppi\HomeCreditDefaultRisk\HomeCreditDefaultRisk\installments_payments_sorted_by_days_installment.csv"
features_prev_data_path = r"C:\Users\redal\Code\bootcamp_ppi\HomeCreditDefaultRisk\HomeCreditDefaultRisk\kahmin\installments_payments_extracted_features.csv"

# 

In [33]:
#df_id_curr = pd.read_csv(ins_pay_data_path)

col_needed = ["SK_ID_CURR", "SK_ID_PREV"]
              
df_id = df_id_curr[col_needed].drop_duplicates('SK_ID_PREV')
df_id.head()

Unnamed: 0,SK_ID_CURR,SK_ID_PREV
0,158271,1000001
2,101962,1000002
6,252457,1000003
9,260094,1000004
16,176456,1000005


In [34]:
df_features_prev = pd.read_csv(features_prev_data_path)

In [37]:
df_features_new = pd.merge(df_id, df_features_prev, on='SK_ID_PREV')
df_features_new.head()

Unnamed: 0,SK_ID_CURR,SK_ID_PREV,DEFAULT,STDEV_PAYMENTS,MEAN_DAYS_LATE
0,158271,1000001,False,39339.747885,-16.0
1,101962,1000002,False,6089.7825,-19.75
2,252457,1000003,False,0.0,-15.333333
3,260094,1000004,False,3698.527885,-26.714286
4,176456,1000005,False,4432.07797,-8.454545


In [38]:
df_features_new.to_csv(path_or_buf=r"C:\Users\redal\Code\bootcamp_ppi\HomeCreditDefaultRisk\HomeCreditDefaultRisk\kahmin\insurance_data_extracted_features.csv")

In [39]:
# throw into pickle file
with open(r"C:\Users\redal\Code\bootcamp_ppi\HomeCreditDefaultRisk\HomeCreditDefaultRisk\kahmin\insurance_data_list_of_dict.pkl", 'wb') as f:
    pickle.dump(df_features_new, f)

In [42]:
# read from pickle file
with open(r"C:\Users\redal\Code\bootcamp_ppi\HomeCreditDefaultRisk\HomeCreditDefaultRisk\kahmin\insurance_data_list_of_dict.pkl", "rb") as input_file:
    e = pickle.load(input_file)

In [43]:
e

Unnamed: 0,SK_ID_CURR,SK_ID_PREV,DEFAULT,STDEV_PAYMENTS,MEAN_DAYS_LATE
0,158271,1000001,False,39339.747885,-16.000000
1,101962,1000002,False,6089.782500,-19.750000
2,252457,1000003,False,0.000000,-15.333333
3,260094,1000004,False,3698.527885,-26.714286
4,176456,1000005,False,4432.077970,-8.454545
...,...,...,...,...,...
997746,292375,2843494,False,628430.873982,-10.000000
997747,260963,2843495,False,239114.242278,-3.857143
997748,425374,2843496,False,15722.730938,-4.000000
997749,451578,2843497,False,0.000000,-2.900000
