In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from functions import *

pd.set_option('display.max_columns', 100)

In [2]:
def process_installment(ins):

    ins, cat_cols = one_hot_encoder(ins, nan_as_category=True)

    # Group payments and get Payment difference
    ins = do_sum(ins, ['SK_ID_PREV', 'NUM_INSTALMENT_NUMBER'], 'AMT_PAYMENT', 'AMT_PAYMENT_GROUPED')
    ins['PAYMENT_DIFFERENCE'] = ins['AMT_INSTALMENT'] - ins['AMT_PAYMENT_GROUPED']
    ins['PAYMENT_RATIO'] = ins['AMT_INSTALMENT'] / ins['AMT_PAYMENT_GROUPED']
    ins['PAID_OVER_AMOUNT'] = ins['AMT_PAYMENT'] - ins['AMT_INSTALMENT']
    ins['PAID_OVER'] = (ins['PAID_OVER_AMOUNT'] > 0).astype(int)

    # Percentage and difference paid in each installment (amount paid and installment value)
    ins['PAYMENT_PERC'] = ins['AMT_PAYMENT'] / ins['AMT_INSTALMENT']
    ins['PAYMENT_DIFF'] = ins['AMT_INSTALMENT'] - ins['AMT_PAYMENT']

    # Days past due and days before due (no negative values)
    ins['DPD_diff'] = ins['DAYS_ENTRY_PAYMENT'] - ins['DAYS_INSTALMENT']
    ins['DBD_diff'] = ins['DAYS_INSTALMENT'] - ins['DAYS_ENTRY_PAYMENT']
    ins['DPD'] = ins['DPD_diff'].apply(lambda x: x if x > 0 else 0)
    ins['DBD'] = ins['DBD_diff'].apply(lambda x: x if x > 0 else 0)

    # Flag late payment
    ins['LATE_PAYMENT'] = ins['DBD'].apply(lambda x: 1 if x > 0 else 0)
    ins['INSTALMENT_PAYMENT_RATIO'] = ins['AMT_PAYMENT'] / ins['AMT_INSTALMENT']
    ins['LATE_PAYMENT_RATIO'] = ins.apply(lambda x: x['INSTALMENT_PAYMENT_RATIO'] if x['LATE_PAYMENT'] == 1 else 0, axis=1)

    # Flag late payments that have a significant amount
    ins['SIGNIFICANT_LATE_PAYMENT'] = ins['LATE_PAYMENT_RATIO'].apply(lambda x: 1 if x > 0.05 else 0)
    
    # Flag k threshold late payments
    ins['DPD_7'] = ins['DPD'].apply(lambda x: 1 if x >= 7 else 0)
    ins['DPD_15'] = ins['DPD'].apply(lambda x: 1 if x >= 15 else 0)

    ins['INS_IS_DPD_UNDER_120'] = ins['DPD'].apply(lambda x: 1 if (x > 0) & (x < 120) else 0)
    ins['INS_IS_DPD_OVER_120'] = ins['DPD'].apply(lambda x: 1 if (x >= 120) else 0)

    # Features: Perform aggregations
    aggregations = {
        'NUM_INSTALMENT_VERSION': ['nunique'],
        'DPD': ['max', 'mean', 'sum', 'var'],
        'DBD': ['max', 'mean', 'sum', 'var'],
        'PAYMENT_PERC': ['max', 'mean', 'sum', 'var'],
        'PAYMENT_DIFF': ['max', 'mean', 'sum', 'var'],
        'AMT_INSTALMENT': ['max', 'mean', 'sum', 'min'],
        'AMT_PAYMENT': ['min', 'max', 'mean', 'sum'],
        'DAYS_ENTRY_PAYMENT': ['max', 'mean', 'sum', 'min'],
        'SK_ID_PREV': ['size', 'nunique'],
        'PAYMENT_DIFFERENCE': ['mean'],
        'PAYMENT_RATIO': ['mean', 'max'],
        'LATE_PAYMENT': ['mean', 'sum'],
        'SIGNIFICANT_LATE_PAYMENT': ['mean', 'sum'],
        'LATE_PAYMENT_RATIO': ['mean'],
        'DPD_7': ['mean'],
        'DPD_15': ['mean'],
        'PAID_OVER': ['mean'],
        'DPD_diff':['mean', 'min', 'max'],
        'DBD_diff':['mean', 'min', 'max'],
        'DAYS_INSTALMENT': ['mean', 'max', 'sum'],
        'INS_IS_DPD_UNDER_120': ['mean', 'sum'],
        'INS_IS_DPD_OVER_120': ['mean', 'sum']
    }

    for cat in cat_cols:
        aggregations[cat] = ['mean']
    ins_agg = ins.groupby('SK_ID_CURR').agg(aggregations)
    ins_agg.columns = pd.Index(['INSTAL_' + e[0] + "_" + e[1].upper() for e in ins_agg.columns.tolist()])

    # Count installments accounts
    ins_agg['INSTAL_COUNT'] = ins.groupby('SK_ID_CURR').size()

    # from oof (DAYS_ENTRY_PAYMENT)
    cond_day = ins['DAYS_ENTRY_PAYMENT'] >= -365
    ins_d365_grp = ins[cond_day].groupby('SK_ID_CURR')
    ins_d365_agg_dict = {
        'SK_ID_CURR': ['count'],
        'NUM_INSTALMENT_VERSION': ['nunique'],
        'DAYS_ENTRY_PAYMENT': ['mean', 'max', 'sum'],
        'DAYS_INSTALMENT': ['mean', 'max', 'sum'],
        'AMT_INSTALMENT': ['mean', 'max', 'sum'],
        'AMT_PAYMENT': ['mean', 'max', 'sum'],
        'PAYMENT_DIFF': ['mean', 'min', 'max', 'sum'],
        'PAYMENT_PERC': ['mean', 'max'],
        'DPD_diff': ['mean', 'min', 'max'],
        'DPD': ['mean', 'sum'],
        'INS_IS_DPD_UNDER_120': ['mean', 'sum'],
        'INS_IS_DPD_OVER_120': ['mean', 'sum']}

    ins_d365_agg = ins_d365_grp.agg(ins_d365_agg_dict)
    ins_d365_agg.columns = ['INS_D365' + ('_').join(column).upper() for column in ins_d365_agg.columns.ravel()]

    ins_agg = ins_agg.merge(ins_d365_agg, on='SK_ID_CURR', how='left')

    print('"Installments Payments" final shape:', ins_agg.shape)
    return ins_agg

raw

In [3]:
installments = pd.read_csv('raw-data/dseb63_installments_payments.csv')
installments.head()

Unnamed: 0,SK_ID_PREV,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT,SK_ID_CURR
0,1054186,1.0,6,-1180.0,-1187.0,6948.36,6948.36,147397.0
1,2452854,1.0,21,-546.0,-552.0,11302.605,11302.605,147397.0
2,1054186,1.0,2,-1300.0,-1307.0,6948.36,6948.36,147397.0
3,1682318,1.0,2,-240.0,-243.0,7374.51,7374.51,147397.0
4,2452854,1.0,10,-876.0,-882.0,11302.605,11302.605,147397.0


eda

In [4]:
installments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2138490 entries, 0 to 2138489
Data columns (total 8 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   SK_ID_PREV              int64  
 1   NUM_INSTALMENT_VERSION  float64
 2   NUM_INSTALMENT_NUMBER   int64  
 3   DAYS_INSTALMENT         float64
 4   DAYS_ENTRY_PAYMENT      float64
 5   AMT_INSTALMENT          float64
 6   AMT_PAYMENT             float64
 7   SK_ID_CURR              float64
dtypes: float64(6), int64(2)
memory usage: 130.5 MB


In [5]:
installments.describe()

Unnamed: 0,SK_ID_PREV,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT,SK_ID_CURR
count,2138490.0,2138490.0,2138490.0,2138490.0,2138082.0,2138490.0,2138081.0,2138489.0
mean,1900536.0,0.7532254,23.22613,-1104.81,-1112.679,16217.61,16236.99,154286.2
std,535907.6,0.9338448,30.0455,803.1695,803.0702,49030.47,52896.89,88578.95
min,1000011.0,0.0,1.0,-2922.0,-3115.0,0.0,0.0,0.0
25%,1434096.0,0.0,5.0,-1745.0,-1753.0,3425.985,2790.585,77353.0
50%,1895056.0,1.0,10.0,-920.0,-928.0,8167.455,7473.69,154551.0
75%,2365976.0,1.0,28.0,-411.0,-419.0,16097.04,15215.49,230212.0
max,2843477.0,42.0,244.0,-2.0,-2.0,3371884.0,3371884.0,307508.0


In [6]:
# # plot correlation matrix
# corr = installments.corr()
# plt.figure(figsize=(10, 10))
# sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm')
# plt.show()

In [7]:
# # plot histogram
# for col in installments.columns:
#     if col not in ['SK_ID_CURR', 'TARGET', 'SK_ID_PREV']:
#         fig, ax = plt.subplots(1, 2, figsize=(12, 4))
#         ax[0].hist(installments[col], bins=30)
#         ax[0].set_title(col)
#         ax[1].scatter(installments[col], installments['TARGET'])
#         ax[1].set_title(col)
#         plt.show()

fill na

In [8]:
# fill missing values with mode
for col in installments.columns:
    installments[col].fillna(installments[col].mode()[0], inplace=True)

installments.isnull().sum()

SK_ID_PREV                0
NUM_INSTALMENT_VERSION    0
NUM_INSTALMENT_NUMBER     0
DAYS_INSTALMENT           0
DAYS_ENTRY_PAYMENT        0
AMT_INSTALMENT            0
AMT_PAYMENT               0
SK_ID_CURR                0
dtype: int64

In [9]:
# # plot histogram hue by target
# for col in installments.select_dtypes('number').columns:
#     if not col in ['SK_ID_CURR', 'TARGET', 'SK_ID_PREV']:
#         plt.figure(figsize=(12, 6))
#         sns.kdeplot(installments.loc[installments['TARGET'] == 0, col], label='target == 0')
#         sns.kdeplot(installments.loc[installments['TARGET'] == 1, col], label='target == 1')
#         plt.xlabel(col)
#         plt.ylabel('Density')
#         plt.title(f'{col} Distribution')
#         plt.legend()
#         plt.show()

feature engineering

In [10]:
# Change in Installment Version
installments['VERSION_CHANGE'] = installments.groupby('SK_ID_PREV')['NUM_INSTALMENT_VERSION'].diff().fillna(0)

# Installment Timing
installments['TIMING_DIFF'] = installments['DAYS_ENTRY_PAYMENT'] - installments['DAYS_INSTALMENT']

# Payment Ratio
installments['PAYMENT_RATIO'] = installments['AMT_PAYMENT'] / installments['AMT_INSTALMENT']

# Trend in Payment Amounts
installments['MOVING_AVG_PAYMENT'] = installments.groupby('SK_ID_PREV')['AMT_PAYMENT'].transform(lambda x: x.rolling(window=3, min_periods=1).mean())

# Cumulative Features
installments['TOTAL_PAID_SO_FAR'] = installments.groupby('SK_ID_PREV')['AMT_PAYMENT'].cumsum()

# Payment Regularity
installments['PAYMENT_REGULARITY'] = installments.groupby('SK_ID_PREV')['TIMING_DIFF'].transform('std')

# Delayed Payment Count
installments['DELAYED_PAYMENT_COUNT'] = (installments['TIMING_DIFF'] > 0).astype(int).groupby(installments['SK_ID_PREV']).cumsum()

# Interaction Features
installments['VERSION_PAYMENT_INTERACTION'] = installments['NUM_INSTALMENT_VERSION'] * installments['AMT_PAYMENT']

# Categorical Encoding of Version Changes
installments['VERSION_CHANGE_CAT'] = pd.Categorical(installments['VERSION_CHANGE'])


# Assuming installments is your DataFrame and DAYS_INSTALMENT is the relative day column
# Convert DAYS_INSTALMENT to a cumulative count
max_day = installments['DAYS_INSTALMENT'].abs().max()
installments['DAY_COUNT'] = max_day - installments['DAYS_INSTALMENT'].abs()

# Sort by this new count
installments = installments.sort_values(by=['SK_ID_PREV', 'DAY_COUNT'])

# Calculate the sum of payments for the last 180 days
# Here, 180 days is just an example, adjust as needed
installments['SUM_LAST_180_DAYS'] = installments.groupby('SK_ID_PREV')['AMT_PAYMENT'].rolling(window=180, min_periods=1).sum().reset_index(level=0, drop=True)

installments.head()

Unnamed: 0,SK_ID_PREV,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT,SK_ID_CURR,VERSION_CHANGE,TIMING_DIFF,PAYMENT_RATIO,MOVING_AVG_PAYMENT,TOTAL_PAID_SO_FAR,PAYMENT_REGULARITY,DELAYED_PAYMENT_COUNT,VERSION_PAYMENT_INTERACTION,VERSION_CHANGE_CAT,DAY_COUNT,SUM_LAST_180_DAYS
1132295,1000011,1.0,1,-435.0,-438.0,92435.04,92435.04,268838.0,0.0,-3.0,1.0,92435.04,1109158.65,1.969464,2,92435.04,0.0,2487.0,92435.04
1132292,1000011,1.0,2,-405.0,-411.0,92435.04,92435.04,268838.0,0.0,-6.0,1.0,92435.04,1016723.61,1.969464,2,92435.04,0.0,2517.0,184870.08
1132287,1000011,1.0,3,-375.0,-377.0,92435.04,92435.04,268838.0,0.0,-2.0,1.0,92414.43,646983.45,1.969464,1,92435.04,0.0,2547.0,277305.12
1132281,1000011,1.0,4,-345.0,-346.0,92435.04,92435.04,268838.0,0.0,-1.0,1.0,92435.04,369740.16,1.969464,1,92435.04,0.0,2577.0,369740.16
1132275,1000011,1.0,5,-315.0,-314.0,92435.04,92435.04,268838.0,0.0,1.0,1.0,92435.04,92435.04,1.969464,1,92435.04,0.0,2607.0,462175.2


In [11]:
# fill na with mode
for col in installments.columns:
    installments[col].fillna(installments[col].mode()[0], inplace=True)

installments.isnull().sum()

SK_ID_PREV                     0
NUM_INSTALMENT_VERSION         0
NUM_INSTALMENT_NUMBER          0
DAYS_INSTALMENT                0
DAYS_ENTRY_PAYMENT             0
AMT_INSTALMENT                 0
AMT_PAYMENT                    0
SK_ID_CURR                     0
VERSION_CHANGE                 0
TIMING_DIFF                    0
PAYMENT_RATIO                  0
MOVING_AVG_PAYMENT             0
TOTAL_PAID_SO_FAR              0
PAYMENT_REGULARITY             0
DELAYED_PAYMENT_COUNT          0
VERSION_PAYMENT_INTERACTION    0
VERSION_CHANGE_CAT             0
DAY_COUNT                      0
SUM_LAST_180_DAYS              0
dtype: int64

aggregate

In [12]:
installments.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2138490 entries, 1132295 to 1054068
Data columns (total 19 columns):
 #   Column                       Dtype   
---  ------                       -----   
 0   SK_ID_PREV                   int64   
 1   NUM_INSTALMENT_VERSION       float64 
 2   NUM_INSTALMENT_NUMBER        int64   
 3   DAYS_INSTALMENT              float64 
 4   DAYS_ENTRY_PAYMENT           float64 
 5   AMT_INSTALMENT               float64 
 6   AMT_PAYMENT                  float64 
 7   SK_ID_CURR                   float64 
 8   VERSION_CHANGE               float64 
 9   TIMING_DIFF                  float64 
 10  PAYMENT_RATIO                float64 
 11  MOVING_AVG_PAYMENT           float64 
 12  TOTAL_PAID_SO_FAR            float64 
 13  PAYMENT_REGULARITY           float64 
 14  DELAYED_PAYMENT_COUNT        int64   
 15  VERSION_PAYMENT_INTERACTION  float64 
 16  VERSION_CHANGE_CAT           category
 17  DAY_COUNT                    float64 
 18  SUM_LAST_180_DAYS    

In [13]:
aggregations_1 = {
    'SK_ID_CURR': ['first'],
    'NUM_INSTALMENT_VERSION': ['mean', 'sum', 'std'],
    'NUM_INSTALMENT_NUMBER': ['mean', 'sum', 'std'],
    'DAYS_INSTALMENT': ['mean', 'sum', 'std'],
    'DAYS_ENTRY_PAYMENT': ['mean', 'sum', 'std'],
    'AMT_INSTALMENT': ['mean', 'sum', 'std'],
    'AMT_PAYMENT': ['mean', 'sum', 'std'],
    'VERSION_CHANGE': ['mean', 'sum', 'std'],
    'TIMING_DIFF': ['mean', 'sum', 'std', 'max'],
    'PAYMENT_RATIO': ['mean', 'sum', 'std', 'min'],
    'MOVING_AVG_PAYMENT': ['mean', 'sum', 'std'],
    'TOTAL_PAID_SO_FAR': ['mean', 'sum', 'std'],
    'PAYMENT_REGULARITY': ['mean', 'sum', 'std'],
    'DELAYED_PAYMENT_COUNT': ['mean', 'sum', 'std'],
    'VERSION_PAYMENT_INTERACTION': ['mean', 'sum', 'std'],
    'VERSION_CHANGE_CAT': ['nunique'],
    'DAY_COUNT': ['mean', 'sum', 'std'],
    'SUM_LAST_180_DAYS': ['mean', 'sum', 'std']
}

installments_agg_1 = installments.groupby('SK_ID_PREV').agg(aggregations_1)
installments_agg_1.columns = pd.Index([e[0] + "_" + e[1].upper() for e in installments_agg_1.columns.tolist()])
installments_agg_1.head()

Unnamed: 0_level_0,SK_ID_CURR_FIRST,NUM_INSTALMENT_VERSION_MEAN,NUM_INSTALMENT_VERSION_SUM,NUM_INSTALMENT_VERSION_STD,NUM_INSTALMENT_NUMBER_MEAN,NUM_INSTALMENT_NUMBER_SUM,NUM_INSTALMENT_NUMBER_STD,DAYS_INSTALMENT_MEAN,DAYS_INSTALMENT_SUM,DAYS_INSTALMENT_STD,DAYS_ENTRY_PAYMENT_MEAN,DAYS_ENTRY_PAYMENT_SUM,DAYS_ENTRY_PAYMENT_STD,AMT_INSTALMENT_MEAN,AMT_INSTALMENT_SUM,AMT_INSTALMENT_STD,AMT_PAYMENT_MEAN,AMT_PAYMENT_SUM,AMT_PAYMENT_STD,VERSION_CHANGE_MEAN,VERSION_CHANGE_SUM,VERSION_CHANGE_STD,TIMING_DIFF_MEAN,TIMING_DIFF_SUM,TIMING_DIFF_STD,TIMING_DIFF_MAX,PAYMENT_RATIO_MEAN,PAYMENT_RATIO_SUM,PAYMENT_RATIO_STD,PAYMENT_RATIO_MIN,MOVING_AVG_PAYMENT_MEAN,MOVING_AVG_PAYMENT_SUM,MOVING_AVG_PAYMENT_STD,TOTAL_PAID_SO_FAR_MEAN,TOTAL_PAID_SO_FAR_SUM,TOTAL_PAID_SO_FAR_STD,PAYMENT_REGULARITY_MEAN,PAYMENT_REGULARITY_SUM,PAYMENT_REGULARITY_STD,DELAYED_PAYMENT_COUNT_MEAN,DELAYED_PAYMENT_COUNT_SUM,DELAYED_PAYMENT_COUNT_STD,VERSION_PAYMENT_INTERACTION_MEAN,VERSION_PAYMENT_INTERACTION_SUM,VERSION_PAYMENT_INTERACTION_STD,VERSION_CHANGE_CAT_NUNIQUE,DAY_COUNT_MEAN,DAY_COUNT_SUM,DAY_COUNT_STD,SUM_LAST_180_DAYS_MEAN,SUM_LAST_180_DAYS_SUM,SUM_LAST_180_DAYS_STD
SK_ID_PREV,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1
1000011,268838.0,1.0,12.0,0.0,6.5,78,3.605551,-270.0,-3240.0,108.166538,-271.666667,-3260.0,108.963158,92429.8875,1109158.65,17.848784,92429.8875,1109158.65,17.848784,0.0,0.0,0.0,-1.666667,-20.0,1.969464,1.0,1.0,12.0,0.0,1.0,92429.8875,1109159.0,9.321223,600791.6925,7209500.31,333251.994979,1.969464,23.633566,0.0,1.333333,16,0.492366,92429.8875,1109158.65,17.848784,1,2652.0,31824.0,108.166538,600822.6075,7209871.29,333270.702459
1000019,193910.0,1.0,13.0,0.0,6.538462,85,3.454837,-2140.846154,-27831.0,103.645105,-2153.384615,-27994.0,105.266755,6067.201154,78873.615,17.897402,5600.111538,72801.45,1682.691565,0.0,0.0,0.0,-12.538462,-163.0,8.362769,2.0,0.923077,12.0,0.277346,1.5e-05,5210.875962,67741.39,1838.070588,36388.35,473048.55,23622.799285,8.362769,108.715991,0.0,0.615385,8,0.50637,5600.111538,72801.45,1682.691565,1,781.153846,10155.0,103.645105,39230.570769,509997.42,20969.723379
1000025,3770.0,1.090909,12.0,0.301511,6.0,66,3.316625,-298.0,-3278.0,99.498744,-310.727273,-3418.0,98.041921,2850.435,31354.785,774.896216,2850.435,31354.785,774.896216,0.0,0.0,0.447214,-12.727273,-140.0,6.497552,-7.0,1.0,11.0,0.0,1.0,2772.555,30498.1,346.544123,16168.05,177848.55,9407.980354,6.497552,71.473072,0.0,0.0,0,0.0,3321.965455,36541.62,2338.785811,3,2624.0,28864.0,99.498744,15934.41,175278.51,9091.177523
1000042,62690.0,1.0,8.0,0.0,4.5,36,2.44949,-1676.0,-13408.0,73.484692,-1681.625,-13453.0,73.853015,5085.03375,40680.27,1.240972,5085.03375,40680.27,1.240972,0.0,0.0,0.0,-5.625,-45.0,2.66927,-1.0,1.0,8.0,0.0,1.0,5085.03375,40680.27,0.605533,22881.99375,183055.95,12456.198642,2.66927,21.354157,0.0,0.0,0,0.0,5085.03375,40680.27,1.240972,1,1246.0,9968.0,73.484692,22881.11625,183048.93,12455.379816
1000049,234821.0,3.5,21.0,1.870829,2.166667,13,1.169045,-280.0,-1680.0,35.071356,-290.0,-1740.0,36.823905,8539.02,51234.12,5831.210231,14539.02,87234.12,6541.326026,-0.333333,-2.0,2.73252,-10.0,-60.0,2.44949,-6.0,2.267795,13.606773,1.80577,1.0,12274.07,73644.42,5863.11094,44463.24,266779.44,31551.795435,2.44949,14.696938,0.0,0.0,0,0.0,44952.06,269712.36,29220.299006,6,2642.0,15852.0,35.071356,56821.08,340926.48,26032.843949


In [14]:
# rename Sk_ID_CURR_FIRST to SK_ID_CURR
installments_agg_1.rename(columns={'SK_ID_CURR_FIRST': 'SK_ID_CURR'}, inplace=True)
installments_agg_2 = installments_agg_1.groupby('SK_ID_CURR').agg(['mean', 'sum', 'std'])

# rename columns
installments_agg_2.columns = pd.Index([e[0] + "_" + e[1].upper() for e in installments_agg_2.columns.tolist()])

installments_agg_2.head()

Unnamed: 0_level_0,NUM_INSTALMENT_VERSION_MEAN_MEAN,NUM_INSTALMENT_VERSION_MEAN_SUM,NUM_INSTALMENT_VERSION_MEAN_STD,NUM_INSTALMENT_VERSION_SUM_MEAN,NUM_INSTALMENT_VERSION_SUM_SUM,NUM_INSTALMENT_VERSION_SUM_STD,NUM_INSTALMENT_VERSION_STD_MEAN,NUM_INSTALMENT_VERSION_STD_SUM,NUM_INSTALMENT_VERSION_STD_STD,NUM_INSTALMENT_NUMBER_MEAN_MEAN,NUM_INSTALMENT_NUMBER_MEAN_SUM,NUM_INSTALMENT_NUMBER_MEAN_STD,NUM_INSTALMENT_NUMBER_SUM_MEAN,NUM_INSTALMENT_NUMBER_SUM_SUM,NUM_INSTALMENT_NUMBER_SUM_STD,NUM_INSTALMENT_NUMBER_STD_MEAN,NUM_INSTALMENT_NUMBER_STD_SUM,NUM_INSTALMENT_NUMBER_STD_STD,DAYS_INSTALMENT_MEAN_MEAN,DAYS_INSTALMENT_MEAN_SUM,DAYS_INSTALMENT_MEAN_STD,DAYS_INSTALMENT_SUM_MEAN,DAYS_INSTALMENT_SUM_SUM,DAYS_INSTALMENT_SUM_STD,DAYS_INSTALMENT_STD_MEAN,DAYS_INSTALMENT_STD_SUM,DAYS_INSTALMENT_STD_STD,DAYS_ENTRY_PAYMENT_MEAN_MEAN,DAYS_ENTRY_PAYMENT_MEAN_SUM,DAYS_ENTRY_PAYMENT_MEAN_STD,DAYS_ENTRY_PAYMENT_SUM_MEAN,DAYS_ENTRY_PAYMENT_SUM_SUM,DAYS_ENTRY_PAYMENT_SUM_STD,DAYS_ENTRY_PAYMENT_STD_MEAN,DAYS_ENTRY_PAYMENT_STD_SUM,DAYS_ENTRY_PAYMENT_STD_STD,AMT_INSTALMENT_MEAN_MEAN,AMT_INSTALMENT_MEAN_SUM,AMT_INSTALMENT_MEAN_STD,AMT_INSTALMENT_SUM_MEAN,AMT_INSTALMENT_SUM_SUM,AMT_INSTALMENT_SUM_STD,AMT_INSTALMENT_STD_MEAN,AMT_INSTALMENT_STD_SUM,AMT_INSTALMENT_STD_STD,AMT_PAYMENT_MEAN_MEAN,AMT_PAYMENT_MEAN_SUM,AMT_PAYMENT_MEAN_STD,AMT_PAYMENT_SUM_MEAN,AMT_PAYMENT_SUM_SUM,...,TOTAL_PAID_SO_FAR_STD_SUM,TOTAL_PAID_SO_FAR_STD_STD,PAYMENT_REGULARITY_MEAN_MEAN,PAYMENT_REGULARITY_MEAN_SUM,PAYMENT_REGULARITY_MEAN_STD,PAYMENT_REGULARITY_SUM_MEAN,PAYMENT_REGULARITY_SUM_SUM,PAYMENT_REGULARITY_SUM_STD,PAYMENT_REGULARITY_STD_MEAN,PAYMENT_REGULARITY_STD_SUM,PAYMENT_REGULARITY_STD_STD,DELAYED_PAYMENT_COUNT_MEAN_MEAN,DELAYED_PAYMENT_COUNT_MEAN_SUM,DELAYED_PAYMENT_COUNT_MEAN_STD,DELAYED_PAYMENT_COUNT_SUM_MEAN,DELAYED_PAYMENT_COUNT_SUM_SUM,DELAYED_PAYMENT_COUNT_SUM_STD,DELAYED_PAYMENT_COUNT_STD_MEAN,DELAYED_PAYMENT_COUNT_STD_SUM,DELAYED_PAYMENT_COUNT_STD_STD,VERSION_PAYMENT_INTERACTION_MEAN_MEAN,VERSION_PAYMENT_INTERACTION_MEAN_SUM,VERSION_PAYMENT_INTERACTION_MEAN_STD,VERSION_PAYMENT_INTERACTION_SUM_MEAN,VERSION_PAYMENT_INTERACTION_SUM_SUM,VERSION_PAYMENT_INTERACTION_SUM_STD,VERSION_PAYMENT_INTERACTION_STD_MEAN,VERSION_PAYMENT_INTERACTION_STD_SUM,VERSION_PAYMENT_INTERACTION_STD_STD,VERSION_CHANGE_CAT_NUNIQUE_MEAN,VERSION_CHANGE_CAT_NUNIQUE_SUM,VERSION_CHANGE_CAT_NUNIQUE_STD,DAY_COUNT_MEAN_MEAN,DAY_COUNT_MEAN_SUM,DAY_COUNT_MEAN_STD,DAY_COUNT_SUM_MEAN,DAY_COUNT_SUM_SUM,DAY_COUNT_SUM_STD,DAY_COUNT_STD_MEAN,DAY_COUNT_STD_SUM,DAY_COUNT_STD_STD,SUM_LAST_180_DAYS_MEAN_MEAN,SUM_LAST_180_DAYS_MEAN_SUM,SUM_LAST_180_DAYS_MEAN_STD,SUM_LAST_180_DAYS_SUM_MEAN,SUM_LAST_180_DAYS_SUM_SUM,SUM_LAST_180_DAYS_SUM_STD,SUM_LAST_180_DAYS_STD_MEAN,SUM_LAST_180_DAYS_STD_SUM,SUM_LAST_180_DAYS_STD_STD
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1
0.0,1.008333,4.033333,0.683333,3.25,13.0,2.5,0.432918,1.731671,0.307497,4.275,17.1,4.262531,59.0,236,102.127371,2.342536,9.370143,2.519231,-174.225,-696.9,133.984088,-1046.5,-4186.0,892.563536,42.334654,169.338617,21.820181,-194.658333,-778.633333,147.253479,-1128.25,-4513.0,928.598361,46.285659,185.142637,23.073263,17514.131812,70056.52725,15919.81711,77735.59875,310942.395,44149.780173,21746.780365,86987.12146,19847.275901,17514.131812,70056.52725,15919.81711,77735.59875,310942.395,...,118840.928285,21502.535674,9.715769,38.863077,12.162081,56.001934,224.007736,59.820937,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,28815.823125,115263.2925,31829.756403,91216.99125,364867.965,96917.351614,40910.91376,163643.655041,47457.120175,2.0,8,0.816497,2747.775,10991.1,133.984088,20868.5,83474.0,24011.566831,42.334654,169.338617,21.820181,35637.304125,142549.2165,19292.800471,335304.9,1341220.0,466832.9,34393.312986,137573.251946,21278.096586
13.0,1.05,4.2,0.1,8.5,34.0,3.0,0.111803,0.447214,0.223607,3.875,15.5,1.108678,33.25,133,18.553975,2.065846,8.263383,0.652587,-1898.0,-7592.0,850.932821,-16248.75,-64995.0,10179.483136,61.975373,247.901491,19.577615,-1897.129167,-7588.516667,847.083397,-16226.5,-64906.0,10092.309729,62.116599,248.466396,18.978997,6981.040125,27924.1605,6579.676297,70153.3125,280613.25,88396.993206,363.541987,1454.167946,711.407053,4878.042938,19512.17175,2404.872114,44917.34625,179669.385,...,55727.405354,11993.84007,6.011489,24.045955,6.531278,59.828402,239.313608,83.016513,0.0,0.0,0.0,1.416667,5.666667,2.833333,17.0,68,34.0,0.841625,3.366502,1.683251,5208.054938,20832.21975,2341.48323,46567.40625,186269.625,37395.354749,3292.654019,13170.616074,4203.378907,1.5,6,1.0,1024.0,4096.0,850.932821,7857.75,31431.0,6014.75244,61.975373,247.901491,19.577615,24934.389188,99737.55675,21475.848273,253275.3,1013101.0,290386.7,13554.944109,54219.776437,11263.786381
24.0,1.036594,6.219564,0.050761,17.0,102.0,20.34699,0.127883,0.767298,0.155846,6.809277,40.855662,5.53705,204.333333,1226,395.494964,3.584652,21.507914,2.818806,-1607.555024,-9645.330144,516.829884,-19578.833333,-117473.0,14042.817216,107.539572,645.237435,84.564171,-1616.164452,-9696.986709,515.44187,-19730.333333,-118382.0,14226.867085,108.111699,648.670192,84.09091,14935.055567,89610.333403,10192.974526,374963.07,2249778.42,659343.812955,3334.788887,20008.73332,5778.476732,12991.032481,77946.194886,7610.318095,266221.245,1597327.47,...,465878.959845,118756.323016,8.014807,48.088842,4.6519,199.373146,1196.238875,353.210466,0.0,0.0,0.0,2.415736,14.494418,5.400545,129.333333,776,310.958947,1.085847,6.515084,2.405236,14407.822505,86446.93503,9309.327604,287195.4,1723172.4,418098.150101,9059.258252,54355.549512,13977.387123,1.833333,11,0.983192,1314.444976,7886.669856,516.829884,28634.166667,171805.0,45079.375481,107.539572,645.237435,84.564171,132371.133229,794226.799372,193451.287928,5369860.0,32219160.0,11838970.0,71304.141231,427824.847386,100189.96812
26.0,1.0,2.0,0.0,17.5,35.0,3.535534,0.0,0.0,0.0,7.15,14.3,0.353553,124.5,249,19.091883,3.605152,7.210304,0.628673,-923.5,-1847.0,980.757106,-17895.0,-35790.0,20428.314908,108.154563,216.309126,18.860203,-920.725,-1841.45,971.741494,-17830.5,-35661.0,20260.730601,110.800225,221.60045,15.228224,12701.172375,25402.34475,11947.746578,201149.685,402299.37,164180.139533,2.158365,4.316729,3.052389,11145.427875,22290.85575,12153.907095,173559.7125,347119.425,...,108874.531293,56773.897384,11.848987,23.697973,5.496447,217.073702,434.147405,138.080315,0.0,0.0,0.0,2.941667,5.883333,3.123055,57.0,114,65.053824,1.83077,3.66154,1.229828,11145.427875,22290.85575,12153.907095,173559.7125,347119.425,173288.33602,3788.713329,7577.426658,2363.689296,1.0,2,0.0,1998.5,3997.0,980.757106,33240.0,66480.0,10097.484835,108.154563,216.309126,18.860203,91587.721875,183175.44375,89815.024852,1444013.0,2888026.0,1247951.0,49732.041516,99464.083031,51774.853563
41.0,1.085556,5.427778,0.103309,8.8,44.0,4.549725,0.218306,1.091532,0.217652,4.115556,20.577778,2.311226,41.0,205,45.310043,2.182179,10.910894,1.369451,-1652.533333,-8262.666667,740.340972,-11795.0,-58975.0,5427.741289,65.465364,327.32682,41.083545,-1666.033889,-8330.169444,733.542844,-11908.0,-59540.0,5346.324485,76.87833,384.391651,36.785892,10475.698025,52378.490125,7296.512378,75933.9,379669.5,53286.068541,10161.201418,50806.007092,13670.262154,9733.315825,48666.579125,7675.386677,70924.005,354620.025,...,93616.835359,18288.83234,21.320304,106.60152,24.218941,136.839926,684.199629,109.270546,0.0,0.0,0.0,0.757222,3.786111,0.865374,6.0,30,7.842194,0.414013,2.070063,0.519394,14437.319275,72186.596375,14836.928798,101122.524,505612.62,89005.042097,22646.937192,113234.685962,27711.005093,2.0,10,1.0,1269.466667,6347.333333,740.340972,12165.4,60827.0,13461.960492,65.465364,327.32682,41.083545,29362.048825,146810.244125,18468.673915,278027.9,1390140.0,340201.5,21640.70755,108203.537752,16142.678672


In [15]:
installments_agg_2.to_csv('processed-data/installments_agg_2.csv')

In [16]:
app_train = pd.read_csv('processed-data/app_train.csv')
app_train.set_index('SK_ID_CURR', inplace=True)
app_train.head()

FileNotFoundError: [Errno 2] No such file or directory: 'processed-data/app_train.csv'

In [None]:
# merge train data with installments
app_train = app_train.merge(installments_agg_2, left_index=True, right_index=True, how='left')
app_train.head()


In [None]:
app_train.info()

In [None]:
# check inf values
app_train.replace([np.inf, -np.inf], np.nan, inplace=True)
app_train.isnull().sum().sum()

# fill na with 0
for col in app_train.columns:
    app_train[col].fillna(0, inplace=True)
app_train.isnull().sum().sum()

0

In [None]:
X, y = app_train.drop('TARGET', axis=1), app_train['TARGET']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# scale data
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
logreg = LogisticRegression(class_weight='balanced', solver='newton-cholesky')
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)
y_pred_proba = logreg.predict_proba(X_test)[:, 1]

gini(y_test, y_pred_proba)

0.4831450610737624

In [None]:
app_train = pd.read_csv('processed-data/app_train.csv')
app_train.set_index('SK_ID_CURR', inplace=True)
app_train.head()

Unnamed: 0_level_0,TARGET,CNT_CHILDREN,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,EXT_SOURCE_2,EXT_SOURCE_3,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_QRT,CODE_GENDER_F,CODE_GENDER_M,FLAG_OWN_CAR_N,FLAG_OWN_CAR_Y,NAME_INCOME_TYPE_Commercial associate,NAME_INCOME_TYPE_Other,NAME_INCOME_TYPE_Pensioner,NAME_INCOME_TYPE_Working,NAME_EDUCATION_TYPE_Higher education,NAME_EDUCATION_TYPE_Incomplete higher,NAME_EDUCATION_TYPE_Other,NAME_EDUCATION_TYPE_Secondary / secondary special,NAME_FAMILY_STATUS_Civil marriage,NAME_FAMILY_STATUS_Married,NAME_FAMILY_STATUS_Other,NAME_FAMILY_STATUS_Single / not married,NAME_HOUSING_TYPE_House / apartment,NAME_HOUSING_TYPE_Municipal apartment,NAME_HOUSING_TYPE_Other,NAME_HOUSING_TYPE_With parents,OCCUPATION_TYPE_Core staff,OCCUPATION_TYPE_Laborers,OCCUPATION_TYPE_Other,OCCUPATION_TYPE_Sales staff,ORGANIZATION_TYPE_Business Entity Type 3,ORGANIZATION_TYPE_Other,ORGANIZATION_TYPE_Self-employed
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1
0,0,0,1197000.0,44487.0,1197000.0,0.026392,-11945,-376,-574.0,-580,1,0,0,0,2,2,0,0,0,0,0,0,0.126697,0.28518,0.0,0,1,0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,1,0,900000.0,26316.0,900000.0,0.003122,-19158,-9203,-12984.0,-2568,1,0,0,0,3,3,0,0,0,0,0,0,0.598301,0.7463,-142.0,0,1,0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,0,1,265851.0,11263.5,229500.0,0.031329,-14434,-3759,-4976.0,-3989,1,0,0,0,2,2,0,0,0,0,0,0,0.293988,0.415347,0.0,0,1,0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,0,2,545040.0,20547.0,450000.0,0.004849,-15957,-6018,-10110.0,-5219,1,0,1,0,2,2,0,0,0,0,0,0,0.070575,0.397946,-725.0,0,1,0,0.0,2.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,0,0,512064.0,25033.5,360000.0,0.018801,-17851,-495,-43.0,-181,1,0,0,0,2,2,0,0,0,0,1,1,0.50179,0.52989,0.0,0,1,0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [None]:
installment = process_installment(installments)
installment.head()

"Installments Payments" final shape: (97828, 85)


Unnamed: 0_level_0,INSTAL_NUM_INSTALMENT_VERSION_NUNIQUE,INSTAL_DPD_MAX,INSTAL_DPD_MEAN,INSTAL_DPD_SUM,INSTAL_DPD_VAR,INSTAL_DBD_MAX,INSTAL_DBD_MEAN,INSTAL_DBD_SUM,INSTAL_DBD_VAR,INSTAL_PAYMENT_PERC_MAX,INSTAL_PAYMENT_PERC_MEAN,INSTAL_PAYMENT_PERC_SUM,INSTAL_PAYMENT_PERC_VAR,INSTAL_PAYMENT_DIFF_MAX,INSTAL_PAYMENT_DIFF_MEAN,INSTAL_PAYMENT_DIFF_SUM,INSTAL_PAYMENT_DIFF_VAR,INSTAL_AMT_INSTALMENT_MAX,INSTAL_AMT_INSTALMENT_MEAN,INSTAL_AMT_INSTALMENT_SUM,INSTAL_AMT_INSTALMENT_MIN,INSTAL_AMT_PAYMENT_MIN,INSTAL_AMT_PAYMENT_MAX,INSTAL_AMT_PAYMENT_MEAN,INSTAL_AMT_PAYMENT_SUM,INSTAL_DAYS_ENTRY_PAYMENT_MAX,INSTAL_DAYS_ENTRY_PAYMENT_MEAN,INSTAL_DAYS_ENTRY_PAYMENT_SUM,INSTAL_DAYS_ENTRY_PAYMENT_MIN,INSTAL_SK_ID_PREV_SIZE,INSTAL_SK_ID_PREV_NUNIQUE,INSTAL_PAYMENT_DIFFERENCE_MEAN,INSTAL_PAYMENT_RATIO_MEAN,INSTAL_PAYMENT_RATIO_MAX,INSTAL_LATE_PAYMENT_MEAN,INSTAL_LATE_PAYMENT_SUM,INSTAL_SIGNIFICANT_LATE_PAYMENT_MEAN,INSTAL_SIGNIFICANT_LATE_PAYMENT_SUM,INSTAL_LATE_PAYMENT_RATIO_MEAN,INSTAL_DPD_7_MEAN,INSTAL_DPD_15_MEAN,INSTAL_PAID_OVER_MEAN,INSTAL_DPD_diff_MEAN,INSTAL_DPD_diff_MIN,INSTAL_DPD_diff_MAX,INSTAL_DBD_diff_MEAN,INSTAL_DBD_diff_MIN,INSTAL_DBD_diff_MAX,INSTAL_DAYS_INSTALMENT_MEAN,INSTAL_DAYS_INSTALMENT_MAX,INSTAL_DAYS_INSTALMENT_SUM,INSTAL_INS_IS_DPD_UNDER_120_MEAN,INSTAL_INS_IS_DPD_UNDER_120_SUM,INSTAL_INS_IS_DPD_OVER_120_MEAN,INSTAL_INS_IS_DPD_OVER_120_SUM,INSTAL_COUNT,INS_D365SK_ID_CURR_COUNT,INS_D365NUM_INSTALMENT_VERSION_NUNIQUE,INS_D365DAYS_ENTRY_PAYMENT_MEAN,INS_D365DAYS_ENTRY_PAYMENT_MAX,INS_D365DAYS_ENTRY_PAYMENT_SUM,INS_D365DAYS_INSTALMENT_MEAN,INS_D365DAYS_INSTALMENT_MAX,INS_D365DAYS_INSTALMENT_SUM,INS_D365AMT_INSTALMENT_MEAN,INS_D365AMT_INSTALMENT_MAX,INS_D365AMT_INSTALMENT_SUM,INS_D365AMT_PAYMENT_MEAN,INS_D365AMT_PAYMENT_MAX,INS_D365AMT_PAYMENT_SUM,INS_D365PAYMENT_DIFF_MEAN,INS_D365PAYMENT_DIFF_MIN,INS_D365PAYMENT_DIFF_MAX,INS_D365PAYMENT_DIFF_SUM,INS_D365PAYMENT_PERC_MEAN,INS_D365PAYMENT_PERC_MAX,INS_D365DPD_DIFF_MEAN,INS_D365DPD_DIFF_MIN,INS_D365DPD_DIFF_MAX,INS_D365DPD_MEAN,INS_D365DPD_SUM,INS_D365INS_IS_DPD_UNDER_120_MEAN,INS_D365INS_IS_DPD_UNDER_120_SUM,INS_D365INS_IS_DPD_OVER_120_MEAN,INS_D365INS_IS_DPD_OVER_120_SUM
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1
0.0,3,0.0,0.0,0.0,0.0,64.0,10.9,327.0,374.575862,1.0,1.0,30.0,0.0,0.0,0.0,0.0,0.0,100287.765,10364.7465,310942.395,141.57,141.57,100287.765,10364.7465,310942.395,-5.0,-150.433333,-4513.0,-446.0,30,4,0.0,1.0,1.0,0.366667,11,0.366667,11,0.366667,0.0,0.0,0.0,-10.9,-64.0,0.0,10.9,0.0,64.0,-139.533333,-5.0,-4186.0,0.0,0,0.0,0,30,26.0,3.0,-107.961538,-5.0,-2807.0,-103.461538,-5.0,-2690.0,10965.515192,100287.765,285103.395,10965.515192,100287.765,285103.395,0.0,0.0,0.0,0.0,1.0,1.0,-4.5,-33.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3.0,4,0.0,0.0,0.0,0.0,99.0,11.581395,498.0,459.439646,2.51059,1.050525,45.172583,0.062151,0.0,-204.069767,-8775.0,911952.8,43050.285,5691.316395,244726.605,78.435,78.435,43050.285,5895.386163,253501.605,-5.0,-228.232558,-9814.0,-713.0,43,4,-612.209302,0.965116,1.0,0.511628,22,0.511628,22,0.562153,0.0,0.0,0.046512,-11.581395,-99.0,0.0,11.581395,0.0,99.0,-216.651163,-5.0,-9316.0,0.0,0,0.0,0,43,37.0,3.0,-175.675676,-5.0,-6500.0,-166.972973,-5.0,-6178.0,5453.012432,43050.285,201761.46,5453.012432,43050.285,201761.46,0.0,0.0,0.0,0.0,1.0,1.0,-8.702703,-99.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5.0,1,0.0,0.0,0.0,0.0,17.0,9.2,92.0,13.066667,1.0,1.0,10.0,0.0,0.0,0.0,0.0,0.0,9820.755,9818.6445,98186.445,9799.65,9799.65,9820.755,9818.6445,98186.445,-1808.0,-1944.2,-19442.0,-2083.0,10,1,0.0,1.0,1.0,1.0,10,1.0,10,1.0,0.0,0.0,0.0,-9.2,-17.0,-6.0,9.2,6.0,17.0,-1935.0,-1800.0,-19350.0,0.0,0,0.0,0,10,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
6.0,2,1.0,0.022222,1.0,0.022222,26.0,10.133333,456.0,55.3,1.0,1.0,45.0,0.0,0.0,0.0,0.0,0.0,185997.555,32920.481,1481421.645,4442.265,4442.265,185997.555,32920.481,1481421.645,-14.0,-466.355556,-20986.0,-1057.0,45,4,0.0,1.0,1.0,0.955556,43,0.955556,43,0.955556,0.0,0.0,0.0,-10.111111,-26.0,1.0,10.111111,-1.0,26.0,-456.244444,-14.0,-20531.0,0.022222,1,0.0,0,45,20.0,1.0,-170.45,-14.0,-3409.0,-160.2,-14.0,-3204.0,38963.682,61977.96,779273.64,38963.682,61977.96,779273.64,0.0,0.0,0.0,0.0,1.0,1.0,-10.25,-26.0,1.0,0.05,1.0,0.05,1.0,0.0,0.0
10.0,3,2.0,0.058824,6.0,0.075713,38.0,10.362745,1057.0,76.985925,1.0,0.990196,101.0,0.008126,13599.09,147.075882,15001.74,1828676.0,204649.695,20810.911324,2122712.955,6614.505,1402.65,204649.695,20663.835441,2107711.215,-35.0,-626.598039,-63913.0,-1548.0,102,5,0.0,1.0,1.0,0.921569,94,0.921569,94,0.912681,0.0,0.0,0.0,-10.303922,-38.0,2.0,10.303922,-2.0,38.0,-616.294118,-28.0,-62862.0,0.04902,5,0.0,0,102,30.0,3.0,-207.4,-35.0,-6222.0,-200.366667,-28.0,-6011.0,25097.958,204649.695,752938.74,25097.958,204649.695,752938.74,0.0,0.0,0.0,0.0,1.0,1.0,-7.033333,-35.0,1.0,0.1,3.0,0.1,3.0,0.0,0.0


In [None]:
installment.to_csv('processed-data/processed_installment.csv')