In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from functions import *

pd.set_option('display.max_columns', 100)

In [2]:
def process_installment(ins):

    ins, cat_cols = one_hot_encoder(ins, nan_as_category=True)

    # Group payments and get Payment difference
    ins = do_sum(ins, ['SK_ID_PREV', 'NUM_INSTALMENT_NUMBER'], 'AMT_PAYMENT', 'AMT_PAYMENT_GROUPED')
    ins['PAYMENT_DIFFERENCE'] = ins['AMT_INSTALMENT'] - ins['AMT_PAYMENT_GROUPED']
    ins['PAYMENT_RATIO'] = ins['AMT_INSTALMENT'] / ins['AMT_PAYMENT_GROUPED']
    ins['PAID_OVER_AMOUNT'] = ins['AMT_PAYMENT'] - ins['AMT_INSTALMENT']
    ins['PAID_OVER'] = (ins['PAID_OVER_AMOUNT'] > 0).astype(int)

    # Percentage and difference paid in each installment (amount paid and installment value)
    ins['PAYMENT_PERC'] = ins['AMT_PAYMENT'] / ins['AMT_INSTALMENT']
    ins['PAYMENT_DIFF'] = ins['AMT_INSTALMENT'] - ins['AMT_PAYMENT']

    # Days past due and days before due (no negative values)
    ins['DPD_diff'] = ins['DAYS_ENTRY_PAYMENT'] - ins['DAYS_INSTALMENT']
    ins['DBD_diff'] = ins['DAYS_INSTALMENT'] - ins['DAYS_ENTRY_PAYMENT']
    ins['DPD'] = ins['DPD_diff'].apply(lambda x: x if x > 0 else 0)
    ins['DBD'] = ins['DBD_diff'].apply(lambda x: x if x > 0 else 0)

    # Flag late payment
    ins['LATE_PAYMENT'] = ins['DBD'].apply(lambda x: 1 if x > 0 else 0)
    ins['INSTALMENT_PAYMENT_RATIO'] = ins['AMT_PAYMENT'] / ins['AMT_INSTALMENT']
    ins['LATE_PAYMENT_RATIO'] = ins.apply(lambda x: x['INSTALMENT_PAYMENT_RATIO'] if x['LATE_PAYMENT'] == 1 else 0, axis=1)

    # Flag late payments that have a significant amount
    ins['SIGNIFICANT_LATE_PAYMENT'] = ins['LATE_PAYMENT_RATIO'].apply(lambda x: 1 if x > 0.05 else 0)
    
    # Flag k threshold late payments
    ins['DPD_7'] = ins['DPD'].apply(lambda x: 1 if x >= 7 else 0)
    ins['DPD_15'] = ins['DPD'].apply(lambda x: 1 if x >= 15 else 0)

    ins['INS_IS_DPD_UNDER_120'] = ins['DPD'].apply(lambda x: 1 if (x > 0) & (x < 120) else 0)
    ins['INS_IS_DPD_OVER_120'] = ins['DPD'].apply(lambda x: 1 if (x >= 120) else 0)

    # Features: Perform aggregations
    aggregations = {
        'NUM_INSTALMENT_VERSION': ['nunique'],
        'DPD': ['max', 'mean', 'sum', 'var'],
        'DBD': ['max', 'mean', 'sum', 'var'],
        'PAYMENT_PERC': ['max', 'mean', 'sum', 'var'],
        'PAYMENT_DIFF': ['max', 'mean', 'sum', 'var'],
        'AMT_INSTALMENT': ['max', 'mean', 'sum', 'min'],
        'AMT_PAYMENT': ['min', 'max', 'mean', 'sum'],
        'DAYS_ENTRY_PAYMENT': ['max', 'mean', 'sum', 'min'],
        'SK_ID_PREV': ['size', 'nunique'],
        'PAYMENT_DIFFERENCE': ['mean'],
        'PAYMENT_RATIO': ['mean', 'max'],
        'LATE_PAYMENT': ['mean', 'sum'],
        'SIGNIFICANT_LATE_PAYMENT': ['mean', 'sum'],
        'LATE_PAYMENT_RATIO': ['mean'],
        'DPD_7': ['mean'],
        'DPD_15': ['mean'],
        'PAID_OVER': ['mean'],
        'DPD_diff':['mean', 'min', 'max'],
        'DBD_diff':['mean', 'min', 'max'],
        'DAYS_INSTALMENT': ['mean', 'max', 'sum'],
        'INS_IS_DPD_UNDER_120': ['mean', 'sum'],
        'INS_IS_DPD_OVER_120': ['mean', 'sum']
    }

    for cat in cat_cols:
        aggregations[cat] = ['mean']
    ins_agg = ins.groupby('SK_ID_CURR').agg(aggregations)
    ins_agg.columns = pd.Index(['INSTAL_' + e[0] + "_" + e[1].upper() for e in ins_agg.columns.tolist()])

    # Count installments accounts
    ins_agg['INSTAL_COUNT'] = ins.groupby('SK_ID_CURR').size()

    # from oof (DAYS_ENTRY_PAYMENT)
    cond_day = ins['DAYS_ENTRY_PAYMENT'] >= -365
    ins_d365_grp = ins[cond_day].groupby('SK_ID_CURR')
    ins_d365_agg_dict = {
        'SK_ID_CURR': ['count'],
        'NUM_INSTALMENT_VERSION': ['nunique'],
        'DAYS_ENTRY_PAYMENT': ['mean', 'max', 'sum'],
        'DAYS_INSTALMENT': ['mean', 'max', 'sum'],
        'AMT_INSTALMENT': ['mean', 'max', 'sum'],
        'AMT_PAYMENT': ['mean', 'max', 'sum'],
        'PAYMENT_DIFF': ['mean', 'min', 'max', 'sum'],
        'PAYMENT_PERC': ['mean', 'max'],
        'DPD_diff': ['mean', 'min', 'max'],
        'DPD': ['mean', 'sum'],
        'INS_IS_DPD_UNDER_120': ['mean', 'sum'],
        'INS_IS_DPD_OVER_120': ['mean', 'sum']}

    ins_d365_agg = ins_d365_grp.agg(ins_d365_agg_dict)
    ins_d365_agg.columns = ['INS_D365' + ('_').join(column).upper() for column in ins_d365_agg.columns.ravel()]

    ins_agg = ins_agg.merge(ins_d365_agg, on='SK_ID_CURR', how='left')

    print('"Installments Payments" final shape:', ins_agg.shape)
    return ins_agg

raw

In [3]:
installments = pd.read_csv('raw-data/dseb63_installments_payments.csv')
installments.head()

Unnamed: 0,SK_ID_PREV,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT,SK_ID_CURR
0,1054186,1.0,6,-1180.0,-1187.0,6948.36,6948.36,147397.0
1,2452854,1.0,21,-546.0,-552.0,11302.605,11302.605,147397.0
2,1054186,1.0,2,-1300.0,-1307.0,6948.36,6948.36,147397.0
3,1682318,1.0,2,-240.0,-243.0,7374.51,7374.51,147397.0
4,2452854,1.0,10,-876.0,-882.0,11302.605,11302.605,147397.0


eda

In [15]:
installments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4403652 entries, 0 to 4403651
Data columns (total 9 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   SK_ID_PREV              int64  
 1   NUM_INSTALMENT_VERSION  float64
 2   NUM_INSTALMENT_NUMBER   int64  
 3   DAYS_INSTALMENT         float64
 4   DAYS_ENTRY_PAYMENT      float64
 5   AMT_INSTALMENT          float64
 6   AMT_PAYMENT             float64
 7   SK_ID_CURR              float64
 8   TARGET                  float64
dtypes: float64(7), int64(2)
memory usage: 302.4 MB


In [16]:
installments.describe()

Unnamed: 0,SK_ID_PREV,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT,SK_ID_CURR,TARGET
count,4403652.0,4403652.0,4403652.0,4403651.0,4402750.0,4403651.0,4402750.0,4403651.0,3520912.0
mean,1900949.0,0.8087169,20.64982,-1066.716,-1075.201,16610.73,16700.72,153999.8,0.07686134
std,536580.1,0.9468355,28.29919,801.9356,801.6065,50052.58,53967.49,88411.78,0.2663713
min,1000001.0,0.0,1.0,-2922.0,-3115.0,0.0,0.0,0.0,0.0
25%,1431026.0,0.0,4.0,-1693.0,-1702.0,3960.72,3313.89,77573.0,0.0
50%,1894018.0,1.0,9.0,-860.0,-868.0,8525.97,7875.0,153873.0,0.0
75%,2368454.0,1.0,22.0,-378.0,-387.0,16293.15,15602.94,229956.0,0.0
max,2843495.0,68.0,275.0,-2.0,-2.0,3771488.0,3771488.0,307508.0,1.0


In [17]:
# # plot correlation matrix
# corr = installments.corr()
# plt.figure(figsize=(10, 10))
# sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm')
# plt.show()

In [18]:
# # plot histogram
# for col in installments.columns:
#     if col not in ['SK_ID_CURR', 'TARGET', 'SK_ID_PREV']:
#         fig, ax = plt.subplots(1, 2, figsize=(12, 4))
#         ax[0].hist(installments[col], bins=30)
#         ax[0].set_title(col)
#         ax[1].scatter(installments[col], installments['TARGET'])
#         ax[1].set_title(col)
#         plt.show()

fill na

In [19]:
# fill missing values with mode
for col in installments.columns:
    installments[col].fillna(installments[col].mode()[0], inplace=True)

installments.isnull().sum()

SK_ID_PREV                0
NUM_INSTALMENT_VERSION    0
NUM_INSTALMENT_NUMBER     0
DAYS_INSTALMENT           0
DAYS_ENTRY_PAYMENT        0
AMT_INSTALMENT            0
AMT_PAYMENT               0
SK_ID_CURR                0
TARGET                    0
dtype: int64

In [20]:
# # plot histogram hue by target
# for col in installments.select_dtypes('number').columns:
#     if not col in ['SK_ID_CURR', 'TARGET', 'SK_ID_PREV']:
#         plt.figure(figsize=(12, 6))
#         sns.kdeplot(installments.loc[installments['TARGET'] == 0, col], label='target == 0')
#         sns.kdeplot(installments.loc[installments['TARGET'] == 1, col], label='target == 1')
#         plt.xlabel(col)
#         plt.ylabel('Density')
#         plt.title(f'{col} Distribution')
#         plt.legend()
#         plt.show()

feature engineering

In [21]:
# Change in Installment Version
installments['VERSION_CHANGE'] = installments.groupby('SK_ID_PREV')['NUM_INSTALMENT_VERSION'].diff().fillna(0)

# Installment Timing
installments['TIMING_DIFF'] = installments['DAYS_ENTRY_PAYMENT'] - installments['DAYS_INSTALMENT']

# Payment Ratio
installments['PAYMENT_RATIO'] = installments['AMT_PAYMENT'] / installments['AMT_INSTALMENT']

# Trend in Payment Amounts
installments['MOVING_AVG_PAYMENT'] = installments.groupby('SK_ID_PREV')['AMT_PAYMENT'].transform(lambda x: x.rolling(window=3, min_periods=1).mean())

# Cumulative Features
installments['TOTAL_PAID_SO_FAR'] = installments.groupby('SK_ID_PREV')['AMT_PAYMENT'].cumsum()

# Payment Regularity
installments['PAYMENT_REGULARITY'] = installments.groupby('SK_ID_PREV')['TIMING_DIFF'].transform('std')

# Delayed Payment Count
installments['DELAYED_PAYMENT_COUNT'] = (installments['TIMING_DIFF'] > 0).astype(int).groupby(installments['SK_ID_PREV']).cumsum()

# Interaction Features
installments['VERSION_PAYMENT_INTERACTION'] = installments['NUM_INSTALMENT_VERSION'] * installments['AMT_PAYMENT']

# Categorical Encoding of Version Changes
installments['VERSION_CHANGE_CAT'] = pd.Categorical(installments['VERSION_CHANGE'])


# Assuming installments is your DataFrame and DAYS_INSTALMENT is the relative day column
# Convert DAYS_INSTALMENT to a cumulative count
max_day = installments['DAYS_INSTALMENT'].abs().max()
installments['DAY_COUNT'] = max_day - installments['DAYS_INSTALMENT'].abs()

# Sort by this new count
installments = installments.sort_values(by=['SK_ID_PREV', 'DAY_COUNT'])

# Calculate the sum of payments for the last 180 days
# Here, 180 days is just an example, adjust as needed
installments['SUM_LAST_180_DAYS'] = installments.groupby('SK_ID_PREV')['AMT_PAYMENT'].rolling(window=180, min_periods=1).sum().reset_index(level=0, drop=True)

installments.head()

Unnamed: 0,SK_ID_PREV,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT,SK_ID_CURR,TARGET,VERSION_CHANGE,TIMING_DIFF,PAYMENT_RATIO,MOVING_AVG_PAYMENT,TOTAL_PAID_SO_FAR,PAYMENT_REGULARITY,DELAYED_PAYMENT_COUNT,VERSION_PAYMENT_INTERACTION,VERSION_CHANGE_CAT,DAY_COUNT,SUM_LAST_180_DAYS
0,1000001,1.0,1,-268.0,-294.0,6404.31,6404.31,117953.0,0.0,0.0,-26.0,1.0,6404.31,6404.31,14.142136,0,6404.31,0.0,2654.0,6404.31
1,1000001,2.0,2,-238.0,-244.0,62039.115,62039.115,117953.0,0.0,1.0,-6.0,1.0,34221.7125,68443.425,14.142136,0,124078.23,1.0,2684.0,68443.425
7,1000004,1.0,1,-862.0,-881.0,3391.11,3391.11,187620.0,0.0,0.0,-19.0,1.0,3391.11,20346.66,17.046156,0,3391.11,0.0,2060.0,3391.11
4,1000004,1.0,2,-832.0,-851.0,3391.11,3391.11,187620.0,0.0,0.0,-19.0,1.0,3391.11,10173.33,17.046156,0,3391.11,0.0,2090.0,6782.22
5,1000004,1.0,3,-802.0,-830.0,3391.11,3391.11,187620.0,0.0,0.0,-28.0,1.0,3391.11,13564.44,17.046156,0,3391.11,0.0,2120.0,10173.33


In [None]:
# fill na with mode
for col in installments.columns:
    installments[col].fillna(installments[col].mode()[0], inplace=True)

installments.isnull().sum()

aggregate

In [None]:
installments.info()

In [None]:
aggregations_1 = {
    'SK_ID_CURR': ['first'],
    'NUM_INSTALMENT_VERSION': ['mean', 'sum', 'std'],
    'NUM_INSTALMENT_NUMBER': ['mean', 'sum', 'std'],
    'DAYS_INSTALMENT': ['mean', 'sum', 'std'],
    'DAYS_ENTRY_PAYMENT': ['mean', 'sum', 'std'],
    'AMT_INSTALMENT': ['mean', 'sum', 'std'],
    'AMT_PAYMENT': ['mean', 'sum', 'std'],
    'VERSION_CHANGE': ['mean', 'sum', 'std'],
    'TIMING_DIFF': ['mean', 'sum', 'std', 'max'],
    'PAYMENT_RATIO': ['mean', 'sum', 'std', 'min'],
    'MOVING_AVG_PAYMENT': ['mean', 'sum', 'std'],
    'TOTAL_PAID_SO_FAR': ['mean', 'sum', 'std'],
    'PAYMENT_REGULARITY': ['mean', 'sum', 'std'],
    'DELAYED_PAYMENT_COUNT': ['mean', 'sum', 'std'],
    'VERSION_PAYMENT_INTERACTION': ['mean', 'sum', 'std'],
    'VERSION_CHANGE_CAT': ['nunique'],
    'DAY_COUNT': ['mean', 'sum', 'std'],
    'SUM_LAST_180_DAYS': ['mean', 'sum', 'std']
}

installments_agg_1 = installments.groupby('SK_ID_PREV').agg(aggregations_1)
installments_agg_1.columns = pd.Index([e[0] + "_" + e[1].upper() for e in installments_agg_1.columns.tolist()])
installments_agg_1.head()

In [None]:
# rename Sk_ID_CURR_FIRST to SK_ID_CURR
installments_agg_1.rename(columns={'SK_ID_CURR_FIRST': 'SK_ID_CURR'}, inplace=True)
installments_agg_2 = installments_agg_1.groupby('SK_ID_CURR').agg(['mean', 'sum', 'std'])

# rename columns
installments_agg_2.columns = pd.Index([e[0] + "_" + e[1].upper() for e in installments_agg_2.columns.tolist()])

installments_agg_2.head()

In [None]:
installments_agg_2.to_csv('processed-data/installments_agg_2.csv')

In [None]:
app_train = pd.read_csv('processed-data/app_train.csv')
app_train.set_index('SK_ID_CURR', inplace=True)
app_train.head()

In [None]:
# merge train data with installments
app_train = app_train.merge(installments_agg_2, left_index=True, right_index=True, how='left')
app_train.head()


In [None]:
app_train.info()

In [8]:
# check inf values
app_train.replace([np.inf, -np.inf], np.nan, inplace=True)
app_train.isnull().sum().sum()

# fill na with 0
for col in app_train.columns:
    app_train[col].fillna(0, inplace=True)
app_train.isnull().sum().sum()

0

In [9]:
X, y = app_train.drop('TARGET', axis=1), app_train['TARGET']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# scale data
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [10]:
logreg = LogisticRegression(class_weight='balanced', solver='newton-cholesky')
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)
y_pred_proba = logreg.predict_proba(X_test)[:, 1]

gini(y_test, y_pred_proba)

0.4831450610737624

In [5]:
app_train = pd.read_csv('processed-data/app_train.csv')
app_train.set_index('SK_ID_CURR', inplace=True)
app_train.head()

Unnamed: 0_level_0,TARGET,CNT_CHILDREN,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,EXT_SOURCE_2,EXT_SOURCE_3,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_QRT,CODE_GENDER_F,CODE_GENDER_M,FLAG_OWN_CAR_N,FLAG_OWN_CAR_Y,NAME_INCOME_TYPE_Commercial associate,NAME_INCOME_TYPE_Other,NAME_INCOME_TYPE_Pensioner,NAME_INCOME_TYPE_Working,NAME_EDUCATION_TYPE_Higher education,NAME_EDUCATION_TYPE_Incomplete higher,NAME_EDUCATION_TYPE_Other,NAME_EDUCATION_TYPE_Secondary / secondary special,NAME_FAMILY_STATUS_Civil marriage,NAME_FAMILY_STATUS_Married,NAME_FAMILY_STATUS_Other,NAME_FAMILY_STATUS_Single / not married,NAME_HOUSING_TYPE_House / apartment,NAME_HOUSING_TYPE_Municipal apartment,NAME_HOUSING_TYPE_Other,NAME_HOUSING_TYPE_With parents,OCCUPATION_TYPE_Core staff,OCCUPATION_TYPE_Laborers,OCCUPATION_TYPE_Other,OCCUPATION_TYPE_Sales staff,ORGANIZATION_TYPE_Business Entity Type 3,ORGANIZATION_TYPE_Other,ORGANIZATION_TYPE_Self-employed
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1
0,0,0,1197000.0,44487.0,1197000.0,0.026392,-11945,-376,-574.0,-580,1,0,0,0,2,2,0,0,0,0,0,0,0.126697,0.28518,0.0,0,1,0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,1,0,900000.0,26316.0,900000.0,0.003122,-19158,-9203,-12984.0,-2568,1,0,0,0,3,3,0,0,0,0,0,0,0.598301,0.7463,-142.0,0,1,0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,0,1,265851.0,11263.5,229500.0,0.031329,-14434,-3759,-4976.0,-3989,1,0,0,0,2,2,0,0,0,0,0,0,0.293988,0.415347,0.0,0,1,0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,0,2,545040.0,20547.0,450000.0,0.004849,-15957,-6018,-10110.0,-5219,1,0,1,0,2,2,0,0,0,0,0,0,0.070575,0.397946,-725.0,0,1,0,0.0,2.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,0,0,512064.0,25033.5,360000.0,0.018801,-17851,-495,-43.0,-181,1,0,0,0,2,2,0,0,0,0,1,1,0.50179,0.52989,0.0,0,1,0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [4]:
installment = process_installment(installments)
installment.head()

"Installments Payments" final shape: (97828, 85)


Unnamed: 0_level_0,INSTAL_NUM_INSTALMENT_VERSION_NUNIQUE,INSTAL_DPD_MAX,INSTAL_DPD_MEAN,INSTAL_DPD_SUM,INSTAL_DPD_VAR,INSTAL_DBD_MAX,INSTAL_DBD_MEAN,INSTAL_DBD_SUM,INSTAL_DBD_VAR,INSTAL_PAYMENT_PERC_MAX,INSTAL_PAYMENT_PERC_MEAN,INSTAL_PAYMENT_PERC_SUM,INSTAL_PAYMENT_PERC_VAR,INSTAL_PAYMENT_DIFF_MAX,INSTAL_PAYMENT_DIFF_MEAN,INSTAL_PAYMENT_DIFF_SUM,INSTAL_PAYMENT_DIFF_VAR,INSTAL_AMT_INSTALMENT_MAX,INSTAL_AMT_INSTALMENT_MEAN,INSTAL_AMT_INSTALMENT_SUM,INSTAL_AMT_INSTALMENT_MIN,INSTAL_AMT_PAYMENT_MIN,INSTAL_AMT_PAYMENT_MAX,INSTAL_AMT_PAYMENT_MEAN,INSTAL_AMT_PAYMENT_SUM,INSTAL_DAYS_ENTRY_PAYMENT_MAX,INSTAL_DAYS_ENTRY_PAYMENT_MEAN,INSTAL_DAYS_ENTRY_PAYMENT_SUM,INSTAL_DAYS_ENTRY_PAYMENT_MIN,INSTAL_SK_ID_PREV_SIZE,INSTAL_SK_ID_PREV_NUNIQUE,INSTAL_PAYMENT_DIFFERENCE_MEAN,INSTAL_PAYMENT_RATIO_MEAN,INSTAL_PAYMENT_RATIO_MAX,INSTAL_LATE_PAYMENT_MEAN,INSTAL_LATE_PAYMENT_SUM,INSTAL_SIGNIFICANT_LATE_PAYMENT_MEAN,INSTAL_SIGNIFICANT_LATE_PAYMENT_SUM,INSTAL_LATE_PAYMENT_RATIO_MEAN,INSTAL_DPD_7_MEAN,INSTAL_DPD_15_MEAN,INSTAL_PAID_OVER_MEAN,INSTAL_DPD_diff_MEAN,INSTAL_DPD_diff_MIN,INSTAL_DPD_diff_MAX,INSTAL_DBD_diff_MEAN,INSTAL_DBD_diff_MIN,INSTAL_DBD_diff_MAX,INSTAL_DAYS_INSTALMENT_MEAN,INSTAL_DAYS_INSTALMENT_MAX,INSTAL_DAYS_INSTALMENT_SUM,INSTAL_INS_IS_DPD_UNDER_120_MEAN,INSTAL_INS_IS_DPD_UNDER_120_SUM,INSTAL_INS_IS_DPD_OVER_120_MEAN,INSTAL_INS_IS_DPD_OVER_120_SUM,INSTAL_COUNT,INS_D365SK_ID_CURR_COUNT,INS_D365NUM_INSTALMENT_VERSION_NUNIQUE,INS_D365DAYS_ENTRY_PAYMENT_MEAN,INS_D365DAYS_ENTRY_PAYMENT_MAX,INS_D365DAYS_ENTRY_PAYMENT_SUM,INS_D365DAYS_INSTALMENT_MEAN,INS_D365DAYS_INSTALMENT_MAX,INS_D365DAYS_INSTALMENT_SUM,INS_D365AMT_INSTALMENT_MEAN,INS_D365AMT_INSTALMENT_MAX,INS_D365AMT_INSTALMENT_SUM,INS_D365AMT_PAYMENT_MEAN,INS_D365AMT_PAYMENT_MAX,INS_D365AMT_PAYMENT_SUM,INS_D365PAYMENT_DIFF_MEAN,INS_D365PAYMENT_DIFF_MIN,INS_D365PAYMENT_DIFF_MAX,INS_D365PAYMENT_DIFF_SUM,INS_D365PAYMENT_PERC_MEAN,INS_D365PAYMENT_PERC_MAX,INS_D365DPD_DIFF_MEAN,INS_D365DPD_DIFF_MIN,INS_D365DPD_DIFF_MAX,INS_D365DPD_MEAN,INS_D365DPD_SUM,INS_D365INS_IS_DPD_UNDER_120_MEAN,INS_D365INS_IS_DPD_UNDER_120_SUM,INS_D365INS_IS_DPD_OVER_120_MEAN,INS_D365INS_IS_DPD_OVER_120_SUM
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1
0.0,3,0.0,0.0,0.0,0.0,64.0,10.9,327.0,374.575862,1.0,1.0,30.0,0.0,0.0,0.0,0.0,0.0,100287.765,10364.7465,310942.395,141.57,141.57,100287.765,10364.7465,310942.395,-5.0,-150.433333,-4513.0,-446.0,30,4,0.0,1.0,1.0,0.366667,11,0.366667,11,0.366667,0.0,0.0,0.0,-10.9,-64.0,0.0,10.9,0.0,64.0,-139.533333,-5.0,-4186.0,0.0,0,0.0,0,30,26.0,3.0,-107.961538,-5.0,-2807.0,-103.461538,-5.0,-2690.0,10965.515192,100287.765,285103.395,10965.515192,100287.765,285103.395,0.0,0.0,0.0,0.0,1.0,1.0,-4.5,-33.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3.0,4,0.0,0.0,0.0,0.0,99.0,11.581395,498.0,459.439646,2.51059,1.050525,45.172583,0.062151,0.0,-204.069767,-8775.0,911952.8,43050.285,5691.316395,244726.605,78.435,78.435,43050.285,5895.386163,253501.605,-5.0,-228.232558,-9814.0,-713.0,43,4,-612.209302,0.965116,1.0,0.511628,22,0.511628,22,0.562153,0.0,0.0,0.046512,-11.581395,-99.0,0.0,11.581395,0.0,99.0,-216.651163,-5.0,-9316.0,0.0,0,0.0,0,43,37.0,3.0,-175.675676,-5.0,-6500.0,-166.972973,-5.0,-6178.0,5453.012432,43050.285,201761.46,5453.012432,43050.285,201761.46,0.0,0.0,0.0,0.0,1.0,1.0,-8.702703,-99.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5.0,1,0.0,0.0,0.0,0.0,17.0,9.2,92.0,13.066667,1.0,1.0,10.0,0.0,0.0,0.0,0.0,0.0,9820.755,9818.6445,98186.445,9799.65,9799.65,9820.755,9818.6445,98186.445,-1808.0,-1944.2,-19442.0,-2083.0,10,1,0.0,1.0,1.0,1.0,10,1.0,10,1.0,0.0,0.0,0.0,-9.2,-17.0,-6.0,9.2,6.0,17.0,-1935.0,-1800.0,-19350.0,0.0,0,0.0,0,10,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
6.0,2,1.0,0.022222,1.0,0.022222,26.0,10.133333,456.0,55.3,1.0,1.0,45.0,0.0,0.0,0.0,0.0,0.0,185997.555,32920.481,1481421.645,4442.265,4442.265,185997.555,32920.481,1481421.645,-14.0,-466.355556,-20986.0,-1057.0,45,4,0.0,1.0,1.0,0.955556,43,0.955556,43,0.955556,0.0,0.0,0.0,-10.111111,-26.0,1.0,10.111111,-1.0,26.0,-456.244444,-14.0,-20531.0,0.022222,1,0.0,0,45,20.0,1.0,-170.45,-14.0,-3409.0,-160.2,-14.0,-3204.0,38963.682,61977.96,779273.64,38963.682,61977.96,779273.64,0.0,0.0,0.0,0.0,1.0,1.0,-10.25,-26.0,1.0,0.05,1.0,0.05,1.0,0.0,0.0
10.0,3,2.0,0.058824,6.0,0.075713,38.0,10.362745,1057.0,76.985925,1.0,0.990196,101.0,0.008126,13599.09,147.075882,15001.74,1828676.0,204649.695,20810.911324,2122712.955,6614.505,1402.65,204649.695,20663.835441,2107711.215,-35.0,-626.598039,-63913.0,-1548.0,102,5,0.0,1.0,1.0,0.921569,94,0.921569,94,0.912681,0.0,0.0,0.0,-10.303922,-38.0,2.0,10.303922,-2.0,38.0,-616.294118,-28.0,-62862.0,0.04902,5,0.0,0,102,30.0,3.0,-207.4,-35.0,-6222.0,-200.366667,-28.0,-6011.0,25097.958,204649.695,752938.74,25097.958,204649.695,752938.74,0.0,0.0,0.0,0.0,1.0,1.0,-7.033333,-35.0,1.0,0.1,3.0,0.1,3.0,0.0,0.0


In [6]:
# merge train data with installment
app_train = app_train.merge(installment, left_index=True, right_index=True, how='left')
app_train.head()

Unnamed: 0_level_0,TARGET,CNT_CHILDREN,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,EXT_SOURCE_2,EXT_SOURCE_3,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_QRT,CODE_GENDER_F,CODE_GENDER_M,FLAG_OWN_CAR_N,FLAG_OWN_CAR_Y,NAME_INCOME_TYPE_Commercial associate,NAME_INCOME_TYPE_Other,NAME_INCOME_TYPE_Pensioner,NAME_INCOME_TYPE_Working,NAME_EDUCATION_TYPE_Higher education,NAME_EDUCATION_TYPE_Incomplete higher,NAME_EDUCATION_TYPE_Other,NAME_EDUCATION_TYPE_Secondary / secondary special,NAME_FAMILY_STATUS_Civil marriage,NAME_FAMILY_STATUS_Married,NAME_FAMILY_STATUS_Other,NAME_FAMILY_STATUS_Single / not married,NAME_HOUSING_TYPE_House / apartment,NAME_HOUSING_TYPE_Municipal apartment,NAME_HOUSING_TYPE_Other,NAME_HOUSING_TYPE_With parents,...,INSTAL_LATE_PAYMENT_SUM,INSTAL_SIGNIFICANT_LATE_PAYMENT_MEAN,INSTAL_SIGNIFICANT_LATE_PAYMENT_SUM,INSTAL_LATE_PAYMENT_RATIO_MEAN,INSTAL_DPD_7_MEAN,INSTAL_DPD_15_MEAN,INSTAL_PAID_OVER_MEAN,INSTAL_DPD_diff_MEAN,INSTAL_DPD_diff_MIN,INSTAL_DPD_diff_MAX,INSTAL_DBD_diff_MEAN,INSTAL_DBD_diff_MIN,INSTAL_DBD_diff_MAX,INSTAL_DAYS_INSTALMENT_MEAN,INSTAL_DAYS_INSTALMENT_MAX,INSTAL_DAYS_INSTALMENT_SUM,INSTAL_INS_IS_DPD_UNDER_120_MEAN,INSTAL_INS_IS_DPD_UNDER_120_SUM,INSTAL_INS_IS_DPD_OVER_120_MEAN,INSTAL_INS_IS_DPD_OVER_120_SUM,INSTAL_COUNT,INS_D365SK_ID_CURR_COUNT,INS_D365NUM_INSTALMENT_VERSION_NUNIQUE,INS_D365DAYS_ENTRY_PAYMENT_MEAN,INS_D365DAYS_ENTRY_PAYMENT_MAX,INS_D365DAYS_ENTRY_PAYMENT_SUM,INS_D365DAYS_INSTALMENT_MEAN,INS_D365DAYS_INSTALMENT_MAX,INS_D365DAYS_INSTALMENT_SUM,INS_D365AMT_INSTALMENT_MEAN,INS_D365AMT_INSTALMENT_MAX,INS_D365AMT_INSTALMENT_SUM,INS_D365AMT_PAYMENT_MEAN,INS_D365AMT_PAYMENT_MAX,INS_D365AMT_PAYMENT_SUM,INS_D365PAYMENT_DIFF_MEAN,INS_D365PAYMENT_DIFF_MIN,INS_D365PAYMENT_DIFF_MAX,INS_D365PAYMENT_DIFF_SUM,INS_D365PAYMENT_PERC_MEAN,INS_D365PAYMENT_PERC_MAX,INS_D365DPD_DIFF_MEAN,INS_D365DPD_DIFF_MIN,INS_D365DPD_DIFF_MAX,INS_D365DPD_MEAN,INS_D365DPD_SUM,INS_D365INS_IS_DPD_UNDER_120_MEAN,INS_D365INS_IS_DPD_UNDER_120_SUM,INS_D365INS_IS_DPD_OVER_120_MEAN,INS_D365INS_IS_DPD_OVER_120_SUM
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1
0.0,0,0,1197000.0,44487.0,1197000.0,0.026392,-11945,-376,-574.0,-580,1,0,0,0,2,2,0,0,0,0,0,0,0.126697,0.28518,0.0,0,1,0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,11.0,0.366667,11.0,0.366667,0.0,0.0,0.0,-10.9,-64.0,0.0,10.9,0.0,64.0,-139.533333,-5.0,-4186.0,0.0,0.0,0.0,0.0,30.0,26.0,3.0,-107.961538,-5.0,-2807.0,-103.461538,-5.0,-2690.0,10965.515192,100287.765,285103.395,10965.515192,100287.765,285103.395,0.0,0.0,0.0,0.0,1.0,1.0,-4.5,-33.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1.0,1,0,900000.0,26316.0,900000.0,0.003122,-19158,-9203,-12984.0,-2568,1,0,0,0,3,3,0,0,0,0,0,0,0.598301,0.7463,-142.0,0,1,0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2.0,0,1,265851.0,11263.5,229500.0,0.031329,-14434,-3759,-4976.0,-3989,1,0,0,0,2,2,0,0,0,0,0,0,0.293988,0.415347,0.0,0,1,0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3.0,0,2,545040.0,20547.0,450000.0,0.004849,-15957,-6018,-10110.0,-5219,1,0,1,0,2,2,0,0,0,0,0,0,0.070575,0.397946,-725.0,0,1,0,0.0,2.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,22.0,0.511628,22.0,0.562153,0.0,0.0,0.046512,-11.581395,-99.0,0.0,11.581395,0.0,99.0,-216.651163,-5.0,-9316.0,0.0,0.0,0.0,0.0,43.0,37.0,3.0,-175.675676,-5.0,-6500.0,-166.972973,-5.0,-6178.0,5453.012432,43050.285,201761.46,5453.012432,43050.285,201761.46,0.0,0.0,0.0,0.0,1.0,1.0,-8.702703,-99.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4.0,0,0,512064.0,25033.5,360000.0,0.018801,-17851,-495,-43.0,-181,1,0,0,0,2,2,0,0,0,0,1,1,0.50179,0.52989,0.0,0,1,0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
