In [None]:
import numpy as np 
import pandas as pd
from warnings import filterwarnings as f_w
f_w('ignore')
pd.options.display.max_columns = 999
pd.options.display.max_rows = 500
import gc

In [None]:
table = 'inst'
locator = pd.read_excel('locator.xlsx', index_col = 0)

In [None]:
raw_inst = pd.read_csv(locator.loc[table, 'initial'])

In [None]:
installments = raw_inst.copy()
installments = installments.drop(columns = ['DAYS_ENTRY_PAYMENT', 
                                            'AMT_PAYMENT'])
installments = installments.drop_duplicates().sort_values(by = ['DAYS_INSTALMENT', 
                                                                'SK_ID_PREV', 
                                                                'NUM_INSTALMENT_NUMBER', 
                                                                'NUM_INSTALMENT_VERSION'])

In [None]:
payments = raw_inst.copy()
payments = payments.drop(columns = ['NUM_INSTALMENT_VERSION', 
                                    'AMT_INSTALMENT'])
payments = payments.drop_duplicates().sort_values(by = ['DAYS_INSTALMENT', 
                                                        'SK_ID_PREV', 
                                                        'NUM_INSTALMENT_NUMBER', 
                                                        'DAYS_ENTRY_PAYMENT'])
payments = payments.fillna(0)

In [None]:
amt_inst = installments.groupby(['SK_ID_PREV', 
                                 'DAYS_INSTALMENT', 
                                 'NUM_INSTALMENT_NUMBER'])['AMT_INSTALMENT'].sum().to_frame('AMT_INSTALMENT').reset_index()
inst_ch = installments.groupby(['SK_ID_PREV', 
                                'DAYS_INSTALMENT', 
                                'NUM_INSTALMENT_NUMBER'])['NUM_INSTALMENT_VERSION'].max().to_frame('MAX_NUM_VERSION').reset_index()
ip = payments.merge(amt_inst,
                    on = ['SK_ID_PREV', 
                          'DAYS_INSTALMENT', 
                          'NUM_INSTALMENT_NUMBER'],
                    how = 'left')
ip = ip.merge(inst_ch,
              on = ['SK_ID_PREV', 
                    'DAYS_INSTALMENT', 
                    'NUM_INSTALMENT_NUMBER'],
              how = 'left')
del amt_inst, inst_ch
gc.collect()

ip = ip[['SK_ID_PREV',
         'SK_ID_CURR',
         'NUM_INSTALMENT_NUMBER',
         'MAX_NUM_VERSION',
         'DAYS_INSTALMENT',
         'AMT_INSTALMENT',
         'DAYS_ENTRY_PAYMENT',
         'AMT_PAYMENT',
         ]]

In [None]:
def get_ip_saldo():
    df1 = installments.drop(columns='NUM_INSTALMENT_VERSION').rename(columns={'DAYS_INSTALMENT':'MOMENT',
                                                                   'AMT_INSTALMENT': 'SALDO'})
    df2 = payments.drop(columns='DAYS_INSTALMENT').rename(columns={'DAYS_ENTRY_PAYMENT':'MOMENT',
                                                             'AMT_PAYMENT': 'SALDO'})
    df1['SALDO'] = df1['SALDO'] * -1
    s = pd.concat((df1, df2)).sort_values(by = ['SK_ID_PREV', 
                                                'MOMENT', 
                                                'NUM_INSTALMENT_NUMBER',
                                                'SALDO']).drop(columns = ['SK_ID_CURR', 
                                                                          'NUM_INSTALMENT_NUMBER'])
    s['SALDO'] = s.groupby('SK_ID_PREV')['SALDO'].cumsum()    
    return s.drop_duplicates(subset = ['SK_ID_PREV', 
                                       'MOMENT'], keep = 'last')

saldo = get_ip_saldo()

In [None]:
ip['DAYS_ENTRY_PAYMENT_DIFF'] = ip.sort_values(by = 'DAYS_ENTRY_PAYMENT').groupby('SK_ID_PREV')['DAYS_ENTRY_PAYMENT'].diff()
ip['PAYMENT_RATIO'] = ip.AMT_PAYMENT / ip.AMT_INSTALMENT
ip['DPD'] = (ip.DAYS_ENTRY_PAYMENT - ip.DAYS_INSTALMENT).clip_lower(0).replace({0: np.nan})
ip['DBD'] = (ip.DAYS_INSTALMENT - ip.DAYS_ENTRY_PAYMENT).clip_lower(0).replace({0: np.nan})
ip['IS_PD'] = ip.DPD.clip_upper(1).fillna(0)
ip['AMT_PD'] = ip.AMT_PAYMENT * ip.IS_PD
ip['PD_RATIO'] = ip.PAYMENT_RATIO * ip.IS_PD
ip['DPD_W'] = ip.PAYMENT_RATIO * ip.DPD
ip['DBD_W'] = ip.PAYMENT_RATIO * ip.DBD

In [None]:
ip['INSTALMENT_DAY_SALDO'] = ip.merge(saldo,
                                      left_on = ['SK_ID_PREV', 'DAYS_INSTALMENT'],
                                      right_on = ['SK_ID_PREV', 'MOMENT'],
                                      how = 'left')['SALDO']

ip['PAYMENT_DAY_SALDO'] = ip.merge(saldo,
                                   left_on = ['SK_ID_PREV', 'DAYS_ENTRY_PAYMENT'],
                                   right_on = ['SK_ID_PREV', 'MOMENT'],
                                   how = 'left')['SALDO']

In [None]:
aggregations = {'DAYS_INSTALMENT': ['min', 'max', 'mean', 'std'],
                'DAYS_ENTRY_PAYMENT_DIFF': ['min', 'max', 'mean', 'std'],
                'DPD': ['min', 'max', 'mean', 'std'],
                'DBD': ['min', 'max', 'mean', 'std'],
                'DPD_W': ['min', 'max', 'mean', 'std'],
                'DBD_W': ['min', 'max', 'mean', 'std'],
                'PAYMENT_RATIO': ['min', 'max', 'mean', 'std'],
                'PD_RATIO': ['max', 'mean', 'std'],
                'AMT_INSTALMENT': ['min', 'max', 'mean', 'std'], 
                'AMT_PAYMENT': ['min', 'max', 'mean', 'std'],
                'AMT_PD': ['max', 'mean', 'std'],
                'INSTALMENT_DAY_SALDO': ['min', 'max', 'mean', 'std'],
                'PAYMENT_DAY_SALDO': ['min', 'max', 'mean', 'std'],
               }

In [None]:
ip = ip.replace({np.inf:np.nan,
                -np.inf:np.nan})

In [None]:
ip_agg = ip.groupby('SK_ID_CURR').agg(aggregations) 
ip_agg.columns = pd.Index(['INST_' + e[0] + "_" + e[1].upper() for e in ip_agg.columns.tolist()])

In [None]:
prev = pd.read_csv(locator.loc['prev', 'initial'], 
                   index_col = 'SK_ID_PREV')

In [None]:
prev_ip = ip.groupby('SK_ID_PREV').agg({'SK_ID_CURR': 'max',
                                        'NUM_INSTALMENT_NUMBER': 'max',
                                        'AMT_INSTALMENT': 'sum'})

In [None]:
prev_ip = prev_ip.join(payments.groupby('SK_ID_PREV').agg({'AMT_PAYMENT': 'sum'}))
prev_ip['DEBT'] = (prev_ip.AMT_INSTALMENT - prev_ip.AMT_PAYMENT).round(3)

In [None]:
prev_ip = prev_ip.join(payments.groupby('SK_ID_PREV')['AMT_PAYMENT'].apply(np.count_nonzero).to_frame('PAYMENTS_COUNT'))

In [None]:
prev_ip = prev_ip.join(ip.groupby('SK_ID_PREV')['IS_PD'].sum().to_frame('PAYMENTS_PD_COUNT'))
prev_ip['PAYMENTS_PD_COUNT_RATIO'] = prev_ip.PAYMENTS_PD_COUNT / prev_ip.PAYMENTS_COUNT

In [None]:
prev_ip = prev_ip.join(prev[['AMT_CREDIT', 
                             'AMT_ANNUITY', 
                             'CNT_PAYMENT', 
                             'DAYS_TERMINATION']])
prev_ip['DAYS_TERMINATION'] = prev_ip['DAYS_TERMINATION'].replace({365243: np.nan})

In [None]:
prev_ip['DEBT_RATIO'] = prev_ip['DEBT'] / prev_ip['AMT_CREDIT']

In [None]:
prev_ip['ACTIVE'] = prev_ip.DAYS_TERMINATION.isnull().astype(int) * prev_ip.AMT_ANNUITY.notnull().astype(int)
prev_ip = prev_ip.drop(columns = 'DAYS_TERMINATION')

In [None]:
prev_ip['REMAINED_INST'] = (prev_ip.CNT_PAYMENT - prev_ip.NUM_INSTALMENT_NUMBER).abs() * prev_ip.ACTIVE 
prev_ip['REMAINED_AMT'] = prev_ip.AMT_ANNUITY * prev_ip.REMAINED_INST

In [None]:
prev_aggregations = {'AMT_PAYMENT': 'sum',
                     'DEBT': 'sum',
                     'PAYMENTS_COUNT': 'sum',
                     'PAYMENTS_PD_COUNT': 'sum',
                     'PAYMENTS_PD_COUNT_RATIO': ['mean', 'max'],
                     'DEBT_RATIO': ['mean', 'max'],
                     'ACTIVE': ['mean', 'sum'],
                     'REMAINED_INST': 'sum',
                     'REMAINED_AMT': 'sum',
                    }

In [None]:
prev_agg = prev_ip.groupby('SK_ID_CURR').agg(prev_aggregations) 
prev_agg.columns = pd.Index(['INSTPREV_' + e[0] + "_" + e[1].upper() for e in prev_agg.columns.tolist()])

In [None]:
ip_agg = ip_agg.join(prev_agg)

In [None]:
ip_agg.to_csv(locator.loc[table, 'feat_eng'])