In [None]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore', category=Warning)
pd.options.display.max_columns = 999
pd.options.display.max_rows = 500
from homecredit import HomeCreditDataTable
from tqdm import tqdm_notebook
from lightgbm import LGBMClassifier
from bayes_opt import BayesianOptimization
import gc
locator = pd.read_excel('locator.xlsx', index_col = 0)

In [None]:
hc = HomeCreditDataTable()
hc.train = pd.read_csv('train.csv',
                       index_col = 'SK_ID_CURR')
hc.test = pd.read_csv('test.csv',
                      index_col = 'SK_ID_CURR')
hc.data = hc.data.join(pd.read_csv(locator.loc['appl', 'feat_eng'],
                                   index_col = 'SK_ID_CURR'))
hc.cv_split(random_state=8)
hc.early_stop_rounds = 100
hc.model = LGBMClassifier(n_estimators=10000, 
                          n_jobs=6, 
                          lambda_l1=8.611318144229598, 
                          max_depth=5, 
                          num_leaves=10) 

In [None]:
hc.validate()
print(hc.cv_score[0])

In [None]:
prev = pd.read_csv(locator.loc['prev', 'initial'])
for c in [co for co in prev.columns if 'DAYS' in co]:
    prev[c] = prev[c].replace({365243: np.nan})
todrop = ['AMT_DOWN_PAYMENT',
 'AMT_GOODS_PRICE',
 'WEEKDAY_APPR_PROCESS_START',
 'HOUR_APPR_PROCESS_START',
 'FLAG_LAST_APPL_PER_CONTRACT',
 'NFLAG_LAST_APPL_IN_DAY',
 'RATE_DOWN_PAYMENT',
 'RATE_INTEREST_PRIMARY',
 'RATE_INTEREST_PRIVILEGED',
 'NAME_CASH_LOAN_PURPOSE',
 'NAME_PAYMENT_TYPE',
 'CODE_REJECT_REASON',
 'NAME_TYPE_SUITE',
 'NAME_CLIENT_TYPE',
 'NAME_GOODS_CATEGORY',
 'NAME_PORTFOLIO',
 'NAME_PRODUCT_TYPE',
 'CHANNEL_TYPE',
 'SELLERPLACE_AREA',
 'NAME_SELLER_INDUSTRY',
 'NAME_YIELD_GROUP',
 'PRODUCT_COMBINATION',
 'NFLAG_INSURED_ON_APPROVAL']
prev = prev.drop(columns=todrop)

In [None]:
pos = pd.read_csv(locator.loc['pos', 'initial'])
pos = pos.sort_values(by='MONTHS_BALANCE')
pos_grouped = pos.groupby('SK_ID_PREV').agg({'SK_ID_CURR': 'max',
                                             'MONTHS_BALANCE': 'last',
                                             'CNT_INSTALMENT': 'last',
                                             'CNT_INSTALMENT_FUTURE': 'last',
                                             'NAME_CONTRACT_STATUS': 'last'}).reset_index()

In [None]:
raw_inst = pd.read_csv(locator.loc['inst', 'initial'])
installments = raw_inst.copy()
installments = installments.drop(columns = ['DAYS_ENTRY_PAYMENT', 
                                            'AMT_PAYMENT'])
installments = installments.drop_duplicates().sort_values(by = ['DAYS_INSTALMENT', 
                                                                'SK_ID_PREV', 
                                                                'NUM_INSTALMENT_NUMBER', 
                                                                'NUM_INSTALMENT_VERSION'])
payments = raw_inst.copy()
payments = payments.drop(columns = ['NUM_INSTALMENT_VERSION', 
                                    'AMT_INSTALMENT'])
payments = payments.drop_duplicates().sort_values(by = ['DAYS_INSTALMENT', 
                                                        'SK_ID_PREV', 
                                                        'NUM_INSTALMENT_NUMBER', 
                                                        'DAYS_ENTRY_PAYMENT'])
payments = payments.fillna(0)

installments = installments.groupby('SK_ID_PREV').agg({'SK_ID_CURR': 'max',
                                                       'AMT_INSTALMENT': 'sum', 
                                                       'DAYS_INSTALMENT': 'max',
                                                       'NUM_INSTALMENT_NUMBER': 'last'}).reset_index()

payments = payments.groupby('SK_ID_PREV').agg({'SK_ID_CURR': 'max',
                                               'AMT_PAYMENT': 'sum',
                                               'DAYS_ENTRY_PAYMENT': 'max',
                                               'NUM_INSTALMENT_NUMBER': 'last'}).reset_index()

In [None]:
card = pd.read_csv(locator.loc['card', 'initial'])
card = card.sort_values(by = ['SK_ID_PREV', 'MONTHS_BALANCE'])
card_grouped = card.groupby('SK_ID_PREV').tail(1)

In [None]:
gc.collect()

In [None]:
unique_ids = np.concatenate((
    prev.SK_ID_PREV.unique(),
    raw_inst.SK_ID_PREV.unique(),
    pos.SK_ID_PREV.unique(),
    card.SK_ID_PREV.unique()))
unique_ids = np.unique(unique_ids)

In [None]:
debt = pd.DataFrame(data=unique_ids,
                    columns=['SK_ID_PREV'])

In [None]:
def get_column(df, column):
    return debt.merge(df[['SK_ID_PREV', column]],
                      on = 'SK_ID_PREV',
                      how = 'outer')[column]

In [None]:
debt['PREV_NAME_CONTRACT_TYPE'] = get_column(prev, 'NAME_CONTRACT_TYPE')
debt['PREV_NAME_CONTRACT_STATUS'] = get_column(prev, 'NAME_CONTRACT_STATUS')
debt['PREV_AMT_CREDIT'] = get_column(prev, 'AMT_CREDIT')
debt['PREV_CNT_PAYMENT'] = get_column(prev, 'CNT_PAYMENT')
debt['PREV_AMT_ANNUITY'] = get_column(prev, 'AMT_ANNUITY')
debt['PREV_DAYS_LAST_DUE'] = get_column(prev, 'DAYS_LAST_DUE')
debt['PREV_DAYS_TERMINATION'] = get_column(prev, 'DAYS_TERMINATION')

In [None]:
debt['POS_MONTHS_BALANCE'] = get_column(pos_grouped, 'MONTHS_BALANCE')
debt['POS_CNT_INSTALMENT'] = get_column(pos_grouped, 'CNT_INSTALMENT')
debt['POS_CNT_INSTALMENT_FUTURE'] = get_column(pos_grouped, 'CNT_INSTALMENT_FUTURE')
debt['POS_NAME_CONTRACT_STATUS'] = get_column(pos_grouped, 'NAME_CONTRACT_STATUS')

In [None]:
debt['INST_AMT_INSTALMENT'] = get_column(installments, 'AMT_INSTALMENT')
debt['INST_LAST_DAYS_INSTALMENT'] = get_column(installments, 'DAYS_INSTALMENT')
debt['INST_LAST_NUM_INSTALMENT_NUMBER'] = get_column(installments, 'NUM_INSTALMENT_NUMBER')

In [None]:
debt['PAYM_AMT_PAYMENT'] = get_column(payments, 'AMT_PAYMENT')
debt['PAYM_LAST_DAYS_PAYMENT'] = get_column(payments, 'DAYS_ENTRY_PAYMENT')
debt['PAYM_LAST_NUM_INSTALMENT_NUMBER'] = get_column(payments, 'NUM_INSTALMENT_NUMBER')

In [None]:
debt['CARD_MONTHS_BALANCE'] = get_column(card_grouped, 'MONTHS_BALANCE')
debt['CARD_AMT_BALANCE'] = get_column(card_grouped, 'AMT_BALANCE')
debt['CARD_AMT_CREDIT_LIMIT_ACTUAL'] = get_column(card_grouped, 'AMT_CREDIT_LIMIT_ACTUAL')
debt['CARD_AMT_TOTAL_RECEIVABLE'] = get_column(card_grouped, 'AMT_TOTAL_RECEIVABLE')

In [None]:
sk_id_curr = debt[['SK_ID_PREV']]
sk_id_curr['PREV_SK_ID_CURR'] = get_column(prev, 'SK_ID_CURR')
sk_id_curr['POS_SK_ID_CURR'] = get_column(pos_grouped, 'SK_ID_CURR')
sk_id_curr['INST_SK_ID_CURR'] = get_column(installments, 'SK_ID_CURR')
sk_id_curr['PAYM_SK_ID_CURR'] = get_column(payments, 'SK_ID_CURR')
sk_id_curr['CARD_SK_ID_CURR'] = get_column(card_grouped, 'SK_ID_CURR')

In [None]:
notapproved = debt[(debt.PREV_NAME_CONTRACT_STATUS.notnull())]
notapproved = notapproved[(notapproved.PREV_NAME_CONTRACT_STATUS!='Approved')]
#у заявок, которые не были одобрены, нет данных о платежах, их можно удалить
notapproved_ids = notapproved.SK_ID_PREV.values
debt = debt[~debt.SK_ID_PREV.isin(notapproved_ids)]
sk_id_curr = sk_id_curr[~sk_id_curr.SK_ID_PREV.isin(notapproved_ids)]
sk_id_curr['SK_ID_CURR'] = sk_id_curr.drop(columns='SK_ID_PREV').mean(axis=1).astype(int)

In [None]:
debt = debt.merge(sk_id_curr[['SK_ID_PREV',
                              'SK_ID_CURR']],
                  on = 'SK_ID_PREV',
                  how = 'left')

In [None]:
debt = debt[debt.PREV_DAYS_TERMINATION.isnull()]

In [None]:
card_debt = debt[debt.PREV_NAME_CONTRACT_TYPE == 'Revolving loans']
card_debt['CARD_DEBT'] = card_debt[['CARD_AMT_BALANCE',
                                    'CARD_AMT_CREDIT_LIMIT_ACTUAL',
                                    'CARD_AMT_TOTAL_RECEIVABLE']].max(axis=1).dropna()
card_debt = card_debt[card_debt['CARD_DEBT'].notnull()] 

In [None]:
noncard_debt = debt[debt.CARD_MONTHS_BALANCE.isnull()].drop(columns=['CARD_MONTHS_BALANCE',
                                                                     'CARD_AMT_BALANCE',
                                                                     'CARD_AMT_CREDIT_LIMIT_ACTUAL',
                                                                     'CARD_AMT_TOTAL_RECEIVABLE'])
noncard_debt = noncard_debt[noncard_debt.PREV_NAME_CONTRACT_TYPE != 'Revolving loans']
noncard_debt = noncard_debt[noncard_debt['POS_CNT_INSTALMENT_FUTURE'].notnull()]
noncard_debt['REMAINED_AMT_PAYMENTS'] = noncard_debt[['POS_CNT_INSTALMENT_FUTURE',
                                                      'PREV_AMT_ANNUITY']].fillna(0).eval('POS_CNT_INSTALMENT_FUTURE * PREV_AMT_ANNUITY').round()
noncard_debt['DEBT_PAYMENTS'] = noncard_debt.eval('INST_AMT_INSTALMENT - PAYM_AMT_PAYMENT').round()
noncard_debt['TOTAL_DEBT'] = noncard_debt.eval('REMAINED_AMT_PAYMENTS + DEBT_PAYMENTS').round()
noncard_debt.PREV_NAME_CONTRACT_TYPE = noncard_debt.PREV_NAME_CONTRACT_TYPE.fillna('Unknown')

In [None]:
card_debt_agg = card_debt.groupby('SK_ID_CURR').agg({'SK_ID_PREV':'nunique',
                                                     'CARD_DEBT': 'sum'})

In [None]:
noncard_debt_agg = noncard_debt.groupby('SK_ID_CURR').agg({'SK_ID_PREV':'nunique',
                                                           'REMAINED_AMT_PAYMENTS': 'sum',
                                                           'DEBT_PAYMENTS': 'sum',
                                                           'TOTAL_DEBT': 'sum',
                                                           'POS_CNT_INSTALMENT_FUTURE': 'max'})

In [None]:
cash_debt_agg = noncard_debt[noncard_debt.PREV_NAME_CONTRACT_TYPE=='Cash loans']
cash_debt_agg = cash_debt_agg.groupby('SK_ID_CURR').agg({'SK_ID_PREV':'nunique',
                                                         'REMAINED_AMT_PAYMENTS': 'sum',
                                                         'DEBT_PAYMENTS': 'sum',
                                                         'TOTAL_DEBT': 'sum',
                                                         'POS_CNT_INSTALMENT_FUTURE': 'max'})

In [None]:
cons_debt_agg = noncard_debt[noncard_debt.PREV_NAME_CONTRACT_TYPE=='Consumer loans']
cons_debt_agg = cons_debt_agg.groupby('SK_ID_CURR').agg({'SK_ID_PREV':'nunique',
                                                         'REMAINED_AMT_PAYMENTS': 'sum',
                                                         'DEBT_PAYMENTS': 'sum',
                                                         'TOTAL_DEBT': 'sum',
                                                         'POS_CNT_INSTALMENT_FUTURE': 'max'})

In [None]:
unknown_agg = noncard_debt[noncard_debt.PREV_NAME_CONTRACT_TYPE=='Unknown']
unknown_agg = unknown_agg.groupby('SK_ID_CURR').agg({'SK_ID_PREV':'nunique',
                                                     'REMAINED_AMT_PAYMENTS': 'sum',
                                                     'DEBT_PAYMENTS': 'sum',
                                                     'TOTAL_DEBT': 'sum',
                                                     'POS_CNT_INSTALMENT_FUTURE': 'max'})

In [None]:
buro = pd.read_csv(locator.loc['buro', 'initial'])

In [None]:
buro = buro[(buro.CREDIT_ACTIVE == 'Active')&(buro.DAYS_CREDIT_UPDATE > -60)]

In [None]:
buro_cards = buro[buro.CREDIT_TYPE == 'Credit card']
buro_cards = buro_cards[buro_cards.DAYS_ENDDATE_FACT.isnull()]
buro_noncards = buro[buro.CREDIT_TYPE != 'Credit card']
buro_noncards = buro_noncards[buro_noncards.DAYS_ENDDATE_FACT.isnull()]

In [None]:
buro_cards['BURO_CARD_DEBT'] = buro_cards[['AMT_CREDIT_SUM_DEBT',
                                           'AMT_CREDIT_SUM_LIMIT']].max(axis=1)

In [None]:
idx = buro_noncards[(buro_noncards.CREDIT_TYPE != 'Mortgage')&(buro_noncards.DAYS_CREDIT_ENDDATE > 3650)].index
buro_noncards.loc[idx, 'DAYS_CREDIT_ENDDATE'] = np.nan

In [None]:
buro_noncards['BURO_CNT_PAYMENTS_REMAINED'] = buro_noncards.DAYS_CREDIT_ENDDATE / 30

In [None]:
buro_cards_agg = buro_cards.groupby('SK_ID_CURR').agg({'SK_ID_BUREAU':'nunique',
                                                       'BURO_CARD_DEBT': 'sum'})

In [None]:
buro_noncards_agg = buro_noncards.groupby('SK_ID_CURR').agg({'SK_ID_BUREAU':'nunique',
                                                             'AMT_CREDIT_SUM_DEBT': 'sum',
                                                             'AMT_CREDIT_SUM_OVERDUE': 'sum',
                                                             'BURO_CNT_PAYMENTS_REMAINED': 'max'})

In [None]:
buro_mortgage_agg = buro_noncards[buro_noncards.CREDIT_TYPE=='Mortgage']
buro_mortgage_agg = buro_mortgage_agg.groupby('SK_ID_CURR').agg({'SK_ID_BUREAU':'nunique',
                                                                 'AMT_CREDIT_SUM_DEBT': 'sum',
                                                                 'AMT_CREDIT_SUM_OVERDUE': 'sum',
                                                                 'BURO_CNT_PAYMENTS_REMAINED': 'max'})

In [None]:
buro_nonmortgage_agg = buro_noncards[buro_noncards.CREDIT_TYPE!='Mortgage']
buro_nonmortgage_agg = buro_nonmortgage_agg.groupby('SK_ID_CURR').agg({'SK_ID_BUREAU':'nunique',
                                                                       'AMT_CREDIT_SUM_DEBT': 'sum',
                                                                       'AMT_CREDIT_SUM_OVERDUE': 'sum',
                                                                       'BURO_CNT_PAYMENTS_REMAINED': 'max'})

In [None]:
card_debt_agg.columns = pd.Index(['HC_NUM_CRCARDS',
                                  'HC_CRCARDS_DEBT'])

In [None]:
noncard_debt_agg.columns = pd.Index(['HC_NUM_LOANS',
                                     'HC_LOANS_INSTALMENTS_REMAINED',
                                     'HC_LOANS_OVERDUE',
                                     'HC_LOANS_DEBT',
                                     'HC_CNT_FUTURE_LOANS_PAYMENTS'])

In [None]:
buro_cards_agg.columns = pd.Index(['BANKS_NUM_CRCARDS',
                                   'BANKS_CRCARDS_DEBT'])

In [None]:
buro_mortgage_agg.columns = pd.Index(['BANKS_NUM_MORTGAGES',
                                      'BANKS_MORTGAGES_DEBT',
                                      'BANKS_MORTGAGES_OVERDUE',
                                      'BANKS_CNT_FUTURE_MORTGAGES_PAYMENTS'])

In [None]:
buro_nonmortgage_agg.columns = pd.Index(['BANKS_NUM_LOANS',
                                         'BANKS_LOANS_DEBT',
                                         'BANKS_LOANS_OVERDUE',
                                         'BANKS_CNT_FUTURE_LOANS_PAYMENTS'])

In [None]:
total_debt = pd.DataFrame(index = hc.data.index)

In [None]:
total_debt = total_debt.join(noncard_debt_agg[['HC_NUM_LOANS',
                                               'HC_LOANS_DEBT',
                                               'HC_LOANS_OVERDUE',
                                               'HC_CNT_FUTURE_LOANS_PAYMENTS']])

total_debt = total_debt.join(card_debt_agg)

total_debt = total_debt.join(buro_nonmortgage_agg[['BANKS_NUM_LOANS',
                                                   'BANKS_LOANS_DEBT',
                                                   'BANKS_LOANS_OVERDUE',
                                                   'BANKS_CNT_FUTURE_LOANS_PAYMENTS']])

total_debt = total_debt.join(buro_mortgage_agg[['BANKS_NUM_MORTGAGES',
                                                'BANKS_MORTGAGES_DEBT',
                                                'BANKS_MORTGAGES_OVERDUE',
                                                'BANKS_CNT_FUTURE_MORTGAGES_PAYMENTS']])

total_debt = total_debt.join(buro_cards_agg)

In [None]:
total_debt['TOTAL_DEBT'] = total_debt[[c for c in total_debt.columns
                                       if 'DEBT' in c]].sum(axis=1)
total_debt['TOTAL_OVERDUE'] = total_debt[[c for c in total_debt.columns
                                       if 'OVERDUE' in c]].sum(axis=1)
total_debt['HC_DEBT_RATIO'] = total_debt[['HC_LOANS_DEBT',
                                          'HC_CRCARDS_DEBT']].sum(axis=1) / total_debt['TOTAL_DEBT']
total_debt['HC_OVERDUE_RATIO'] = total_debt['HC_LOANS_OVERDUE'] / total_debt['TOTAL_OVERDUE']
total_debt['LOANS_RATIO'] = total_debt[['HC_LOANS_DEBT',
                                        'BANKS_LOANS_DEBT']].sum(axis=1) / total_debt['TOTAL_DEBT']
total_debt['CRCARDS_RATIO'] = total_debt[['HC_CRCARDS_DEBT',
                                          'BANKS_CRCARDS_DEBT']].sum(axis=1) / total_debt['TOTAL_DEBT']
total_debt['MORTGAGE_RATIO'] = total_debt['BANKS_MORTGAGES_DEBT'] / total_debt['TOTAL_DEBT']
total_debt['FUTURE_LOANS_NUM_PAYMENTS'] = total_debt[['HC_CNT_FUTURE_LOANS_PAYMENTS',
                                                      'BANKS_CNT_FUTURE_LOANS_PAYMENTS']].sum(axis=1)

In [None]:
total_debt.columns = pd.Index(['_'.join(('DEBT', c)) for c in total_debt.columns])

In [None]:
hc.data = hc.data.join(total_debt)

In [None]:
hc.validate()
print(hc.cv_score[0])

In [None]:
total_debt.to_csv(locator.loc['debt', 'feat_eng'])