In [1]:
__imp

Libraries have been loaded


# TODOS

## Load Additional Libraries

In [17]:
%reload_ext autoreload
%autoreload 2

from datetime import datetime

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib

import lightgbm as lgb
from features import woe

basepath   = os.path.expanduser('../')

# train and validation fold
# TRAIN_PATH = os.path.join(basepath, 'data/processed/application_train_fold.feather')
# TEST_PATH  = os.path.join(basepath, 'data/processed/application_val_fold.feather')

# full training
TRAIN_PATH = os.path.join(basepath, 'data/processed/application_train.feather')
TEST_PATH  = os.path.join(basepath, 'data/processed/application_test.feather')

# MODEL PRESET
MODEL_PRESET   = 'M25'

# DATASET PREFIX
# DATASET_PREFIX = 'tr'
DATASET_PREFIX  = 'train'

## Load Processed Data

In [18]:
%%time

tr = pd.read_feather(f'{TRAIN_PATH}')
te = pd.read_feather(f'{TEST_PATH}')

# Application data from previous loans
bureau       = pd.read_feather(os.path.join(basepath, 'data/processed/bureau.feather'))
bureau_bal   = pd.read_feather(os.path.join(basepath, 'data/processed/bureau_balance.feather'))
prev_app     = pd.read_pickle(os.path.join(basepath, 'data/processed/prev_app.pkl'))
pos_cash     = pd.read_pickle(os.path.join(basepath, 'data/processed/pos_cash.pkl'))
credit_bal   = pd.read_pickle(os.path.join(basepath, 'data/processed/credit_card_balance.pkl'))
installments = pd.read_pickle(os.path.join(basepath, 'data/processed/installments_payments.pkl'))

CPU times: user 599 ms, sys: 1.35 s, total: 1.95 s
Wall time: 12.1 s


In [19]:
# concat training and test set
data   = pd.concat((tr, te))
ntrain = len(tr) 

del tr, te
gc.collect();

## Feature Engineering

In [20]:
AGG_FEATURES = ['mean', 'median', 'max', 'min', 'var', 'sum']

def get_agg_features(data, gp, f, on):
    agg         = gp.groupby(on)[f]\
                        .agg({np.mean, np.median, np.max, np.min, np.var, np.sum}).fillna(-1)
    
    cols        = [f'{AGG_FEATURES[i]}_{f}_{c}' for i, c in enumerate(agg.columns)]
    agg.columns = cols
    agg         = agg.reset_index()
    data        = data.merge(agg, on=on, how='left')
    
    del agg
    gc.collect();
    
    return data, cols    

def log_features(data, features):
    for f in features:
        data.loc[:, f] = data[f].map(lambda x: np.log(x + 1))
    
    return data

In [21]:
# %%time

# deviation in three external scores
data.loc[:, 'EXT_SOURCE_DEV']  = data.loc[:, ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].apply(np.std, axis=1)

# sum of external scores
data.loc[:, 'EXT_SOURCE_SUM'] = data['EXT_SOURCE_1'].fillna(0) + data['EXT_SOURCE_2'].fillna(0) + data['EXT_SOURCE_3'].fillna(0)

# mean of external scores
data.loc[:, 'MEAN_EXTERNAL_SCORE'] = (data['EXT_SOURCE_1'].fillna(0) + data['EXT_SOURCE_2'].fillna(0) + data['EXT_SOURCE_3'].fillna(0)) / 3

# number of null values in an application
data.loc[:, 'num_nulls'] = data.isnull().sum(axis=1)

# feature interactions
data.loc[:, 'EXT_3_1'] = data.loc[:, 'EXT_SOURCE_3'] / data.loc[:, 'EXT_SOURCE_1']
data.loc[:, 'EXT_3_2'] = data.loc[:, 'EXT_SOURCE_3'] / data.loc[:, 'EXT_SOURCE_2']
data.loc[:, 'EXT_2_1'] = data.loc[:, 'EXT_SOURCE_2'] / data.loc[:, 'EXT_SOURCE_1']

# relationship between amount credit and total income
data.loc[:, 'ratio_credit_income'] = data.loc[:, 'AMT_CREDIT'] / data.loc[:, 'AMT_INCOME_TOTAL']

# relationship between annual amount to be paid and income
data.loc[:, 'ratio_annuity_income'] = data.loc[:, 'AMT_ANNUITY'] / data.loc[:, 'AMT_INCOME_TOTAL']

# relationship between amount annuity and age
data.loc[:, 'ratio_annuity_age'] = data.loc[:, 'AMT_ANNUITY'] / (-data.loc[:, 'DAYS_BIRTH'] / 365)

# number of missing values in an application
data.loc[:, 'num_missing_values'] = data.loc[:, data.columns.drop('TARGET')].isnull().sum(axis=1).values

# feature interaction between age and days employed
data.loc[:, 'age_plus_employed']  = data.loc[:, 'DAYS_BIRTH'] + data.loc[:, 'DAYS_EMPLOYED']
data.loc[:, 'ratio_age_employed'] = (data.DAYS_EMPLOYED) / (data.DAYS_BIRTH)

# ratio of value of goods against which loan is given to total income
data.loc[:, 'ratio_goods_income'] = data.loc[:, 'AMT_GOODS_PRICE'] / data.loc[:, 'AMT_INCOME_TOTAL']

# feature interaction between value of goods against which loan is given to annual loan amount to be paid
data.loc[:, 'ratio_goods_annuity'] = data.loc[:, 'AMT_GOODS_PRICE'] / data.loc[:, 'AMT_ANNUITY']
data.loc[:, 'mult_goods_annuity']  = data.loc[:, 'AMT_GOODS_PRICE'] * data.loc[:, 'AMT_ANNUITY']

# feature interaction value of goods and amount credit
data.loc[:, 'ratio_goods_credit'] = (data.loc[:, 'AMT_GOODS_PRICE'] / data.loc[:, 'AMT_CREDIT']).replace([np.inf, -np.inf], np.nan)
data.loc[:, 'mult_goods_credit']  = (data.loc[:, 'AMT_GOODS_PRICE'] * data.loc[:, 'AMT_CREDIT']).replace([np.inf, -np.inf], np.nan)

# feature interaction between annuity and amount credit
data.loc[:, 'ratio_annuity_credit'] = data.loc[:, 'AMT_ANNUITY'] / data.loc[:, 'AMT_CREDIT'].replace([np.inf, -np.inf], np.nan)

# feature interaction between amount credit and age
data.loc[:, 'ratio_credit_age'] = data.AMT_CREDIT / (-data.DAYS_BIRTH / 365)

# feature interaction between amount credit and days before application id was changed
data.loc[:, 'ratio_credit_id_change'] = (data.AMT_CREDIT / -data.DAYS_ID_PUBLISH).replace([np.inf, -np.inf], np.nan)

# feature interaction between days id publish and age
data.loc[:, 'ratio_id_change_age'] = (data.DAYS_ID_PUBLISH / (-data.DAYS_BIRTH / 365))

# ratio of annuity and external score
data.loc[:, 'ratio_annuity_score_1'] = (data.loc[:, 'AMT_ANNUITY'] / data.loc[:, 'EXT_SOURCE_1']).replace([np.inf, -np.inf], np.nan)
data.loc[:, 'ratio_annuity_score_2'] = (data.loc[:, 'AMT_ANNUITY'] / data.loc[:, 'EXT_SOURCE_2']).replace([np.inf, -np.inf], np.nan)
data.loc[:, 'ratio_annuity_score_3'] = (data.loc[:, 'AMT_ANNUITY'] / data.loc[:, 'EXT_SOURCE_3']).replace([np.inf, -np.inf], np.nan)

# ratio of annuity, credit multiplied by external scores
data.loc[:, 'ratio_credit_annuity_score_1'] = ((data.loc[:, 'AMT_ANNUITY'] / data.loc[:, 'AMT_CREDIT']) * data.loc[:, 'EXT_SOURCE_1']).replace([np.inf, -np.inf], np.nan)
data.loc[:, 'ratio_credit_annuity_score_2'] = ((data.loc[:, 'AMT_ANNUITY'] / data.loc[:, 'AMT_CREDIT']) * data.loc[:, 'EXT_SOURCE_2']).replace([np.inf, -np.inf], np.nan)
data.loc[:, 'ratio_credit_annuity_score_3'] = ((data.loc[:, 'AMT_ANNUITY'] / data.loc[:, 'AMT_CREDIT']) * data.loc[:, 'EXT_SOURCE_3']).replace([np.inf, -np.inf], np.nan)


# ratio of owner's car age with his age
data.loc[:, 'ratio_car_person_age'] = (data.OWN_CAR_AGE / -data.DAYS_BIRTH)

# difference credit and income
data.loc[:, 'diff_credit_income'] = data.AMT_CREDIT - data.AMT_INCOME_TOTAL

# difference income total and annuity
data.loc[:, 'diff_income_annuity']  = data.AMT_ANNUITY - data.AMT_INCOME_TOTAL

# difference credit and goods price
data.loc[:, 'diff_credit_goods'] = data.AMT_CREDIT - data.AMT_GOODS_PRICE

# max, mean, std of featur groups related to days before any document was modified or changed
if os.path.exists(os.path.join(basepath, f'data/processed/{DATASET_PREFIX}_max_doc_modified.pkl')):
    data.loc[:, 'max_document_modified'] = pd.read_pickle(os.path.join(basepath, f'data/processed/{DATASET_PREFIX}_max_doc_modified.pkl'))
else:
    max_document_modified = data.loc[:, ['DAYS_REGISTRATION',
                                         'DAYS_ID_PUBLISH',
                                         'DAYS_LAST_PHONE_CHANGE'
                                        ]].apply(np.max, axis=1)
    
    max_document_modified.to_pickle(os.path.join(basepath, f'data/processed/{DATASET_PREFIX}_max_doc_modified.pkl'))
    data.loc[:, 'max_document_modified'] = max_document_modified
    
    del max_document_modified
    gc.collect();
    
if os.path.exists(os.path.join(basepath, f'data/processed/{DATASET_PREFIX}_mean_doc_modified.pkl')):
    data.loc[:, 'mean_document_modified'] = pd.read_pickle(os.path.join(basepath, f'data/processed/{DATASET_PREFIX}_mean_doc_modified.pkl'))
else:
    mean_document_modified = data.loc[:, ['DAYS_REGISTRATION',
                                         'DAYS_ID_PUBLISH',
                                         'DAYS_LAST_PHONE_CHANGE'
                                        ]].apply(np.mean, axis=1)
    
    mean_document_modified.to_pickle(os.path.join(basepath, f'data/processed/{DATASET_PREFIX}_mean_doc_modified.pkl'))
    data.loc[:, 'mean_document_modified'] = mean_document_modified
    
    del mean_document_modified
    gc.collect();


if os.path.exists(os.path.join(basepath, f'data/processed/{DATASET_PREFIX}_std_doc_modified.pkl')):
    data.loc[:, 'std_document_modified'] = pd.read_pickle(os.path.join(basepath, f'data/processed/{DATASET_PREFIX}_std_doc_modified.pkl'))
else:
    std_document_modified = data.loc[:, ['DAYS_REGISTRATION',
                                         'DAYS_ID_PUBLISH',
                                         'DAYS_LAST_PHONE_CHANGE'
                                        ]].apply(np.std, axis=1)
    
    std_document_modified.to_pickle(os.path.join(basepath, f'data/processed/{DATASET_PREFIX}_std_doc_modified.pkl'))
    data.loc[:, 'std_document_modified'] = std_document_modified
    
    del std_document_modified
    gc.collect();
    

# combine feature groups
if os.path.exists(os.path.join(basepath, f'data/processed/{DATASET_PREFIX}_flag_document.pkl')):
    data.loc[:, 'flag_document_summary'] = pd.read_pickle(os.path.join(basepath, f'data/processed/{DATASET_PREFIX}_flag_document.pkl'))
else:
    flag_document_summary = data.loc[:, [f for f in data.columns if 'FLAG' in f]].apply(np.sum, axis=1)
    data.loc[:, 'flag_document_summary'] = flag_document_summary
    
    del flag_document_summary
    gc.collect();
    

if os.path.exists(os.path.join(basepath, f'data/processed/{DATASET_PREFIX}_amt_reqd_credit.pkl')):
    data.loc[:, 'amt_reqd_summary_summary'] = pd.read_pickle(os.path.join(basepath, f'data/processed/{DATASET_PREFIX}_amt_reqd_credit.pkl'))
else:
    amt_reqd_credit_summary = data.loc[:, [f for f in data.columns if 'AMT_REQD_CREDIT' in f]].apply(np.sum, axis=1)
    data.loc[:, 'amt_reqd_summary_summary'] = amt_reqd_credit_summary
    
    del amt_reqd_credit_summary
    gc.collect();

    
##########################################################################################################
#                            BUREAU                                                                      #
##########################################################################################################

# number of previous loans for a particular user
prev_num_loans = bureau.groupby('SK_ID_CURR').size()

# number of previous active credits
num_active_credits = bureau.groupby('SK_ID_CURR')['CREDIT_ACTIVE'].sum()

# aggregation features
data, dc_cols  = get_agg_features(data, bureau, 'DAYS_CREDIT', 'SK_ID_CURR')
data, acm_cols = get_agg_features(data, bureau, 'AMT_CREDIT_SUM', 'SK_ID_CURR')

# logarithm of features
data  = log_features(data, acm_cols)

# mean number of days overdue on any previous credit
mean_days_overdue = bureau.groupby('SK_ID_CURR')['CREDIT_DAY_OVERDUE'].mean()

# mean number of days of CB credit at the time of application
mean_days_credit_end = bureau.groupby('SK_ID_CURR')['DAYS_CREDIT_ENDDATE'].mean()

# mean of maximum amount overdue on any credit line
mean_max_amt_overdue = bureau.groupby('SK_ID_CURR')['AMT_CREDIT_MAX_OVERDUE'].mean().map(lambda x: np.log(x + 1))

# mean of total amount overdue on any credit line
mean_total_amt_overdue = bureau.groupby('SK_ID_CURR')['AMT_CREDIT_SUM_OVERDUE'].mean().map(lambda x: np.log(x + 1))

# sum of num times credit was prolonged
sum_num_times_prolonged = bureau.groupby('SK_ID_CURR')['CNT_CREDIT_PROLONG'].sum()

# number of different types of credit taken from CREDIT BUREAU
num_diff_credits = bureau.groupby('SK_ID_CURR')['CREDIT_TYPE'].nunique()

# mean number of days of last credit update
mean_days_credit_update = bureau.groupby('SK_ID_CURR')['DAYS_CREDIT_UPDATE'].mean()    
    
# summary of amount of annuity of credit bureau loans
mean_cb_credit_annuity = bureau.groupby('SK_ID_CURR')['AMT_ANNUITY'].mean().map(lambda x: np.log(x + 1))
std_cb_credit_annuity  = bureau.groupby('SK_ID_CURR')['AMT_ANNUITY'].std().map(lambda x: np.log(x + 1))

# latest application reported to Home Credit
latest_credit = bureau.groupby('SK_ID_CURR')['DAYS_CREDIT'].max()
data.loc[:, 'latest_credit'] = data.SK_ID_CURR.map(latest_credit)

# day before current application date
credit_duration = (bureau.DAYS_CREDIT_ENDDATE - bureau.DAYS_CREDIT).map(np.abs).groupby(bureau.SK_ID_CURR).mean()
data.loc[:, 'credit_duration'] = data.SK_ID_CURR.map(credit_duration).replace([np.inf, -np.inf], np.nan)

# deviation in difference between remaining duration of credit and how long before we applied for this credit
diff_prev_curr_credit = bureau.DAYS_CREDIT_ENDDATE.fillna(0) - bureau.DAYS_CREDIT.fillna(0)
diff_prev_curr_credit = diff_prev_curr_credit.groupby(bureau.SK_ID_CURR).std()
data.loc[:, 'std_diff_prev_curr_credit'] = data.SK_ID_CURR.map(diff_prev_curr_credit)


# mean of difference between remaining duration of credit and how long before we applied for this credit
diff_prev_curr_credit = bureau.DAYS_CREDIT_ENDDATE - bureau.DAYS_CREDIT
diff_prev_curr_credit = diff_prev_curr_credit.groupby(bureau.SK_ID_CURR).mean()
data.loc[:, 'mean_diff_prev_curr_credit'] = data.SK_ID_CURR.map(diff_prev_curr_credit)

# mean of difference between days since cb credit ended and how long before we applied for current credit
diff_prev_curr_credit = bureau.DAYS_ENDDATE_FACT - bureau.DAYS_CREDIT
diff_prev_curr_credit = diff_prev_curr_credit.groupby(bureau.SK_ID_CURR).mean()
data.loc[:, 'mean_diff_ended_curr_credit'] = data.SK_ID_CURR.map(diff_prev_curr_credit)

# mean of difference between days last credit ended and remaining duration of credit
diff_prev_curr_credit = bureau.DAYS_ENDDATE_FACT - bureau.DAYS_CREDIT_ENDDATE
diff_prev_curr_credit = diff_prev_curr_credit.groupby(bureau.SK_ID_CURR).mean()
data.loc[:, 'mean_diff_prev_remaining_credit'] = data.SK_ID_CURR.map(diff_prev_curr_credit)

# mean of ratio of two differences
diff1 = bureau.DAYS_ENDDATE_FACT - bureau.DAYS_CREDIT
diff2 = bureau.DAYS_CREDIT_ENDDATE - bureau.DAYS_CREDIT
diff  = (diff1 / diff2).replace([np.inf, -np.inf], np.nan)
diff  = diff.groupby(bureau.SK_ID_CURR).mean()
data.loc[:, 'ratio_two_diff'] = data.SK_ID_CURR.map(diff)

# number of null values in days credit end date
if os.path.exists(os.path.join(basepath, f'data/processed/{DATASET_PREFIX}_num_nulls_enddate.pkl')):
    num_nulls_enddate = pd.read_pickle(os.path.join(basepath, f'data/processed/{DATASET_PREFIX}_num_nulls_enddate.pkl'))
else:    
    num_nulls_enddate = bureau.groupby('SK_ID_CURR')['DAYS_CREDIT_ENDDATE'].apply(lambda x: x.isnull().sum())
    num_nulls_enddate.to_pickle(os.path.join(basepath, f'data/processed/{DATASET_PREFIX}_num_nulls_enddate.pkl'))

data.loc[:, 'num_nulls_enddate'] = data.SK_ID_CURR.map(num_nulls_enddate).fillna(-99).astype(np.int8)

# ratio of debt to total credit sum
ratio_debt_total                = (bureau.AMT_CREDIT_SUM_DEBT / (bureau.AMT_CREDIT_SUM + 1))
ratio_debt_total                = ratio_debt_total.groupby(bureau.SK_ID_CURR).mean()
data.loc[:, 'ratio_debt_total'] = data.SK_ID_CURR.map(ratio_debt_total)


# merge back with original dataframe
data.loc[:, 'num_prev_loans']           = data.SK_ID_CURR.map(prev_num_loans).fillna(0).values
data.loc[:, 'num_prev_active_credits']  = data.SK_ID_CURR.map(num_active_credits).fillna(0).values
data.loc[:, 'mean_credit_days_overdue'] = data.SK_ID_CURR.map(mean_days_overdue).fillna(0).values
data.loc[:, 'mean_days_credit_end']     = data.SK_ID_CURR.map(mean_days_credit_end).fillna(0).values
data.loc[:, 'mean_max_amt_overdue']     = data.SK_ID_CURR.map(mean_max_amt_overdue).fillna(0).values
data.loc[:, 'mean_total_amt_overdue']   = data.SK_ID_CURR.map(mean_total_amt_overdue).values
data.loc[:, 'sum_num_times_prolonged']  = data.SK_ID_CURR.map(sum_num_times_prolonged).fillna(0).astype(np.int8).values
data.loc[:, 'mean_cb_credit_annuity']   = data.SK_ID_CURR.map(mean_cb_credit_annuity).fillna(0).values
data.loc[:, 'std_cb_credit_annuity']    = data.SK_ID_CURR.map(std_cb_credit_annuity).fillna(0).values
data.loc[:, 'num_diff_credits']         = data.SK_ID_CURR.map(num_diff_credits).fillna(0).values
data.loc[:, 'mean_days_credit_update']  = data.SK_ID_CURR.map(mean_days_credit_update).fillna(0).values

# load ratio credit overdue to credit sum
# ratio_credit_overdue_sum = pd.read_pickle(os.path.join(basepath, 'data/processed/bureau_ratio_overdue_sum_credit.pkl'))
# data.loc[:, 'ratio_cedit_overdue_sum'] = data.SK_ID_CURR.map(ratio_credit_overdue_sum).fillna(-1)

# interaction between credit amount and duration of credit
credit_times_duration = (bureau.AMT_CREDIT_SUM.fillna(0) *\
                         (bureau.DAYS_CREDIT_ENDDATE - bureau.DAYS_CREDIT).map(np.abs))\
                        .replace([np.inf, -np.inf], np.nan)
credit_times_duration = credit_times_duration.groupby(bureau.SK_ID_CURR).mean()
data.loc[:, 'credit_times_duration'] = data.SK_ID_CURR.map(credit_times_duration)


#############################################################################################################
#                      FEATURE INTERACTIONS                                                                 #
#############################################################################################################

# feature interaction between credit bureau annuity, current annuity and total income
data.loc[:, 'ratio_cb_goods_annuity'] = (data.AMT_GOODS_PRICE / (data.mean_cb_credit_annuity + data.AMT_ANNUITY)).replace([np.inf, -np.inf], np.nan)

# feature interaction between mean days credit update and last time id was changed by user
data.loc[:, 'ratio_update_id']        = (data.mean_days_credit_update / data.DAYS_ID_PUBLISH).replace([np.inf, -np.inf], np.nan)

# feature interaction between mean credit amount of previous credits with current credit
data.loc[:, 'ratio_curr_prev_credit'] = data.AMT_CREDIT / data.mean_AMT_CREDIT_SUM_mean

#############################################################################################################
#                           BUREAU and BUREAU BALANCE                                                       #
#############################################################################################################

prev_bal = bureau.loc[:, ['SK_ID_CURR', 'SK_ID_BUREAU']].merge(bureau_bal,
                                                   on='SK_ID_BUREAU',
                                                   how='left'
                                                  )

mean_status                      = prev_bal.groupby('SK_ID_BUREAU')['STATUS'].mean().fillna(-1)
bureau.loc[:, 'mean_status'] = bureau.SK_ID_BUREAU.map(mean_status).values

mean_status                = bureau.groupby('SK_ID_CURR')['mean_status'].mean()
data.loc[:, 'mean_status'] = data.SK_ID_CURR.map(mean_status).values

# previous loans history
credit_history                = prev_bal.groupby('SK_ID_CURR').size().fillna(0)
data.loc[:, 'credit_history'] = data.SK_ID_CURR.map(credit_history).values 

#############################################################################################################
#                          PREVIOUS APPLICATION                                                             #
#############################################################################################################

# number of previous applications
num_prev_apps                = prev_app.groupby('SK_ID_CURR').size()
data.loc[:, 'num_prev_apps'] = data.SK_ID_CURR.map(num_prev_apps).fillna(0).astype(np.int8) 

# mean amount to be paid annually for previous applications
prev_app_mean_annuity        = prev_app.groupby('SK_ID_CURR')['AMT_ANNUITY'].mean().map(lambda x: np.log(x + 1))
prev_app_mean_annuity        = data.SK_ID_CURR.map(prev_app_mean_annuity)

# ratio of previous annuity to current annuity
data.loc[:, 'ratio_prev_curr_annuity'] = (prev_app_mean_annuity / data.AMT_ANNUITY).replace([np.inf, -np.inf], np.nan)
data.loc[:, 'diff_prev_curr_annuity']  = (prev_app_mean_annuity - data.AMT_ANNUITY).replace([np.inf, -np.inf], np.nan)


# ratio of down payment amount to application amount sum
if os.path.exists(os.path.join(basepath, f'data/processed/{DATASET_PREFIX}_down_payment_to_application.pkl')):
    down_payment_to_application = pd.read_pickle(os.path.join(basepath, f'data/processed/{DATASET_PREFIX}_down_payment_to_application.pkl'))
else:    
    down_payment_to_application = prev_app.groupby('SK_ID_CURR').apply(lambda x: (x['AMT_DOWN_PAYMENT'].fillna(0) / x['AMT_APPLICATION']).sum())
    down_payment_to_application.to_pickle(os.path.join(basepath, f'data/processed/{DATASET_PREFIX}_down_payment_to_application.pkl'))

data.loc[:, 'down_payment_to_application'] = data.SK_ID_CURR.map(down_payment_to_application)

# mean interest rate on down payments of previous applications
mean_down_payment_rate                = prev_app.groupby('SK_ID_CURR')['RATE_DOWN_PAYMENT'].mean()
data.loc[:, 'mean_down_payment_rate'] = data.SK_ID_CURR.map(mean_down_payment_rate)

# most frequent rejection reason
if os.path.exists(os.path.join(basepath, f'data/processed/{DATASET_PREFIX}_most_freq_reject_reason.pkl')):
    most_freq_rejection_reason = pd.read_pickle(os.path.join(basepath, f'data/processed/{DATASET_PREFIX}_most_freq_reject_reason.pkl'))
else:
    most_freq_rejection_reason = prev_app.groupby('SK_ID_CURR').apply(lambda x: x.CODE_REJECT_REASON.value_counts().index.values[0])
    most_freq_rejection_reason.to_pickle(os.path.join(basepath, f'data/processed/{DATASET_PREFIX}_most_freq_reject_reason.pkl'))

data.loc[:, 'most_freq_rejection_reason'] = data.SK_ID_CURR.map(most_freq_rejection_reason)

# median amount annuity
median_annuity                = prev_app.groupby('SK_ID_CURR')['AMT_ANNUITY'].median().map(lambda x: np.log(x + 1))
data.loc[:, 'median_annuity'] = data.SK_ID_CURR.map(median_annuity)

# mean of past annuity to credit applications
past_annuity_credit = (prev_app.AMT_ANNUITY / prev_app.AMT_CREDIT).replace([np.inf, -np.inf], np.nan)
past_annuity_credit = past_annuity_credit.groupby(prev_app.SK_ID_CURR).mean()
data.loc[:, 'past_annuity_to_credit'] = data.SK_ID_CURR.map(past_annuity_credit)

# difference between current credit and mean applied amount in previous applications
# mean_applied_amount = prev_app.groupby('SK_ID_CURR')['AMT_APPLICATION'].mean()
# mean_applied_amount = data.SK_ID_CURR.map(mean_applied_amount)
# data.loc[:, 'diff_applied_current_credit'] = mean_applied_amount - data.AMT_CREDIT

# difference of down_payment * rate and annuity
diff_dp_annuity = ((prev_app.AMT_DOWN_PAYMENT * prev_app.RATE_DOWN_PAYMENT) - prev_app.AMT_ANNUITY).replace([np.inf, -np.inf])
diff_dp_annuity = diff_dp_annuity.groupby(prev_app.SK_ID_CURR).sum()
data.loc[:, 'diff_dp_annuity'] = data.SK_ID_CURR.map(diff_dp_annuity)

# mean of decision on last application
mean_last_decision = prev_app.groupby('SK_ID_CURR')['DAYS_DECISION'].mean()
data.loc[:, 'mean_last_decision'] = data.SK_ID_CURR.map(mean_last_decision)

# mean of term of previous credit
mean_prev_credit = prev_app.groupby('SK_ID_CURR')['CNT_PAYMENT'].mean()
data.loc[:, 'mean_prev_credit'] = data.SK_ID_CURR.map(mean_prev_credit)


# deviation in hour, weekday at which previous application process started
dev_hour_process                = prev_app.groupby('SK_ID_CURR')['HOUR_APPR_PROCESS_START'].std()
data.loc[:, 'dev_hour_process'] = data.SK_ID_CURR.map(dev_hour_process)

dev_weekday_process                = prev_app.groupby('SK_ID_CURR')['WEEKDAY_APPR_PROCESS_START'].std()
data.loc[:, 'dev_weekday_process'] = data.SK_ID_CURR.map(dev_weekday_process)

# mean hour, weekday at which previous application process started
dev_hour_process                 = prev_app.groupby('SK_ID_CURR')['HOUR_APPR_PROCESS_START'].mean()
data.loc[:, 'mean_hour_process'] = data.SK_ID_CURR.map(dev_hour_process)

dev_weekday_process                 = prev_app.groupby('SK_ID_CURR')['WEEKDAY_APPR_PROCESS_START'].mean()
data.loc[:, 'mean_weekday_process'] = data.SK_ID_CURR.map(dev_weekday_process)

# mean days before applicaiton was made
prev_app_decision                = prev_app.groupby('SK_ID_CURR')['DAYS_DECISION'].mean()
data.loc[:, 'prev_app_decision'] = data.SK_ID_CURR.map(prev_app_decision)

# difference between termination of credit and day decision was made
diff_termination_decision                = prev_app.DAYS_TERMINATION.replace({365243: np.nan}) - prev_app.DAYS_DECISION
diff_termination_decision                = diff_termination_decision.groupby(prev_app.SK_ID_CURR).mean()
data.loc[:, 'diff_termination_decision'] = data.SK_ID_CURR.map(diff_termination_decision)

# ratio of amt annuity and amt goods price
ratio_prev_annuity_goods                = (prev_app.AMT_ANNUITY / prev_app.AMT_GOODS_PRICE).replace([np.inf, -np.inf], np.nan)
ratio_prev_annuity_goods                = ratio_prev_annuity_goods.groupby(prev_app.SK_ID_CURR).mean()
data.loc[:, 'mean_prev_annuity_goods']  = data.SK_ID_CURR.map(ratio_prev_annuity_goods)

# max of ratio of amt annuity to amt goods price
ratio_prev_annuity_goods                = (prev_app.AMT_ANNUITY / prev_app.AMT_GOODS_PRICE).replace([np.inf, -np.inf], np.nan)
ratio_prev_annuity_goods                = ratio_prev_annuity_goods.groupby(prev_app.SK_ID_CURR).max()
data.loc[:, 'max_prev_annuity_goods'] = data.SK_ID_CURR.map(ratio_prev_annuity_goods)


# max of ratio of amt annuity to amt_credit_sum
ratio_prev_annuity_credit                = (prev_app.AMT_ANNUITY / prev_app.AMT_CREDIT).replace([np.inf, -np.inf], np.nan)
ratio_prev_annuity_credit                = ratio_prev_annuity_goods.groupby(prev_app.SK_ID_CURR).max()
data.loc[:, 'max_prev_annuity_credit']   = data.SK_ID_CURR.map(ratio_prev_annuity_credit)

###############################################################################################################
#                                 POS CASH                                                                    #
###############################################################################################################
data, cif_cols = get_agg_features(data, pos_cash, 'CNT_INSTALMENT_FUTURE', 'SK_ID_CURR')

# mean of term of previous credits
mean_term = pos_cash.groupby('SK_ID_CURR')['CNT_INSTALMENT'].mean()
data.loc[:, 'mean_term'] = data.SK_ID_CURR.map(mean_term)

# total number of installments
total_installments                = pos_cash.CNT_INSTALMENT + pos_cash.CNT_INSTALMENT_FUTURE
total_installments                = total_installments.groupby(pos_cash.SK_ID_CURR).sum()
data.loc[:, 'total_installments'] = data.SK_ID_CURR.map(total_installments)

# ratio of paid to unpaid number of installments
ratio_paid_unpaid = (pos_cash.CNT_INSTALMENT_FUTURE / pos_cash.CNT_INSTALMENT).replace([np.inf, -np.inf], np.nan)
ratio_paid_unpaid = ratio_paid_unpaid.groupby(pos_cash.SK_ID_CURR).mean()
data.loc[:, 'ratio_paid_unpaid'] = data.SK_ID_CURR.map(ratio_paid_unpaid)

##############################################################################################################
#                                Credit Card Balance                                                         #
##############################################################################################################

# mean of amount balance during previous payments
mean_amt_balance = credit_bal.groupby('SK_ID_CURR')['AMT_BALANCE'].mean()
data.loc[:, 'mean_amt_balance'] = data.SK_ID_CURR.map(mean_amt_balance)

# mean of actual credit limit
mean_credit_limit = credit_bal.groupby('SK_ID_CURR')['AMT_CREDIT_LIMIT_ACTUAL'].mean()
data.loc[:, 'mean_credit_limit'] = data.SK_ID_CURR.map(mean_credit_limit)

# total paid installments on previous credit
total_paid_installments = credit_bal.groupby('SK_ID_CURR')['CNT_INSTALMENT_MATURE_CUM'].sum()
data.loc[:, 'total_paid_installments'] = data.SK_ID_CURR.map(total_paid_installments)

# mean total drawings
mean_total_drawings                = credit_bal.groupby('SK_ID_CURR')['AMT_DRAWINGS_CURRENT'].mean()
data.loc[:, 'mean_total_drawings'] = data.SK_ID_CURR.map(mean_total_drawings)

# sum of diff between balance and credit limit
diff_bal_credit   = credit_bal.AMT_BALANCE - credit_bal.AMT_CREDIT_LIMIT_ACTUAL
diff_bal_credit   = diff_bal_credit.groupby(credit_bal.SK_ID_CURR).sum()

data.loc[:, 'diff_bal_credit'] = data.SK_ID_CURR.map(diff_bal_credit)

# mean of ratio of balance and credit limit
ratio_bal_credit = credit_bal.AMT_BALANCE / credit_bal.AMT_CREDIT_LIMIT_ACTUAL
ratio_bal_credit = ratio_bal_credit.groupby(credit_bal.SK_ID_CURR).mean()

data.loc[:, 'ratio_bal_credit'] = data.SK_ID_CURR.map(ratio_bal_credit)

# aggregate features for MONTHS_BALANCE
data, mb_cols = get_agg_features(data, credit_bal, 'MONTHS_BALANCE', 'SK_ID_CURR')


# ratio of minimum installment on credit card with amount balance
ratio_min_installment_balance                = (credit_bal.AMT_BALANCE / credit_bal.AMT_INST_MIN_REGULARITY).replace([np.inf, -np.inf], np.nan)
ratio_min_installment_balance                = ratio_min_installment_balance.groupby(credit_bal.SK_ID_CURR).mean()
data.loc[:, 'ratio_min_installment_balance'] = data.SK_ID_CURR.map(ratio_min_installment_balance)

# difference of minimum installment on credit card with amount balance
diff_min_installment_balance                = (credit_bal.AMT_BALANCE - credit_bal.AMT_INST_MIN_REGULARITY).replace([np.inf, -np.inf], np.nan)
diff_min_installment_balance                = diff_min_installment_balance.groupby(credit_bal.SK_ID_CURR).mean().map(lambda x: np.log(x + 1))
data.loc[:, 'diff_min_installment_balance'] = data.SK_ID_CURR.map(diff_min_installment_balance)


###############################################################################################################
#                                  Installment Payments                                                       #
###############################################################################################################

# mean installment
mean_installment                = installments.groupby('SK_ID_CURR')['AMT_INSTALMENT'].mean()
data.loc[:, 'mean_installment'] = data.SK_ID_CURR.map(mean_installment)

# mean payment against installment
data, ap_cols               = get_agg_features(data, installments, 'AMT_PAYMENT', 'SK_ID_CURR')

# difference between actual day of installment versus when it was supposed to be paid

diff_actual_decided = -(installments.DAYS_ENTRY_PAYMENT - installments.DAYS_INSTALMENT)
diff_actual_decided = diff_actual_decided.groupby(installments.SK_ID_CURR).mean()

data.loc[:, 'diff_actual_decided'] = data.SK_ID_CURR.map(diff_actual_decided)

# ratio of installment to be paid versus actual amount paid

res = (installments.AMT_INSTALMENT / installments.AMT_PAYMENT).replace([np.inf, -np.inf], np.nan)
res = res.groupby(installments.SK_ID_CURR).mean()

data.loc[:, 'ratio_actual_decided_amount'] = data.SK_ID_CURR.map(res)

###############################################################################################################
#                                Previous and Current Application                                             #   
###############################################################################################################

# ratio of mean of amount credit sum from previous applications and current amount credit sum
prev_amt_credit_mean = prev_app.groupby('SK_ID_CURR')['AMT_CREDIT'].mean()
prev_amt_credit_mean = data.SK_ID_CURR.map(prev_amt_credit_mean)

data.loc[:, 'ratio_prev_curr_credit'] = (data.AMT_CREDIT / prev_amt_credit_mean)\
                                             .replace([np.inf, -np.inf], np.nan)


# diff (amt_annuity / amt_credit) current and previous application
ratio_annuity_credit = (prev_app.AMT_ANNUITY / prev_app.AMT_CREDIT).replace([np.inf, -np.inf], np.nan)
ratio_annuity_credit = ratio_annuity_credit.groupby(prev_app.SK_ID_CURR).mean()
data.loc[:, 'diff_annuity_credit_curr_prev'] = data.ratio_annuity_credit - ratio_annuity_credit


# diff (amt_goods_price / amt_credit_sum) current and previous application
ratio_goods_credit = (prev_app.AMT_GOODS_PRICE / prev_app.AMT_CREDIT).replace([np.inf, -np.inf], np.nan)
ratio_goods_credit = ratio_goods_credit.groupby(prev_app.SK_ID_CURR).mean()
data.loc[:, 'diff_credit_goods_curr_prev'] = data.ratio_goods_credit - ratio_goods_credit


###############################################################################################################
#                               Previous Application and Bureau                                               #
###############################################################################################################


# difference between day decision was made and remaining duration of CB Credit
dcn = bureau.groupby('SK_ID_CURR')['DAYS_CREDIT_ENDDATE'].mean()
dd  = prev_app.groupby('SK_ID_CURR')['DAYS_DECISION'].mean()
data.loc[:, 'diff_decision_credit_end'] = data.SK_ID_CURR.map(dd - dcn)

dcu = bureau.groupby('SK_ID_CURR')['DAYS_CREDIT_UPDATE'].mean() 
dd  = prev_app.groupby('SK_ID_CURR')['DAYS_DECISION'].mean()
data.loc[:, 'diff_decision_update'] = data.SK_ID_CURR.map(dd - dcu)

df  = bureau.groupby('SK_ID_CURR')['DAYS_ENDDATE_FACT'].mean() 
dd  = prev_app.groupby('SK_ID_CURR')['DAYS_DECISION'].mean()
data.loc[:, 'diff_decision_fact'] = data.SK_ID_CURR.map(dd - df)



    

# delete all intermediatory variables
del prev_num_loans, num_active_credits, bureau
del mean_days_overdue, mean_days_credit_end
del mean_max_amt_overdue, sum_num_times_prolonged
del mean_cb_credit_annuity, std_cb_credit_annuity
del num_diff_credits, mean_days_credit_update
del mean_status, prev_bal, credit_history
del num_prev_apps
del prev_app, down_payment_to_application
del mean_down_payment_rate, most_freq_rejection_reason
del median_annuity
del latest_credit, credit_duration
del mean_total_amt_overdue, credit_times_duration
del past_annuity_credit, mean_last_decision
del credit_bal, mean_amt_balance, mean_credit_limit
del total_paid_installments, mean_total_drawings
del diff_bal_credit, diff_prev_curr_credit
del diff1, diff2, diff, num_nulls_enddate
del ratio_debt_total, ratio_min_installment_balance
del diff_min_installment_balance
del total_installments, prev_amt_credit_mean
del dev_weekday_process, dev_hour_process
del prev_app_decision, diff_termination_decision
del ratio_prev_annuity_goods, dcn, dd, dcu, df
del ratio_prev_annuity_credit, diff_actual_decided

gc.collect();

In [22]:
# replace feature values with frequency less 20 with -100
for f in data.select_dtypes(include=['int8']).columns:
    if data[f].nunique() > 10:        
        low_freq_values = data[f].value_counts()
        low_freq_values = low_freq_values[low_freq_values < 20].index.values
        
        if len(low_freq_values) > 0:
            print('Feature: {}'.format(f))
            data.loc[data[f].isin(low_freq_values), f] = -100

Feature: CNT_CHILDREN
Feature: num_nulls_enddate
Feature: num_prev_apps


In [23]:
(data.isnull().sum() / len(data)).sort_values(ascending=False)

amt_reqd_summary_summary         1.000000
ratio_min_installment_balance    0.803803
diff_min_installment_balance     0.713548
ratio_bal_credit                 0.712010
median_MONTHS_BALANCE_median     0.709315
mean_amt_balance                 0.709315
diff_bal_credit                  0.709315
mean_MONTHS_BALANCE_mean         0.709315
mean_credit_limit                0.709315
max_MONTHS_BALANCE_amax          0.709315
min_MONTHS_BALANCE_var           0.709315
var_MONTHS_BALANCE_amin          0.709315
sum_MONTHS_BALANCE_sum           0.709315
mean_total_drawings              0.709315
total_paid_installments          0.709315
COMMONAREA_MODE                  0.697141
COMMONAREA_MEDI                  0.697141
COMMONAREA_AVG                   0.697141
NONLIVINGAPARTMENTS_MEDI         0.692933
NONLIVINGAPARTMENTS_AVG          0.692933
NONLIVINGAPARTMENTS_MODE         0.692933
LIVINGAPARTMENTS_MODE            0.682037
LIVINGAPARTMENTS_MEDI            0.682037
LIVINGAPARTMENTS_AVG             0

In [24]:
# unpack to train and test
tr = data.iloc[:ntrain]
te = data.iloc[ntrain:]

del data
gc.collect();

## Modelling

In [25]:
COLS_TO_REMOVE = [
                  'FLAG_DOCUMENT_21',
                  'FLAG_DOCUMENT_4',
                  'FLAG_MOBIL',
                  'FLAG_DOCUMENT_2',
                  'FLAG_DOCUMENT_20',
                  'FLAG_DOCUMENT_9',
                  'FLAG_DOCUMENT_17',
                  'FLAG_DOCUMENT_19',
                  'FLAG_DOCUMENT_5',
                  'FLAG_CONT_MOBILE',
                  'FLAG_DOCUMENT_10',
                  'HOUSETYPE_MODE',
                  'FLAG_DOCUMENT_12',
                  'FLAG_DOCUMENT_7',
                  'FLAG_DOCUMENT_11',
                  'AMT_REQ_CREDIT_BUREAU_HOUR',
                  'LIVE_REGION_NOT_WORK_REGION',
                  'AMT_REQ_CREDIT_BUREAU_DAY',
                  'FLAG_DOCUMENT_15',
                  'EMERGENCYSTATE_MODE',
                  'REG_REGION_NOT_LIVE_REGION',
                  'SK_ID_CURR',
                  'FLAG_EMP_PHONE',
                  'REG_REGION_NOT_WORK_REGION',
                  'FLAG_DOCUMENT_14',
                  'FLAG_DOCUMENT_6',
                  'TARGET'
                ]

In [26]:
features = [f for f in tr.columns if f not in COLS_TO_REMOVE]

Xtr  = tr.loc[:, features]
ytr  = tr.loc[:, 'TARGET']

Xval = te.loc[:, features]
# yval = te.loc[:, 'TARGET'] # only execute during validation phase

del tr, te
gc.collect();

In [27]:
print('Number of features used in the model are: {}'.format(len(features)))

Number of features used in the model are: 228


### Validation

In [15]:
params = {
    'objective': 'binary',
    'learning_rate': .02,
    'metric': 'auc',
    'min_data_in_leaf': 100,
    'num_leaves': 64,
    'feature_fraction': .65,
    'bagging_fraction': .8,
    'lambda_l1': 5,
    'lambda_l2': 5,
    'min_child_weight': 1.,
    'nthread': 4
}

ltrain = lgb.Dataset(Xtr, ytr, feature_name=features)
lval   = lgb.Dataset(Xval, yval, feature_name=features)

valid_sets  = [ltrain, lval]
valid_names = ['train', 'val']

num_boost_round       = 5000
early_stopping_rounds = 100

m = lgb.train(params, 
              ltrain, 
              num_boost_round, 
              valid_sets=valid_sets, 
              valid_names=valid_names, 
              early_stopping_rounds=early_stopping_rounds, 
              verbose_eval=20)

Training until validation scores don't improve for 100 rounds.
[20]	train's auc: 0.737369	val's auc: 0.726536
[40]	train's auc: 0.744584	val's auc: 0.730858
[60]	train's auc: 0.752315	val's auc: 0.736088
[80]	train's auc: 0.758626	val's auc: 0.7406
[100]	train's auc: 0.764957	val's auc: 0.745562
[120]	train's auc: 0.771259	val's auc: 0.749946
[140]	train's auc: 0.778105	val's auc: 0.754607
[160]	train's auc: 0.784398	val's auc: 0.758657
[180]	train's auc: 0.790219	val's auc: 0.762379
[200]	train's auc: 0.795905	val's auc: 0.765977
[220]	train's auc: 0.801274	val's auc: 0.769291
[240]	train's auc: 0.805945	val's auc: 0.771581
[260]	train's auc: 0.810492	val's auc: 0.773897
[280]	train's auc: 0.814734	val's auc: 0.77548
[300]	train's auc: 0.818672	val's auc: 0.776952
[320]	train's auc: 0.822331	val's auc: 0.778151
[340]	train's auc: 0.825982	val's auc: 0.779214
[360]	train's auc: 0.82952	val's auc: 0.780266
[380]	train's auc: 0.832878	val's auc: 0.781104
[400]	train's auc: 0.836236	val's

`[1068]	train's auc: 0.910518	val's auc: 0.787271`

### Feature Importance

In [16]:
# feature importance df
feat_imp = pd.DataFrame({'features': features,
                         'imp': m.feature_importance()
                        })

feat_imp.sort_values(by='imp', ascending=False)

Unnamed: 0,features,imp
113,ratio_annuity_credit,1213
219,sum_AMT_PAYMENT_sum,1041
96,EXT_SOURCE_SUM,970
20,DAYS_BIRTH,966
152,ratio_debt_total,963
34,EXT_SOURCE_2,909
220,diff_actual_decided,853
35,EXT_SOURCE_3,845
198,ratio_paid_unpaid,824
221,ratio_actual_decided_amount,726


In [15]:
feat_imp[feat_imp.features == 'ratio_actual_decided_amount']

Unnamed: 0,features,imp
219,ratio_actual_decided_amount,422


### OOF Predictions

In [19]:
%%time

HOLDOUT_SCORE  = 0.787271 

oof_preds = m.predict(Xval)
joblib.dump(oof_preds, os.path.join(basepath, f'data/oof_sub/{MODEL_PRESET}_{HOLDOUT_SCORE}_preds.pkl'))

CPU times: user 17.5 s, sys: 156 ms, total: 17.7 s
Wall time: 5.3 s


### Full Training

In [28]:
%%time

params = {
    'objective': 'binary',
    'learning_rate': (.02 / 1.2),
    'metric': 'auc',
    'min_data_in_leaf': 100,
    'num_leaves': 64,
    'feature_fraction': .65,
    'bagging_fraction': .8,
    'lambda_l1': 5,
    'lambda_l2': 5,
    'min_child_weight': 1.,
    'nthread': 4
}

BEST_ITERATION  = 1068

num_boost_round = int(BEST_ITERATION * 1.2)
ltrain          = lgb.Dataset(Xtr, ytr, feature_name=features)

m           = lgb.train(params, ltrain, num_boost_round)
final_preds = m.predict(Xval)

CPU times: user 30min 1s, sys: 2.88 s, total: 30min 4s
Wall time: 7min 49s


In [29]:
HOLDOUT_SCORE  = 0.787271

sub_identifier = "%s-%s-%.5f" % (datetime.now().strftime('%Y%m%d-%H%M'), MODEL_PRESET, HOLDOUT_SCORE)

sub           = pd.read_csv('../data/raw/sample_submission.csv.zip')
sub['TARGET'] = final_preds

sub.to_csv(os.path.join(basepath, 'submissions/%s.csv'%(sub_identifier)), index=False)