In [1]:
__imp

Libraries have been loaded


# TODOS

[ x ] Mean, Median, Std, Max, Min for every real valued feature.

## Load Additional Libraries

In [2]:
%reload_ext autoreload
%autoreload 2

from datetime import datetime

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib

import lightgbm as lgb
from features import woe

basepath   = os.path.expanduser('../')

# train and validation fold
# TRAIN_PATH = os.path.join(basepath, 'data/processed/application_train_fold.feather')
# TEST_PATH  = os.path.join(basepath, 'data/processed/application_val_fold.feather')

# full training
TRAIN_PATH = os.path.join(basepath, 'data/processed/application_train.feather')
TEST_PATH  = os.path.join(basepath, 'data/processed/application_test.feather')

# MODEL PRESET
MODEL_PRESET   = 'M20'

# DATASET PREFIX
# DATASET_PREFIX = 'tr'
DATASET_PREFIX  = 'train'

## Load Processed Data

In [3]:
%%time

tr = pd.read_feather(f'{TRAIN_PATH}')
te = pd.read_feather(f'{TEST_PATH}')

# Application data from previous loans
bureau       = pd.read_feather(os.path.join(basepath, 'data/processed/bureau.feather'))
bureau_bal   = pd.read_feather(os.path.join(basepath, 'data/processed/bureau_balance.feather'))
prev_app     = pd.read_pickle(os.path.join(basepath, 'data/processed/prev_app.pkl'))
pos_cash     = pd.read_pickle(os.path.join(basepath, 'data/processed/pos_cash.pkl'))
credit_bal   = pd.read_pickle(os.path.join(basepath, 'data/processed/credit_card_balance.pkl'))
installments = pd.read_pickle(os.path.join(basepath, 'data/processed/installments_payments.pkl'))

CPU times: user 603 ms, sys: 1.54 s, total: 2.14 s
Wall time: 11 s


In [4]:
# concat training and test set
data   = pd.concat((tr, te))
ntrain = len(tr) 

del tr, te
gc.collect();

## Feature Engineering

In [5]:
AGG_FEATURES = ['mean', 'median', 'max', 'min', 'var', 'sum']

def get_agg_features(data, gp, f, on):
    agg         = gp.groupby(on)[f]\
                        .agg({np.mean, np.median, np.max, np.min, np.var, np.sum}).fillna(-1)
    
    cols        = [f'{AGG_FEATURES[i]}_{f}_{c}' for i, c in enumerate(agg.columns)]
    agg.columns = cols
    agg         = agg.reset_index()
    data        = data.merge(agg, on=on, how='left')
    
    del agg
    gc.collect();
    
    return data, cols    

def log_features(data, features):
    for f in features:
        data.loc[:, f] = data[f].map(lambda x: np.log(x + 1))
    
    return data

In [6]:
%%time

# deviation in three external scores
data.loc[:, 'EXT_SOURCE_DEV']  = data.loc[:, ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].apply(np.std, axis=1)

# sum of external scores
data.loc[:, 'EXT_SOURCE_SUM'] = data['EXT_SOURCE_1'].fillna(0) + data['EXT_SOURCE_2'].fillna(0) + data['EXT_SOURCE_3'].fillna(0)

# mean of external scores
data.loc[:, 'MEAN_EXTERNAL_SCORE'] = (data['EXT_SOURCE_1'].fillna(0) + data['EXT_SOURCE_2'].fillna(0) + data['EXT_SOURCE_3'].fillna(0)) / 3

# feature interactions
data.loc[:, 'EXT_3_1'] = data.loc[:, 'EXT_SOURCE_3'] / data.loc[:, 'EXT_SOURCE_1']
data.loc[:, 'EXT_3_2'] = data.loc[:, 'EXT_SOURCE_3'] / data.loc[:, 'EXT_SOURCE_2']
data.loc[:, 'EXT_2_1'] = data.loc[:, 'EXT_SOURCE_2'] / data.loc[:, 'EXT_SOURCE_1']

# relationship between amount credit and total income
data.loc[:, 'ratio_credit_income'] = data.loc[:, 'AMT_CREDIT'].map(np.log1p) / data.loc[:, 'AMT_INCOME_TOTAL'].map(np.log1p)

# relationship between annual amount to be paid and income
data.loc[:, 'ratio_annuity_income'] = data.loc[:, 'AMT_ANNUITY'] / data.loc[:, 'AMT_INCOME_TOTAL']

# relationship between amount annuity and age
data.loc[:, 'ratio_annuity_age'] = data.loc[:, 'AMT_ANNUITY'] / (-data.loc[:, 'DAYS_BIRTH'] / 365)

# number of missing values in an application
data.loc[:, 'num_missing_values'] = data.loc[:, data.columns.drop('TARGET')].isnull().sum(axis=1).values

# feature interaction between age and days employed
data.loc[:, 'age_plus_employed']  = data.loc[:, 'DAYS_BIRTH'] + data.loc[:, 'DAYS_EMPLOYED']
data.loc[:, 'ratio_age_employed'] = (data.DAYS_EMPLOYED) / (data.DAYS_BIRTH)

# ratio of value of goods against which loan is given to total income
data.loc[:, 'ratio_goods_income'] = data.loc[:, 'AMT_GOODS_PRICE'] / data.loc[:, 'AMT_INCOME_TOTAL']

# feature interaction between value of goods against which loan is given to annual loan amount to be paid
data.loc[:, 'ratio_goods_annuity'] = data.loc[:, 'AMT_GOODS_PRICE'] / data.loc[:, 'AMT_ANNUITY']
data.loc[:, 'mult_goods_annuity']  = data.loc[:, 'AMT_GOODS_PRICE'] * data.loc[:, 'AMT_ANNUITY']

# feature interaction value of goods and amount credit
data.loc[:, 'ratio_goods_credit'] = (data.loc[:, 'AMT_GOODS_PRICE'] / data.loc[:, 'AMT_CREDIT']).replace([np.inf, -np.inf], np.nan)
data.loc[:, 'mult_goods_credit']  = (data.loc[:, 'AMT_GOODS_PRICE'] * data.loc[:, 'AMT_CREDIT']).replace([np.inf, -np.inf], np.nan)

# feature interaction between annuity and amount credit
data.loc[:, 'ratio_annuity_credit'] = data.loc[:, 'AMT_ANNUITY'] / data.loc[:, 'AMT_CREDIT'].replace([np.inf, -np.inf], np.nan)

# feature interaction between amount credit and age
data.loc[:, 'ratio_credit_age'] = data.AMT_CREDIT / (-data.DAYS_BIRTH / 365)

# feature interaction between amount credit and days before application id was changed
data.loc[:, 'ratio_credit_id_change'] = (data.AMT_CREDIT / -data.DAYS_ID_PUBLISH).replace([np.inf, -np.inf], np.nan)

# feature interaction between days id publish and age
data.loc[:, 'ratio_id_change_age'] = (data.DAYS_ID_PUBLISH / (-data.DAYS_BIRTH / 365))

# ratio of annuity and external score
data.loc[:, 'ratio_annuity_score_1'] = (data.loc[:, 'AMT_ANNUITY'] / data.loc[:, 'EXT_SOURCE_1']).replace([np.inf, -np.inf], np.nan)
data.loc[:, 'ratio_annuity_score_2'] = (data.loc[:, 'AMT_ANNUITY'] / data.loc[:, 'EXT_SOURCE_2']).replace([np.inf, -np.inf], np.nan)
data.loc[:, 'ratio_annuity_score_3'] = (data.loc[:, 'AMT_ANNUITY'] / data.loc[:, 'EXT_SOURCE_3']).replace([np.inf, -np.inf], np.nan)

# ratio of annuity, credit multiplied by external scores
data.loc[:, 'ratio_credit_annuity_score_1'] = ((data.loc[:, 'AMT_ANNUITY'] / data.loc[:, 'AMT_CREDIT']) * data.loc[:, 'EXT_SOURCE_1']).replace([np.inf, -np.inf], np.nan)
data.loc[:, 'ratio_credit_annuity_score_2'] = ((data.loc[:, 'AMT_ANNUITY'] / data.loc[:, 'AMT_CREDIT']) * data.loc[:, 'EXT_SOURCE_2']).replace([np.inf, -np.inf], np.nan)
data.loc[:, 'ratio_credit_annuity_score_3'] = ((data.loc[:, 'AMT_ANNUITY'] / data.loc[:, 'AMT_CREDIT']) * data.loc[:, 'EXT_SOURCE_3']).replace([np.inf, -np.inf], np.nan)


# ratio of owner's car age with his age
data.loc[:, 'ratio_car_person_age'] = (data.OWN_CAR_AGE / -data.DAYS_BIRTH)


##########################################################################################################
#                            BUREAU                                                                      #
##########################################################################################################

# number of previous loans for a particular user
prev_num_loans = bureau.groupby('SK_ID_CURR').size()

# number of previous active credits
num_active_credits = bureau.groupby('SK_ID_CURR')['CREDIT_ACTIVE'].sum()

# aggregation features
data, dc_cols  = get_agg_features(data, bureau, 'DAYS_CREDIT', 'SK_ID_CURR')
data, acm_cols = get_agg_features(data, bureau, 'AMT_CREDIT_SUM', 'SK_ID_CURR')

# logarithm of features
data  = log_features(data, acm_cols)

# mean number of days overdue on any previous credit
mean_days_overdue = bureau.groupby('SK_ID_CURR')['CREDIT_DAY_OVERDUE'].mean()

# mean number of days of CB credit at the time of application
mean_days_credit_end = bureau.groupby('SK_ID_CURR')['DAYS_CREDIT_ENDDATE'].mean()

# mean of maximum amount overdue on any credit line
mean_max_amt_overdue = bureau.groupby('SK_ID_CURR')['AMT_CREDIT_MAX_OVERDUE'].mean().map(lambda x: np.log(x + 1))

# mean of total amount overdue on any credit line
mean_total_amt_overdue = bureau.groupby('SK_ID_CURR')['AMT_CREDIT_SUM_OVERDUE'].mean().map(lambda x: np.log(x + 1))

# sum of num times credit was prolonged
sum_num_times_prolonged = bureau.groupby('SK_ID_CURR')['CNT_CREDIT_PROLONG'].sum()

# number of different types of credit taken from CREDIT BUREAU
num_diff_credits = bureau.groupby('SK_ID_CURR')['CREDIT_TYPE'].nunique()

# mean number of days of last credit update
mean_days_credit_update = bureau.groupby('SK_ID_CURR')['DAYS_CREDIT_UPDATE'].mean()    
    
# summary of amount of annuity of credit bureau loans
mean_cb_credit_annuity = bureau.groupby('SK_ID_CURR')['AMT_ANNUITY'].mean().map(lambda x: np.log(x + 1))
std_cb_credit_annuity  = bureau.groupby('SK_ID_CURR')['AMT_ANNUITY'].std().map(lambda x: np.log(x + 1))

# latest application reported to Home Credit
latest_credit = bureau.groupby('SK_ID_CURR')['DAYS_CREDIT'].max()
data.loc[:, 'latest_credit'] = data.SK_ID_CURR.map(latest_credit)

# day before current application date
credit_duration = (bureau.DAYS_CREDIT_ENDDATE - bureau.DAYS_CREDIT).map(np.abs).groupby(bureau.SK_ID_CURR).mean()
data.loc[:, 'credit_duration'] = data.SK_ID_CURR.map(credit_duration).replace([np.inf, -np.inf], np.nan)

# deviation in difference between remaining duration of credit and how long before we applied for this credit
diff_prev_curr_credit = bureau.DAYS_CREDIT_ENDDATE.fillna(0) - bureau.DAYS_CREDIT.fillna(0)
diff_prev_curr_credit = diff_prev_curr_credit.groupby(bureau.SK_ID_CURR).std()
data.loc[:, 'std_diff_prev_curr_credit'] = data.SK_ID_CURR.map(diff_prev_curr_credit)


# mean of difference between remaining duration of credit and how long before we applied for this credit
diff_prev_curr_credit = bureau.DAYS_CREDIT_ENDDATE - bureau.DAYS_CREDIT
diff_prev_curr_credit = diff_prev_curr_credit.groupby(bureau.SK_ID_CURR).mean()
data.loc[:, 'mean_diff_prev_curr_credit'] = data.SK_ID_CURR.map(diff_prev_curr_credit)

# mean of difference between days since cb credit ended and how long before we applied for current credit
diff_prev_curr_credit = bureau.DAYS_ENDDATE_FACT - bureau.DAYS_CREDIT
diff_prev_curr_credit = diff_prev_curr_credit.groupby(bureau.SK_ID_CURR).mean()
data.loc[:, 'mean_diff_ended_curr_credit'] = data.SK_ID_CURR.map(diff_prev_curr_credit)

# mean of difference between days last credit ended and remaining duration of credit
diff_prev_curr_credit = bureau.DAYS_ENDDATE_FACT - bureau.DAYS_CREDIT_ENDDATE
diff_prev_curr_credit = diff_prev_curr_credit.groupby(bureau.SK_ID_CURR).mean()
data.loc[:, 'mean_diff_prev_remaining_credit'] = data.SK_ID_CURR.map(diff_prev_curr_credit)

# mean of ratio of two differences
diff1 = bureau.DAYS_ENDDATE_FACT - bureau.DAYS_CREDIT
diff2 = bureau.DAYS_CREDIT_ENDDATE - bureau.DAYS_CREDIT
diff  = (diff1 / diff2).replace([np.inf, -np.inf], np.nan)
diff  = diff.groupby(bureau.SK_ID_CURR).mean()
data.loc[:, 'ratio_two_diff'] = data.SK_ID_CURR.map(diff)

# number of null values in days credit end date
if os.path.exists(os.path.join(basepath, f'data/processed/{DATASET_PREFIX}_num_nulls_enddate.pkl')):
    num_nulls_enddate = pd.read_pickle(os.path.join(basepath, f'data/processed/{DATASET_PREFIX}_num_nulls_enddate.pkl'))
else:    
    num_nulls_enddate = bureau.groupby('SK_ID_CURR')['DAYS_CREDIT_ENDDATE'].apply(lambda x: x.isnull().sum())
    num_nulls_enddate.to_pickle(os.path.join(basepath, f'data/processed/{DATASET_PREFIX}_num_nulls_enddate.pkl'))

data.loc[:, 'num_nulls_enddate'] = data.SK_ID_CURR.map(num_nulls_enddate).fillna(-99).astype(np.int8)

# ratio of debt to total credit sum
ratio_debt_total                = (bureau.AMT_CREDIT_SUM_DEBT / (bureau.AMT_CREDIT_SUM + 1))
ratio_debt_total                = ratio_debt_total.groupby(bureau.SK_ID_CURR).mean()
data.loc[:, 'ratio_debt_total'] = data.SK_ID_CURR.map(ratio_debt_total)


# merge back with original dataframe
data.loc[:, 'num_prev_loans']           = data.SK_ID_CURR.map(prev_num_loans).fillna(0).values
data.loc[:, 'num_prev_active_credits']  = data.SK_ID_CURR.map(num_active_credits).fillna(0).values
data.loc[:, 'mean_credit_days_overdue'] = data.SK_ID_CURR.map(mean_days_overdue).fillna(0).values
data.loc[:, 'mean_days_credit_end']     = data.SK_ID_CURR.map(mean_days_credit_end).fillna(0).values
data.loc[:, 'mean_max_amt_overdue']     = data.SK_ID_CURR.map(mean_max_amt_overdue).fillna(0).values
data.loc[:, 'mean_total_amt_overdue']   = data.SK_ID_CURR.map(mean_total_amt_overdue).values
data.loc[:, 'sum_num_times_prolonged']  = data.SK_ID_CURR.map(sum_num_times_prolonged).fillna(0).astype(np.int8).values
data.loc[:, 'mean_cb_credit_annuity']   = data.SK_ID_CURR.map(mean_cb_credit_annuity).fillna(0).values
data.loc[:, 'std_cb_credit_annuity']    = data.SK_ID_CURR.map(std_cb_credit_annuity).fillna(0).values
data.loc[:, 'num_diff_credits']         = data.SK_ID_CURR.map(num_diff_credits).fillna(0).values
data.loc[:, 'mean_days_credit_update']  = data.SK_ID_CURR.map(mean_days_credit_update).fillna(0).values

# load ratio credit overdue to credit sum
# ratio_credit_overdue_sum = pd.read_pickle(os.path.join(basepath, 'data/processed/bureau_ratio_overdue_sum_credit.pkl'))
# data.loc[:, 'ratio_cedit_overdue_sum'] = data.SK_ID_CURR.map(ratio_credit_overdue_sum).fillna(-1)

# interaction between credit amount and duration of credit
credit_times_duration = (bureau.AMT_CREDIT_SUM.fillna(0) *\
                         (bureau.DAYS_CREDIT_ENDDATE - bureau.DAYS_CREDIT).map(np.abs))\
                        .replace([np.inf, -np.inf], np.nan)
credit_times_duration = credit_times_duration.groupby(bureau.SK_ID_CURR).mean()
data.loc[:, 'credit_times_duration'] = data.SK_ID_CURR.map(credit_times_duration)


#############################################################################################################
#                      FEATURE INTERACTIONS                                                                 #
#############################################################################################################

# feature interaction between credit bureau annuity, current annuity and total income
data.loc[:, 'ratio_cb_goods_annuity'] = (data.AMT_GOODS_PRICE / (data.mean_cb_credit_annuity + data.AMT_ANNUITY)).replace([np.inf, -np.inf], np.nan)

# feature interaction between mean days credit update and last time id was changed by user
data.loc[:, 'ratio_update_id']        = (data.mean_days_credit_update / data.DAYS_ID_PUBLISH).replace([np.inf, -np.inf], np.nan)

# feature interaction between mean credit amount of previous credits with current credit
data.loc[:, 'ratio_curr_prev_credit'] = data.AMT_CREDIT / data.mean_AMT_CREDIT_SUM_mean

#############################################################################################################
#                           BUREAU and BUREAU BALANCE                                                       #
#############################################################################################################

prev_bal = bureau.loc[:, ['SK_ID_CURR', 'SK_ID_BUREAU']].merge(bureau_bal,
                                                   on='SK_ID_BUREAU',
                                                   how='left'
                                                  )

mean_status                      = prev_bal.groupby('SK_ID_BUREAU')['STATUS'].mean().fillna(-1)
bureau.loc[:, 'mean_status'] = bureau.SK_ID_BUREAU.map(mean_status).values

mean_status                = bureau.groupby('SK_ID_CURR')['mean_status'].mean()
data.loc[:, 'mean_status'] = data.SK_ID_CURR.map(mean_status).values

# previous loans history
credit_history                = prev_bal.groupby('SK_ID_CURR').size().fillna(0)
data.loc[:, 'credit_history'] = data.SK_ID_CURR.map(credit_history).values 

#############################################################################################################
#                          PREVIOUS APPLICATION                                                             #
#############################################################################################################

# number of previous applications
num_prev_apps                = prev_app.groupby('SK_ID_CURR').size()
data.loc[:, 'num_prev_apps'] = data.SK_ID_CURR.map(num_prev_apps).fillna(0).astype(np.int8) 

# mean amount to be paid annually for previous applications
prev_app_mean_annuity        = prev_app.groupby('SK_ID_CURR')['AMT_ANNUITY'].mean().map(lambda x: np.log(x + 1))
prev_app_mean_annuity        = data.SK_ID_CURR.map(prev_app_mean_annuity)

# ratio of previous annuity to current annuity
data.loc[:, 'ratio_prev_curr_annuity'] = (prev_app_mean_annuity / data.AMT_ANNUITY).replace([np.inf, -np.inf], np.nan)
data.loc[:, 'diff_prev_curr_annuity']  = (prev_app_mean_annuity - data.AMT_ANNUITY).replace([np.inf, -np.inf], np.nan)


# ratio of down payment amount to application amount sum
if os.path.exists(os.path.join(basepath, f'data/processed/{DATASET_PREFIX}_down_payment_to_application.pkl')):
    down_payment_to_application = pd.read_pickle(os.path.join(basepath, f'data/processed/{DATASET_PREFIX}_down_payment_to_application.pkl'))
else:    
    down_payment_to_application = prev_app.groupby('SK_ID_CURR').apply(lambda x: (x['AMT_DOWN_PAYMENT'].fillna(0) / x['AMT_APPLICATION']).sum())
    down_payment_to_application.to_pickle(os.path.join(basepath, f'data/processed/{DATASET_PREFIX}_down_payment_to_application.pkl'))

data.loc[:, 'down_payment_to_application'] = data.SK_ID_CURR.map(down_payment_to_application)

# mean interest rate on down payments of previous applications
mean_down_payment_rate                = prev_app.groupby('SK_ID_CURR')['RATE_DOWN_PAYMENT'].mean()
data.loc[:, 'mean_down_payment_rate'] = data.SK_ID_CURR.map(mean_down_payment_rate)

# most frequent rejection reason
if os.path.exists(os.path.join(basepath, f'data/processed/{DATASET_PREFIX}_most_freq_reject_reason.pkl')):
    most_freq_rejection_reason = pd.read_pickle(os.path.join(basepath, f'data/processed/{DATASET_PREFIX}_most_freq_reject_reason.pkl'))
else:
    most_freq_rejection_reason = prev_app.groupby('SK_ID_CURR').apply(lambda x: x.CODE_REJECT_REASON.value_counts().index.values[0])
    most_freq_rejection_reason.to_pickle(os.path.join(basepath, f'data/processed/{DATASET_PREFIX}_most_freq_reject_reason.pkl'))

data.loc[:, 'most_freq_rejection_reason'] = data.SK_ID_CURR.map(most_freq_rejection_reason)

# median amount annuity
median_annuity                = prev_app.groupby('SK_ID_CURR')['AMT_ANNUITY'].median().map(lambda x: np.log(x + 1))
data.loc[:, 'median_annuity'] = data.SK_ID_CURR.map(median_annuity)

# mean of past annuity to credit applications
past_annuity_credit = (prev_app.AMT_ANNUITY / prev_app.AMT_CREDIT).replace([np.inf, -np.inf], np.nan)
past_annuity_credit = past_annuity_credit.groupby(prev_app.SK_ID_CURR).mean()
data.loc[:, 'past_annuity_to_credit'] = data.SK_ID_CURR.map(past_annuity_credit)

# difference between current credit and mean applied amount in previous applications
# mean_applied_amount = prev_app.groupby('SK_ID_CURR')['AMT_APPLICATION'].mean()
# mean_applied_amount = data.SK_ID_CURR.map(mean_applied_amount)
# data.loc[:, 'diff_applied_current_credit'] = mean_applied_amount - data.AMT_CREDIT

# difference of down_payment * rate and annuity
diff_dp_annuity = ((prev_app.AMT_DOWN_PAYMENT * prev_app.RATE_DOWN_PAYMENT) - prev_app.AMT_ANNUITY).replace([np.inf, -np.inf])
diff_dp_annuity = diff_dp_annuity.groupby(prev_app.SK_ID_CURR).sum()
data.loc[:, 'diff_dp_annuity'] = data.SK_ID_CURR.map(diff_dp_annuity)

# mean of decision on last application
mean_last_decision = prev_app.groupby('SK_ID_CURR')['DAYS_DECISION'].mean()
data.loc[:, 'mean_last_decision'] = data.SK_ID_CURR.map(mean_last_decision)

# mean of term of previous credit
mean_prev_credit = prev_app.groupby('SK_ID_CURR')['CNT_PAYMENT'].mean()
data.loc[:, 'mean_prev_credit'] = data.SK_ID_CURR.map(mean_prev_credit)

###############################################################################################################
#                                 POS CASH                                                                    #
###############################################################################################################
data, cif_cols = get_agg_features(data, pos_cash, 'CNT_INSTALMENT_FUTURE', 'SK_ID_CURR')

# mean of term of previous credits
mean_term = pos_cash.groupby('SK_ID_CURR')['CNT_INSTALMENT'].mean()
data.loc[:, 'mean_term'] = data.SK_ID_CURR.map(mean_term)


##############################################################################################################
#                                Credit Card Balance                                                         #
##############################################################################################################

# mean of amount balance during previous payments
mean_amt_balance = credit_bal.groupby('SK_ID_CURR')['AMT_BALANCE'].mean()
data.loc[:, 'mean_amt_balance'] = data.SK_ID_CURR.map(mean_amt_balance)

# mean of actual credit limit
mean_credit_limit = credit_bal.groupby('SK_ID_CURR')['AMT_CREDIT_LIMIT_ACTUAL'].mean()
data.loc[:, 'mean_credit_limit'] = data.SK_ID_CURR.map(mean_credit_limit)

# total paid installments on previous credit
total_paid_installments = credit_bal.groupby('SK_ID_CURR')['CNT_INSTALMENT_MATURE_CUM'].sum()
data.loc[:, 'total_paid_installments'] = data.SK_ID_CURR.map(total_paid_installments)

# mean total drawings
mean_total_drawings                = credit_bal.groupby('SK_ID_CURR')['AMT_DRAWINGS_CURRENT'].mean()
data.loc[:, 'mean_total_drawings'] = data.SK_ID_CURR.map(mean_total_drawings)

# sum of diff between balance and credit limit
diff_bal_credit   = credit_bal.AMT_BALANCE - credit_bal.AMT_CREDIT_LIMIT_ACTUAL
diff_bal_credit   = diff_bal_credit.groupby(credit_bal.SK_ID_CURR).sum()

data.loc[:, 'diff_bal_credit'] = data.SK_ID_CURR.map(diff_bal_credit)

# mean of ratio of balance and credit limit
ratio_bal_credit = credit_bal.AMT_BALANCE / credit_bal.AMT_CREDIT_LIMIT_ACTUAL
ratio_bal_credit = ratio_bal_credit.groupby(credit_bal.SK_ID_CURR).mean()

data.loc[:, 'ratio_bal_credit'] = data.SK_ID_CURR.map(ratio_bal_credit)

###############################################################################################################
#                                  Installment Payments                                                       #
###############################################################################################################

# mean installment
mean_installment                = installments.groupby('SK_ID_CURR')['AMT_INSTALMENT'].mean()
data.loc[:, 'mean_installment'] = data.SK_ID_CURR.map(mean_installment)

# mean payment against installment
data, ap_cols               = get_agg_features(data, installments, 'AMT_PAYMENT', 'SK_ID_CURR')


# delete all intermediatory variables
del prev_num_loans, num_active_credits, bureau
del mean_days_overdue, mean_days_credit_end
del mean_max_amt_overdue, sum_num_times_prolonged
del mean_cb_credit_annuity, std_cb_credit_annuity
del num_diff_credits, mean_days_credit_update
del mean_status, prev_bal, credit_history
del num_prev_apps
del prev_app, down_payment_to_application
del mean_down_payment_rate, most_freq_rejection_reason
del median_annuity
del latest_credit, credit_duration
del mean_total_amt_overdue, credit_times_duration
del past_annuity_credit, mean_last_decision
del credit_bal, mean_amt_balance, mean_credit_limit
del total_paid_installments, mean_total_drawings
del diff_bal_credit, diff_prev_curr_credit
del diff1, diff2, diff, num_nulls_enddate
del ratio_debt_total

gc.collect();

CPU times: user 1min 4s, sys: 13.9 s, total: 1min 18s
Wall time: 1min 19s


In [7]:
# replace feature values with frequency less 20 with -100
for f in data.select_dtypes(include=['int8']).columns:
    if data[f].nunique() > 10:        
        low_freq_values = data[f].value_counts()
        low_freq_values = low_freq_values[low_freq_values < 20].index.values
        
        if len(low_freq_values) > 0:
            print('Feature: {}'.format(f))
            data.loc[data[f].isin(low_freq_values), f] = -100

Feature: CNT_CHILDREN
Feature: num_nulls_enddate
Feature: num_prev_apps


In [8]:
(data.isnull().sum() / len(data)).sort_values(ascending=False)

ratio_bal_credit               0.712010
total_paid_installments        0.709315
diff_bal_credit                0.709315
mean_total_drawings            0.709315
mean_credit_limit              0.709315
mean_amt_balance               0.709315
COMMONAREA_MODE                0.697141
COMMONAREA_AVG                 0.697141
COMMONAREA_MEDI                0.697141
NONLIVINGAPARTMENTS_AVG        0.692933
NONLIVINGAPARTMENTS_MEDI       0.692933
NONLIVINGAPARTMENTS_MODE       0.692933
LIVINGAPARTMENTS_MODE          0.682037
LIVINGAPARTMENTS_MEDI          0.682037
LIVINGAPARTMENTS_AVG           0.682037
FLOORSMIN_MEDI                 0.676785
FLOORSMIN_MODE                 0.676785
FLOORSMIN_AVG                  0.676785
YEARS_BUILD_AVG                0.663306
YEARS_BUILD_MODE               0.663306
YEARS_BUILD_MEDI               0.663306
ratio_car_person_age           0.660316
OWN_CAR_AGE                    0.660316
EXT_3_1                        0.625757
LANDAREA_MODE                  0.591835


In [9]:
# unpack to train and test
tr = data.iloc[:ntrain]
te = data.iloc[ntrain:]

del data
gc.collect();

## Modelling

In [10]:
COLS_TO_REMOVE = [
                  'FLAG_DOCUMENT_21',
                  'FLAG_DOCUMENT_4',
                  'FLAG_MOBIL',
                  'FLAG_DOCUMENT_2',
                  'FLAG_DOCUMENT_20',
                  'FLAG_DOCUMENT_9',
                  'FLAG_DOCUMENT_17',
                  'FLAG_DOCUMENT_19',
                  'FLAG_DOCUMENT_5',
                  'FLAG_CONT_MOBILE',
                  'FLAG_DOCUMENT_10',
                  'HOUSETYPE_MODE',
                  'FLAG_DOCUMENT_12',
                  'FLAG_DOCUMENT_7',
                  'FLAG_DOCUMENT_11',
                  'AMT_REQ_CREDIT_BUREAU_HOUR',
                  'LIVE_REGION_NOT_WORK_REGION',
                  'AMT_REQ_CREDIT_BUREAU_DAY',
                  'FLAG_DOCUMENT_15',
                  'EMERGENCYSTATE_MODE',
                  'REG_REGION_NOT_LIVE_REGION',
                  'SK_ID_CURR',
                  'FLAG_EMP_PHONE',
                  'TARGET'
                ]

In [11]:
features = [f for f in tr.columns if f not in COLS_TO_REMOVE]

Xtr  = tr.loc[:, features]
ytr  = tr.loc[:, 'TARGET']

Xval = te.loc[:, features]
# yval = te.loc[:, 'TARGET'] # only execute during validation phase

del tr, te
gc.collect();

In [12]:
print('Number of features used in the model are: {}'.format(len(features)))

Number of features used in the model are: 195


### Validation

In [17]:
params = {
    'objective': 'binary',
    'learning_rate': .02,
    'metric': 'auc',
    'min_data_in_leaf': 100,
    'num_leaves': 31,
    'feature_fraction': .7,
    'bagging_fraction': .8,
    'min_child_weight': 1.,
    'nthread': 4
}

ltrain = lgb.Dataset(Xtr, ytr, feature_name=features)
lval   = lgb.Dataset(Xval, yval, feature_name=features)

valid_sets  = [ltrain, lval]
valid_names = ['train', 'val']

num_boost_round       = 5000
early_stopping_rounds = 100

m = lgb.train(params, 
              ltrain, 
              num_boost_round, 
              valid_sets=valid_sets, 
              valid_names=valid_names, 
              early_stopping_rounds=early_stopping_rounds, 
              verbose_eval=20)

Training until validation scores don't improve for 100 rounds.
[20]	train's auc: 0.736105	val's auc: 0.726569
[40]	train's auc: 0.741578	val's auc: 0.730961
[60]	train's auc: 0.745228	val's auc: 0.733655
[80]	train's auc: 0.749601	val's auc: 0.737392
[100]	train's auc: 0.754208	val's auc: 0.74156
[120]	train's auc: 0.758728	val's auc: 0.745401
[140]	train's auc: 0.763383	val's auc: 0.749105
[160]	train's auc: 0.768861	val's auc: 0.753303
[180]	train's auc: 0.773935	val's auc: 0.757013
[200]	train's auc: 0.778895	val's auc: 0.760738
[220]	train's auc: 0.783532	val's auc: 0.764093
[240]	train's auc: 0.787907	val's auc: 0.767188
[260]	train's auc: 0.791499	val's auc: 0.769318
[280]	train's auc: 0.794956	val's auc: 0.771219
[300]	train's auc: 0.797996	val's auc: 0.772752
[320]	train's auc: 0.800938	val's auc: 0.774208
[340]	train's auc: 0.803682	val's auc: 0.775439
[360]	train's auc: 0.806494	val's auc: 0.776464
[380]	train's auc: 0.809054	val's auc: 0.777263
[400]	train's auc: 0.811648	va

`[1392]	train's auc: 0.892417	val's auc: 0.784025`

### Feature Importance

In [18]:
# feature importance df
feat_imp = pd.DataFrame({'features': features,
                         'imp': m.feature_importance()
                        })

feat_imp.sort_values(by='imp', ascending=False)

Unnamed: 0,features,imp
115,ratio_annuity_credit,891
99,EXT_SOURCE_SUM,862
194,sum_AMT_PAYMENT_sum,794
34,EXT_SOURCE_2,772
193,var_AMT_PAYMENT_amin,766
146,ratio_debt_total,736
20,DAYS_BIRTH,695
35,EXT_SOURCE_3,669
113,ratio_goods_credit,553
121,ratio_annuity_score_3,531


### OOF Predictions

In [19]:
%%time

HOLDOUT_SCORE  = 0.784025

oof_preds = m.predict(Xval)
joblib.dump(oof_preds, os.path.join(basepath, f'data/oof_sub/{MODEL_PRESET}_{HOLDOUT_SCORE}_preds.pkl'))

CPU times: user 12.9 s, sys: 140 ms, total: 13 s
Wall time: 4.09 s


### Full Training

In [13]:
%%time

params = {
    'objective': 'binary',
    'learning_rate': (.02 / 1.2),
    'metric': 'auc',
    'min_data_in_leaf': 100,
    'num_leaves': 31,
    'feature_fraction': .7,
    'bagging_fraction': .8,
    'min_child_weight': 1.,
    'nthread': 4
}

BEST_ITERATION  = 1392

num_boost_round = int(BEST_ITERATION * 1.2)
ltrain          = lgb.Dataset(Xtr, ytr, feature_name=features)

m           = lgb.train(params, ltrain, num_boost_round)
final_preds = m.predict(Xval)

CPU times: user 22min 33s, sys: 5.99 s, total: 22min 39s
Wall time: 7min 29s


In [14]:
HOLDOUT_SCORE  = 0.784025 

sub_identifier = "%s-%s-%.5f" % (datetime.now().strftime('%Y%m%d-%H%M'), MODEL_PRESET, HOLDOUT_SCORE)

sub           = pd.read_csv('../data/raw/sample_submission.csv.zip')
sub['TARGET'] = final_preds

sub.to_csv(os.path.join(basepath, 'submissions/%s.csv'%(sub_identifier)), index=False)