In [None]:
import os
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from sklearn.externals import joblib
from functools import partial

%matplotlib inline
import seaborn as sns
import category_encoders as ce

In [None]:
DIR = '/PATH/TO/YOUR/DATA'
description = pd.read_csv(os.path.join(DIR,'data/HomeCredit_columns_description.csv'),encoding = 'latin1')
application = pd.read_csv(os.path.join(DIR, 'files/unzipped_data/application_train.csv'))
previous_application = pd.read_csv(os.path.join(DIR, 'files/unzipped_data/previous_application.csv'))

# Solution 6
### Hand crafted features

In [None]:
features = pd.DataFrame({'SK_ID_CURR': application['SK_ID_CURR']})

common_columns = [col for col in application.columns if col in previous_application.columns]
application_common = application[common_columns]
merged_tables = previous_application[common_columns + ['DAYS_DECISION']].merge(application_common, on='SK_ID_CURR',
                                                                               how='right')
merged_tables.head()

In [None]:
merged_sorted = merged_tables.sort_values(['SK_ID_CURR', 'DAYS_DECISION'])

In [None]:
merged_sorted['annuity_diff'] = merged_sorted['AMT_ANNUITY_y'] - merged_sorted['AMT_ANNUITY_x']
merged_sorted['annuity_ratio'] = merged_sorted['AMT_ANNUITY_y'] / merged_sorted['AMT_ANNUITY_x']
merged_sorted['credit_diff'] = merged_sorted['AMT_CREDIT_y'] - merged_sorted['AMT_CREDIT_x']
merged_sorted['credit_ratio'] = merged_sorted['AMT_CREDIT_y'] / merged_sorted['AMT_CREDIT_x']

In [None]:
merged_sorted['the_same_contract_type'] = (
    merged_sorted['NAME_CONTRACT_TYPE_x'] == merged_sorted['NAME_CONTRACT_TYPE_y']).astype(int)
merged_sorted['the_same_weekday'] = (merged_sorted['WEEKDAY_APPR_PROCESS_START_x'] == merged_sorted['WEEKDAY_APPR_PROCESS_START_y']).astype(int)
merged_sorted['hour_diff'] = merged_sorted['HOUR_APPR_PROCESS_START_x'] - merged_sorted['HOUR_APPR_PROCESS_START_y']
merged_sorted['the_same_type_suite'] = (merged_sorted['NAME_TYPE_SUITE_x'] == merged_sorted['NAME_TYPE_SUITE_y']
                                       ).astype(int)
merged_sorted['the_same_type_suite'][merged_sorted['NAME_TYPE_SUITE_x'].isnull()] = 1

In [None]:
def _get_last_k_applications_feature_name(feature_name, number, suffix):
    return 'application_previous_application_{}_last_{}_applications_{}'.format(feature_name, number, suffix)


def get_last_k_credits_features(merged_sorted, numbers_of_applications):
    features = pd.DataFrame({'SK_ID_CURR': merged_sorted['SK_ID_CURR'].unique()})
    feature_list = ['annuity_diff', 'annuity_ratio', 'credit_diff', 'credit_ratio', 'the_same_contract_type',
                        'the_same_type_suite', 'the_same_weekday', 'hour_diff']

    for number in numbers_of_applications:
        table_tail = merged_sorted.groupby('SK_ID_CURR').tail(number)
        tail_groupby = table_tail.groupby('SK_ID_CURR')
        g = tail_groupby[feature_list].agg('mean')

        g = g.rename(axis='columns', mapper=partial(_get_last_k_applications_feature_name, number=number,
                                        suffix='mean')).reset_index()

        features = features.merge(g, how='left', on=['SK_ID_CURR'])
    return features

In [None]:
g = get_last_k_credits_features(merged_sorted, numbers_of_applications=[1,3,5,10])
features = features.merge(g, on=['SK_ID_CURR'], how='left')

In [None]:
X = application.merge(features,
                                left_on=['SK_ID_CURR'],
                                right_on=['SK_ID_CURR'],
                                how='left',
                                validate='one_to_one')

In [None]:
engineered_numerical_columns = list(features.columns)
engineered_numerical_columns.remove('SK_ID_CURR')
X = X[engineered_numerical_columns + ['TARGET']]
X_corr = abs(X.corr())

In [None]:
X_corr.sort_values('TARGET', ascending=False)['TARGET']