In [1]:
import os
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from sklearn.externals import joblib
from functools import partial

%matplotlib inline
import seaborn as sns
import category_encoders as ce

In [2]:
#DIR = '/PATH/TO/YOUR/DATA'
DIR = '/mnt/ml-team/minerva/open-solutions/home-credit'
description = pd.read_csv(os.path.join(DIR,'data/HomeCredit_columns_description.csv'),encoding = 'latin1')
application = pd.read_csv(os.path.join(DIR, 'files/unzipped_data/application_train.csv'))
previous_application = pd.read_csv(os.path.join(DIR, 'files/unzipped_data/previous_application.csv'))

# Solution 6
### Hand crafted features

In [3]:
features = pd.DataFrame({'SK_ID_CURR': application['SK_ID_CURR']})

common_columns = [col for col in application.columns if col in previous_application.columns]
application_common = application[common_columns]
merged_tables = previous_application[common_columns + ['DAYS_DECISION']].merge(application_common, on='SK_ID_CURR',
                                                                               how='right')
merged_tables.head()

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE_x,AMT_CREDIT_x,AMT_ANNUITY_x,AMT_GOODS_PRICE_x,NAME_TYPE_SUITE_x,WEEKDAY_APPR_PROCESS_START_x,HOUR_APPR_PROCESS_START_x,DAYS_DECISION,NAME_CONTRACT_TYPE_y,AMT_CREDIT_y,AMT_ANNUITY_y,AMT_GOODS_PRICE_y,NAME_TYPE_SUITE_y,WEEKDAY_APPR_PROCESS_START_y,HOUR_APPR_PROCESS_START_y
0,271877,Consumer loans,17145.0,1730.43,17145.0,,SATURDAY,15.0,-73.0,Cash loans,533668.5,25803.0,477000.0,Unaccompanied,TUESDAY,17
1,271877,Consumer loans,1754721.0,68258.655,1800000.0,,SATURDAY,18.0,-472.0,Cash loans,533668.5,25803.0,477000.0,Unaccompanied,TUESDAY,17
2,271877,Consumer loans,119848.5,12417.39,108400.5,,SUNDAY,14.0,-548.0,Cash loans,533668.5,25803.0,477000.0,Unaccompanied,TUESDAY,17
3,108129,Cash loans,679671.0,25188.615,607500.0,Unaccompanied,THURSDAY,11.0,-164.0,Revolving loans,135000.0,6750.0,135000.0,Family,SUNDAY,10
4,108129,Cash loans,512370.0,21709.125,450000.0,,WEDNESDAY,9.0,-515.0,Revolving loans,135000.0,6750.0,135000.0,Family,SUNDAY,10


In [4]:
merged_sorted = merged_tables.sort_values(['SK_ID_CURR', 'DAYS_DECISION'])

In [5]:
merged_sorted['annuity_diff'] = merged_sorted['AMT_ANNUITY_y'] - merged_sorted['AMT_ANNUITY_x']
merged_sorted['annuity_ratio'] = merged_sorted['AMT_ANNUITY_y'] / merged_sorted['AMT_ANNUITY_x']
merged_sorted['credit_diff'] = merged_sorted['AMT_CREDIT_y'] - merged_sorted['AMT_CREDIT_x']
merged_sorted['credit_ratio'] = merged_sorted['AMT_CREDIT_y'] / merged_sorted['AMT_CREDIT_x']

In [6]:
merged_sorted['the_same_contract_type'] = (
    merged_sorted['NAME_CONTRACT_TYPE_x'] == merged_sorted['NAME_CONTRACT_TYPE_y']).astype(int)
merged_sorted['the_same_weekday'] = (merged_sorted['WEEKDAY_APPR_PROCESS_START_x'] == merged_sorted['WEEKDAY_APPR_PROCESS_START_y']).astype(int)
merged_sorted['hour_diff'] = merged_sorted['HOUR_APPR_PROCESS_START_x'] - merged_sorted['HOUR_APPR_PROCESS_START_y']
merged_sorted['the_same_type_suite'] = (merged_sorted['NAME_TYPE_SUITE_x'] == merged_sorted['NAME_TYPE_SUITE_y']
                                       ).astype(int)
merged_sorted['the_same_type_suite'][merged_sorted['NAME_TYPE_SUITE_x'].isnull()] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [7]:
def _get_last_k_applications_feature_name(feature_name, number, suffix):
    return 'application_previous_application_{}_last_{}_applications_{}'.format(feature_name, number, suffix)


def get_last_k_credits_features(merged_sorted, numbers_of_applications):
    features = pd.DataFrame({'SK_ID_CURR': merged_sorted['SK_ID_CURR'].unique()})
    feature_list = ['annuity_diff', 'annuity_ratio', 'credit_diff', 'credit_ratio', 'the_same_contract_type',
                        'the_same_type_suite', 'the_same_weekday', 'hour_diff']

    for number in numbers_of_applications:
        table_tail = merged_sorted.groupby('SK_ID_CURR').tail(number)
        tail_groupby = table_tail.groupby('SK_ID_CURR')
        g = tail_groupby[feature_list].agg('mean')

        g = g.rename(axis='columns', mapper=partial(_get_last_k_applications_feature_name, number=number,
                                        suffix='mean')).reset_index()

        features = features.merge(g, how='left', on=['SK_ID_CURR'])
    return features

In [8]:
g = get_last_k_credits_features(merged_sorted, numbers_of_applications=[1,3,5,10])
features = features.merge(g, on=['SK_ID_CURR'], how='left')

In [9]:
X = application.merge(features,
                                left_on=['SK_ID_CURR'],
                                right_on=['SK_ID_CURR'],
                                how='left',
                                validate='one_to_one')

In [10]:
engineered_numerical_columns = list(features.columns)
engineered_numerical_columns.remove('SK_ID_CURR')
X = X[engineered_numerical_columns + ['TARGET']]
X_corr = abs(X.corr())

In [11]:
X_corr.sort_values('TARGET', ascending=False)['TARGET']

TARGET                                                                               1.000000
application_previous_application_annuity_ratio_last_1_applications_mean              0.029945
application_previous_application_annuity_ratio_last_3_applications_mean              0.029445
application_previous_application_annuity_ratio_last_5_applications_mean              0.029344
application_previous_application_annuity_ratio_last_10_applications_mean             0.029287
application_previous_application_the_same_contract_type_last_10_applications_mean    0.024471
application_previous_application_the_same_contract_type_last_5_applications_mean     0.022874
application_previous_application_annuity_diff_last_1_applications_mean               0.022078
application_previous_application_credit_diff_last_10_applications_mean               0.021103
application_previous_application_the_same_type_suite_last_10_applications_mean       0.020787
application_previous_application_credit_diff_last_5_applicat