In [13]:
%load_ext autoreload
%autoreload 2

import os
import sys
import warnings

from functools import partial
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
from sklearn.externals import joblib
%matplotlib inline
import seaborn as sns

sys.path.append('../')
from src.utils import parallel_apply
from src.feature_extraction import add_features, add_features_in_group

warnings.filterwarnings('ignore')

DIR = '/mnt/ml-team/minerva/open-solutions/home-credit'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
description = pd.read_csv(os.path.join(DIR,'data/HomeCredit_columns_description.csv'),encoding = 'latin1')
application = pd.read_csv(os.path.join(DIR, 'files/unzipped_data/application_train.csv'))
installments = pd.read_csv(os.path.join(DIR, 'files/unzipped_data/installments_payments.csv'))

In [3]:
installments.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
0,1054186,161674,1.0,6,-1180.0,-1187.0,6948.36,6948.36
1,1330831,151639,0.0,34,-2156.0,-2156.0,1716.525,1716.525
2,2085231,193053,2.0,1,-63.0,-63.0,25425.0,25425.0
3,2452527,199697,1.0,3,-2418.0,-2426.0,24350.13,24350.13
4,2714724,167756,1.0,2,-1383.0,-1366.0,2165.04,2160.585


# Preprocessing
## Solution 3

# Feature Engineering
## Solution 3

## Aggregations

In [None]:
INSTALLMENTS_PAYMENTS_AGGREGATION_RECIPIES = []
for agg in ['mean', 'min', 'max', 'sum', 'var']:
    for select in ['AMT_INSTALMENT',
                   'AMT_PAYMENT',
                   'DAYS_ENTRY_PAYMENT',
                   'DAYS_INSTALMENT',
                   'NUM_INSTALMENT_NUMBER',
                   'NUM_INSTALMENT_VERSION'
                   ]:
        INSTALLMENTS_PAYMENTS_AGGREGATION_RECIPIES.append((select, agg))
INSTALLMENTS_PAYMENTS_AGGREGATION_RECIPIES = [(['SK_ID_CURR'], INSTALLMENTS_PAYMENTS_AGGREGATION_RECIPIES)]

In [None]:
groupby_aggregate_names = []
for groupby_cols, specs in tqdm(INSTALLMENTS_PAYMENTS_AGGREGATION_RECIPIES):
    group_object = installments.groupby(groupby_cols)
    for select, agg in tqdm(specs):
        groupby_aggregate_name = '{}_{}_{}'.format('_'.join(groupby_cols), agg, select)
        application = application.merge(group_object[select]
                              .agg(agg)
                              .reset_index()
                              .rename(index=str,
                                      columns={select: groupby_aggregate_name})
                              [groupby_cols + [groupby_aggregate_name]],
                              on=groupby_cols,
                              how='left')
        groupby_aggregate_names.append(groupby_aggregate_name)

In [None]:
application.head()

In [None]:
application_agg = application[groupby_aggregate_names + ['TARGET']]
application_agg_corr = abs(application_agg.corr())

In [None]:
application_agg_corr.sort_values('TARGET', ascending=False)['TARGET']

# Solution 4

In [None]:
positive_ID = application[application['TARGET']==1]['SK_ID_CURR'].tolist()
positive_ID[:4]

In [None]:
value_counts = installments[installments['SK_ID_CURR'].isin(positive_ID)]['SK_ID_CURR'].value_counts()

In [None]:
value_counts.head()

In [None]:
sns.distplot(value_counts)

In [None]:
installments_one = installments[installments['SK_ID_CURR']==328162]

In [None]:
installments_one.sort_values(['DAYS_INSTALMENT'],ascending=False).head(10)

In [4]:
# installments_ = installments[installments['SK_ID_CURR'].isin(positive_ID[:100])]
installments_ = installments.sample(20000)
installments_['instalment_paid_late_in_days'] = installments_['DAYS_ENTRY_PAYMENT'] - installments_['DAYS_INSTALMENT'] 
installments_['instalment_paid_late'] = (installments_['instalment_paid_late_in_days'] > 0).astype(int)
installments_['instalment_paid_over_amount'] = installments_['AMT_PAYMENT'] - installments_['AMT_INSTALMENT']
installments_['instalment_paid_over'] = (installments_['instalment_paid_over_amount'] > 0).astype(int)

In [16]:
features = pd.DataFrame({'SK_ID_CURR':installments_['SK_ID_CURR'].unique()})
groupby = installments_.groupby(['SK_ID_CURR'])

## per id aggregations

In [6]:
feature_names = []

features, feature_names = add_features('instalment_paid_late_in_days', ['sum','mean','max','min','std'],
                                     features, feature_names, groupby)

features, feature_names = add_features('instalment_paid_late', ['sum','mean'],
                                     features, feature_names, groupby)

features, feature_names = add_features('instalment_paid_over_amount', ['sum','mean','max','min','std'],
                                     features, feature_names, groupby)

features, feature_names = add_features('instalment_paid_over', ['sum','mean'],
                                     features, feature_names, groupby)
    
display(features.head())

Unnamed: 0,SK_ID_CURR,instalment_paid_late_in_days_sum,instalment_paid_late_in_days_mean,instalment_paid_late_in_days_max,instalment_paid_late_in_days_min,instalment_paid_late_in_days_std,instalment_paid_late_sum,instalment_paid_late_mean,instalment_paid_over_amount_sum,instalment_paid_over_amount_mean,instalment_paid_over_amount_max,instalment_paid_over_amount_min,instalment_paid_over_amount_std,instalment_paid_over_sum,instalment_paid_over_mean
0,116269,0.0,0.0,0.0,0.0,,0,0.0,0.0,0.0,0.0,0.0,,0,0.0
1,379729,-4.0,-4.0,-4.0,-4.0,,0,0.0,0.0,0.0,0.0,0.0,,0,0.0
2,414067,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0
3,196930,0.0,0.0,0.0,0.0,,0,0.0,0.0,0.0,0.0,0.0,,0,0.0
4,448398,-13.0,-13.0,-13.0,-13.0,,0,0.0,0.0,0.0,0.0,0.0,,0,0.0


## Per id k last installment information

In [7]:
def last_k_instalment_features(gr, periods):
    gr_ = gr.copy()
    gr_.sort_values(['DAYS_INSTALMENT'],ascending=False, inplace=True)
    
    features = {}

    for period in periods:
        gr_period = gr_.iloc[:period]

        features = add_features_in_group(features,gr_period, 'instalment_paid_late_in_days', 
                                     ['sum','mean','max','min','std'], 'last_{}_'.format(period))
        features = add_features_in_group(features,gr_period ,'instalment_paid_late', 
                                     ['count','mean'],'last_{}_'.format(period))
        features = add_features_in_group(features,gr_period ,'instalment_paid_over_amount', 
                                     ['sum','mean','max','min','std'],'last_{}_'.format(period))
        features = add_features_in_group(features,gr_period,'instalment_paid_over', 
                                     ['count','mean'],'last_{}_'.format(period))
    
    return features

In [8]:
func = partial(last_k_instalment_features, periods=[1,5,10,20,50,100])

g = parallel_apply(groupby, func, index_name='SK_ID_CURR',
                   num_workers=16, chunk_size=10000).reset_index()
features = features.merge(g, on='SK_ID_CURR', how='left')

display(features.head())

100%|██████████| 2/2.0 [00:58<00:00, 29.37s/it]


Unnamed: 0,SK_ID_CURR,instalment_paid_late_in_days_sum,instalment_paid_late_in_days_mean,instalment_paid_late_in_days_max,instalment_paid_late_in_days_min,instalment_paid_late_in_days_std,instalment_paid_late_sum,instalment_paid_late_mean,instalment_paid_over_amount_sum,instalment_paid_over_amount_mean,...,last_5_instalment_paid_late_in_days_std,last_5_instalment_paid_late_in_days_sum,last_5_instalment_paid_late_mean,last_5_instalment_paid_over_amount_max,last_5_instalment_paid_over_amount_mean,last_5_instalment_paid_over_amount_min,last_5_instalment_paid_over_amount_std,last_5_instalment_paid_over_amount_sum,last_5_instalment_paid_over_count,last_5_instalment_paid_over_mean
0,116269,0.0,0.0,0.0,0.0,,0,0.0,0.0,0.0,...,,0.0,0.0,0.0,0.0,0.0,,0.0,1,0.0
1,379729,-4.0,-4.0,-4.0,-4.0,,0,0.0,0.0,0.0,...,,-4.0,0.0,0.0,0.0,0.0,,0.0,1,0.0
2,414067,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,0.0
3,196930,0.0,0.0,0.0,0.0,,0,0.0,0.0,0.0,...,,0.0,0.0,0.0,0.0,0.0,,0.0,1,0.0
4,448398,-13.0,-13.0,-13.0,-13.0,,0,0.0,0.0,0.0,...,,-13.0,0.0,0.0,0.0,0.0,,0.0,1,0.0


## per id dynamic 

In [19]:
def trend_in_last_k_instalment_features(gr, periods):
    gr_ = gr.copy()
    gr_.sort_values(['DAYS_INSTALMENT'],ascending=False, inplace=True)
    
    features = {}

    for period in periods:
        gr_period = gr_.iloc[:period]


        features = _add_trend_feature(features,gr_period,
                                      'instalment_paid_late_in_days','{}_period_trend_'.format(period)
                                     )
        features = _add_trend_feature(features,gr_period,
                                      'instalment_paid_over_amount','{}_period_trend_'.format(period)
                                     )
    return features

def _add_trend_feature(features,gr,feature_name, prefix):
    y = gr[feature_name].values
    x = np.arange(0,len(y))
    z = np.polyfit(x,y,1)
    features['{}{}'.format(prefix,feature_name)] = z[0]
    return features

In [20]:
func = partial(trend_in_last_k_instalment_features, periods=[10,50,100,500])

g = parallel_apply(groupby, func, index_name='SK_ID_CURR',
                   num_workers=16, chunk_size=10000).reset_index()
features = features.merge(g, on='SK_ID_CURR', how='left')

display(features.head())

100%|██████████| 2/2.0 [00:23<00:00, 11.70s/it]


Unnamed: 0,SK_ID_CURR,100_period_trend_instalment_paid_late_in_days,100_period_trend_instalment_paid_over_amount,10_period_trend_instalment_paid_late_in_days,10_period_trend_instalment_paid_over_amount,500_period_trend_instalment_paid_late_in_days,500_period_trend_instalment_paid_over_amount,50_period_trend_instalment_paid_late_in_days,50_period_trend_instalment_paid_over_amount
0,116269,,,,,,,,
1,379729,,,,,,,,
2,414067,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,196930,,,,,,,,
4,448398,,,,,,,,


In [None]:
for k, gr in groupby:
    break

In [32]:
X = application.merge(features, on='SK_ID_CURR',how='left')
X = X[features.columns.drop('SK_ID_CURR').tolist()+['TARGET']]
X.head()

Unnamed: 0,100_period_trend_instalment_paid_late_in_days,100_period_trend_instalment_paid_over_amount,10_period_trend_instalment_paid_late_in_days,10_period_trend_instalment_paid_over_amount,500_period_trend_instalment_paid_late_in_days,500_period_trend_instalment_paid_over_amount,50_period_trend_instalment_paid_late_in_days,50_period_trend_instalment_paid_over_amount,TARGET
0,,,,,,,,,1
1,,,,,,,,,0
2,,,,,,,,,0
3,,,,,,,,,0
4,,,,,,,,,0


In [33]:
X_corr = abs(X.corr())
X_corr.sort_values('TARGET', ascending=False)['TARGET']

TARGET                                           1.000000
100_period_trend_instalment_paid_late_in_days    0.058820
10_period_trend_instalment_paid_late_in_days     0.058820
500_period_trend_instalment_paid_late_in_days    0.058820
50_period_trend_instalment_paid_late_in_days     0.058820
100_period_trend_instalment_paid_over_amount     0.018065
10_period_trend_instalment_paid_over_amount      0.018065
500_period_trend_instalment_paid_over_amount     0.018065
50_period_trend_instalment_paid_over_amount      0.018065
Name: TARGET, dtype: float64