In [1]:
from copy import deepcopy
from functools import partial
from src import *
import category_encoders as ce
import numpy as np
import pandas as pd
from scipy.stats import kurtosis, iqr, skew
from sklearn.externals import joblib

In [2]:
installments = pd.read_csv('./input/installments_payments.csv')
installments.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
0,1054186,161674,1.0,6,-1180.0,-1187.0,6948.36,6948.36
1,1330831,151639,0.0,34,-2156.0,-2156.0,1716.525,1716.525
2,2085231,193053,2.0,1,-63.0,-63.0,25425.0,25425.0
3,2452527,199697,1.0,3,-2418.0,-2426.0,24350.13,24350.13
4,2714724,167756,1.0,2,-1383.0,-1366.0,2165.04,2160.585


In [3]:
last_k_trend_periods= [10, 50, 100, 500]
last_k_agg_periods= [1, 5, 10, 20, 50, 100]
last_k_agg_period_fractions= [(5,20),(5,50),(10,50),(10,100),(20,100)]
num_workers = 4

In [4]:
def add_features_in_group(features, gr_, feature_name, aggs, prefix):
    for agg in aggs:
        if agg == 'sum':
            features['{}{}_sum'.format(prefix, feature_name)] = gr_[feature_name].sum()
        elif agg == 'mean':
            features['{}{}_mean'.format(prefix, feature_name)] = gr_[feature_name].mean()
        elif agg == 'max':
            features['{}{}_max'.format(prefix, feature_name)] = gr_[feature_name].max()
        elif agg == 'min':
            features['{}{}_min'.format(prefix, feature_name)] = gr_[feature_name].min()
        elif agg == 'std':
            features['{}{}_std'.format(prefix, feature_name)] = gr_[feature_name].std()
        elif agg == 'count':
            features['{}{}_count'.format(prefix, feature_name)] = gr_[feature_name].count()
        elif agg == 'skew':
            features['{}{}_skew'.format(prefix, feature_name)] = skew(gr_[feature_name])
        elif agg == 'kurt':
            features['{}{}_kurt'.format(prefix, feature_name)] = kurtosis(gr_[feature_name])
        elif agg == 'iqr':
            features['{}{}_iqr'.format(prefix, feature_name)] = iqr(gr_[feature_name])
        elif agg == 'median':
            features['{}{}_median'.format(prefix, feature_name)] = gr_[feature_name].median()

    return features

def add_trend_feature(features, gr, feature_name, prefix):
    y = gr[feature_name].values
    try:
        x = np.arange(0, len(y)).reshape(-1, 1)
        lr = LinearRegression()
        lr.fit(x, y)
        trend = lr.coef_[0]
    except:
        trend = np.nan
    features['{}{}'.format(prefix, feature_name)] = trend
    return features


def get_feature_names_by_period(features, period):
    return sorted([feat for feat in features.keys() if '_{}_'.format(period) in feat])

def safe_div(a, b):
    try:
        return float(a) / float(b)
    except:
        return 0.0

In [5]:
def generate_features(gr, agg_periods, trend_periods, period_fractions):
    all = all_installment_features(gr)
    agg = last_k_installment_features_with_fractions(gr,
                                                     agg_periods,
                                                     period_fractions)
    trend = trend_in_last_k_installment_features(gr, trend_periods)
    last = last_loan_features(gr)
    features = {**all, **agg, **trend, **last}
    return pd.Series(features)


def all_installment_features(gr):
    return  last_k_installment_features(gr, periods=[10e16])


def last_k_installment_features_with_fractions(gr, periods, period_fractions):
    features =  last_k_installment_features(gr, periods)

    for short_period, long_period in period_fractions:
        short_feature_names = get_feature_names_by_period(features, short_period)
        long_feature_names = get_feature_names_by_period(features, long_period)

        for short_feature, long_feature in zip(short_feature_names, long_feature_names):
            old_name_chunk = '_{}_'.format(short_period)
            new_name_chunk = '_{}by{}_fraction_'.format(short_period, long_period)
            fraction_feature_name = short_feature.replace(old_name_chunk, new_name_chunk)
            features[fraction_feature_name] = safe_div(features[short_feature], features[long_feature])
    return features

def last_k_installment_features(gr, periods):
    gr_ = gr.copy()
    gr_.sort_values(['DAYS_INSTALMENT'], ascending=False, inplace=True)

    features = {}
    for period in periods:
        if period > 10e10:
            period_name = 'all_installment_'
            gr_period = gr_.copy()
        else:
            period_name = 'last_{}_'.format(period)
            gr_period = gr_.iloc[:period]

        features = add_features_in_group(features, gr_period, 'NUM_INSTALMENT_VERSION',
                                         ['sum', 'mean', 'max', 'min', 'std', 'median', 'skew', 'kurt', 'iqr'],
                                         period_name)

        features = add_features_in_group(features, gr_period, 'installment_paid_late_in_days',
                                         ['sum', 'mean', 'max', 'min', 'std', 'median', 'skew', 'kurt', 'iqr'],
                                         period_name)
        features = add_features_in_group(features, gr_period, 'installment_paid_late',
                                         ['count', 'mean'],
                                         period_name)
        features = add_features_in_group(features, gr_period, 'installment_paid_over_amount',
                                         ['sum', 'mean', 'max', 'min', 'std', 'median', 'skew', 'kurt', 'iqr'],
                                         period_name)
        features = add_features_in_group(features, gr_period, 'installment_paid_over',
                                         ['count', 'mean'],
                                         period_name)
    return features


def trend_in_last_k_installment_features(gr, periods):
    gr_ = gr.copy()
    gr_.sort_values(['DAYS_INSTALMENT'], ascending=False, inplace=True)

    features = {}
    for period in periods:
        gr_period = gr_.iloc[:period]

        features = add_trend_feature(features, gr_period,
                                     'installment_paid_late_in_days', '{}_period_trend_'.format(period)
                                     )
        features = add_trend_feature(features, gr_period,
                                     'installment_paid_over_amount', '{}_period_trend_'.format(period)
                                     )
    return features


def last_loan_features(gr):
    gr_ = gr.copy()
    gr_.sort_values(['DAYS_INSTALMENT'], ascending=False, inplace=True)
    last_installment_id = gr_['SK_ID_PREV'].iloc[0]
    gr_ = gr_[gr_['SK_ID_PREV'] == last_installment_id]

    features = {}
    features = add_features_in_group(features, gr_,
                                     'installment_paid_late_in_days',
                                     ['sum', 'mean', 'max', 'min', 'std'],
                                     'last_loan_')
    features = add_features_in_group(features, gr_,
                                     'installment_paid_late',
                                     ['count', 'mean'],
                                     'last_loan_')
    features = add_features_in_group(features, gr_,
                                     'installment_paid_over_amount',
                                     ['sum', 'mean', 'max', 'min', 'std'],
                                     'last_loan_')
    features = add_features_in_group(features, gr_,
                                     'installment_paid_over',
                                     ['count', 'mean'],
                                     'last_loan_')
    return features

In [6]:
installments['installment_paid_late_in_days'] = installments['DAYS_ENTRY_PAYMENT'] - installments['DAYS_INSTALMENT']
installments['installment_paid_late'] = (installments['installment_paid_late_in_days'] > 0).astype(int)
installments['installment_paid_over_amount'] = installments['AMT_PAYMENT'] - installments['AMT_INSTALMENT']
installments['installment_paid_over'] = (installments['installment_paid_over_amount'] > 0).astype(int)

features = pd.DataFrame({'SK_ID_CURR': installments['SK_ID_CURR'].unique()})
groupby = installments.groupby(['SK_ID_CURR'])

func = partial(generate_features,
               agg_periods=last_k_agg_periods,
               period_fractions=last_k_agg_period_fractions,
               trend_periods=last_k_trend_periods)
g = parallel_apply(groupby, func, index_name='SK_ID_CURR', num_workers=num_workers).reset_index()
features = features.merge(g, on='SK_ID_CURR', how='left')


HBox(children=(IntProgress(value=0, max=4), HTML(value='')))

  interpolation=interpolation)
  interpolation=interpolation)
  interpolation=interpolation)
  interpolation=interpolation)
  interpolation=interpolation)
  interpolation=interpolation)
  interpolation=interpolation)
  interpolation=interpolation)
  interpolation=interpolation)
  interpolation=interpolation)
  interpolation=interpolation)
  interpolation=interpolation)
  interpolation=interpolation)
  interpolation=interpolation)
  interpolation=interpolation)
  interpolation=interpolation)





In [7]:
features.head()

Unnamed: 0,SK_ID_CURR,all_installment_NUM_INSTALMENT_VERSION_sum,all_installment_NUM_INSTALMENT_VERSION_mean,all_installment_NUM_INSTALMENT_VERSION_max,all_installment_NUM_INSTALMENT_VERSION_min,all_installment_NUM_INSTALMENT_VERSION_std,all_installment_NUM_INSTALMENT_VERSION_median,all_installment_NUM_INSTALMENT_VERSION_skew,all_installment_NUM_INSTALMENT_VERSION_kurt,all_installment_NUM_INSTALMENT_VERSION_iqr,...,last_loan_installment_paid_late_in_days_std,last_loan_installment_paid_late_count,last_loan_installment_paid_late_mean,last_loan_installment_paid_over_amount_sum,last_loan_installment_paid_over_amount_mean,last_loan_installment_paid_over_amount_max,last_loan_installment_paid_over_amount_min,last_loan_installment_paid_over_amount_std,last_loan_installment_paid_over_count,last_loan_installment_paid_over_mean
0,161674,105.0,1.039604,2.0,1.0,0.196,1.0,4.72136,20.291237,0.0,...,11.350739,8.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0
1,151639,57.0,0.360759,2.0,0.0,0.507502,0.0,0.871002,-0.58085,1.0,...,8.281976,29.0,0.172414,-78611.985,-2710.758103,0.0,-26067.465,7820.699128,29.0,0.0
2,193053,8.0,2.666667,3.0,2.0,0.57735,3.0,-0.707107,-1.5,0.5,...,15.69501,3.0,0.333333,0.0,0.0,0.0,0.0,0.0,3.0,0.0
3,199697,30.0,1.111111,3.0,1.0,0.423659,1.0,3.818018,13.561224,0.0,...,23.782947,21.0,0.52381,-148257.9,-7059.9,0.0,-21174.3,10061.70889,21.0,0.0
4,167756,33.0,1.1,2.0,1.0,0.305129,1.0,2.666667,5.111111,0.0,...,14.985532,17.0,0.411765,-14341.59,-843.622941,0.0,-2389.68,853.505401,17.0,0.0


In [8]:
features.to_csv('./preprocessed_data/preprocess_installment.csv',index=False)