In [1]:
from copy import deepcopy
from functools import partial
from src import parallel_apply
import category_encoders as ce
import numpy as np
import pandas as pd
from scipy.stats import kurtosis, iqr, skew
from sklearn.externals import joblib

In [2]:
last_k_trend_periods= [6, 12]
last_k_agg_periods= [6, 12, 30]
num_workers = 4

In [3]:
def generate_features(gr, agg_periods, trend_periods):
    one_time = one_time_features(gr)
    all = all_installment_features(gr)
    agg = last_k_installment_features(gr, agg_periods)
    trend = trend_in_last_k_installment_features(gr, trend_periods)
    last = last_loan_features(gr)
    features = {**one_time, **all, **agg, **trend, **last}
    return pd.Series(features)


def one_time_features(gr):
    gr_ = gr.copy()
    gr_.sort_values(['MONTHS_BALANCE'], inplace=True)
    features = {}

    features['pos_cash_remaining_installments'] = gr_['CNT_INSTALMENT_FUTURE'].tail(1)
    features['pos_cash_completed_contracts'] = gr_['is_contract_status_completed'].agg('sum')

    return features


def all_installment_features(gr):
    return last_k_installment_features(gr, periods=[10e16])


def last_k_installment_features(gr, periods):
    gr_ = gr.copy()
    gr_.sort_values(['MONTHS_BALANCE'], ascending=False, inplace=True)

    features = {}
    for period in periods:
        if period > 10e10:
            period_name = 'all_installment_'
            gr_period = gr_.copy()
        else:
            period_name = 'last_{}_'.format(period)
            gr_period = gr_.iloc[:period]

        features = add_features_in_group(features, gr_period, 'pos_cash_paid_late',
                                         ['count', 'mean'],
                                         period_name)
        features = add_features_in_group(features, gr_period, 'pos_cash_paid_late_with_tolerance',
                                         ['count', 'mean'],
                                         period_name)
        features = add_features_in_group(features, gr_period, 'SK_DPD',
                                         ['sum', 'mean', 'max', 'std', 'skew', 'kurt'],
                                         period_name)
        features = add_features_in_group(features, gr_period, 'SK_DPD_DEF',
                                         ['sum', 'mean', 'max', 'std', 'skew', 'kurt'],
                                         period_name)
    return features


def trend_in_last_k_installment_features(gr, periods):
    gr_ = gr.copy()
    gr_.sort_values(['MONTHS_BALANCE'], ascending=False, inplace=True)

    features = {}
    for period in periods:
        gr_period = gr_.iloc[:period]

        features = add_trend_feature(features, gr_period,
                                     'SK_DPD', '{}_period_trend_'.format(period)
                                     )
        features = add_trend_feature(features, gr_period,
                                     'SK_DPD_DEF', '{}_period_trend_'.format(period)
                                     )
        features = add_trend_feature(features, gr_period,
                                     'CNT_INSTALMENT_FUTURE', '{}_period_trend_'.format(period)
                                     )
    return features


def last_loan_features(gr):
    gr_ = gr.copy()
    gr_.sort_values(['MONTHS_BALANCE'], ascending=False, inplace=True)
    last_installment_id = gr_['SK_ID_PREV'].iloc[0]
    gr_ = gr_[gr_['SK_ID_PREV'] == last_installment_id]

    features={}
    features = add_features_in_group(features, gr_, 'pos_cash_paid_late',
                                     ['count', 'sum', 'mean'],
                                     'last_loan_')
    features = add_features_in_group(features, gr_, 'pos_cash_paid_late_with_tolerance',
                                     ['mean'],
                                     'last_loan_')
    features = add_features_in_group(features, gr_, 'SK_DPD',
                                     ['sum', 'mean', 'max', 'std'],
                                     'last_loan_')
    features = add_features_in_group(features, gr_, 'SK_DPD_DEF',
                                     ['sum', 'mean', 'max', 'std'],
                                     'last_loan_')

    return features

In [4]:
def add_features_in_group(features, gr_, feature_name, aggs, prefix):
    for agg in aggs:
        if agg == 'sum':
            features['{}{}_sum'.format(prefix, feature_name)] = gr_[feature_name].sum()
        elif agg == 'mean':
            features['{}{}_mean'.format(prefix, feature_name)] = gr_[feature_name].mean()
        elif agg == 'max':
            features['{}{}_max'.format(prefix, feature_name)] = gr_[feature_name].max()
        elif agg == 'min':
            features['{}{}_min'.format(prefix, feature_name)] = gr_[feature_name].min()
        elif agg == 'std':
            features['{}{}_std'.format(prefix, feature_name)] = gr_[feature_name].std()
        elif agg == 'count':
            features['{}{}_count'.format(prefix, feature_name)] = gr_[feature_name].count()
        elif agg == 'skew':
            features['{}{}_skew'.format(prefix, feature_name)] = skew(gr_[feature_name])
        elif agg == 'kurt':
            features['{}{}_kurt'.format(prefix, feature_name)] = kurtosis(gr_[feature_name])
        elif agg == 'iqr':
            features['{}{}_iqr'.format(prefix, feature_name)] = iqr(gr_[feature_name])
        elif agg == 'median':
            features['{}{}_median'.format(prefix, feature_name)] = gr_[feature_name].median()

    return features

def add_trend_feature(features, gr, feature_name, prefix):
    y = gr[feature_name].values
    try:
        x = np.arange(0, len(y)).reshape(-1, 1)
        lr = LinearRegression()
        lr.fit(x, y)
        trend = lr.coef_[0]
    except:
        trend = np.nan
    features['{}{}'.format(prefix, feature_name)] = trend
    return features


def get_feature_names_by_period(features, period):
    return sorted([feat for feat in features.keys() if '_{}_'.format(period) in feat])


In [5]:
pos_cash = pd.read_csv('./input/POS_CASH_balance.csv')
pos_cash.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,CNT_INSTALMENT,CNT_INSTALMENT_FUTURE,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
0,1803195,182943,-31,48.0,45.0,Active,0,0
1,1715348,367990,-33,36.0,35.0,Active,0,0
2,1784872,397406,-32,12.0,9.0,Active,0,0
3,1903291,269225,-35,48.0,42.0,Active,0,0
4,2341044,334279,-35,36.0,35.0,Active,0,0


In [6]:
pos_cash.loc[pos_cash['CNT_INSTALMENT_FUTURE'] > 60, 'CNT_INSTALMENT_FUTURE'] = np.nan

In [7]:
pos_cash['is_contract_status_completed'] = pos_cash['NAME_CONTRACT_STATUS'] == 'Completed'
pos_cash['pos_cash_paid_late'] = (pos_cash['SK_DPD'] > 0).astype(int)
pos_cash['pos_cash_paid_late_with_tolerance'] = (pos_cash['SK_DPD_DEF'] > 0).astype(int)

features = pd.DataFrame({'SK_ID_CURR': pos_cash['SK_ID_CURR'].unique()})
groupby = pos_cash.groupby(['SK_ID_CURR'])
func = partial(generate_features,
               agg_periods=last_k_agg_periods,
               trend_periods=last_k_trend_periods)
g = parallel_apply(groupby, func, index_name='SK_ID_CURR', num_workers=num_workers).reset_index()
features = features.merge(g, on='SK_ID_CURR', how='left')

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))




In [8]:
features.head()

Unnamed: 0,SK_ID_CURR,pos_cash_remaining_installments,pos_cash_completed_contracts,all_installment_pos_cash_paid_late_count,all_installment_pos_cash_paid_late_mean,all_installment_pos_cash_paid_late_with_tolerance_count,all_installment_pos_cash_paid_late_with_tolerance_mean,all_installment_SK_DPD_sum,all_installment_SK_DPD_mean,all_installment_SK_DPD_max,...,last_loan_pos_cash_paid_late_mean,last_loan_pos_cash_paid_late_with_tolerance_mean,last_loan_SK_DPD_sum,last_loan_SK_DPD_mean,last_loan_SK_DPD_max,last_loan_SK_DPD_std,last_loan_SK_DPD_DEF_sum,last_loan_SK_DPD_DEF_mean,last_loan_SK_DPD_DEF_max,last_loan_SK_DPD_DEF_std
0,182943,"1660128 15.0 Name: CNT_INSTALMENT_FUTURE, d...",1,43,0.0,43,0.0,0,0.0,0,...,0.0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
1,367990,"5420722 0.0 Name: CNT_INSTALMENT_FUTURE, dt...",2,27,0.0,27,0.0,0,0.0,0,...,0.0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
2,397406,"5123775 0.0 Name: CNT_INSTALMENT_FUTURE, dt...",6,109,0.174312,109,0.027523,4110,37.706422,485,...,0.0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
3,269225,"2512526 39.0 Name: CNT_INSTALMENT_FUTURE, d...",5,114,0.0,114,0.0,0,0.0,0,...,0.0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
4,334279,"35079 1.0 Name: CNT_INSTALMENT_FUTURE, dtyp...",4,126,0.0,126,0.0,0,0.0,0,...,0.0,0.0,0,0.0,0,0.0,0,0.0,0,0.0


In [9]:
features.to_csv('./preprocessed_data/preprocess_POSCashBalance.csv',index=False)