In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import category_encoders as ce
from sklearn.preprocessing import LabelBinarizer, StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
prev = pd.read_csv('./data/rawdata/previous_application.csv.zip',compression='zip')
prev = prev.sort_values(['SK_ID_CURR','SK_ID_PREV'], ascending = [True,False])
prev['cnt'] = 1

In [3]:
prev.shape
prev.head()

(1670214, 38)

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NAME_CONTRACT_TYPE,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,...,CNT_PAYMENT,NAME_YIELD_GROUP,PRODUCT_COMBINATION,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL,cnt
201668,1369693,100001,Consumer loans,3951.0,24835.5,23787.0,2520.0,24835.5,FRIDAY,13,...,8.0,high,POS mobile with interest,365243.0,-1709.0,-1499.0,-1619.0,-1612.0,0.0,1
892077,1038818,100002,Consumer loans,9251.775,179055.0,179055.0,0.0,179055.0,SATURDAY,9,...,24.0,low_normal,POS other with interest,365243.0,-565.0,125.0,-25.0,-17.0,0.0,1
1021650,2636178,100003,Consumer loans,64567.665,337500.0,348637.5,0.0,337500.0,SUNDAY,17,...,6.0,middle,POS industry with interest,365243.0,-797.0,-647.0,-647.0,-639.0,0.0,1
1223745,2396755,100003,Consumer loans,6737.31,68809.5,68053.5,6885.0,68809.5,SATURDAY,15,...,12.0,middle,POS household with interest,365243.0,-2310.0,-1980.0,-1980.0,-1976.0,1.0,1
575941,1810518,100003,Cash loans,98356.995,900000.0,1035882.0,,900000.0,FRIDAY,12,...,12.0,low_normal,Cash X-Sell: low,365243.0,-716.0,-386.0,-536.0,-527.0,1.0,1


In [4]:
features_type = {
'AMT_ANNUITY':'amount_features', 'AMT_APPLICATION':'amount_features', 'AMT_CREDIT':'amount_features', 'AMT_GOODS_PRICE':'amount_features', 'AMT_DOWN_PAYMENT':'amount_features', 'NAME_CONTRACT_TYPE':'numerical_features', 'RATE_DOWN_PAYMENT':'numerical_features', 'RATE_INTEREST_PRIMARY':'numerical_features', 'RATE_INTEREST_PRIVILEGED':'numerical_features', 'DAYS_DECISION':'numerical_features', 'SELLERPLACE_AREA':'numerical_features', 'DAYS_FIRST_DRAWING':'numerical_features', 'DAYS_FIRST_DUE':'numerical_features', 'DAYS_LAST_DUE_1ST_VERSION':'numerical_features', 'DAYS_LAST_DUE':'numerical_features', 'DAYS_TERMINATION':'numerical_features', 'WEEKDAY_APPR_PROCESS_START':'categorical_features', 'HOUR_APPR_PROCESS_START':'categorical_features', 'NAME_CASH_LOAN_PURPOSE':'categorical_features', 'NAME_GOODS_CATEGORY':'categorical_features', 'NAME_CONTRACT_STATUS':'pivot_features', 'NAME_PAYMENT_TYPE':'pivot_features', 'CODE_REJECT_REASON':'pivot_features', 'NAME_TYPE_SUITE':'pivot_features', 'NAME_CLIENT_TYPE':'pivot_features', 'NAME_PORTFOLIO':'pivot_features', 'NAME_PRODUCT_TYPE':'pivot_features', 'CHANNEL_TYPE':'pivot_features', 'NAME_SELLER_INDUSTRY':'pivot_features', 'NAME_YIELD_GROUP':'pivot_features', 'PRODUCT_COMBINATION':'pivot_features', 
}

numerical_features = [f for f in features_type.keys() if features_type[f] == 'numerical_features']
categorical_features = [f for f in features_type.keys() if features_type[f] == 'categorical_features']
amount_features = [f for f in features_type.keys() if features_type[f] == 'amount_features']
pivot_features = [f for f in features_type.keys() if features_type[f] == 'pivot_features']


def latest(group):
    return list(group)[0]

def countd(group):
    return group.nunique()

def collect_set(group):
    return ','.join(set(','.join(group).split(',')))

def cnt_positive(group):
    return sum(group>0)

def cnt_negative(group):
    return sum(group<0)


In [5]:
prev_source_features = pd.concat([
    prev[numerical_features].groupby(prev.SK_ID_CURR).agg(['min','max','mean','median',latest]),
    prev[categorical_features].astype(str).groupby(prev.SK_ID_CURR).agg([collect_set,countd,latest]),
    prev[pivot_features].astype(str).groupby(prev.SK_ID_CURR).agg([collect_set,countd,latest]),
    prev[amount_features].groupby(prev.SK_ID_CURR).agg(['sum','mean','max','min']),
], axis = 1)

prev_source_features.columns = ['_'.join(col) for col in prev_source_features.columns]


In [6]:
prev_special_features = pd.concat([
    prev.SK_ID_PREV.groupby(prev.SK_ID_CURR).agg('count'),
    prev.AMT_DOWN_PAYMENT.groupby(prev.SK_ID_CURR).agg([cnt_negative,cnt_positive]),
    prev.HOUR_APPR_PROCESS_START.groupby(prev.SK_ID_CURR).agg('median'),
    prev.FLAG_LAST_APPL_PER_CONTRACT.groupby(prev.SK_ID_CURR).agg(lambda x: 1 if np.min(x) == 'Y' else 0),
    prev.NFLAG_LAST_APPL_IN_DAY.groupby(prev.SK_ID_CURR).agg('min'),
    prev.NFLAG_INSURED_ON_APPROVAL.groupby(prev.SK_ID_CURR).agg('min'),
    ], axis = 1)

prev_special_features.columns = ['PREV_cnt','AMT_DOWN_PAYMENT_cnt_negative','AMT_DOWN_PAYMENT_cnt_positive','HOUR_APPR_PROCESS_START_median','FLAG_LAST_APPL_PER_CONTRACT_1','NFLAG_LAST_APPL_IN_DAY_1','NFLAG_INSURED_ON_APPROVAL']


In [7]:
prev_features = pd.concat([
    prev_special_features,
    pd.get_dummies(prev[pivot_features + ['SK_ID_CURR']].set_index('SK_ID_CURR')).groupby(level=0).agg('max'),
    prev_source_features], axis = 1)
prev_features = prev_features.reset_index()

In [8]:
prev_features.shape
prev_features.head()

(338857, 200)

Unnamed: 0,SK_ID_CURR,PREV_cnt,AMT_DOWN_PAYMENT_cnt_negative,AMT_DOWN_PAYMENT_cnt_positive,HOUR_APPR_PROCESS_START_median,FLAG_LAST_APPL_PER_CONTRACT_1,NFLAG_LAST_APPL_IN_DAY_1,NFLAG_INSURED_ON_APPROVAL,NAME_CONTRACT_STATUS_Approved,NAME_CONTRACT_STATUS_Canceled,...,AMT_CREDIT_max,AMT_CREDIT_min,AMT_GOODS_PRICE_sum,AMT_GOODS_PRICE_mean,AMT_GOODS_PRICE_max,AMT_GOODS_PRICE_min,AMT_DOWN_PAYMENT_sum,AMT_DOWN_PAYMENT_mean,AMT_DOWN_PAYMENT_max,AMT_DOWN_PAYMENT_min
0,100001,1,0.0,1.0,13.0,1,1,0.0,1,0,...,23787.0,23787.0,24835.5,24835.5,24835.5,24835.5,2520.0,2520.0,2520.0,2520.0
1,100002,1,0.0,0.0,9.0,1,1,0.0,1,0,...,179055.0,179055.0,179055.0,179055.0,179055.0,179055.0,0.0,0.0,0.0,0.0
2,100003,3,0.0,1.0,15.0,1,1,0.0,1,0,...,1035882.0,68053.5,1306309.5,435436.5,900000.0,68809.5,6885.0,3442.5,6885.0,0.0
3,100004,1,0.0,1.0,5.0,1,1,0.0,1,0,...,20106.0,20106.0,24282.0,24282.0,24282.0,24282.0,4860.0,4860.0,4860.0,4860.0
4,100005,2,0.0,1.0,10.5,1,1,0.0,1,1,...,40153.5,0.0,44617.5,44617.5,44617.5,44617.5,4464.0,4464.0,4464.0,4464.0


In [9]:
prev_features.to_csv('./data/rawdata/only_prev_features.csv',index=False)
