In [2]:
import numpy as np
import pandas as pd
import gc
import time
from contextlib import contextmanager
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import multiprocessing
from os.path import exists
import os
warnings.simplefilter(action='ignore', category=FutureWarning)
np.random.seed(int(time.time()))

In [3]:
# One-hot encoding for categorical columns with get_dummies
def one_hot_encoder(df, nan_as_category = True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns= categorical_columns, dummy_na= nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

def read_df(name):
    if exists('../input/%s.h5' %name):
        df = pd.read_hdf('../input/%s.h5' %name, str(name))
    else:
        df = pd.read_csv('../input/%s.csv' %name)
        df.to_hdf('../input/%s.h5' %name, str(name))
    return df

def fillna_with_gaussian(df):
    a = df.values
    m = np.isnan(a) # mask of NaNs
    a[m] = np.random.normal(df.mean(), df.std(), size=m.sum())
    return df

def group_target_by_cols(df, target, cols, method='mean', residual = False):
    name_grouped_target = target+'_BY_'+'_'.join(cols)
    print ('name_grouped_target', name_grouped_target)
    tmp = df[cols + [target]].groupby(cols).agg(method)
    tmp = tmp.reset_index().rename(index=str, columns={target: name_grouped_target})
    df = df.merge(tmp, how='left', on=cols)
    if residual: 
        df[name_grouped_target] = df[target] - df[name_grouped_target]
    return df


In [7]:
# Read data and merge
df = read_df('application_train')
test_df = read_df('application_test')
print("Train samples: {}, test samples: {}".format(len(df), len(test_df)))
df = df.append(test_df).reset_index()
# Optional: Remove 4 applications with XNA CODE_GENDER (train set)
df = df[df['CODE_GENDER'] != 'XNA']

docs = [_f for _f in df.columns if 'FLAG_DOC' in _f]
live = [_f for _f in df.columns if ('FLAG_' in _f) & ('FLAG_DOC' not in _f)]

# NaN values for DAYS_EMPLOYED: 365.243 -> nan
df['DAYS_EMPLOYED'].replace(365243, np.nan, inplace= True)

inc_by_org = df[['AMT_INCOME_TOTAL', 'ORGANIZATION_TYPE']].groupby('ORGANIZATION_TYPE').median()['AMT_INCOME_TOTAL']

df['NUM_INSTALMENTS'] = df['AMT_CREDIT'] / df['AMT_ANNUITY'] 
df['DIFF_CREDIT_AND_GOODS_RATIO'] = df['AMT_CREDIT'] - df['AMT_GOODS_PRICE']
df['NEW_DOC_IND_AVG'] = df[docs].mean(axis=1)
df['NEW_DOC_IND_STD'] = df[docs].std(axis=1)
df['NEW_DOC_IND_KURT'] = df[docs].kurtosis(axis=1)
df['NEW_LIVE_IND_SUM'] = df[live].sum(axis=1)
df['NEW_LIVE_IND_KURT'] = df[live].kurtosis(axis=1)
df['NEW_INC_PER_CHLD'] = df['AMT_INCOME_TOTAL'] / (1 + df['CNT_CHILDREN'])
df['NEW_INC_BY_ORG'] = df['ORGANIZATION_TYPE'].map(inc_by_org)

Train samples: 307511, test samples: 48744


In [16]:
target = 'AMT_INCOME_TOTAL'
cols = ['CODE_GENDER', 'NAME_EDUCATION_TYPE']
method = 'mean'
name_grouped_target = target+'_BY_'+'_'.join(cols)
name_grouped_target

'AMT_INCOME_TOTAL_BY_CODE_GENDER_NAME_EDUCATION_TYPE'

In [17]:
tmp = df[cols + [target]].groupby(cols).agg(method)
tmp

Unnamed: 0_level_0,Unnamed: 1_level_0,AMT_INCOME_TOTAL
CODE_GENDER,NAME_EDUCATION_TYPE,Unnamed: 2_level_1
F,Academic degree,214079.527559
F,Higher education,192680.501529
F,Incomplete higher,165536.663782
F,Lower secondary,119385.787803
F,Secondary / secondary special,145221.353602
M,Academic degree,272376.923077
M,Higher education,245449.436461
M,Incomplete higher,211365.974571
M,Lower secondary,149420.667553
M,Secondary / secondary special,177755.249074


In [18]:
tmp = tmp.reset_index().rename(index=str, columns={target: name_grouped_target})
tmp

Unnamed: 0,CODE_GENDER,NAME_EDUCATION_TYPE,AMT_INCOME_TOTAL_BY_CODE_GENDER_NAME_EDUCATION_TYPE
0,F,Academic degree,214079.527559
1,F,Higher education,192680.501529
2,F,Incomplete higher,165536.663782
3,F,Lower secondary,119385.787803
4,F,Secondary / secondary special,145221.353602
5,M,Academic degree,272376.923077
6,M,Higher education,245449.436461
7,M,Incomplete higher,211365.974571
8,M,Lower secondary,149420.667553
9,M,Secondary / secondary special,177755.249074


In [8]:
df.WEEKDAY_APPR_PROCESS_START.value_counts()

TUESDAY      63651
WEDNESDAY    60391
MONDAY       59120
THURSDAY     59008
FRIDAY       57586
SATURDAY     38455
SUNDAY       18040
Name: WEEKDAY_APPR_PROCESS_START, dtype: int64

In [7]:
df[CODE_GENDER], uniques = pd.factorize(df['CODE_GENDER'])

In [None]:
df = group_target_by_cols(df, 
                          target = 'AMT_INCOME_TOTAL', 
                          cols = ['CODE_GENDER', 'NAME_EDUCATION_TYPE'], 
                          method='median')

# sequential feature
### find the temporal axis(first)
### the last order state(t = -1) ===> xxx_t_-1
### he last last order state(t = -2) ===> xxx_t_-2

# Is able to do data augmentation?
### Target variable (1 - client with payment difficulties: he/she had late payment more than X days on at least one of the first Y installments of the loan in our sample
### X = ?
### Y = ?

# clustering features
# https://github.com/nicodv/kmodes

In [4]:
description = pd.read_csv('../input/HomeCredit_columns_description.csv')

# 和temporal axis 有關的features

In [5]:
description[description.Row.str.contains('DAYS')]

Unnamed: 0.1,Unnamed: 0,Table,Row,Description,Special
17,20,application_{train|test}.csv,DAYS_BIRTH,Client's age in days at the time of application,time only relative to the application
18,21,application_{train|test}.csv,DAYS_EMPLOYED,How many days before the application the perso...,time only relative to the application
19,22,application_{train|test}.csv,DAYS_REGISTRATION,How many days before the application did clien...,time only relative to the application
20,23,application_{train|test}.csv,DAYS_ID_PUBLISH,How many days before the application did clien...,time only relative to the application
95,98,application_{train|test}.csv,DAYS_LAST_PHONE_CHANGE,How many days before application did client ch...,
126,129,bureau.csv,DAYS_CREDIT,How many days before current application did c...,time only relative to the application
128,131,bureau.csv,DAYS_CREDIT_ENDDATE,Remaining duration of CB credit (in days) at t...,time only relative to the application
129,132,bureau.csv,DAYS_ENDDATE_FACT,Days since CB credit ended at the time of appl...,time only relative to the application
137,140,bureau.csv,DAYS_CREDIT_UPDATE,How many days before loan application did last...,time only relative to the application
191,194,previous_application.csv,DAYS_DECISION,Relative to current application when was the d...,time only relative to the application


In [6]:
description[description.Row.str.contains('WEEKDAY_APPR_PROCESS_START')]

Unnamed: 0.1,Unnamed: 0,Table,Row,Description,Special
32,35,application_{train|test}.csv,WEEKDAY_APPR_PROCESS_START,On which day of the week did the client apply ...,
181,184,previous_application.csv,WEEKDAY_APPR_PROCESS_START,On which day of the week did the client apply ...,
