In [60]:
# libraries
import gc
import os
import re
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt # for plotting
import seaborn as sns # for making plots with seaborn
color = sns.color_palette()
import warnings
warnings.filterwarnings('ignore') # Suppress warnings
from scipy import stats
from scipy.stats import norm, skew
%matplotlib inline

In [61]:
#data
application_train = pd.read_csv('application_train.csv')
df_train = application_train.copy()
df_full = df_train

In [62]:
# Data cleaning
df_full = df_full[df_full['CODE_GENDER'] != 'XNA']  # 4 people with XNA code gender
df_full = df_full[df_full['AMT_INCOME_TOTAL'] < 20000000] # Max income in test is 4M; train has a 117M value
df_full['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True)
df_full['DAYS_LAST_PHONE_CHANGE'].replace(0, np.nan, inplace=True)

In [63]:
# Flag_document features - count and kurtosis
docs = [f for f in df_full.columns if 'FLAG_DOC' in f]
df_full['DOCUMENT_COUNT'] = df_full[docs].sum(axis=1)
df_full['NEW_DOC_KURT'] = df_full[docs].kurtosis(axis=1)

In [64]:
def get_age_label(days_birth):
    """ Return the age group label (int). """
    age_years = -days_birth / 365
    if age_years < 27: return 1
    elif age_years < 40: return 2
    elif age_years < 50: return 3
    elif age_years < 65: return 4
    elif age_years < 99: return 5
    else: return 0

In [65]:
# Categorical age - based on target=1 plot
df_full['AGE_RANGE'] = df_full['DAYS_BIRTH'].apply(lambda x: get_age_label(x))

In [66]:
# New features based on External sources
df_full['EXT_SOURCES_PROD'] = df_full['EXT_SOURCE_1'] * df_full['EXT_SOURCE_2'] * df_full['EXT_SOURCE_3']
df_full['EXT_SOURCES_WEIGHTED'] = df_full.EXT_SOURCE_1 * 2 + df_full.EXT_SOURCE_2 * 1 + df_full.EXT_SOURCE_3 * 3
for function_name in ['min', 'max', 'mean', 'nanmedian', 'var']:
        feature_name = 'EXT_SOURCES_{}'.format(function_name.upper())
        df_full[feature_name] = eval('np.{}'.format(function_name))(
            df_full[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']], axis=1)

In [67]:
# Credit ratios
df_full['CREDIT_TO_ANNUITY_RATIO'] = df_full['AMT_CREDIT'] / df_full['AMT_ANNUITY']
df_full['CREDIT_TO_GOODS_RATIO'] = df_full['AMT_CREDIT'] / df_full['AMT_GOODS_PRICE']

In [68]:
# Income ratios
df_full['ANNUITY_TO_INCOME_RATIO'] = df_full['AMT_ANNUITY'] / df_full['AMT_INCOME_TOTAL']
df_full['CREDIT_TO_INCOME_RATIO'] = df_full['AMT_CREDIT'] / df_full['AMT_INCOME_TOTAL']
df_full['INCOME_TO_EMPLOYED_RATIO'] = df_full['AMT_INCOME_TOTAL'] / df_full['DAYS_EMPLOYED']
df_full['INCOME_TO_BIRTH_RATIO'] = df_full['AMT_INCOME_TOTAL'] / df_full['DAYS_BIRTH']

In [69]:
# Time ratios
df_full['EMPLOYED_TO_BIRTH_RATIO'] = df_full['DAYS_EMPLOYED'] / df_full['DAYS_BIRTH']
df_full['ID_TO_BIRTH_RATIO'] = df_full['DAYS_ID_PUBLISH'] / df_full['DAYS_BIRTH']
df_full['CAR_TO_BIRTH_RATIO'] = df_full['OWN_CAR_AGE'] / df_full['DAYS_BIRTH']
df_full['CAR_TO_EMPLOYED_RATIO'] = df_full['OWN_CAR_AGE'] / df_full['DAYS_EMPLOYED']
df_full['PHONE_TO_BIRTH_RATIO'] = df_full['DAYS_LAST_PHONE_CHANGE'] / df_full['DAYS_BIRTH']

In [70]:
def do_mean(df, group_cols, counted, agg_name):
    gp = df[group_cols + [counted]].groupby(group_cols)[counted].mean().reset_index().rename(
        columns={counted: agg_name})
    df = df.merge(gp, on=group_cols, how='left')
    del gp
    gc.collect()
    return df


def do_median(df, group_cols, counted, agg_name):
    gp = df[group_cols + [counted]].groupby(group_cols)[counted].median().reset_index().rename(
        columns={counted: agg_name})
    df = df.merge(gp, on=group_cols, how='left')
    del gp
    gc.collect()
    return df


def do_std(df, group_cols, counted, agg_name):
    gp = df[group_cols + [counted]].groupby(group_cols)[counted].std().reset_index().rename(
        columns={counted: agg_name})
    df = df.merge(gp, on=group_cols, how='left')
    del gp
    gc.collect()
    return df


def do_sum(df, group_cols, counted, agg_name):
    gp = df[group_cols + [counted]].groupby(group_cols)[counted].sum().reset_index().rename(
        columns={counted: agg_name})
    df = df.merge(gp, on=group_cols, how='left')
    del gp
    gc.collect()
    return df

In [None]:
# Groupby: Statistics for applications in the same group
group = ['ORGANIZATION_TYPE', 'NAME_EDUCATION_TYPE', 'OCCUPATION_TYPE', 'AGE_RANGE', 'CODE_GENDER']
df_full = do_median(df_full, group, 'EXT_SOURCES_MEAN', 'GROUP_EXT_SOURCES_MEDIAN')
df_full = do_std(df_full, group, 'EXT_SOURCES_MEAN', 'GROUP_EXT_SOURCES_STD')
df_full = do_mean(df_full, group, 'AMT_INCOME_TOTAL', 'GROUP_INCOME_MEAN')
df_full = do_std(df_full, group, 'AMT_INCOME_TOTAL', 'GROUP_INCOME_STD')
df_full = do_mean(df_full, group, 'CREDIT_TO_ANNUITY_RATIO', 'GROUP_CREDIT_TO_ANNUITY_MEAN')
df_full = do_std(df_full, group, 'CREDIT_TO_ANNUITY_RATIO', 'GROUP_CREDIT_TO_ANNUITY_STD')
df_full = do_mean(df_full, group, 'AMT_CREDIT', 'GROUP_CREDIT_MEAN')
df_full = do_mean(df_full, group, 'AMT_ANNUITY', 'GROUP_ANNUITY_MEAN')
df_full = do_std(df_full, group, 'AMT_ANNUITY', 'GROUP_ANNUITY_STD')

In [None]:
def label_encoder(df, categorical_columns=None):
    """Encode categorical values as integers (0,1,2,3...) with pandas.factorize. """
    if not categorical_columns:
        categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    for col in categorical_columns:
        df[col], uniques = pd.factorize(df[col])
    return df, categorical_columns

In [None]:
df_full, le_encoded_cols = label_encoder(df_full, None)

In [None]:
def drop_application_columns(df):
    """ Drop features based on permutation feature importance. """
    drop_list = [
        'CNT_CHILDREN', 'CNT_FAM_MEMBERS', 'HOUR_APPR_PROCESS_START',
        'FLAG_EMP_PHONE', 'FLAG_MOBIL', 'FLAG_CONT_MOBILE', 'FLAG_EMAIL', 'FLAG_PHONE',
        'FLAG_OWN_REALTY', 'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION',
        'REG_CITY_NOT_WORK_CITY', 'OBS_30_CNT_SOCIAL_CIRCLE', 'OBS_60_CNT_SOCIAL_CIRCLE',
        'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_YEAR', 
        'COMMONAREA_MODE', 'NONLIVINGAREA_MODE', 'ELEVATORS_MODE', 'NONLIVINGAREA_AVG',
        'FLOORSMIN_MEDI', 'LANDAREA_MODE', 'NONLIVINGAREA_MEDI', 'LIVINGAPARTMENTS_MODE',
        'FLOORSMIN_AVG', 'LANDAREA_AVG', 'FLOORSMIN_MODE', 'LANDAREA_MEDI',
        'COMMONAREA_MEDI', 'YEARS_BUILD_AVG', 'COMMONAREA_AVG', 'BASEMENTAREA_AVG',
        'BASEMENTAREA_MODE', 'NONLIVINGAPARTMENTS_MEDI', 'BASEMENTAREA_MEDI', 
        'LIVINGAPARTMENTS_AVG', 'ELEVATORS_AVG', 'YEARS_BUILD_MEDI', 'ENTRANCES_MODE',
        'NONLIVINGAPARTMENTS_MODE', 'LIVINGAREA_MODE', 'LIVINGAPARTMENTS_MEDI',
        'YEARS_BUILD_MODE', 'YEARS_BEGINEXPLUATATION_AVG', 'ELEVATORS_MEDI', 'LIVINGAREA_MEDI',
        'YEARS_BEGINEXPLUATATION_MODE', 'NONLIVINGAPARTMENTS_AVG', 'HOUSETYPE_MODE',
        'FONDKAPREMONT_MODE', 'EMERGENCYSTATE_MODE'
    ]
    # Drop most flag document columns
    for doc_num in [2,4,5,6,7,9,10,11,12,13,14,15,16,17,19,20,21]:
        drop_list.append('FLAG_DOCUMENT_{}'.format(doc_num))
    df.drop(drop_list, axis=1, inplace=True)
    return df

In [None]:
df_full = drop_application_columns(df_full)