In [7]:
import pandas as pd
import numpy as np
import gc

In [8]:
def application_train_test(nan_as_category = False):
    train = pd.read_csv('./data/rawdata/application_train.csv.zip',compression='zip')
    test = pd.read_csv('./data/rawdata/application_test.csv.zip',compression='zip')
    print("train: {}, test: {}".format(train.shape, test.shape))
    df = train.append(test).reset_index()
    
    for bin_feature in ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']:
        df[bin_feature], uniques = pd.factorize(df[bin_feature])
        
#     df, cat_cols = one_hot_encoder(df, nan_as_category)
    
    df['DAYS_EMPLOYED'].replace(365243, np.nan, inplace = True)
    df['DAYS_EMPLOYED_PERC'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']
    df['INCOME_CREDIT_PERC'] = df['AMT_INCOME_TOTAL'] / df['AMT_CREDIT']
    df['INCOME_PER_PERSON'] = df['AMT_INCOME_TOTAL'] / df['CNT_FAM_MEMBERS']
    df['ANNUITY_INCOME_PERC'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']
    df['PAYMENT_RATE'] = df['AMT_ANNUITY'] / df['AMT_CREDIT']
    
    del test
    gc.collect()
    return df
    

In [9]:
df = application_train_test()

train: (307511, 122), test: (48744, 121)


In [10]:
df.shape

(356255, 128)

In [11]:
df.head()

Unnamed: 0,index,AMT_ANNUITY,AMT_CREDIT,AMT_GOODS_PRICE,AMT_INCOME_TOTAL,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_WEEK,...,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_AVG,YEARS_BUILD_MEDI,YEARS_BUILD_MODE,DAYS_EMPLOYED_PERC,INCOME_CREDIT_PERC,INCOME_PER_PERSON,ANNUITY_INCOME_PERC,PAYMENT_RATE
0,0,24700.5,406597.5,351000.0,202500.0,0.0,0.0,0.0,0.0,0.0,...,0.9722,0.9722,0.6192,0.6243,0.6341,0.067329,0.498036,202500.0,0.121978,0.060749
1,1,35698.5,1293502.5,1129500.0,270000.0,0.0,0.0,0.0,0.0,0.0,...,0.9851,0.9851,0.796,0.7987,0.804,0.070862,0.208736,135000.0,0.132217,0.027598
2,2,6750.0,135000.0,135000.0,67500.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,0.011814,0.5,67500.0,0.1,0.05
3,3,29686.5,312682.5,297000.0,135000.0,,,,,,...,,,,,,0.159905,0.431748,67500.0,0.2199,0.094941
4,4,21865.5,513000.0,513000.0,121500.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,0.152418,0.236842,121500.0,0.179963,0.042623


In [20]:
def bureau_and_balance(nan_as_category = True):
    bureau = pd.read_csv('./data/rawdata/bureau.csv.zip',compression='zip')
    bureau_balance = pd.read_csv('./data/rawdata/bureau_balance.csv.zip',compression='zip')
    bb, bb_cat = one_hot_encoder(bureau_balance, nan_as_category)
    bureau, bureau_cat = one_hot_encoder(bureau, nan_as_category)
    
    bb_aggregations = {'MONTH_BALANCE': ['min','max','size']}
    for col in bb_cat:
        bb_aggregations[col] = ['mean']
        
    bb_agg = bb.groupby('SK_ID_BUREAU').agg(bb_agregations)
    bb_agg.columns = pd.Index([e[0] + "_" + e[1].upper() for e in bb_agg.columns.tolist()])
    
    bureau = bureau.join(bb_agg, how = 'left', on = 'SK_ID_BUREAU')
    bureau.drop(['SK_ID_BUREAU'], axis = 1, inplace = True)
    del bb, bb_agg
    gc.collect()
    
    num_aggregations = {
        'DAYS_CREDIT': ['min', 'max', 'mean', 'var'],
        'DAYS_CREDIT_ENDDATE':['min', 'max', 'mean'],
        'DAYS_CREDIT_UPDATE':['mean'],
        'CREDIT_DAY_OVERDUE': ['max', 'mean'],
        'AMT_CREDIT_MAX_OVERDUE': ['mean'],
        'AMT_CREDIT_SUM': ['max', 'mean', 'sum'],
        'AMT_CREDIT_SUM_DEBT': ['max', 'mean', 'sum'],
        'AMT_CREDIT_SUM_OVERDUE': ['mean'],
        'AMT_CREDIT_SUM_LIMIT': ['mean', 'sum'],
        'AMT_ANNUITY': ['max', 'mean'],
        'CNT_CREDIT_PROLONG': ['sum'],
        'MONTHS_BALANCE_MIN': ['min'],
        'MONTHS_BALANCE_MAX': ['max'],
        'MONTHS_BALANCE_SIZE': ['mean', 'sum']
    }
    
    cat_aggregations = {}
    for cat in bureau_cat:
        cat_aggregations[cat] = ['mean']
    for cat in bb_cat:
        cat_aggregations[cat + "_MEAN"] = ['mean']
        
    bureau_agg = bureau.groupby('SK_ID_CURR').agg({**num_aggregations, **cat_aggregations})
    bureau_agg.columns = pd.Index(['BURO_' + e[0] + "_" + e[1].upper() for e in bureau_agg.columns.tolist()])
    
    active = bureau[bureau['CREDIT_ACTIVE_Active'] == 1]
    active_agg = active.groupby('SK_ID_CURR').agg(num_aggregations)
    active_agg.columns = pd.Index(['ACTIVE_' + e[0] + '_' + e[1].upper() for e in active_agg.columns.tolist()])
    bureau_agg = bureau_agg.join(active_agg, how = 'left', on = 'SK_ID_CURR')
    del active, active_agg
    gc.collect()
    
    closed = bureau[bureau['CREDIT_ACTIVE_Closed'] == 1]
    closed_agg = closed.groupby('SK_ID_CURR').agg(num_aggregations)
    closed_agg.columns = pd.Index(['CLOSED_' + e[0] + '_' + e[1].upper() for e in closed_agg.columns.tolist()])
    bureau_agg = bureau_agg.join(closed_agg, how = 'left', on = 'SK_ID_CURR')
    del closed, closed_agg, bureau
    return bureau_agg

