In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler, Imputer
from sklearn.preprocessing import PolynomialFeatures
from scipy import stats

In [2]:
#Read in Data
train = pd.read_csv('dataFiles/application_train.csv')
test = pd.read_csv('dataFiles/application_test.csv')
bureau_data = pd.read_csv('dataFiles/bureau.csv')
bureau_balance_data = pd.read_csv('dataFiles/bureau_balance.csv')
prev_app_data = pd.read_csv('dataFiles/previous_application.csv')
pos_cash_balance_data = pd.read_csv('dataFiles/POS_CASH_balance.csv')
installments_data = pd.read_csv('dataFiles/installments_payments.csv')
cc_data = pd.read_csv('dataFiles/credit_card_balance.csv')

In [3]:
cc_data_grouped = cc_data.select_dtypes(exclude='object').groupby('SK_ID_PREV', as_index = False).agg({'MONTHS_BALANCE':[sum, 'median', 'mean', min, max],
                                                                       'AMT_BALANCE':[sum, 'median', 'mean', min, max],
                                                                       'AMT_CREDIT_LIMIT_ACTUAL':[sum, 'median', 'mean', min, max],
                                                                       'AMT_DRAWINGS_ATM_CURRENT':[sum, 'median', 'mean', min, max],
                                                                       'AMT_DRAWINGS_CURRENT':[sum, 'median', 'mean', min, max],
                                                                       'AMT_DRAWINGS_OTHER_CURRENT':[sum, 'median', 'mean', min, max],
                                                                       'AMT_DRAWINGS_POS_CURRENT':[sum, 'median', 'mean', min, max],
                                                                       'AMT_INST_MIN_REGULARITY':[sum, 'median', 'mean', min, max],
                                                                       'AMT_PAYMENT_CURRENT':[sum, 'median', 'mean', min, max],
                                                                       'AMT_PAYMENT_TOTAL_CURRENT':[sum, 'median', 'mean', min, max],
                                                                       'AMT_RECEIVABLE_PRINCIPAL':[sum, 'median', 'mean', min, max],
                                                                       'AMT_RECIVABLE':[sum, 'median', 'mean', min, max],
                                                                       'AMT_TOTAL_RECEIVABLE':[sum, 'median', 'mean', min, max],
                                                                       'CNT_DRAWINGS_ATM_CURRENT':[sum, 'median', 'mean', min, max],
                                                                       'CNT_DRAWINGS_CURRENT':[sum, 'median', 'mean', min, max],
                                                                       'CNT_DRAWINGS_OTHER_CURRENT':[sum, 'median', 'mean', min, max],
                                                                       'CNT_DRAWINGS_POS_CURRENT':[sum, 'median', 'mean', min, max],
                                                                       'CNT_INSTALMENT_MATURE_CUM':[sum, 'median', 'mean', min, max],
                                                                       'SK_DPD':[sum, 'median', 'mean', min, max],
                                                                       'SK_DPD_DEF':[sum, 'median', 'mean', min, max]
                                                                      })

In [4]:
cc_data_grouped.columns = ['_CC_'.join(col) if col != ('SK_ID_PREV', '') else col[0] for col in cc_data_grouped.columns]

In [5]:
prev_app_data_merged = prev_app_data.merge(cc_data_grouped, how = 'left', left_on = 'SK_ID_PREV', right_on = 'SK_ID_PREV')

In [6]:
installments_data_grouped = installments_data.groupby('SK_ID_PREV', as_index = False).agg({'NUM_INSTALMENT_VERSION':[sum, 'median', 'mean', min, max],
                                                              'NUM_INSTALMENT_NUMBER':[sum, 'median', 'mean', min, max],
                                                              'DAYS_INSTALMENT': [sum, 'median', 'mean', min, max],
                                                              'DAYS_ENTRY_PAYMENT': [sum, 'median', 'mean', min, max],
                                                              'AMT_INSTALMENT': [sum, 'median', 'mean', min, max],
                                                              'AMT_PAYMENT': [sum, 'median', 'mean', min, max]})

In [7]:
installments_data_grouped.columns = ['_INST_'.join(col) if col != ('SK_ID_PREV', '') else col[0] for col in installments_data_grouped.columns]

In [8]:
prev_app_data_merged = prev_app_data_merged.merge(installments_data_grouped, how = 'left', left_on = 'SK_ID_PREV', right_on = 'SK_ID_PREV')

In [9]:
pos_cash_balance_data_grouped = pos_cash_balance_data.groupby('SK_ID_PREV', as_index=False).agg({'MONTHS_BALANCE':min,'CNT_INSTALMENT':'median',
                                                                 'CNT_INSTALMENT_FUTURE':'median','CNT_INSTALMENT_FUTURE':'median',
                                                                'SK_DPD':sum, 'SK_DPD_DEF':sum})

In [10]:
prev_app_data_merged = prev_app_data_merged.merge(pos_cash_balance_data_grouped, how = 'left', left_on = 'SK_ID_PREV', right_on = 'SK_ID_PREV')

In [11]:
bureau_balance_data_grouped = pd.get_dummies(bureau_balance_data).groupby('SK_ID_BUREAU', as_index=False).agg({'MONTHS_BALANCE':min, 'STATUS_0':sum,
                                                                                'STATUS_1':sum, 'STATUS_2':sum, 'STATUS_3':sum,
                                                                                'STATUS_3':sum, 'STATUS_4':sum, 'STATUS_5':sum,
                                                                                'STATUS_5':sum, 'STATUS_X':sum})

In [12]:
bureau_data = bureau_data.merge(bureau_balance_data_grouped, how = 'left', left_on = 'SK_ID_BUREAU', right_on = 'SK_ID_BUREAU')

In [13]:
# Joining Bureau Grouped Data with Train Data
bureau_data_grouped = bureau_data.groupby('SK_ID_CURR', as_index=False).agg({'SK_ID_BUREAU': 'count', 'AMT_CREDIT_MAX_OVERDUE':sum,\
                                                                             'AMT_CREDIT_SUM_OVERDUE': sum, 'DAYS_CREDIT': 'mean',\
                                                                             'CREDIT_DAY_OVERDUE': sum, 'DAYS_CREDIT_ENDDATE': 'mean',\
                                                                             'AMT_CREDIT_SUM': sum, 'AMT_CREDIT_SUM_DEBT': sum,\
                                                                             'MONTHS_BALANCE':min, 'STATUS_0':sum,\
                                                                             'STATUS_1':sum, 'STATUS_2':sum, 'STATUS_3':sum,\
                                                                             'STATUS_3':sum, 'STATUS_4':sum, 'STATUS_5':sum,\
                                                                             'STATUS_5':sum, 'STATUS_X':sum
                                                                            })

train_merged = train.merge(bureau_data_grouped, how = 'left', left_on = 'SK_ID_CURR', right_on = 'SK_ID_CURR')

test_merged = test.merge(bureau_data_grouped, how = 'left', left_on = 'SK_ID_CURR', right_on = 'SK_ID_CURR')

In [19]:
prev_app_data_subset = prev_app_data_merged[['SK_ID_CURR','AMT_ANNUITY','AMT_APPLICATION','AMT_CREDIT','AMT_DOWN_PAYMENT','AMT_GOODS_PRICE',\
                         'CNT_PAYMENT','DAYS_FIRST_DRAWING','DAYS_FIRST_DUE','DAYS_LAST_DUE_1ST_VERSION',\
                         'DAYS_LAST_DUE','DAYS_TERMINATION','NFLAG_INSURED_ON_APPROVAL','MONTHS_BALANCE','CNT_INSTALMENT',\
                                             'CNT_INSTALMENT_FUTURE', 'CNT_INSTALMENT_FUTURE', 'SK_DPD', 'SK_DPD_DEF','DAYS_ENTRY_PAYMENT_INST_sum',\
                                             'DAYS_ENTRY_PAYMENT_INST_median','DAYS_ENTRY_PAYMENT_INST_mean','DAYS_ENTRY_PAYMENT_INST_min','DAYS_ENTRY_PAYMENT_INST_max',\
                                             'AMT_PAYMENT_INST_sum','AMT_PAYMENT_INST_median','AMT_PAYMENT_INST_mean','AMT_PAYMENT_INST_min','AMT_PAYMENT_INST_max',\
                                             'AMT_INSTALMENT_INST_sum','AMT_INSTALMENT_INST_median','AMT_INSTALMENT_INST_mean','AMT_INSTALMENT_INST_min',\
                                             'AMT_INSTALMENT_INST_max','NUM_INSTALMENT_NUMBER_INST_sum','NUM_INSTALMENT_NUMBER_INST_median','NUM_INSTALMENT_NUMBER_INST_mean',\
                                             'NUM_INSTALMENT_NUMBER_INST_min','NUM_INSTALMENT_NUMBER_INST_max','NUM_INSTALMENT_VERSION_INST_sum',\
                                             'NUM_INSTALMENT_VERSION_INST_median','NUM_INSTALMENT_VERSION_INST_mean','NUM_INSTALMENT_VERSION_INST_min','NUM_INSTALMENT_VERSION_INST_max',\
                                             'AMT_DRAWINGS_CURRENT_CC_sum','AMT_DRAWINGS_CURRENT_CC_median','AMT_DRAWINGS_CURRENT_CC_mean','AMT_DRAWINGS_CURRENT_CC_min','AMT_DRAWINGS_CURRENT_CC_max','CNT_DRAWINGS_POS_CURRENT_CC_sum','CNT_DRAWINGS_POS_CURRENT_CC_median','CNT_DRAWINGS_POS_CURRENT_CC_mean','CNT_DRAWINGS_POS_CURRENT_CC_min','CNT_DRAWINGS_POS_CURRENT_CC_max','SK_DPD_CC_sum','SK_DPD_CC_median','SK_DPD_CC_mean','SK_DPD_CC_min','SK_DPD_CC_max','AMT_RECIVABLE_CC_sum','AMT_RECIVABLE_CC_median','AMT_RECIVABLE_CC_mean','AMT_RECIVABLE_CC_min','AMT_RECIVABLE_CC_max','SK_DPD_DEF_CC_sum','SK_DPD_DEF_CC_median','SK_DPD_DEF_CC_mean','SK_DPD_DEF_CC_min','SK_DPD_DEF_CC_max','MONTHS_BALANCE_CC_sum','MONTHS_BALANCE_CC_median','MONTHS_BALANCE_CC_mean','MONTHS_BALANCE_CC_min','MONTHS_BALANCE_CC_max','AMT_RECEIVABLE_PRINCIPAL_CC_sum','AMT_RECEIVABLE_PRINCIPAL_CC_median','AMT_RECEIVABLE_PRINCIPAL_CC_mean','AMT_RECEIVABLE_PRINCIPAL_CC_min','AMT_RECEIVABLE_PRINCIPAL_CC_max','AMT_CREDIT_LIMIT_ACTUAL_CC_sum','AMT_CREDIT_LIMIT_ACTUAL_CC_median','AMT_CREDIT_LIMIT_ACTUAL_CC_mean','AMT_CREDIT_LIMIT_ACTUAL_CC_min','AMT_CREDIT_LIMIT_ACTUAL_CC_max','AMT_PAYMENT_TOTAL_CURRENT_CC_sum','AMT_PAYMENT_TOTAL_CURRENT_CC_median','AMT_PAYMENT_TOTAL_CURRENT_CC_mean','AMT_PAYMENT_TOTAL_CURRENT_CC_min','AMT_PAYMENT_TOTAL_CURRENT_CC_max','CNT_DRAWINGS_OTHER_CURRENT_CC_sum','CNT_DRAWINGS_OTHER_CURRENT_CC_median','CNT_DRAWINGS_OTHER_CURRENT_CC_mean','CNT_DRAWINGS_OTHER_CURRENT_CC_min','CNT_DRAWINGS_OTHER_CURRENT_CC_max','AMT_DRAWINGS_OTHER_CURRENT_CC_sum','AMT_DRAWINGS_OTHER_CURRENT_CC_median','AMT_DRAWINGS_OTHER_CURRENT_CC_mean','AMT_DRAWINGS_OTHER_CURRENT_CC_min','AMT_DRAWINGS_OTHER_CURRENT_CC_max','AMT_PAYMENT_CURRENT_CC_sum','AMT_PAYMENT_CURRENT_CC_median','AMT_PAYMENT_CURRENT_CC_mean','AMT_PAYMENT_CURRENT_CC_min','AMT_PAYMENT_CURRENT_CC_max','AMT_DRAWINGS_POS_CURRENT_CC_sum','AMT_DRAWINGS_POS_CURRENT_CC_median','AMT_DRAWINGS_POS_CURRENT_CC_mean','AMT_DRAWINGS_POS_CURRENT_CC_min','AMT_DRAWINGS_POS_CURRENT_CC_max','AMT_BALANCE_CC_sum','AMT_BALANCE_CC_median','AMT_BALANCE_CC_mean','AMT_BALANCE_CC_min','AMT_BALANCE_CC_max','AMT_TOTAL_RECEIVABLE_CC_sum','AMT_TOTAL_RECEIVABLE_CC_median','AMT_TOTAL_RECEIVABLE_CC_mean','AMT_TOTAL_RECEIVABLE_CC_min','AMT_TOTAL_RECEIVABLE_CC_max','AMT_DRAWINGS_ATM_CURRENT_CC_sum','AMT_DRAWINGS_ATM_CURRENT_CC_median','AMT_DRAWINGS_ATM_CURRENT_CC_mean','AMT_DRAWINGS_ATM_CURRENT_CC_min','AMT_DRAWINGS_ATM_CURRENT_CC_max','AMT_INST_MIN_REGULARITY_CC_sum','AMT_INST_MIN_REGULARITY_CC_median','AMT_INST_MIN_REGULARITY_CC_mean','AMT_INST_MIN_REGULARITY_CC_min','AMT_INST_MIN_REGULARITY_CC_max','CNT_DRAWINGS_ATM_CURRENT_CC_sum','CNT_DRAWINGS_ATM_CURRENT_CC_median','CNT_DRAWINGS_ATM_CURRENT_CC_mean','CNT_DRAWINGS_ATM_CURRENT_CC_min','CNT_DRAWINGS_ATM_CURRENT_CC_max','CNT_INSTALMENT_MATURE_CUM_CC_sum','CNT_INSTALMENT_MATURE_CUM_CC_median','CNT_INSTALMENT_MATURE_CUM_CC_mean','CNT_INSTALMENT_MATURE_CUM_CC_min','CNT_INSTALMENT_MATURE_CUM_CC_max','CNT_DRAWINGS_CURRENT_CC_sum','CNT_DRAWINGS_CURRENT_CC_median','CNT_DRAWINGS_CURRENT_CC_mean','CNT_DRAWINGS_CURRENT_CC_min','CNT_DRAWINGS_CURRENT_CC_max'
                                            ]]

In [20]:
prev_app_data_subset.columns = ['SK_ID_CURR','AMT_ANNUITY_PRE','AMT_APPLICATION_PRE','AMT_CREDIT_PRE','AMT_DOWN_PAYMENT_PRE','AMT_GOODS_PRICE_PRE',\
                         'CNT_PAYMENT_PRE','DAYS_FIRST_DRAWING_PRE','DAYS_FIRST_DUE_PRE','DAYS_LAST_DUE_1ST_VERSION_PRE',\
                         'DAYS_LAST_DUE_PRE','DAYS_TERMINATION_PRE','NFLAG_INSURED_ON_APPROVAL_PRE','MONTHS_BALANCE_PRE','CNT_INSTALMENT_PRE',\
                                             'CNT_INSTALMENT_FUTURE_PRE', 'CNT_INSTALMENT_FUTURE_PRE', 'SK_DPD_PRE', 'SK_DPD_DEF_PRE','DAYS_ENTRY_PAYMENT_INST_sum',\
                                             'DAYS_ENTRY_PAYMENT_INST_median','DAYS_ENTRY_PAYMENT_INST_mean','DAYS_ENTRY_PAYMENT_INST_min','DAYS_ENTRY_PAYMENT_INST_max',\
                                             'AMT_PAYMENT_INST_sum','AMT_PAYMENT_INST_median','AMT_PAYMENT_INST_mean','AMT_PAYMENT_INST_min','AMT_PAYMENT_INST_max',\
                                             'AMT_INSTALMENT_INST_sum','AMT_INSTALMENT_INST_median','AMT_INSTALMENT_INST_mean','AMT_INSTALMENT_INST_min',\
                                             'AMT_INSTALMENT_INST_max','NUM_INSTALMENT_NUMBER_INST_sum','NUM_INSTALMENT_NUMBER_INST_median','NUM_INSTALMENT_NUMBER_INST_mean',\
                                             'NUM_INSTALMENT_NUMBER_INST_min','NUM_INSTALMENT_NUMBER_INST_max','NUM_INSTALMENT_VERSION_INST_sum',\
                                             'NUM_INSTALMENT_VERSION_INST_median','NUM_INSTALMENT_VERSION_INST_mean','NUM_INSTALMENT_VERSION_INST_min','NUM_INSTALMENT_VERSION_INST_max',\
                                             'AMT_DRAWINGS_CURRENT_CC_sum','AMT_DRAWINGS_CURRENT_CC_median','AMT_DRAWINGS_CURRENT_CC_mean','AMT_DRAWINGS_CURRENT_CC_min','AMT_DRAWINGS_CURRENT_CC_max','CNT_DRAWINGS_POS_CURRENT_CC_sum','CNT_DRAWINGS_POS_CURRENT_CC_median','CNT_DRAWINGS_POS_CURRENT_CC_mean','CNT_DRAWINGS_POS_CURRENT_CC_min','CNT_DRAWINGS_POS_CURRENT_CC_max','SK_DPD_CC_sum','SK_DPD_CC_median','SK_DPD_CC_mean','SK_DPD_CC_min','SK_DPD_CC_max','AMT_RECIVABLE_CC_sum','AMT_RECIVABLE_CC_median','AMT_RECIVABLE_CC_mean','AMT_RECIVABLE_CC_min','AMT_RECIVABLE_CC_max','SK_DPD_DEF_CC_sum','SK_DPD_DEF_CC_median','SK_DPD_DEF_CC_mean','SK_DPD_DEF_CC_min','SK_DPD_DEF_CC_max','MONTHS_BALANCE_CC_sum','MONTHS_BALANCE_CC_median','MONTHS_BALANCE_CC_mean','MONTHS_BALANCE_CC_min','MONTHS_BALANCE_CC_max','AMT_RECEIVABLE_PRINCIPAL_CC_sum','AMT_RECEIVABLE_PRINCIPAL_CC_median','AMT_RECEIVABLE_PRINCIPAL_CC_mean','AMT_RECEIVABLE_PRINCIPAL_CC_min','AMT_RECEIVABLE_PRINCIPAL_CC_max','AMT_CREDIT_LIMIT_ACTUAL_CC_sum','AMT_CREDIT_LIMIT_ACTUAL_CC_median','AMT_CREDIT_LIMIT_ACTUAL_CC_mean','AMT_CREDIT_LIMIT_ACTUAL_CC_min','AMT_CREDIT_LIMIT_ACTUAL_CC_max','AMT_PAYMENT_TOTAL_CURRENT_CC_sum','AMT_PAYMENT_TOTAL_CURRENT_CC_median','AMT_PAYMENT_TOTAL_CURRENT_CC_mean','AMT_PAYMENT_TOTAL_CURRENT_CC_min','AMT_PAYMENT_TOTAL_CURRENT_CC_max','CNT_DRAWINGS_OTHER_CURRENT_CC_sum','CNT_DRAWINGS_OTHER_CURRENT_CC_median','CNT_DRAWINGS_OTHER_CURRENT_CC_mean','CNT_DRAWINGS_OTHER_CURRENT_CC_min','CNT_DRAWINGS_OTHER_CURRENT_CC_max','AMT_DRAWINGS_OTHER_CURRENT_CC_sum','AMT_DRAWINGS_OTHER_CURRENT_CC_median','AMT_DRAWINGS_OTHER_CURRENT_CC_mean','AMT_DRAWINGS_OTHER_CURRENT_CC_min','AMT_DRAWINGS_OTHER_CURRENT_CC_max','AMT_PAYMENT_CURRENT_CC_sum','AMT_PAYMENT_CURRENT_CC_median','AMT_PAYMENT_CURRENT_CC_mean','AMT_PAYMENT_CURRENT_CC_min','AMT_PAYMENT_CURRENT_CC_max','AMT_DRAWINGS_POS_CURRENT_CC_sum','AMT_DRAWINGS_POS_CURRENT_CC_median','AMT_DRAWINGS_POS_CURRENT_CC_mean','AMT_DRAWINGS_POS_CURRENT_CC_min','AMT_DRAWINGS_POS_CURRENT_CC_max','AMT_BALANCE_CC_sum','AMT_BALANCE_CC_median','AMT_BALANCE_CC_mean','AMT_BALANCE_CC_min','AMT_BALANCE_CC_max','AMT_TOTAL_RECEIVABLE_CC_sum','AMT_TOTAL_RECEIVABLE_CC_median','AMT_TOTAL_RECEIVABLE_CC_mean','AMT_TOTAL_RECEIVABLE_CC_min','AMT_TOTAL_RECEIVABLE_CC_max','AMT_DRAWINGS_ATM_CURRENT_CC_sum','AMT_DRAWINGS_ATM_CURRENT_CC_median','AMT_DRAWINGS_ATM_CURRENT_CC_mean','AMT_DRAWINGS_ATM_CURRENT_CC_min','AMT_DRAWINGS_ATM_CURRENT_CC_max','AMT_INST_MIN_REGULARITY_CC_sum','AMT_INST_MIN_REGULARITY_CC_median','AMT_INST_MIN_REGULARITY_CC_mean','AMT_INST_MIN_REGULARITY_CC_min','AMT_INST_MIN_REGULARITY_CC_max','CNT_DRAWINGS_ATM_CURRENT_CC_sum','CNT_DRAWINGS_ATM_CURRENT_CC_median','CNT_DRAWINGS_ATM_CURRENT_CC_mean','CNT_DRAWINGS_ATM_CURRENT_CC_min','CNT_DRAWINGS_ATM_CURRENT_CC_max','CNT_INSTALMENT_MATURE_CUM_CC_sum','CNT_INSTALMENT_MATURE_CUM_CC_median','CNT_INSTALMENT_MATURE_CUM_CC_mean','CNT_INSTALMENT_MATURE_CUM_CC_min','CNT_INSTALMENT_MATURE_CUM_CC_max','CNT_DRAWINGS_CURRENT_CC_sum','CNT_DRAWINGS_CURRENT_CC_median','CNT_DRAWINGS_CURRENT_CC_mean','CNT_DRAWINGS_CURRENT_CC_min','CNT_DRAWINGS_CURRENT_CC_max']

In [21]:
prev_app_data_grouped = prev_app_data_subset.groupby('SK_ID_CURR', as_index=False).sum()

In [22]:
train_merged = train_merged.merge(prev_app_data_grouped, how = 'left', left_on = 'SK_ID_CURR', right_on = 'SK_ID_CURR')

test_merged = test_merged.merge(prev_app_data_grouped, how = 'left', left_on = 'SK_ID_CURR', right_on = 'SK_ID_CURR')

In [23]:
own_car_median = train_merged.OWN_CAR_AGE.median()

In [24]:
train_dummies = pd.get_dummies(train_merged[['FLAG_OWN_CAR', 'FLAG_OWN_REALTY']], drop_first= True)

test_dummies = pd.get_dummies(test_merged[['FLAG_OWN_CAR', 'FLAG_OWN_REALTY']], drop_first = True)

In [25]:
def own_car_missing(x):
    if x['FLAG_OWN_CAR'] == 'N':
        return 28
    elif x['FLAG_OWN_CAR'] == 'Y' and pd.isnull(x['OWN_CAR_AGE']):
        return own_car_median
    else:
        return x['OWN_CAR_AGE']

In [26]:
train_merged = pd.concat([train_merged, train_dummies], axis = 1).select_dtypes(exclude=['object'])

test_merged = pd.concat([test_merged, test_dummies], axis = 1).select_dtypes(exclude=['object'])

In [27]:
imputer = Imputer(strategy = 'median')
imputer.fit(train_merged.drop('TARGET', axis = 1))
train_merged_imputed = pd.DataFrame(imputer.transform(train_merged.drop('TARGET', axis = 1)), columns = train_merged.drop('TARGET', axis = 1).columns)
test_merged_imputed = pd.DataFrame(imputer.transform(test_merged), columns = train_merged.drop('TARGET', axis = 1).columns)

In [28]:
train_merged_imputed['DAYS_EMPLOYED_^2'] = train_merged_imputed['DAYS_EMPLOYED'] ** 2
train_merged_imputed['AMT_GOODS_PRICE_^2'] = train_merged_imputed['AMT_GOODS_PRICE'] ** 2
train_merged_imputed['DAYS_CREDIT_^2'] = train_merged_imputed['DAYS_CREDIT'] ** 2
train_merged_imputed['DAYS_BIRTH_^2'] = train_merged_imputed['DAYS_BIRTH'] ** 2

test_merged_imputed['DAYS_EMPLOYED_^2'] = test_merged_imputed['DAYS_EMPLOYED'] ** 2
test_merged_imputed['AMT_GOODS_PRICE_^2'] = test_merged_imputed['AMT_GOODS_PRICE'] ** 2
test_merged_imputed['DAYS_CREDIT_^2'] = test_merged_imputed['DAYS_CREDIT'] ** 2
test_merged_imputed['DAYS_BIRTH_^2'] = test_merged_imputed['DAYS_BIRTH'] ** 2

The minimum supported version is 2.4.6



In [47]:
column_corr = pd.concat([train_merged_imputed, train['TARGET']], axis = 1).corr()['TARGET'].sort_values()
column_corr = column_corr[(column_corr > .03) | (column_corr < -.02)].index.values.tolist()

In [50]:
#column_corr = train_merged.corr()['TARGET'].sort_values()
#column_corr = column_corr[(column_corr > .01) | (column_corr < -.01)].index.values.tolist()
#column_corr.remove('TARGET')

In [51]:
#train_subset = train_merged[column_corr]

#test_subset = test_merged[column_corr]

In [52]:
poly_transformer = PolynomialFeatures(degree = 1)
poly_transformer.fit(train_merged_imputed)
train_poly_features = poly_transformer.transform(train_merged_imputed)

In [53]:
train_subset_poly = pd.DataFrame(train_poly_features, columns = poly_transformer.get_feature_names(
    input_features = train_merged_imputed.columns.tolist()
))

In [54]:
test_poly_features = poly_transformer.transform(test_merged_imputed)

In [55]:
test_subset_poly = pd.DataFrame(test_poly_features, columns = poly_transformer.get_feature_names(input_features = train_merged_imputed.columns.tolist()))

In [56]:
scaler = MinMaxScaler(feature_range = (0, 1))

In [57]:
scaler.fit(train_subset_poly)
train_scaled = scaler.transform(train_subset_poly)
test_scaled = scaler.transform(test_subset_poly)

In [58]:
from catboost import CatBoostClassifier
cat_model = CatBoostClassifier(iterations=500, learning_rate=0.1)

In [59]:
cat_model.fit(train_scaled, train.TARGET)

0:	learn: 0.5866823	total: 774ms	remaining: 6m 26s
1:	learn: 0.5067410	total: 1.56s	remaining: 6m 29s
2:	learn: 0.4474333	total: 2.32s	remaining: 6m 24s
3:	learn: 0.4018047	total: 3.07s	remaining: 6m 20s
4:	learn: 0.3691285	total: 3.83s	remaining: 6m 18s
5:	learn: 0.3427282	total: 4.62s	remaining: 6m 20s
6:	learn: 0.3230446	total: 5.36s	remaining: 6m 17s
7:	learn: 0.3084037	total: 6.14s	remaining: 6m 17s
8:	learn: 0.2963694	total: 6.89s	remaining: 6m 15s
9:	learn: 0.2867935	total: 7.65s	remaining: 6m 14s
10:	learn: 0.2801034	total: 8.41s	remaining: 6m 14s
11:	learn: 0.2741726	total: 9.22s	remaining: 6m 15s
12:	learn: 0.2694177	total: 10s	remaining: 6m 14s
13:	learn: 0.2658679	total: 10.9s	remaining: 6m 16s
14:	learn: 0.2631855	total: 11.8s	remaining: 6m 22s
15:	learn: 0.2609704	total: 12.6s	remaining: 6m 22s
16:	learn: 0.2586760	total: 13.4s	remaining: 6m 21s
17:	learn: 0.2570305	total: 14.3s	remaining: 6m 22s
18:	learn: 0.2556474	total: 15.2s	remaining: 6m 25s
19:	learn: 0.2546734	tot

<catboost.core._CatBoostBase at 0x116987e90>

In [60]:
test_y_cat = pd.DataFrame(cat_model.predict_proba(test_scaled))
submission_cat = pd.concat([test.SK_ID_CURR, test_y_cat], axis=1).drop(0, axis = 1)
submission_cat.columns = ['SK_ID_CURR', 'Target']

In [61]:
#max_sub_cat = submission_cat.Target.max()
#min_sub_cat = submission_cat.Target.min()

In [62]:
#submission_cat['Target'] = submission_cat['Target'].apply(lambda x: (x - min_sub_cat) / (max_sub_cat - min_sub_cat))

In [63]:
submission_cat.to_csv('reduced_cat16.csv', index = False)

In [None]:
########################################################################

In [None]:
column_corr = pd.concat([train_subset_poly, train['TARGET']], axis = 1).corr()['TARGET'].sort_values()
column_corr = column_corr.drop('TARGET')
column_corr = column_corr[(column_corr > .03) | (column_corr < -.03)].index.values.tolist()

In [None]:
train_subset_poly = train_subset_poly[column_corr]

In [None]:
test_subset_poly = test_subset_poly[column_corr]

In [None]:
scaler.fit(train_subset_imputed)
train_scaled = scaler.transform(train_subset_imputed)
test_scaled = scaler.transform(test_subset_imputed)

In [None]:
cat_model.fit(train_scaled, target)

In [None]:
test_y_cat = pd.DataFrame(cat_model.predict_proba(test_scaled))
submission_cat = pd.concat([test.SK_ID_CURR, test_y_cat], axis=1).drop(0, axis = 1)
submission_cat.columns = ['SK_ID_CURR', 'Target']
submission_cat.to_csv('reduced_cat11.csv', index = False)