In [32]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler, Imputer
from sklearn.preprocessing import PolynomialFeatures
from scipy import stats
from scipy.stats import boxcox

In [39]:
#Read in Data
train = pd.read_csv('dataFiles/application_train.csv')
test = pd.read_csv('dataFiles/application_test.csv')
bureau_data = pd.read_csv('dataFiles/bureau.csv')
bureau_balance_data = pd.read_csv('dataFiles/bureau_balance.csv')
prev_app_data = pd.read_csv('dataFiles/previous_application.csv')
pos_cash_balance_data = pd.read_csv('dataFiles/POS_CASH_balance.csv')
installments_data = pd.read_csv('dataFiles/installments_payments.csv')
cc_data = pd.read_csv('dataFiles/credit_card_balance.csv')

In [40]:
#Clipping Outliers
train['CNT_CHILDREN'] = train['CNT_CHILDREN'].apply(lambda x: 5 if x >=5 else x)
train['CNT_FAM_MEMBERS'] = train['CNT_FAM_MEMBERS'].apply(lambda x: 5 if x >=5 else x)
train['AMT_INCOME_TOTAL'] = train['AMT_INCOME_TOTAL'].apply(lambda x: 14 if x >=14 else x)
train['HOUR_APPR_PROCESS_START'] = train['HOUR_APPR_PROCESS_START'].apply(lambda x: 2 if x <= 2 else x)
train['COMMONAREA_AVG'] = train['COMMONAREA_AVG'].fillna(0).apply(lambda x: 0.3 if x >= 0.3 else x)
train['AMT_REQ_CREDIT_BUREAU_YEAR'] = train['AMT_REQ_CREDIT_BUREAU_YEAR'].apply(lambda x: 9 if x >= 9 else x).fillna(10)

test['CNT_CHILDREN'] = test['CNT_CHILDREN'].apply(lambda x: 5 if x >=5 else x)
test['CNT_FAM_MEMBERS'] = test['CNT_FAM_MEMBERS'].apply(lambda x: 5 if x >=5 else x)
test['AMT_INCOME_TOTAL'] = test['AMT_INCOME_TOTAL'].apply(lambda x: 14 if x >=14 else x)
test['HOUR_APPR_PROCESS_START'] = test['HOUR_APPR_PROCESS_START'].apply(lambda x: 2 if x <= 2 else x)
test['COMMONAREA_AVG'] = test['COMMONAREA_AVG'].fillna(0).apply(lambda x: 0.3 if x >= 0.3 else x)
test['AMT_REQ_CREDIT_BUREAU_YEAR'] = test['AMT_REQ_CREDIT_BUREAU_YEAR'].apply(lambda x: 9 if x >= 9 else x).fillna(10)

In [41]:
#Log Transformations
train[['AMT_INCOME_TOTAL','AMT_CREDIT','AMT_ANNUITY',\
       'AMT_GOODS_PRICE']] = np.log(train[['AMT_INCOME_TOTAL','AMT_CREDIT','AMT_ANNUITY','AMT_GOODS_PRICE']])

test[['AMT_INCOME_TOTAL','AMT_CREDIT','AMT_ANNUITY',\
       'AMT_GOODS_PRICE']] = np.log(test[['AMT_INCOME_TOTAL','AMT_CREDIT','AMT_ANNUITY','AMT_GOODS_PRICE']])

In [42]:
#BINNING THE TWO HUMPS FROM DAYS_EMPLOYED
train['DAYS_EMPLOYED_BIN_1'] = train['DAYS_EMPLOYED'].apply(lambda x: 1 if x < 150000 else 0)
train['DAYS_EMPLOYED_BIN_2'] = train['DAYS_EMPLOYED'].apply(lambda x: 1 if x >= 150000 else 0)

#BINNING THE TWO HUMPS FROM DAYS_EMPLOYED
test['DAYS_EMPLOYED_BIN_1'] = test['DAYS_EMPLOYED'].apply(lambda x: 1 if x < 150000 else 0)
test['DAYS_EMPLOYED_BIN_2'] = test['DAYS_EMPLOYED'].apply(lambda x: 1 if x >= 150000 else 0)

In [43]:
#ABS & BOX COX Transformation
train['DAYS_REGISTRATION'] = np.abs(train['DAYS_REGISTRATION'])
train['DAYS_REGISTRATION'] = train['DAYS_REGISTRATION'].apply(lambda x: 0.01 if x == 0 else x)
train['DAYS_REGISTRATION'] = boxcox(train['DAYS_REGISTRATION'],0.5)

train['DAYS_ID_PUBLISH'] = np.abs(train['DAYS_ID_PUBLISH'])
train['DAYS_ID_PUBLISH'] = train['DAYS_ID_PUBLISH'].apply(lambda x: 0.01 if x == 0 else x)
train['DAYS_ID_PUBLISH'] = boxcox(train['DAYS_ID_PUBLISH'],0.5)

train['ENTRANCES_AVG'] = train['ENTRANCES_AVG'].apply(lambda x: np.sqrt(x) if pd.notnull(x) else x)

test['DAYS_REGISTRATION'] = np.abs(test['DAYS_REGISTRATION'])
test['DAYS_REGISTRATION'] = test['DAYS_REGISTRATION'].apply(lambda x: 0.01 if x == 0 else x)
test['DAYS_REGISTRATION'] = boxcox(test['DAYS_REGISTRATION'],0.5)

test['DAYS_ID_PUBLISH'] = np.abs(test['DAYS_ID_PUBLISH'])
test['DAYS_ID_PUBLISH'] = test['DAYS_ID_PUBLISH'].apply(lambda x: 0.01 if x == 0 else x)
test['DAYS_ID_PUBLISH'] = boxcox(test['DAYS_ID_PUBLISH'],0.5)

test['ENTRANCES_AVG'] = test['ENTRANCES_AVG'].apply(lambda x: np.sqrt(x) if pd.notnull(x) else x)

In [44]:
#4 Bins for Own Car Age
def own_car_age_bins(x):
    if pd.isnull(x):
        return 0
    elif x <= 10:
        return 1
    elif x <= 30:
        return 2
    else:
        return 3
    
train['OWN_CAR_AGE_BINS'] = train['OWN_CAR_AGE'].apply(own_car_age_bins)

test['OWN_CAR_AGE_BINS'] = test['OWN_CAR_AGE'].apply(own_car_age_bins)

In [45]:
#4 Bins for BASEMENTAREA_AVG
def basement_avg_bins(x):
    if pd.isnull(x):
        return 0
    elif x <= 0.1:
        return 1
    elif x <= 0.2:
        return 2
    else:
        return 3
    
train['BASEMENTAREA_AVG_BINS'] = train['BASEMENTAREA_AVG'].apply(basement_avg_bins)

test['BASEMENTAREA_AVG_BINS'] = test['BASEMENTAREA_AVG'].apply(basement_avg_bins)

In [46]:
#Columns to drop because of sparse features
train = train.drop(['FLAG_MOBIL','FLAG_CONT_MOBILE','FLAG_DOCUMENT_2','FLAG_DOCUMENT_4',\
                    'FLAG_DOCUMENT_7','FLAG_DOCUMENT_10','FLAG_DOCUMENT_12','FLAG_DOCUMENT_15',\
                    'FLAG_DOCUMENT_17','FLAG_DOCUMENT_19','FLAG_DOCUMENT_20','FLAG_DOCUMENT_21'], axis = 1)

test = test.drop(['FLAG_MOBIL','FLAG_CONT_MOBILE','FLAG_DOCUMENT_2','FLAG_DOCUMENT_4',\
                    'FLAG_DOCUMENT_7','FLAG_DOCUMENT_10','FLAG_DOCUMENT_12','FLAG_DOCUMENT_15',\
                    'FLAG_DOCUMENT_17','FLAG_DOCUMENT_19','FLAG_DOCUMENT_20','FLAG_DOCUMENT_21'], axis = 1)

In [47]:
#FillNA Columns
train[['APARTMENTS_AVG','BASEMENTAREA_AVG',\
       'YEARS_BEGINEXPLUATATION_AVG','YEARS_BUILD_AVG',\
       'ELEVATORS_AVG','ENTRANCES_AVG','AMT_REQ_CREDIT_BUREAU_HOUR','AMT_REQ_CREDIT_BUREAU_DAY',\
       'AMT_REQ_CREDIT_BUREAU_WEEK','AMT_REQ_CREDIT_BUREAU_MON',\
       'AMT_REQ_CREDIT_BUREAU_QRT']] = train[['APARTMENTS_AVG','BASEMENTAREA_AVG','YEARS_BEGINEXPLUATATION_AVG',\
                                  'YEARS_BUILD_AVG','ELEVATORS_AVG','ENTRANCES_AVG','AMT_REQ_CREDIT_BUREAU_HOUR',\
                                              'AMT_REQ_CREDIT_BUREAU_DAY','AMT_REQ_CREDIT_BUREAU_WEEK',\
                                              'AMT_REQ_CREDIT_BUREAU_MON','AMT_REQ_CREDIT_BUREAU_QRT']].fillna(0)

test[['APARTMENTS_AVG','BASEMENTAREA_AVG',\
       'YEARS_BEGINEXPLUATATION_AVG','YEARS_BUILD_AVG',\
       'ELEVATORS_AVG','ENTRANCES_AVG','AMT_REQ_CREDIT_BUREAU_HOUR','AMT_REQ_CREDIT_BUREAU_DAY',\
       'AMT_REQ_CREDIT_BUREAU_WEEK','AMT_REQ_CREDIT_BUREAU_MON',\
       'AMT_REQ_CREDIT_BUREAU_QRT']] = test[['APARTMENTS_AVG','BASEMENTAREA_AVG','YEARS_BEGINEXPLUATATION_AVG',\
                                  'YEARS_BUILD_AVG','ELEVATORS_AVG','ENTRANCES_AVG','AMT_REQ_CREDIT_BUREAU_HOUR',\
                                              'AMT_REQ_CREDIT_BUREAU_DAY','AMT_REQ_CREDIT_BUREAU_WEEK',\
                                              'AMT_REQ_CREDIT_BUREAU_MON','AMT_REQ_CREDIT_BUREAU_QRT']].fillna(0)

In [48]:
#### Bureau Balance ####


In [49]:
bureau_balance_data_grouped = pd.get_dummies(bureau_balance_data).groupby('SK_ID_BUREAU', as_index=False).agg({'STATUS_1':'count','MONTHS_BALANCE':min,\
                                                                                 'STATUS_C':sum,'STATUS_0':sum,'STATUS_X':sum})

In [50]:
bureau_balance_data_grouped = bureau_balance_data_grouped.rename(columns={'STATUS_1':'BALANCE_COUNT'})

In [51]:
bureau_balance_data_grouped['STATUS_X_RATIO'] = bureau_balance_data_grouped['STATUS_X'] / bureau_balance_data_grouped['BALANCE_COUNT'].astype(float)
bureau_balance_data_grouped['STATUS_C_RATIO'] = bureau_balance_data_grouped['STATUS_C'] / bureau_balance_data_grouped['BALANCE_COUNT'].astype(float)
bureau_balance_data_grouped['STATUS_0_RATIO'] = bureau_balance_data_grouped['STATUS_0'] / bureau_balance_data_grouped['BALANCE_COUNT'].astype(float)

In [52]:
bureau_data = bureau_data.merge(bureau_balance_data_grouped, how = 'left')

In [53]:
bureau_data[['MONTHS_BALANCE', 'STATUS_X', 'STATUS_C', 'BALANCE_COUNT', 'STATUS_0', 'STATUS_X_RATIO','STATUS_C_RATIO', 'STATUS_0_RATIO']] = bureau_data[['MONTHS_BALANCE', 'STATUS_X', 'STATUS_C', 'BALANCE_COUNT', 'STATUS_0', 'STATUS_X_RATIO','STATUS_C_RATIO', 'STATUS_0_RATIO']].fillna(0)

In [54]:
##### Bureau Data #####
bureau_data_grouped = bureau_data.select_dtypes(exclude='object').drop('SK_ID_BUREAU', axis = 1).groupby('SK_ID_CURR').sum()
#bureau_data_grouped.columns = ['_'.join(col) if col != ('SK_ID_CURR', '') else col[0] for col in bureau_data_grouped.columns]
bureau_data_grouped = bureau_data_grouped.reset_index()

In [55]:
#Past Loan Count
loan_count = bureau_data[['SK_ID_CURR','SK_ID_BUREAU']].groupby('SK_ID_CURR', \
                                                   as_index=False)['SK_ID_BUREAU'].count().rename(columns = {'SK_ID_BUREAU':'LOAN_COUNT'})

bureau_data_grouped = bureau_data_grouped.merge(loan_count, how = 'left')

In [56]:
#Unique Loan Types
unique_loan_count = bureau_data[['SK_ID_CURR','CREDIT_TYPE']].groupby('SK_ID_CURR',\
                                                                      as_index=False).agg({'CREDIT_TYPE':'nunique'}).rename(columns={'CREDIT_TYPE':'UNIQUE_CREDIT_TYPES'})

bureau_data_grouped = bureau_data_grouped.merge(unique_loan_count, how = 'left')

In [57]:
#Total Active Loans
bureau_data['CREDIT_ACTIVE_BINARY'] = bureau_data['CREDIT_ACTIVE'].apply(lambda x: 1 if x == 'Active' else 0)

active_loan_count = bureau_data[['SK_ID_CURR','CREDIT_ACTIVE_BINARY']].groupby('SK_ID_CURR', \
                                                   as_index=False)['CREDIT_ACTIVE_BINARY'].sum().rename(columns = {'CREDIT_ACTIVE_BINARY':'ACTIVE_LOANS'})

bureau_data_grouped = bureau_data_grouped.merge(active_loan_count, how = 'left')

In [58]:
#Days Between Successive Past Applications
grp = bureau_data[['SK_ID_CURR', 'SK_ID_BUREAU', 'DAYS_CREDIT']].groupby(by = ['SK_ID_CURR'])
grp1 = grp.apply(lambda x: x.sort_values(['DAYS_CREDIT'], ascending = False)).reset_index(drop = True)

grp1['DAYS_CREDIT1'] = grp1['DAYS_CREDIT']*-1
grp1['DAYS_DIFF'] = grp1.groupby(by = ['SK_ID_CURR'])['DAYS_CREDIT1'].diff()
grp1['DAYS_DIFF'] = grp1['DAYS_DIFF'].fillna(0).astype('uint32')
del grp1['DAYS_CREDIT1'], grp1['DAYS_CREDIT']

In [59]:
past_app_days = grp1.groupby('SK_ID_CURR', as_index=False)['DAYS_DIFF'].mean()

In [60]:
bureau_data_grouped = bureau_data_grouped.merge(past_app_days, how = 'left')

In [61]:
# Days Credit Expires
bureau_data['CREDIT_ENDDATE_BINARY'] = bureau_data['DAYS_CREDIT_ENDDATE'].apply(lambda x: 0 if x < 0 else 1) 

In [62]:
B1 = bureau_data.loc[bureau_data['CREDIT_ENDDATE_BINARY'] == 1]

In [63]:
grp = B1[['SK_ID_CURR', 'SK_ID_BUREAU', 'DAYS_CREDIT_ENDDATE']].groupby(by = ['SK_ID_CURR'])
# Sort the values of CREDIT_ENDDATE for each customer ID 
grp1 = grp.apply(lambda x: x.sort_values(['DAYS_CREDIT_ENDDATE'], ascending = True)).reset_index(drop = True)
del grp

In [64]:
grp1['DAYS_ENDDATE_DIFF'] = grp1.groupby(by = ['SK_ID_CURR'])['DAYS_CREDIT_ENDDATE'].diff()
grp1['DAYS_ENDDATE_DIFF'] = grp1['DAYS_ENDDATE_DIFF'].fillna(0).astype('uint32')
del grp1['DAYS_CREDIT_ENDDATE']

In [65]:
credit_expires_days = grp1.groupby('SK_ID_CURR', as_index = False)['DAYS_ENDDATE_DIFF'].mean()

bureau_data_grouped = bureau_data_grouped.merge(credit_expires_days, how = 'left')

In [66]:
# % Active Loans
bureau_data_grouped['ACTIVE_LOAN_PERC'] = bureau_data_grouped['ACTIVE_LOANS'] / bureau_data_grouped['LOAN_COUNT'].astype(float)

In [67]:
##### Credit Card Data

In [68]:
cc_data_one_hot = pd.concat([cc_data['SK_ID_PREV'], \
                            pd.get_dummies(cc_data.select_dtypes(include=['object']), drop_first = True)],\
                            axis = 1)

In [69]:
cc_data_one_hot_grouped = cc_data_one_hot.groupby('SK_ID_PREV', as_index=False).max()

In [70]:
cc_data_numeric_grouped = cc_data.select_dtypes(exclude=['object']).groupby('SK_ID_PREV', as_index = False).agg(['count', sum, 'mean', min, max])

In [71]:
cc_data_numeric_grouped.columns = ['_CC_'.join(col) if col != ('SK_ID_PREV', '') else col[0] for col in cc_data_numeric_grouped.columns]

In [72]:
filtered_cols = filter(lambda x: x[-5:] != 'count' and x[0:10] != 'SK_ID_CURR', cc_data_numeric_grouped.columns.tolist())
filtered_cols.insert(0, 'MONTHS_BALANCE_CC_count')

In [73]:
cc_data_numeric_grouped_filtered = cc_data_numeric_grouped[filtered_cols].reset_index()

In [74]:
cc_data_grouped = cc_data_numeric_grouped_filtered.merge(cc_data_one_hot_grouped, how = 'left')

In [75]:
prev_app_data_dummies = pd.get_dummies(prev_app_data[['NAME_CONTRACT_TYPE','FLAG_LAST_APPL_PER_CONTRACT','NAME_CONTRACT_STATUS',\
                                       'NAME_PAYMENT_TYPE','CODE_REJECT_REASON','NAME_TYPE_SUITE','NAME_CLIENT_TYPE',\
                                       'NAME_PORTFOLIO','CHANNEL_TYPE','NAME_YIELD_GROUP','PRODUCT_COMBINATION']]).fillna(0)

In [76]:
prev_app_data = pd.concat([prev_app_data.select_dtypes(exclude=['object']), prev_app_data_dummies], axis = 1)

In [77]:
prev_app_data_merged = prev_app_data.merge(cc_data_grouped, how = 'left', left_on = 'SK_ID_PREV', right_on = 'SK_ID_PREV')

In [78]:
installments_data_grouped = installments_data.groupby('SK_ID_PREV', as_index = False).agg(['count', sum, 'mean', min, max])

In [79]:
installments_data_grouped.columns = ['_INST_'.join(col) if col != ('SK_ID_PREV', '') else col[0] for col in installments_data_grouped.columns]

In [80]:
installments_data_grouped = installments_data_grouped.reset_index()

In [81]:
filtered_cols = filter(lambda x: x[-5:] != 'count' and x[0:10] != 'SK_ID_CURR', installments_data_grouped.columns.tolist())
filtered_cols.insert(0, 'SK_ID_CURR_INST_count')

In [82]:
installments_data_grouped = installments_data_grouped[filtered_cols]

In [83]:
prev_app_data_merged = prev_app_data_merged.merge(installments_data_grouped, how = 'left', left_on = 'SK_ID_PREV', right_on = 'SK_ID_PREV')

In [84]:
pos_cash_balance_data = pd.get_dummies(pos_cash_balance_data)

In [85]:
pos_cash_balance_data_grouped = pos_cash_balance_data.groupby('SK_ID_PREV', as_index=False).agg(['count', sum, 'mean', min, max])

In [86]:
pos_cash_balance_data_grouped.columns = ['_POS_'.join(col) if col != ('SK_ID_PREV', '') else col[0] for col in pos_cash_balance_data_grouped.columns]

In [87]:
pos_cash_balance_data_grouped = pos_cash_balance_data_grouped.reset_index()

In [88]:
filtered_cols = filter(lambda x: x[-5:] != 'count' and x[0:10] != 'SK_ID_CURR', pos_cash_balance_data_grouped.columns.tolist())
filtered_cols.insert(0, 'SK_ID_CURR_POS_count')

In [89]:
pos_cash_balance_data_grouped = pos_cash_balance_data_grouped[filtered_cols]

In [90]:
prev_app_data_merged = prev_app_data_merged.merge(pos_cash_balance_data_grouped, how = 'left', left_on = 'SK_ID_PREV', right_on = 'SK_ID_PREV')

In [91]:
filtered_cols = filter(lambda x: x[-5:] != 'count' and x[0:10] != 'SK_ID_CURR', bureau_data_grouped.columns.tolist())
filtered_cols.insert(0, 'SK_ID_BUREAU_count')

In [92]:
train_dummies = pd.get_dummies(train[['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE',\
'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'FONDKAPREMONT_MODE','HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE']]).fillna(0)

In [93]:
test_dummies = pd.get_dummies(test[['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE',\
'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'FONDKAPREMONT_MODE','HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE']]).fillna(0)

In [94]:
train_one_hot = pd.concat([train.select_dtypes(exclude=['object']), train_dummies], axis = 1)

test_one_hot = pd.concat([test.select_dtypes(exclude=['object']), test_dummies], axis = 1)

In [95]:
train_merged = train_one_hot.merge(bureau_data_grouped, how = 'left', left_on = 'SK_ID_CURR', right_on = 'SK_ID_CURR')

test_merged = test_one_hot.merge(bureau_data_grouped, how = 'left', left_on = 'SK_ID_CURR', right_on = 'SK_ID_CURR')

In [96]:
prev_app_data_subset = prev_app_data_merged[prev_app_data_merged.columns.tolist()].drop('SK_ID_PREV', axis = 1)

In [97]:
prev_app_data_grouped = prev_app_data_subset.groupby('SK_ID_CURR', as_index=False).sum()

In [98]:
#prev_app_data_grouped.columns = ['_'.join(col) if col != ('SK_ID_CURR', '') else col[0] for col in prev_app_data_grouped.columns]
#prev_app_data_grouped = prev_app_data_grouped.reset_index()

In [99]:
train_merged = train_merged.merge(prev_app_data_grouped, how = 'left', left_on = 'SK_ID_CURR', right_on = 'SK_ID_CURR')

test_merged = test_merged.merge(prev_app_data_grouped, how = 'left', left_on = 'SK_ID_CURR', right_on = 'SK_ID_CURR')

In [100]:
own_car_median = train_merged.OWN_CAR_AGE.median()

In [101]:
def own_car_missing(x):
    if x['FLAG_OWN_CAR'] == 'N':
        return 28
    elif x['FLAG_OWN_CAR'] == 'Y' and pd.isnull(x['OWN_CAR_AGE']):
        return own_car_median
    else:
        return x['OWN_CAR_AGE']

In [102]:
train_merged_subset = train_merged.dropna(thresh=len(train_merged) - 200000000, axis = 1)

In [103]:
column_corr_subset = train_merged_subset.columns.tolist()

In [104]:
len(column_corr_subset)

438

In [105]:
#col_corr = train_merged_subset.corr()['TARGET'].sort_values()

In [106]:
#column_corr_subset = col_corr[(col_corr >= 0.03) | (col_corr < -0.035)].index.values.tolist()

In [107]:
column_corr_subset.remove('TARGET')

In [108]:
column_corr_subset.remove('CODE_GENDER_XNA')
column_corr_subset.remove('NAME_FAMILY_STATUS_Unknown')
column_corr_subset.remove('NAME_INCOME_TYPE_Maternity leave')

In [109]:
train_subset = train_merged_subset[column_corr_subset]

test_subset = test_merged[column_corr_subset]

In [110]:
imputer = Imputer()
imputer.fit(train_subset)
train_merged_imputed = pd.DataFrame(imputer.transform(train_subset), columns = train_subset.columns)
test_merged_imputed = pd.DataFrame(imputer.transform(test_subset), columns = train_subset.columns)

In [111]:
train_merged_imputed['DAYS_EMPLOYED_^2'] = train_merged_imputed['DAYS_EMPLOYED'] ** 2
#train_merged_imputed['AMT_GOODS_PRICE_^2'] = train_merged_imputed['AMT_GOODS_PRICE'] ** 2
train_merged_imputed['DAYS_CREDIT^2'] = train_merged_imputed['DAYS_CREDIT'] ** 2
#train_merged_imputed['DAYS_CREDIT_median^2'] = train_merged_imputed['DAYS_CREDIT_median'] ** 2
train_merged_imputed['DAYS_BIRTH_^2'] = train_merged_imputed['DAYS_BIRTH'] ** 2
train_merged_imputed['REGION_RATING_CLIENT_W_CITY_^2'] = train_merged_imputed['REGION_RATING_CLIENT_W_CITY'] ** 2
train_merged_imputed['REGION_RATING_CLIENT_^2'] = train_merged_imputed['REGION_RATING_CLIENT'] ** 2
train_merged_imputed['NAME_INCOME_TYPE_Working_^2'] = train_merged_imputed['NAME_INCOME_TYPE_Working'] ** 2
train_merged_imputed['DAYS_LAST_PHONE_CHANGE_^2'] = train_merged_imputed['DAYS_LAST_PHONE_CHANGE'] ** 2
train_merged_imputed['CODE_GENDER_M_^2'] = train_merged_imputed['CODE_GENDER_M'] ** 2
train_merged_imputed['EXT_SOURCE_1_^2'] = train_merged_imputed['EXT_SOURCE_1'] ** 2
train_merged_imputed['EXT_SOURCE_2_^2'] = train_merged_imputed['EXT_SOURCE_2'] ** 2
train_merged_imputed['EXT_SOURCE_3_^2'] = train_merged_imputed['EXT_SOURCE_3'] ** 2
train_merged_imputed['NAME_EDUCATION_TYPE_Higher education_^2'] = train_merged_imputed['NAME_EDUCATION_TYPE_Higher education'] ** 2
train_merged_imputed['CODE_GENDER_F_^2']= train_merged_imputed['CODE_GENDER_F'] ** 2

train_merged_imputed['DAYS_EMPLOYED_^3'] = train_merged_imputed['DAYS_EMPLOYED'] ** 3
#train_merged_imputed['AMT_GOODS_PRICE_^3'] = train_merged_imputed['AMT_GOODS_PRICE'] ** 3
train_merged_imputed['DAYS_CREDIT^3'] = train_merged_imputed['DAYS_CREDIT'] ** 3
#train_merged_imputed['DAYS_CREDIT_median^3'] = train_merged_imputed['DAYS_CREDIT_median'] ** 3
train_merged_imputed['DAYS_BIRTH_^3'] = train_merged_imputed['DAYS_BIRTH'] ** 3
train_merged_imputed['REGION_RATING_CLIENT_W_CITY_^3'] = train_merged_imputed['REGION_RATING_CLIENT_W_CITY'] ** 3
train_merged_imputed['REGION_RATING_CLIENT_^3'] = train_merged_imputed['REGION_RATING_CLIENT'] ** 3
train_merged_imputed['NAME_INCOME_TYPE_Working_^3'] = train_merged_imputed['NAME_INCOME_TYPE_Working'] ** 3
train_merged_imputed['DAYS_LAST_PHONE_CHANGE_^3'] = train_merged_imputed['DAYS_LAST_PHONE_CHANGE'] ** 3
train_merged_imputed['CODE_GENDER_M_^3'] = train_merged_imputed['CODE_GENDER_M'] ** 3
train_merged_imputed['EXT_SOURCE_1_^3'] = train_merged_imputed['EXT_SOURCE_1'] ** 3
train_merged_imputed['EXT_SOURCE_2_^3'] = train_merged_imputed['EXT_SOURCE_2'] ** 3
train_merged_imputed['EXT_SOURCE_3_^3'] = train_merged_imputed['EXT_SOURCE_3'] ** 3
train_merged_imputed['NAME_EDUCATION_TYPE_Higher education_^3'] = train_merged_imputed['NAME_EDUCATION_TYPE_Higher education'] ** 3
train_merged_imputed['CODE_GENDER_F_^3']= train_merged_imputed['CODE_GENDER_F'] ** 3

test_merged_imputed['DAYS_EMPLOYED_^2'] = test_merged_imputed['DAYS_EMPLOYED'] ** 2
#test_merged_imputed['AMT_GOODS_PRICE_^2'] = test_merged_imputed['AMT_GOODS_PRICE'] ** 2
test_merged_imputed['DAYS_CREDIT_^2'] = test_merged_imputed['DAYS_CREDIT'] ** 2
#test_merged_imputed['DAYS_CREDIT_median^2'] = test_merged_imputed['DAYS_CREDIT_median'] ** 2
test_merged_imputed['DAYS_BIRTH_^2'] = test_merged_imputed['DAYS_BIRTH'] ** 2
test_merged_imputed['REGION_RATING_CLIENT_W_CITY_^2'] = test_merged_imputed['REGION_RATING_CLIENT_W_CITY'] ** 2
test_merged_imputed['REGION_RATING_CLIENT_^2'] = test_merged_imputed['REGION_RATING_CLIENT'] ** 2
test_merged_imputed['NAME_INCOME_TYPE_Working_^2'] = test_merged_imputed['NAME_INCOME_TYPE_Working'] ** 2
test_merged_imputed['DAYS_LAST_PHONE_CHANGE_^2'] = test_merged_imputed['DAYS_LAST_PHONE_CHANGE'] ** 2
test_merged_imputed['CODE_GENDER_M_^2'] = test_merged_imputed['CODE_GENDER_M'] ** 2
test_merged_imputed['EXT_SOURCE_1_^2'] = test_merged_imputed['EXT_SOURCE_1'] ** 2
test_merged_imputed['EXT_SOURCE_2_^2'] = test_merged_imputed['EXT_SOURCE_2'] ** 2
test_merged_imputed['EXT_SOURCE_3_^2'] = test_merged_imputed['EXT_SOURCE_3'] ** 2
test_merged_imputed['NAME_EDUCATION_TYPE_Higher education_^2'] = test_merged_imputed['NAME_EDUCATION_TYPE_Higher education'] ** 2
test_merged_imputed['CODE_GENDER_F_^2']= test_merged_imputed['CODE_GENDER_F'] ** 2

test_merged_imputed['DAYS_EMPLOYED_^3'] = test_merged_imputed['DAYS_EMPLOYED'] ** 3
#test_merged_imputed['AMT_GOODS_PRICE_^3'] = test_merged_imputed['AMT_GOODS_PRICE'] ** 3
test_merged_imputed['DAYS_CREDIT^3'] = test_merged_imputed['DAYS_CREDIT'] ** 3
#test_merged_imputed['DAYS_CREDIT_median^3'] = test_merged_imputed['DAYS_CREDIT_median'] ** 3
test_merged_imputed['DAYS_BIRTH_^3'] = test_merged_imputed['DAYS_BIRTH'] ** 3
test_merged_imputed['REGION_RATING_CLIENT_W_CITY_^3'] = test_merged_imputed['REGION_RATING_CLIENT_W_CITY'] ** 3
test_merged_imputed['REGION_RATING_CLIENT_^3'] = test_merged_imputed['REGION_RATING_CLIENT'] ** 3
test_merged_imputed['NAME_INCOME_TYPE_Working_^3'] = test_merged_imputed['NAME_INCOME_TYPE_Working'] ** 3
test_merged_imputed['DAYS_LAST_PHONE_CHANGE_^3'] = test_merged_imputed['DAYS_LAST_PHONE_CHANGE'] ** 3
test_merged_imputed['CODE_GENDER_M_^3'] = test_merged_imputed['CODE_GENDER_M'] ** 3
test_merged_imputed['EXT_SOURCE_1_^3'] = test_merged_imputed['EXT_SOURCE_1'] ** 3
test_merged_imputed['EXT_SOURCE_2_^3'] = test_merged_imputed['EXT_SOURCE_2'] ** 3
test_merged_imputed['EXT_SOURCE_3_^3'] = test_merged_imputed['EXT_SOURCE_3'] ** 3
test_merged_imputed['NAME_EDUCATION_TYPE_Higher education_^3'] = test_merged_imputed['NAME_EDUCATION_TYPE_Higher education'] ** 3
test_merged_imputed['CODE_GENDER_F_^3']= test_merged_imputed['CODE_GENDER_F'] ** 3

In [112]:
poly_transformer = PolynomialFeatures(degree = 1)
poly_transformer.fit(train_merged_imputed)
train_poly_features = poly_transformer.transform(train_merged_imputed)

In [113]:
train_subset_poly = pd.DataFrame(train_poly_features, columns = poly_transformer.get_feature_names(
    input_features = train_merged_imputed.columns.tolist()
))

In [114]:
test_poly_features = poly_transformer.transform(test_merged_imputed)

In [115]:
test_subset_poly = pd.DataFrame(test_poly_features, columns = poly_transformer.get_feature_names(input_features = test_merged_imputed.columns.tolist()))

In [116]:
scaler = MinMaxScaler(feature_range = (0, 1))

In [117]:
scaler.fit(train_subset_poly)
train_scaled = scaler.transform(train_subset_poly)
test_scaled = scaler.transform(test_subset_poly)

In [126]:
from catboost import CatBoostClassifier
cat_model = CatBoostClassifier(iterations = 3000, random_state = 42, learning_rate = 0.2)

In [None]:
cat_model.fit(train_scaled, train.TARGET)

0:	learn: 0.4980158	total: 1.5s	remaining: 1h 15m 3s
1:	learn: 0.3935329	total: 3.07s	remaining: 1h 16m 44s
2:	learn: 0.3347561	total: 4.5s	remaining: 1h 14m 50s
3:	learn: 0.3028508	total: 5.99s	remaining: 1h 14m 45s
4:	learn: 0.2838303	total: 7.67s	remaining: 1h 16m 36s
5:	learn: 0.2708611	total: 9.08s	remaining: 1h 15m 30s
6:	learn: 0.2629275	total: 10.5s	remaining: 1h 14m 38s
7:	learn: 0.2582778	total: 11.9s	remaining: 1h 14m 7s
8:	learn: 0.2551190	total: 13.3s	remaining: 1h 13m 52s
9:	learn: 0.2528541	total: 14.8s	remaining: 1h 13m 54s
10:	learn: 0.2511789	total: 16.2s	remaining: 1h 13m 20s
11:	learn: 0.2500904	total: 17.7s	remaining: 1h 13m 26s
12:	learn: 0.2490829	total: 19s	remaining: 1h 12m 51s
13:	learn: 0.2483663	total: 20.5s	remaining: 1h 12m 51s
14:	learn: 0.2478848	total: 21.9s	remaining: 1h 12m 39s
15:	learn: 0.2471633	total: 23.3s	remaining: 1h 12m 27s
16:	learn: 0.2468254	total: 24.7s	remaining: 1h 12m 11s
17:	learn: 0.2463423	total: 26.3s	remaining: 1h 12m 43s
18:	lear

In [None]:
from xgboost import XGBClassifier
xgb_model = XGBClassifier(n_estimators = 300, silent=True, learning_rate = 0.1)

In [None]:
#xgb_model.fit(train_scaled, train.TARGET)

In [120]:
test_y_cat = pd.DataFrame(cat_model.predict_proba(test_scaled))
submission_cat = pd.concat([test.SK_ID_CURR, test_y_cat], axis=1).drop(0, axis = 1)
submission_cat.columns = ['SK_ID_CURR', 'Target']

In [121]:
submission_cat.to_csv('cat_fe.csv', index=False)

In [None]:
#max_sub_cat = submission_cat.Target.max()
#min_sub_cat = submission_cat.Target.min()

In [None]:
#submission_cat['Target'] = submission_cat['Target'].apply(lambda x: (x - min_sub_cat) / (max_sub_cat - min_sub_cat))

In [122]:
#cat_submission = pd.read_csv('cat_lr75.csv')
xgb_submission = pd.read_csv('xgb1.csv')

In [124]:
submission_cat['Target'] = (submission_cat['Target']+ xgb_submission['Target']) /2

In [125]:
submission_cat.to_csv('combined10.csv', index = False)