In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler, Imputer
from sklearn.preprocessing import PolynomialFeatures
from scipy import stats

In [2]:
#Read in Data
train = pd.read_csv('dataFiles/application_train.csv')
test = pd.read_csv('dataFiles/application_test.csv')
bureau_data = pd.read_csv('dataFiles/bureau.csv')
bureau_balance_data = pd.read_csv('dataFiles/bureau_balance.csv')
prev_app_data = pd.read_csv('dataFiles/previous_application.csv')

In [3]:
bureau_balance_data_grouped = pd.get_dummies(bureau_balance_data).groupby('SK_ID_BUREAU', as_index=False).agg({'MONTHS_BALANCE':min, 'STATUS_0':sum,
                                                                                'STATUS_1':sum, 'STATUS_2':sum, 'STATUS_3':sum,
                                                                                'STATUS_3':sum, 'STATUS_4':sum, 'STATUS_5':sum,
                                                                                'STATUS_5':sum, 'STATUS_X':sum})

In [4]:
bureau_data = bureau_data.merge(bureau_balance_data_grouped, how = 'left', left_on = 'SK_ID_BUREAU', right_on = 'SK_ID_BUREAU')

In [5]:
# Joining Bureau Grouped Data with Train Data
bureau_data_grouped = bureau_data.groupby('SK_ID_CURR', as_index=False).agg({'SK_ID_BUREAU': 'count', 'AMT_CREDIT_MAX_OVERDUE':sum,\
                                                                             'AMT_CREDIT_SUM_OVERDUE': sum, 'DAYS_CREDIT': 'mean',\
                                                                             'CREDIT_DAY_OVERDUE': sum, 'DAYS_CREDIT_ENDDATE': 'mean',\
                                                                             'AMT_CREDIT_SUM': sum, 'AMT_CREDIT_SUM_DEBT': sum,\
                                                                             'MONTHS_BALANCE':min, 'STATUS_0':sum,\
                                                                             'STATUS_1':sum, 'STATUS_2':sum, 'STATUS_3':sum,\
                                                                             'STATUS_3':sum, 'STATUS_4':sum, 'STATUS_5':sum,\
                                                                             'STATUS_5':sum, 'STATUS_X':sum
                                                                            })

train_merged = train.merge(bureau_data_grouped, how = 'left', left_on = 'SK_ID_CURR', right_on = 'SK_ID_CURR')

test_merged = test.merge(bureau_data_grouped, how = 'left', left_on = 'SK_ID_CURR', right_on = 'SK_ID_CURR')

In [6]:
prev_app_data = prev_app_data[['SK_ID_CURR','AMT_ANNUITY','AMT_APPLICATION','AMT_CREDIT','AMT_DOWN_PAYMENT','AMT_GOODS_PRICE',\
                         'CNT_PAYMENT','DAYS_FIRST_DRAWING','DAYS_FIRST_DUE','DAYS_LAST_DUE_1ST_VERSION',\
                         'DAYS_LAST_DUE','DAYS_TERMINATION','NFLAG_INSURED_ON_APPROVAL']]

prev_app_data.columns = ['SK_ID_CURR','AMT_ANNUITY_PRE','AMT_APPLICATION_PRE','AMT_CREDIT_PRE','AMT_DOWN_PAYMENT_PRE','AMT_GOODS_PRICE_PRE',\
                         'CNT_PAYMENT_PRE','DAYS_FIRST_DRAWING_PRE','DAYS_FIRST_DUE_PRE','DAYS_LAST_DUE_1ST_VERSION_PRE',\
                         'DAYS_LAST_DUE_PRE','DAYS_TERMINATION_PRE','NFLAG_INSURED_ON_APPROVAL_PRE']

prev_app_data_grouped = prev_app_data.groupby('SK_ID_CURR', as_index=False)[['AMT_ANNUITY_PRE','AMT_APPLICATION_PRE','AMT_CREDIT_PRE','AMT_DOWN_PAYMENT_PRE','AMT_GOODS_PRICE_PRE',\
                         'CNT_PAYMENT_PRE','DAYS_FIRST_DRAWING_PRE','DAYS_FIRST_DUE_PRE','DAYS_LAST_DUE_1ST_VERSION_PRE',\
                         'DAYS_LAST_DUE_PRE','DAYS_TERMINATION_PRE','NFLAG_INSURED_ON_APPROVAL_PRE']].sum()

In [7]:
train_merged = train_merged.merge(prev_app_data_grouped, how = 'left', left_on = 'SK_ID_CURR', right_on = 'SK_ID_CURR')

test_merged = test_merged.merge(prev_app_data_grouped, how = 'left', left_on = 'SK_ID_CURR', right_on = 'SK_ID_CURR')

In [11]:
column_corr = train_merged.corr()['TARGET'].sort_values()
column_corr = column_corr[(column_corr > .005) | (column_corr < -.005)].index.values.tolist()

In [12]:
column_corr.remove('TARGET')

In [13]:
column_corr

['EXT_SOURCE_3',
 'EXT_SOURCE_2',
 'EXT_SOURCE_1',
 'DAYS_EMPLOYED',
 'FLOORSMAX_AVG',
 'FLOORSMAX_MEDI',
 'FLOORSMAX_MODE',
 'DAYS_FIRST_DRAWING_PRE',
 'AMT_GOODS_PRICE',
 'REGION_POPULATION_RELATIVE',
 'ELEVATORS_AVG',
 'ELEVATORS_MEDI',
 'FLOORSMIN_AVG',
 'FLOORSMIN_MEDI',
 'LIVINGAREA_AVG',
 'LIVINGAREA_MEDI',
 'FLOORSMIN_MODE',
 'TOTALAREA_MODE',
 'ELEVATORS_MODE',
 'LIVINGAREA_MODE',
 'AMT_CREDIT',
 'APARTMENTS_AVG',
 'APARTMENTS_MEDI',
 'FLAG_DOCUMENT_6',
 'APARTMENTS_MODE',
 'AMT_DOWN_PAYMENT_PRE',
 'LIVINGAPARTMENTS_AVG',
 'LIVINGAPARTMENTS_MEDI',
 'HOUR_APPR_PROCESS_START',
 'FLAG_PHONE',
 'LIVINGAPARTMENTS_MODE',
 'BASEMENTAREA_AVG',
 'YEARS_BUILD_MEDI',
 'YEARS_BUILD_AVG',
 'BASEMENTAREA_MEDI',
 'YEARS_BUILD_MODE',
 'BASEMENTAREA_MODE',
 'ENTRANCES_AVG',
 'ENTRANCES_MEDI',
 'COMMONAREA_MEDI',
 'COMMONAREA_AVG',
 'DAYS_FIRST_DUE_PRE',
 'ENTRANCES_MODE',
 'COMMONAREA_MODE',
 'NFLAG_INSURED_ON_APPROVAL_PRE',
 'AMT_CREDIT_SUM',
 'NONLIVINGAREA_AVG',
 'NONLIVINGAREA_MEDI',
 'AMT

In [14]:
train_subset = train_merged[column_corr]

test_subset = test_merged[column_corr]

In [15]:
imputer = Imputer(strategy = 'median')
imputer.fit(train_subset)
train_subset_imputed = imputer.transform(train_subset)
test_subset_imputed = imputer.transform(test_subset)

In [16]:
poly_transformer = PolynomialFeatures(degree = 1)
poly_transformer.fit(train_subset_imputed)
train_poly_features = poly_transformer.transform(train_subset_imputed)

In [17]:
train_subset_poly = pd.DataFrame(train_poly_features, columns = poly_transformer.get_feature_names(
    input_features = train_subset.columns.tolist()
))

In [18]:
test_poly_features = poly_transformer.transform(test_subset_imputed)

In [19]:
test_subset_poly = pd.DataFrame(test_poly_features, columns = poly_transformer.get_feature_names(input_features = train_subset.columns.tolist()))

In [20]:
scaler = MinMaxScaler(feature_range = (0, 1))

In [21]:
scaler.fit(train_subset_poly)
train_scaled = scaler.transform(train_subset_poly)
test_scaled = scaler.transform(test_subset_poly)

In [24]:
from catboost import CatBoostClassifier
cat_model = CatBoostClassifier(iterations=500, learning_rate=0.1)

In [25]:
cat_model.fit(train_scaled, train.TARGET)

0:	learn: 0.5863954	total: 415ms	remaining: 3m 26s
1:	learn: 0.5062075	total: 802ms	remaining: 3m 19s
2:	learn: 0.4474104	total: 1.19s	remaining: 3m 16s
3:	learn: 0.4032619	total: 1.58s	remaining: 3m 15s
4:	learn: 0.3696034	total: 1.92s	remaining: 3m 9s
5:	learn: 0.3438035	total: 2.46s	remaining: 3m 22s
6:	learn: 0.3245369	total: 2.79s	remaining: 3m 16s
7:	learn: 0.3096118	total: 3.13s	remaining: 3m 12s
8:	learn: 0.2976256	total: 3.46s	remaining: 3m 8s
9:	learn: 0.2893352	total: 3.79s	remaining: 3m 5s
10:	learn: 0.2819728	total: 4.12s	remaining: 3m 3s
11:	learn: 0.2761199	total: 4.46s	remaining: 3m 1s
12:	learn: 0.2715834	total: 4.79s	remaining: 2m 59s
13:	learn: 0.2676047	total: 5.12s	remaining: 2m 57s
14:	learn: 0.2649409	total: 5.49s	remaining: 2m 57s
15:	learn: 0.2623176	total: 5.82s	remaining: 2m 56s
16:	learn: 0.2602107	total: 6.15s	remaining: 2m 54s
17:	learn: 0.2586211	total: 6.49s	remaining: 2m 53s
18:	learn: 0.2573818	total: 6.82s	remaining: 2m 52s
19:	learn: 0.2562482	total:

<catboost.core._CatBoostBase at 0x10ce4bb50>

In [26]:
test_y_cat = pd.DataFrame(cat_model.predict_proba(test_scaled))
submission_cat = pd.concat([test.SK_ID_CURR, test_y_cat], axis=1).drop(0, axis = 1)
submission_cat.columns = ['SK_ID_CURR', 'Target']

In [27]:
#max_sub_cat = submission_cat.Target.max()
#min_sub_cat = submission_cat.Target.min()

In [28]:
#submission_cat['Target'] = submission_cat['Target'].apply(lambda x: (x - min_sub_cat) / (max_sub_cat - min_sub_cat))

In [29]:
submission_cat.to_csv('reduced_cat12.csv', index = False)

In [None]:
########################################################################

In [None]:
column_corr = pd.concat([train_subset_poly, train['TARGET']], axis = 1).corr()['TARGET'].sort_values()
column_corr = column_corr.drop('TARGET')
column_corr = column_corr[(column_corr > .05) | (column_corr < -.05)].index.values.tolist()

In [None]:
train_subset_poly = train_subset_poly[column_corr]

In [None]:
test_subset_poly = test_subset_poly[column_corr]

In [None]:
scaler.fit(train_subset_imputed)
train_scaled = scaler.transform(train_subset_imputed)
test_scaled = scaler.transform(test_subset_imputed)

In [None]:
cat_model.fit(train_scaled, target)

In [None]:
test_y_cat = pd.DataFrame(cat_model.predict_proba(test_scaled))
submission_cat = pd.concat([test.SK_ID_CURR, test_y_cat], axis=1).drop(0, axis = 1)
submission_cat.columns = ['SK_ID_CURR', 'Target']
submission_cat.to_csv('reduced_cat11.csv', index = False)