In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler, Imputer
from sklearn.preprocessing import PolynomialFeatures
from scipy import stats

In [2]:
#Read in Data
train = pd.read_csv('dataFiles/application_train.csv')
test = pd.read_csv('dataFiles/application_test.csv')
bureau_data = pd.read_csv('dataFiles/bureau.csv')
prev_app_data = pd.read_csv('dataFiles/previous_application.csv')

In [3]:
# Joining Bureau Grouped Data with Train Data
bureau_data_grouped = bureau_data.groupby('SK_ID_CURR', as_index=False).agg({'SK_ID_BUREAU': 'count', 'AMT_CREDIT_MAX_OVERDUE':sum,\
                                                                             'AMT_CREDIT_SUM_OVERDUE': sum, 'DAYS_CREDIT': 'mean',\
                                                                             'CREDIT_DAY_OVERDUE': sum, 'DAYS_CREDIT_ENDDATE': 'mean',\
                                                                             'AMT_CREDIT_SUM': sum, 'AMT_CREDIT_SUM_DEBT': sum
                                                                            })

train_merged = train.merge(bureau_data_grouped, how = 'left', left_on = 'SK_ID_CURR', right_on = 'SK_ID_CURR')

test_merged = test.merge(bureau_data_grouped, how = 'left', left_on = 'SK_ID_CURR', right_on = 'SK_ID_CURR')

In [4]:
train_subset = train_merged[['TARGET','EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3',\
                             'DAYS_BIRTH', 'REGION_RATING_CLIENT_W_CITY', 'AMT_CREDIT_MAX_OVERDUE', \
                             'AMT_CREDIT_SUM_OVERDUE', 'DAYS_CREDIT', 'CREDIT_DAY_OVERDUE', 'DAYS_CREDIT_ENDDATE',\
                            'AMT_CREDIT_SUM', 'AMT_CREDIT_SUM_DEBT']]

test_subset = test_merged[['EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3',\
                             'DAYS_BIRTH', 'REGION_RATING_CLIENT_W_CITY', 'AMT_CREDIT_MAX_OVERDUE', \
                             'AMT_CREDIT_SUM_OVERDUE', 'DAYS_CREDIT', 'CREDIT_DAY_OVERDUE', 'DAYS_CREDIT_ENDDATE',\
                            'AMT_CREDIT_SUM', 'AMT_CREDIT_SUM_DEBT']]

In [5]:
train_subset.corr()['TARGET']

TARGET                         1.000000
EXT_SOURCE_1                  -0.155317
EXT_SOURCE_2                  -0.160472
EXT_SOURCE_3                  -0.178919
DAYS_BIRTH                     0.078239
REGION_RATING_CLIENT_W_CITY    0.060893
AMT_CREDIT_MAX_OVERDUE         0.002491
AMT_CREDIT_SUM_OVERDUE         0.013335
DAYS_CREDIT                    0.089729
CREDIT_DAY_OVERDUE             0.006298
DAYS_CREDIT_ENDDATE            0.046983
AMT_CREDIT_SUM                -0.014057
AMT_CREDIT_SUM_DEBT            0.007144
Name: TARGET, dtype: float64

In [6]:
target = train_subset['TARGET']
train_subset = train_subset.drop(columns='TARGET')

In [7]:
imputer = Imputer(strategy = 'median')
imputer.fit(train_subset)
train_subset_imputed = imputer.transform(train_subset)
test_subset_imputed = imputer.transform(test_subset)

In [8]:
poly_transformer = PolynomialFeatures(degree = 3)
poly_transformer.fit(train_subset_imputed)
train_poly_features = poly_transformer.transform(train_subset_imputed)

In [9]:
train_subset_poly = pd.DataFrame(train_poly_features, columns = poly_transformer.get_feature_names(
    input_features = ['EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3',\
                             'DAYS_BIRTH', 'REGION_RATING_CLIENT_W_CITY', 'AMT_CREDIT_MAX_OVERDUE', \
                             'AMT_CREDIT_SUM_OVERDUE', 'DAYS_CREDIT', 'CREDIT_DAY_OVERDUE', 'DAYS_CREDIT_ENDDATE',\
                            'AMT_CREDIT_SUM', 'AMT_CREDIT_SUM_DEBT']
))

In [10]:
test_poly_features = poly_transformer.transform(test_subset_imputed)

In [11]:
test_subset_poly = pd.DataFrame(test_poly_features, columns = poly_transformer.get_feature_names(input_features = ['EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3',\
                             'DAYS_BIRTH', 'REGION_RATING_CLIENT_W_CITY', 'AMT_CREDIT_MAX_OVERDUE', \
                             'AMT_CREDIT_SUM_OVERDUE', 'DAYS_CREDIT', 'CREDIT_DAY_OVERDUE', 'DAYS_CREDIT_ENDDATE',\
                            'AMT_CREDIT_SUM', 'AMT_CREDIT_SUM_DEBT']))

In [12]:
scaler = MinMaxScaler(feature_range = (0, 1))

In [13]:
scaler.fit(train_subset_poly)
train_scaled = scaler.transform(train_subset_poly)
test_scaled = scaler.transform(test_subset_poly)

In [14]:
from catboost import CatBoostClassifier
cat_model = CatBoostClassifier(iterations=1500, learning_rate=0.1)

In [15]:
cat_model.fit(train_scaled, target)

0:	learn: 0.5858358	total: 1.3s	remaining: 32m 29s
1:	learn: 0.5056302	total: 2.55s	remaining: 31m 50s
2:	learn: 0.4460608	total: 3.81s	remaining: 31m 42s
3:	learn: 0.4016873	total: 5.1s	remaining: 31m 46s
4:	learn: 0.3680946	total: 6.41s	remaining: 31m 56s
5:	learn: 0.3430225	total: 7.81s	remaining: 32m 24s
6:	learn: 0.3233636	total: 9.23s	remaining: 32m 48s
7:	learn: 0.3086618	total: 10.6s	remaining: 32m 48s
8:	learn: 0.2973987	total: 12.1s	remaining: 33m 19s
9:	learn: 0.2883602	total: 13.3s	remaining: 33m 7s
10:	learn: 0.2813233	total: 14.6s	remaining: 32m 57s
11:	learn: 0.2762308	total: 15.9s	remaining: 32m 48s
12:	learn: 0.2716453	total: 17.1s	remaining: 32m 35s
13:	learn: 0.2681942	total: 18.3s	remaining: 32m 27s
14:	learn: 0.2654032	total: 19.6s	remaining: 32m 23s
15:	learn: 0.2630482	total: 20.9s	remaining: 32m 16s
16:	learn: 0.2613257	total: 22.5s	remaining: 32m 44s
17:	learn: 0.2599501	total: 23.9s	remaining: 32m 43s
18:	learn: 0.2587124	total: 25.4s	remaining: 32m 59s
19:	le

<catboost.core._CatBoostBase at 0x115f99a10>

In [16]:
test_y_cat = pd.DataFrame(cat_model.predict_proba(test_scaled))
submission_cat = pd.concat([test.SK_ID_CURR, test_y_cat], axis=1).drop(0, axis = 1)
submission_cat.columns = ['SK_ID_CURR', 'Target']

In [17]:
submission_cat['Target'] = submission_cat['Target'].apply(lambda x: x * 3 if x > .1 else x)

In [18]:
submission_cat['Target'] = submission_cat['Target'].apply(lambda x: .95 if x > 1 else x)

In [19]:
submission_cat.to_csv('reduced_cat8.csv', index = False)

In [None]:
########################################################################

In [None]:
column_corr = pd.concat([train_subset_poly, train['TARGET']], axis = 1).corr()['TARGET'].sort_values()

In [None]:
column_corr = column_corr.drop('TARGET')

In [None]:
column_corr = column_corr[(column_corr > .1) | (column_corr < -.1)].index.values.tolist()

In [None]:
train_subset_poly = train_subset_poly[column_corr]

In [None]:
test_subset_poly = test_subset_poly[column_corr]

In [None]:
scaler.fit(train_subset_imputed)
train_scaled = scaler.transform(train_subset_imputed)
test_scaled = scaler.transform(test_subset_imputed)

In [None]:
cat_model.fit(train_scaled, target)

In [None]:
test_y_cat = pd.DataFrame(cat_model.predict_proba(test_scaled))
submission_cat = pd.concat([test.SK_ID_CURR, test_y_cat], axis=1).drop(0, axis = 1)
submission_cat.columns = ['SK_ID_CURR', 'Target']
submission_cat.to_csv('reduced_cat9.csv', index = False)