In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler, Imputer
from sklearn.preprocessing import PolynomialFeatures
from scipy import stats

In [2]:
#Read in Data
train = pd.read_csv('dataFiles/application_train.csv')
test = pd.read_csv('dataFiles/application_test.csv')
bureau_data = pd.read_csv('dataFiles/bureau.csv')
prev_app_data = pd.read_csv('dataFiles/previous_application.csv')

In [3]:
train_subset = train[['TARGET','EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3',\
                      'DAYS_BIRTH', 'REGION_RATING_CLIENT_W_CITY']]

test_subset = test[['EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3',\
                      'DAYS_BIRTH', 'REGION_RATING_CLIENT_W_CITY']]

In [4]:
train_subset.corr()['TARGET']

TARGET                         1.000000
EXT_SOURCE_1                  -0.155317
EXT_SOURCE_2                  -0.160472
EXT_SOURCE_3                  -0.178919
DAYS_BIRTH                     0.078239
REGION_RATING_CLIENT_W_CITY    0.060893
Name: TARGET, dtype: float64

In [5]:
target = train_subset['TARGET']
train_subset = train_subset.drop(columns='TARGET')

In [6]:
imputer = Imputer(strategy = 'median')
imputer.fit(train_subset)
train_subset_imputed = imputer.transform(train_subset)
test_subset_imputed = imputer.transform(test_subset)

In [7]:
poly_transformer = PolynomialFeatures(degree = 3)
poly_transformer.fit(train_subset_imputed)
train_poly_features = poly_transformer.transform(train_subset_imputed)

In [8]:
train_subset_poly = pd.DataFrame(train_poly_features, columns = poly_transformer.get_feature_names(input_features = ['EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3',\
                      'DAYS_BIRTH', 'REGION_RATING_CLIENT_W_CITY']))

In [9]:
test_poly_features = poly_transformer.transform(test_subset_imputed)

In [10]:
test_subset_poly = pd.DataFrame(test_poly_features, columns = poly_transformer.get_feature_names(input_features = ['EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3',\
                      'DAYS_BIRTH', 'REGION_RATING_CLIENT_W_CITY']))

In [11]:
scaler = MinMaxScaler(feature_range = (0, 1))

In [12]:
scaler.fit(train_subset_poly)
train_scaled = scaler.transform(train_subset_poly)
test_scaled = scaler.transform(test_subset_poly)

In [13]:
from catboost import CatBoostClassifier
cat_model = CatBoostClassifier(iterations=500, learning_rate=0.1)

In [14]:
cat_model.fit(train_scaled, target)

0:	learn: 0.5854307	total: 462ms	remaining: 3m 50s
1:	learn: 0.5049098	total: 682ms	remaining: 2m 49s
2:	learn: 0.4449308	total: 931ms	remaining: 2m 34s
3:	learn: 0.4005456	total: 1.16s	remaining: 2m 23s
4:	learn: 0.3687840	total: 1.42s	remaining: 2m 21s
5:	learn: 0.3433566	total: 1.64s	remaining: 2m 15s
6:	learn: 0.3250838	total: 1.91s	remaining: 2m 14s
7:	learn: 0.3096458	total: 2.21s	remaining: 2m 16s
8:	learn: 0.2977818	total: 2.47s	remaining: 2m 14s
9:	learn: 0.2887551	total: 2.7s	remaining: 2m 12s
10:	learn: 0.2817778	total: 2.95s	remaining: 2m 11s
11:	learn: 0.2762248	total: 3.19s	remaining: 2m 9s
12:	learn: 0.2717473	total: 3.43s	remaining: 2m 8s
13:	learn: 0.2685420	total: 3.67s	remaining: 2m 7s
14:	learn: 0.2660024	total: 3.91s	remaining: 2m 6s
15:	learn: 0.2637172	total: 4.15s	remaining: 2m 5s
16:	learn: 0.2619192	total: 4.38s	remaining: 2m 4s
17:	learn: 0.2604453	total: 4.62s	remaining: 2m 3s
18:	learn: 0.2592343	total: 4.86s	remaining: 2m 2s
19:	learn: 0.2581841	total: 5.1

<catboost.core._CatBoostBase at 0x114339350>

In [19]:
test_y_cat = pd.DataFrame(cat_model.predict_proba(test_scaled))
submission_cat = pd.concat([test.SK_ID_CURR, test_y_cat], axis=1).drop(0, axis = 1)
submission_cat.columns = ['SK_ID_CURR', 'Target']

In [21]:
submission_cat['Target'] = submission_cat['Target'].apply(lambda x: x * 3 if x > .1 else x)

In [22]:
submission_cat['Target'] = submission_cat['Target'].apply(lambda x: .95 if x > 1 else x)

In [23]:
submission_cat.to_csv('reduced_cat3.csv', index = False)

In [16]:
column_corr = pd.concat([train_subset_poly, train['TARGET']], axis = 1).corr()['TARGET'].sort_values()

In [17]:
column_corr = column_corr.drop('TARGET')

In [18]:
column_corr = column_corr[(column_corr > .1) | (column_corr < -.1)].index.values.tolist()

In [None]:
train_subset_poly = train_subset_poly[column_corr]

In [None]:
test_subset_poly = test_subset_poly[column_corr]

In [None]:
scaler.fit(train_subset_poly)
train_scaled = scaler.transform(train_subset_poly)
test_scaled = scaler.transform(test_subset_poly)

In [None]:
cat_model.fit(train_scaled, target)

In [None]:
test_y_cat = pd.DataFrame(cat_model.predict_proba(test_scaled))
submission_cat = pd.concat([test.SK_ID_CURR, test_y_cat], axis=1).drop(0, axis = 1)
submission_cat.columns = ['SK_ID_CURR', 'Target']
submission_cat.to_csv('reduced_cat2.csv', index = False)