In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler, Imputer
from sklearn.preprocessing import PolynomialFeatures
from scipy import stats

In [2]:
#Read in Data
train = pd.read_csv('dataFiles/application_train.csv')
test = pd.read_csv('dataFiles/application_test.csv')
bureau_data = pd.read_csv('dataFiles/bureau.csv')
bureau_balance_data = pd.read_csv('dataFiles/bureau_balance.csv')
prev_app_data = pd.read_csv('dataFiles/previous_application.csv')
pos_cash_balance_data = pd.read_csv('dataFiles/POS_CASH_balance.csv')
installments_data = pd.read_csv('dataFiles/installments_payments.csv')
cc_data = pd.read_csv('dataFiles/credit_card_balance.csv')

In [3]:
bureau_balance_data_grouped = pd.get_dummies(bureau_balance_data).groupby('SK_ID_BUREAU', as_index=False).agg({'STATUS_1':'count','MONTHS_BALANCE':min,\
                                                                                 'STATUS_C':sum,'STATUS_0':sum,'STATUS_X':sum})

In [4]:
bureau_balance_data_grouped = bureau_balance_data_grouped.rename(columns={'STATUS_1':'BALANCE_COUNT'})

In [5]:
bureau_balance_data_grouped['STATUS_X_RATIO'] = bureau_balance_data_grouped['STATUS_X'] / bureau_balance_data_grouped['BALANCE_COUNT'].astype(float)
bureau_balance_data_grouped['STATUS_C_RATIO'] = bureau_balance_data_grouped['STATUS_C'] / bureau_balance_data_grouped['BALANCE_COUNT'].astype(float)
bureau_balance_data_grouped['STATUS_0_RATIO'] = bureau_balance_data_grouped['STATUS_0'] / bureau_balance_data_grouped['BALANCE_COUNT'].astype(float)

The minimum supported version is 2.4.6



In [6]:
bureau_data = bureau_data.merge(bureau_balance_data_grouped, how = 'left')

In [7]:
bureau_data[['MONTHS_BALANCE', 'STATUS_X', 'STATUS_C', 'BALANCE_COUNT', 'STATUS_0', 'STATUS_X_RATIO','STATUS_C_RATIO', 'STATUS_0_RATIO']] = bureau_data[['MONTHS_BALANCE', 'STATUS_X', 'STATUS_C', 'BALANCE_COUNT', 'STATUS_0', 'STATUS_X_RATIO','STATUS_C_RATIO', 'STATUS_0_RATIO']].fillna(0)

In [18]:
##### Bureau Data #####
bureau_data_grouped = bureau_data.select_dtypes(exclude='object').drop('SK_ID_BUREAU', axis = 1).groupby('SK_ID_CURR').sum()
#bureau_data_grouped.columns = ['_'.join(col) if col != ('SK_ID_CURR', '') else col[0] for col in bureau_data_grouped.columns]
bureau_data_grouped = bureau_data_grouped.reset_index()

In [19]:
#Past Loan Count
loan_count = bureau_data[['SK_ID_CURR','SK_ID_BUREAU']].groupby('SK_ID_CURR', \
                                                   as_index=False)['SK_ID_BUREAU'].count().rename(columns = {'SK_ID_BUREAU':'LOAN_COUNT'})

bureau_data_grouped = bureau_data_grouped.merge(loan_count, how = 'left')

In [20]:
#Unique Loan Types
unique_loan_count = bureau_data[['SK_ID_CURR','CREDIT_TYPE']].groupby('SK_ID_CURR',\
                                                                      as_index=False).agg({'CREDIT_TYPE':'nunique'}).rename(columns={'CREDIT_TYPE':'UNIQUE_CREDIT_TYPES'})

bureau_data_grouped = bureau_data_grouped.merge(unique_loan_count, how = 'left')

In [21]:
#Total Active Loans
bureau_data['CREDIT_ACTIVE_BINARY'] = bureau_data['CREDIT_ACTIVE'].apply(lambda x: 1 if x == 'Active' else 0)

active_loan_count = bureau_data[['SK_ID_CURR','CREDIT_ACTIVE_BINARY']].groupby('SK_ID_CURR', \
                                                   as_index=False)['CREDIT_ACTIVE_BINARY'].sum().rename(columns = {'CREDIT_ACTIVE_BINARY':'ACTIVE_LOANS'})

bureau_data_grouped = bureau_data_grouped.merge(active_loan_count, how = 'left')

In [22]:
#Days Between Successive Past Applications
grp = bureau_data[['SK_ID_CURR', 'SK_ID_BUREAU', 'DAYS_CREDIT']].groupby(by = ['SK_ID_CURR'])
grp1 = grp.apply(lambda x: x.sort_values(['DAYS_CREDIT'], ascending = False)).reset_index(drop = True)

grp1['DAYS_CREDIT1'] = grp1['DAYS_CREDIT']*-1
grp1['DAYS_DIFF'] = grp1.groupby(by = ['SK_ID_CURR'])['DAYS_CREDIT1'].diff()
grp1['DAYS_DIFF'] = grp1['DAYS_DIFF'].fillna(0).astype('uint32')
del grp1['DAYS_CREDIT1'], grp1['DAYS_CREDIT']

In [23]:
past_app_days = grp1.groupby('SK_ID_CURR', as_index=False)['DAYS_DIFF'].mean()

In [24]:
bureau_data_grouped = bureau_data_grouped.merge(past_app_days, how = 'left')

In [26]:
# Days Credit Expires
bureau_data['CREDIT_ENDDATE_BINARY'] = bureau_data['DAYS_CREDIT_ENDDATE'].apply(lambda x: 0 if x < 0 else 1) 

In [27]:
B1 = bureau_data.loc[bureau_data['CREDIT_ENDDATE_BINARY'] == 1]

In [28]:
grp = B1[['SK_ID_CURR', 'SK_ID_BUREAU', 'DAYS_CREDIT_ENDDATE']].groupby(by = ['SK_ID_CURR'])
# Sort the values of CREDIT_ENDDATE for each customer ID 
grp1 = grp.apply(lambda x: x.sort_values(['DAYS_CREDIT_ENDDATE'], ascending = True)).reset_index(drop = True)
del grp

In [29]:
grp1['DAYS_ENDDATE_DIFF'] = grp1.groupby(by = ['SK_ID_CURR'])['DAYS_CREDIT_ENDDATE'].diff()
grp1['DAYS_ENDDATE_DIFF'] = grp1['DAYS_ENDDATE_DIFF'].fillna(0).astype('uint32')
del grp1['DAYS_CREDIT_ENDDATE']

In [30]:
credit_expires_days = grp1.groupby('SK_ID_CURR', as_index = False)['DAYS_ENDDATE_DIFF'].mean()

bureau_data_grouped = bureau_data_grouped.merge(credit_expires_days, how = 'left')

In [31]:
# % Active Loans
bureau_data_grouped['ACTIVE_LOAN_PERC'] = bureau_data_grouped['ACTIVE_LOANS'] / bureau_data_grouped['LOAN_COUNT'].astype(float)

In [37]:
##### Credit Card Data

In [38]:
cc_data_one_hot = pd.concat([cc_data['SK_ID_PREV'], \
                            pd.get_dummies(cc_data.select_dtypes(include=['object']), drop_first = True)],\
                            axis = 1)

In [39]:
cc_data_one_hot_grouped = cc_data_one_hot.groupby('SK_ID_PREV', as_index=False).max()

In [40]:
cc_data_numeric_grouped = cc_data.select_dtypes(exclude=['object']).groupby('SK_ID_PREV', as_index = False).agg(['count', sum, 'mean', min, max])

In [41]:
cc_data_numeric_grouped.columns = ['_CC_'.join(col) if col != ('SK_ID_PREV', '') else col[0] for col in cc_data_numeric_grouped.columns]

In [42]:
filtered_cols = filter(lambda x: x[-5:] != 'count' and x[0:10] != 'SK_ID_CURR', cc_data_numeric_grouped.columns.tolist())
filtered_cols.insert(0, 'MONTHS_BALANCE_CC_count')

In [43]:
cc_data_numeric_grouped_filtered = cc_data_numeric_grouped[filtered_cols].reset_index()

In [44]:
cc_data_grouped = cc_data_numeric_grouped_filtered.merge(cc_data_one_hot_grouped, how = 'left')

In [45]:
prev_app_data_dummies = pd.get_dummies(prev_app_data[['NAME_CONTRACT_TYPE','FLAG_LAST_APPL_PER_CONTRACT','NAME_CONTRACT_STATUS',\
                                       'NAME_PAYMENT_TYPE','CODE_REJECT_REASON','NAME_TYPE_SUITE','NAME_CLIENT_TYPE',\
                                       'NAME_PORTFOLIO','CHANNEL_TYPE','NAME_YIELD_GROUP','PRODUCT_COMBINATION']]).fillna(0)

In [46]:
prev_app_data = pd.concat([prev_app_data.select_dtypes(exclude=['object']), prev_app_data_dummies], axis = 1)

In [47]:
prev_app_data_merged = prev_app_data.merge(cc_data_grouped, how = 'left', left_on = 'SK_ID_PREV', right_on = 'SK_ID_PREV')

In [48]:
installments_data_grouped = installments_data.groupby('SK_ID_PREV', as_index = False).agg(['count', sum, 'mean', min, max])

In [49]:
installments_data_grouped.columns = ['_INST_'.join(col) if col != ('SK_ID_PREV', '') else col[0] for col in installments_data_grouped.columns]

In [50]:
installments_data_grouped = installments_data_grouped.reset_index()

In [51]:
filtered_cols = filter(lambda x: x[-5:] != 'count' and x[0:10] != 'SK_ID_CURR', installments_data_grouped.columns.tolist())
filtered_cols.insert(0, 'SK_ID_CURR_INST_count')

In [52]:
installments_data_grouped = installments_data_grouped[filtered_cols]

In [53]:
prev_app_data_merged = prev_app_data_merged.merge(installments_data_grouped, how = 'left', left_on = 'SK_ID_PREV', right_on = 'SK_ID_PREV')

In [54]:
pos_cash_balance_data = pd.get_dummies(pos_cash_balance_data)

In [55]:
pos_cash_balance_data_grouped = pos_cash_balance_data.groupby('SK_ID_PREV', as_index=False).agg(['count', sum, 'mean', min, max])

In [56]:
pos_cash_balance_data_grouped.columns = ['_POS_'.join(col) if col != ('SK_ID_PREV', '') else col[0] for col in pos_cash_balance_data_grouped.columns]

In [57]:
pos_cash_balance_data_grouped = pos_cash_balance_data_grouped.reset_index()

In [58]:
filtered_cols = filter(lambda x: x[-5:] != 'count' and x[0:10] != 'SK_ID_CURR', pos_cash_balance_data_grouped.columns.tolist())
filtered_cols.insert(0, 'SK_ID_CURR_POS_count')

In [59]:
pos_cash_balance_data_grouped = pos_cash_balance_data_grouped[filtered_cols]

In [60]:
prev_app_data_merged = prev_app_data_merged.merge(pos_cash_balance_data_grouped, how = 'left', left_on = 'SK_ID_PREV', right_on = 'SK_ID_PREV')

In [61]:
filtered_cols = filter(lambda x: x[-5:] != 'count' and x[0:10] != 'SK_ID_CURR', bureau_data_grouped.columns.tolist())
filtered_cols.insert(0, 'SK_ID_BUREAU_count')

In [62]:
train_dummies = pd.get_dummies(train[['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE',\
'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'FONDKAPREMONT_MODE','HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE']]).fillna(0)

In [63]:
test_dummies = pd.get_dummies(test[['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE',\
'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'FONDKAPREMONT_MODE','HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE']]).fillna(0)

In [64]:
train_one_hot = pd.concat([train.select_dtypes(exclude=['object']), train_dummies], axis = 1)

test_one_hot = pd.concat([test.select_dtypes(exclude=['object']), test_dummies], axis = 1)

In [65]:
train_merged = train_one_hot.merge(bureau_data_grouped, how = 'left', left_on = 'SK_ID_CURR', right_on = 'SK_ID_CURR')

test_merged = test_one_hot.merge(bureau_data_grouped, how = 'left', left_on = 'SK_ID_CURR', right_on = 'SK_ID_CURR')

In [66]:
prev_app_data_subset = prev_app_data_merged[prev_app_data_merged.columns.tolist()].drop('SK_ID_PREV', axis = 1)

In [67]:
prev_app_data_grouped = prev_app_data_subset.groupby('SK_ID_CURR', as_index=False).sum()

In [68]:
#prev_app_data_grouped.columns = ['_'.join(col) if col != ('SK_ID_CURR', '') else col[0] for col in prev_app_data_grouped.columns]
#prev_app_data_grouped = prev_app_data_grouped.reset_index()

In [69]:
train_merged = train_merged.merge(prev_app_data_grouped, how = 'left', left_on = 'SK_ID_CURR', right_on = 'SK_ID_CURR')

test_merged = test_merged.merge(prev_app_data_grouped, how = 'left', left_on = 'SK_ID_CURR', right_on = 'SK_ID_CURR')

In [70]:
own_car_median = train_merged.OWN_CAR_AGE.median()

In [71]:
def own_car_missing(x):
    if x['FLAG_OWN_CAR'] == 'N':
        return 28
    elif x['FLAG_OWN_CAR'] == 'Y' and pd.isnull(x['OWN_CAR_AGE']):
        return own_car_median
    else:
        return x['OWN_CAR_AGE']

In [73]:
train_merged_subset = train_merged.dropna(thresh=len(train_merged) - 20000000, axis = 1)

In [74]:
column_corr_subset = train_merged_subset.columns.tolist()

In [75]:
len(column_corr_subset)

446

In [76]:
#col_corr = train_merged_subset.corr()['TARGET'].sort_values()

In [77]:
#column_corr_subset = col_corr[(col_corr >= 0.03) | (col_corr < -0.035)].index.values.tolist()

In [78]:
column_corr_subset.remove('TARGET')

In [79]:
column_corr_subset.remove('CODE_GENDER_XNA')
column_corr_subset.remove('NAME_FAMILY_STATUS_Unknown')
column_corr_subset.remove('NAME_INCOME_TYPE_Maternity leave')

In [80]:
train_subset = train_merged_subset[column_corr_subset]

test_subset = test_merged[column_corr_subset]

In [81]:
imputer = Imputer()
imputer.fit(train_subset)
train_merged_imputed = pd.DataFrame(imputer.transform(train_subset), columns = train_subset.columns)
test_merged_imputed = pd.DataFrame(imputer.transform(test_subset), columns = train_subset.columns)

In [85]:
train_merged_imputed['DAYS_EMPLOYED_^2'] = train_merged_imputed['DAYS_EMPLOYED'] ** 2
#train_merged_imputed['AMT_GOODS_PRICE_^2'] = train_merged_imputed['AMT_GOODS_PRICE'] ** 2
train_merged_imputed['DAYS_CREDIT^2'] = train_merged_imputed['DAYS_CREDIT'] ** 2
#train_merged_imputed['DAYS_CREDIT_median^2'] = train_merged_imputed['DAYS_CREDIT_median'] ** 2
train_merged_imputed['DAYS_BIRTH_^2'] = train_merged_imputed['DAYS_BIRTH'] ** 2
train_merged_imputed['REGION_RATING_CLIENT_W_CITY_^2'] = train_merged_imputed['REGION_RATING_CLIENT_W_CITY'] ** 2
train_merged_imputed['REGION_RATING_CLIENT_^2'] = train_merged_imputed['REGION_RATING_CLIENT'] ** 2
train_merged_imputed['NAME_INCOME_TYPE_Working_^2'] = train_merged_imputed['NAME_INCOME_TYPE_Working'] ** 2
train_merged_imputed['DAYS_LAST_PHONE_CHANGE_^2'] = train_merged_imputed['DAYS_LAST_PHONE_CHANGE'] ** 2
train_merged_imputed['CODE_GENDER_M_^2'] = train_merged_imputed['CODE_GENDER_M'] ** 2
train_merged_imputed['EXT_SOURCE_1_^2'] = train_merged_imputed['EXT_SOURCE_1'] ** 2
train_merged_imputed['EXT_SOURCE_2_^2'] = train_merged_imputed['EXT_SOURCE_2'] ** 2
train_merged_imputed['EXT_SOURCE_3_^2'] = train_merged_imputed['EXT_SOURCE_3'] ** 2
train_merged_imputed['NAME_EDUCATION_TYPE_Higher education_^2'] = train_merged_imputed['NAME_EDUCATION_TYPE_Higher education'] ** 2
train_merged_imputed['CODE_GENDER_F_^2']= train_merged_imputed['CODE_GENDER_F'] ** 2

train_merged_imputed['DAYS_EMPLOYED_^3'] = train_merged_imputed['DAYS_EMPLOYED'] ** 3
#train_merged_imputed['AMT_GOODS_PRICE_^3'] = train_merged_imputed['AMT_GOODS_PRICE'] ** 3
train_merged_imputed['DAYS_CREDIT^3'] = train_merged_imputed['DAYS_CREDIT'] ** 3
#train_merged_imputed['DAYS_CREDIT_median^3'] = train_merged_imputed['DAYS_CREDIT_median'] ** 3
train_merged_imputed['DAYS_BIRTH_^3'] = train_merged_imputed['DAYS_BIRTH'] ** 3
train_merged_imputed['REGION_RATING_CLIENT_W_CITY_^3'] = train_merged_imputed['REGION_RATING_CLIENT_W_CITY'] ** 3
train_merged_imputed['REGION_RATING_CLIENT_^3'] = train_merged_imputed['REGION_RATING_CLIENT'] ** 3
train_merged_imputed['NAME_INCOME_TYPE_Working_^3'] = train_merged_imputed['NAME_INCOME_TYPE_Working'] ** 3
train_merged_imputed['DAYS_LAST_PHONE_CHANGE_^3'] = train_merged_imputed['DAYS_LAST_PHONE_CHANGE'] ** 3
train_merged_imputed['CODE_GENDER_M_^3'] = train_merged_imputed['CODE_GENDER_M'] ** 3
train_merged_imputed['EXT_SOURCE_1_^3'] = train_merged_imputed['EXT_SOURCE_1'] ** 3
train_merged_imputed['EXT_SOURCE_2_^3'] = train_merged_imputed['EXT_SOURCE_2'] ** 3
train_merged_imputed['EXT_SOURCE_3_^3'] = train_merged_imputed['EXT_SOURCE_3'] ** 3
train_merged_imputed['NAME_EDUCATION_TYPE_Higher education_^3'] = train_merged_imputed['NAME_EDUCATION_TYPE_Higher education'] ** 3
train_merged_imputed['CODE_GENDER_F_^3']= train_merged_imputed['CODE_GENDER_F'] ** 3

test_merged_imputed['DAYS_EMPLOYED_^2'] = test_merged_imputed['DAYS_EMPLOYED'] ** 2
#test_merged_imputed['AMT_GOODS_PRICE_^2'] = test_merged_imputed['AMT_GOODS_PRICE'] ** 2
test_merged_imputed['DAYS_CREDIT_^2'] = test_merged_imputed['DAYS_CREDIT'] ** 2
#test_merged_imputed['DAYS_CREDIT_median^2'] = test_merged_imputed['DAYS_CREDIT_median'] ** 2
test_merged_imputed['DAYS_BIRTH_^2'] = test_merged_imputed['DAYS_BIRTH'] ** 2
test_merged_imputed['REGION_RATING_CLIENT_W_CITY_^2'] = test_merged_imputed['REGION_RATING_CLIENT_W_CITY'] ** 2
test_merged_imputed['REGION_RATING_CLIENT_^2'] = test_merged_imputed['REGION_RATING_CLIENT'] ** 2
test_merged_imputed['NAME_INCOME_TYPE_Working_^2'] = test_merged_imputed['NAME_INCOME_TYPE_Working'] ** 2
test_merged_imputed['DAYS_LAST_PHONE_CHANGE_^2'] = test_merged_imputed['DAYS_LAST_PHONE_CHANGE'] ** 2
test_merged_imputed['CODE_GENDER_M_^2'] = test_merged_imputed['CODE_GENDER_M'] ** 2
test_merged_imputed['EXT_SOURCE_1_^2'] = test_merged_imputed['EXT_SOURCE_1'] ** 2
test_merged_imputed['EXT_SOURCE_2_^2'] = test_merged_imputed['EXT_SOURCE_2'] ** 2
test_merged_imputed['EXT_SOURCE_3_^2'] = test_merged_imputed['EXT_SOURCE_3'] ** 2
test_merged_imputed['NAME_EDUCATION_TYPE_Higher education_^2'] = test_merged_imputed['NAME_EDUCATION_TYPE_Higher education'] ** 2
test_merged_imputed['CODE_GENDER_F_^2']= test_merged_imputed['CODE_GENDER_F'] ** 2

test_merged_imputed['DAYS_EMPLOYED_^3'] = test_merged_imputed['DAYS_EMPLOYED'] ** 3
#test_merged_imputed['AMT_GOODS_PRICE_^3'] = test_merged_imputed['AMT_GOODS_PRICE'] ** 3
test_merged_imputed['DAYS_CREDIT^3'] = test_merged_imputed['DAYS_CREDIT'] ** 3
#test_merged_imputed['DAYS_CREDIT_median^3'] = test_merged_imputed['DAYS_CREDIT_median'] ** 3
test_merged_imputed['DAYS_BIRTH_^3'] = test_merged_imputed['DAYS_BIRTH'] ** 3
test_merged_imputed['REGION_RATING_CLIENT_W_CITY_^3'] = test_merged_imputed['REGION_RATING_CLIENT_W_CITY'] ** 3
test_merged_imputed['REGION_RATING_CLIENT_^3'] = test_merged_imputed['REGION_RATING_CLIENT'] ** 3
test_merged_imputed['NAME_INCOME_TYPE_Working_^3'] = test_merged_imputed['NAME_INCOME_TYPE_Working'] ** 3
test_merged_imputed['DAYS_LAST_PHONE_CHANGE_^3'] = test_merged_imputed['DAYS_LAST_PHONE_CHANGE'] ** 3
test_merged_imputed['CODE_GENDER_M_^3'] = test_merged_imputed['CODE_GENDER_M'] ** 3
test_merged_imputed['EXT_SOURCE_1_^3'] = test_merged_imputed['EXT_SOURCE_1'] ** 3
test_merged_imputed['EXT_SOURCE_2_^3'] = test_merged_imputed['EXT_SOURCE_2'] ** 3
test_merged_imputed['EXT_SOURCE_3_^3'] = test_merged_imputed['EXT_SOURCE_3'] ** 3
test_merged_imputed['NAME_EDUCATION_TYPE_Higher education_^3'] = test_merged_imputed['NAME_EDUCATION_TYPE_Higher education'] ** 3
test_merged_imputed['CODE_GENDER_F_^3']= test_merged_imputed['CODE_GENDER_F'] ** 3

In [96]:
poly_transformer = PolynomialFeatures(degree = 1)
poly_transformer.fit(train_merged_imputed)
train_poly_features = poly_transformer.transform(train_merged_imputed)

In [97]:
train_subset_poly = pd.DataFrame(train_poly_features, columns = poly_transformer.get_feature_names(
    input_features = train_merged_imputed.columns.tolist()
))

In [98]:
test_poly_features = poly_transformer.transform(test_merged_imputed)

In [99]:
test_subset_poly = pd.DataFrame(test_poly_features, columns = poly_transformer.get_feature_names(input_features = test_merged_imputed.columns.tolist()))

In [100]:
scaler = MinMaxScaler(feature_range = (0, 1))

In [101]:
scaler.fit(train_subset_poly)
train_scaled = scaler.transform(train_subset_poly)
test_scaled = scaler.transform(test_subset_poly)

In [110]:
from catboost import CatBoostClassifier
cat_model = CatBoostClassifier(iterations = 2000, random_state = 42, learning_rate = 0.1)

In [None]:
cat_model.fit(train_scaled, train.TARGET)

0:	learn: 0.5856974	total: 1.68s	remaining: 55m 52s
1:	learn: 0.5058570	total: 3.29s	remaining: 54m 50s
2:	learn: 0.4452592	total: 4.98s	remaining: 55m 12s
3:	learn: 0.4007032	total: 6.46s	remaining: 53m 44s
4:	learn: 0.3674868	total: 7.78s	remaining: 51m 43s
5:	learn: 0.3423636	total: 9.04s	remaining: 50m 3s
6:	learn: 0.3230338	total: 10.3s	remaining: 48m 47s
7:	learn: 0.3080931	total: 11.5s	remaining: 47m 50s
8:	learn: 0.2966196	total: 12.8s	remaining: 47m 7s
9:	learn: 0.2871394	total: 14.1s	remaining: 46m 40s
10:	learn: 0.2796928	total: 16.3s	remaining: 49m 1s
11:	learn: 0.2735445	total: 17.8s	remaining: 49m
12:	learn: 0.2690623	total: 19.4s	remaining: 49m 21s
13:	learn: 0.2656367	total: 20.8s	remaining: 49m 11s
14:	learn: 0.2625419	total: 22.1s	remaining: 48m 51s
15:	learn: 0.2603011	total: 23.5s	remaining: 48m 38s
16:	learn: 0.2582318	total: 24.8s	remaining: 48m 17s
17:	learn: 0.2565185	total: 26.8s	remaining: 49m 9s
18:	learn: 0.2552433	total: 29.2s	remaining: 50m 49s
19:	learn: 

In [104]:
#from xgboost import XGBClassifier
#xgb_model = XGBClassifier(n_estimators = 1000, silent=True, learning_rate = 0.05)

In [105]:
#xgb_model.fit(train_scaled, train.TARGET)

In [106]:
test_y_cat = pd.DataFrame(cat_model.predict_proba(test_scaled))
submission_cat = pd.concat([test.SK_ID_CURR, test_y_cat], axis=1).drop(0, axis = 1)
submission_cat.columns = ['SK_ID_CURR', 'Target']

In [107]:
#max_sub_cat = submission_cat.Target.max()
#min_sub_cat = submission_cat.Target.min()

In [108]:
#submission_cat['Target'] = submission_cat['Target'].apply(lambda x: (x - min_sub_cat) / (max_sub_cat - min_sub_cat))

In [109]:
submission_cat.to_csv('cat24.csv', index = False)