In [1]:
!pip install xgboost

Collecting xgboost
[?25l  Downloading https://files.pythonhosted.org/packages/c3/eb/496aa2f5d356af4185f770bc76055307f8d1870e11016b10fd779b21769c/xgboost-2.0.3-py3-none-manylinux2014_x86_64.whl (297.1MB)
[K     |████████████████████████████████| 297.1MB 1.1MB/s eta 0:00:011     |██████████████████▊             | 173.7MB 62.7MB/s eta 0:00:02
[?25hCollecting numpy
[?25l  Downloading https://files.pythonhosted.org/packages/98/5d/5738903efe0ecb73e51eb44feafba32bdba2081263d40c5043568ff60faf/numpy-1.24.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3MB)
[K     |████████████████████████████████| 17.3MB 33.1MB/s eta 0:00:01
[?25hCollecting scipy
[?25l  Downloading https://files.pythonhosted.org/packages/69/f0/fb07a9548e48b687b8bf2fa81d71aba9cfc548d365046ca1c791e24db99d/scipy-1.10.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.5MB)
[K     |████████████████████████████████| 34.5MB 48.4MB/s eta 0:00:01
[31mERROR: snowflake-ml-python 1.0.1 has requirement pa

In [2]:
!pip install catboost

Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/1f/a7/82a2856613e36245663cfabde87cb1f80878548dd9e116e83a4f0e72eff7/catboost-1.2.5-cp38-cp38-manylinux2014_x86_64.whl (98.2MB)
[K     |████████████████████████████████| 98.2MB 43.6MB/s eta 0:00:01
[?25hCollecting six
  Downloading https://files.pythonhosted.org/packages/d9/5a/e7c31adbe875f2abbb91bd84cf2dc52d792b5a01506781dbcf25c91daf11/six-1.16.0-py2.py3-none-any.whl
Collecting scipy
[?25l  Downloading https://files.pythonhosted.org/packages/69/f0/fb07a9548e48b687b8bf2fa81d71aba9cfc548d365046ca1c791e24db99d/scipy-1.10.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.5MB)
[K     |████████████████████████████████| 34.5MB 49.0MB/s eta 0:00:01
[?25hCollecting graphviz
[?25l  Downloading https://files.pythonhosted.org/packages/00/be/d59db2d1d52697c6adc9eacaf50e8965b6345cc143f671e1ed068818d5cf/graphviz-0.20.3-py3-none-any.whl (47kB)
[K     |████████████████████████████████| 51kB 42.7MB/s eta 0

You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [1]:
from snowflake.snowpark import Session
import configparser

In [125]:
import pandas as pd
import numpy as np
from scipy.stats import skew
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, recall_score, precision_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from math import sqrt
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from catboost import CatBoostClassifier
import gc

In [16]:
NFOLDS = 3
SEED = 0
NROWS = None

# Code to establish connection and read data from Snowflake

In [17]:
config = configparser.ConfigParser()
config.read("snowflake_connection.ini")

['snowflake_connection.ini']

In [18]:
connection_parameters = {
    "user": f'{config["Snowflake"]["user"]}',
    "password": f'{config["Snowflake"]["password"]}',
    #"password": os.getenv('snowflake_password'),
    "account": f'{config["Snowflake"]["account"]}',
    #"account": os.getenv('snowflake_account'),
    "WAREHOUSE": f'{config["Snowflake"]["WAREHOUSE"]}',
    "DATABASE": f'{config["Snowflake"]["DATABASE"]}',
    "SCHEMA": f'{config["Snowflake"]["SCHEMA"]}'
}

In [19]:
def snowflake_connector(conn):
    try:
        session = Session.builder.configs(conn).create()
        print("connection successful!")
    except:
        raise ValueError("error while connecting with db")
    return session

session = snowflake_connector(connection_parameters)

connection successful!


In [41]:
application_train_sf  = session.table("CRA_APPLICATION_TRAIN_DETAILS")
application_test_sf  = session.table("CRA_APPLICATION_TEST_DETAILS")
previous_application_sf  = session.table("CRA_PREVIOUS_APPLICATION_DETAILS")

# Convert Snowflake data into Pandas dataframes

In [52]:
data = application_train_sf.to_pandas()
test = application_test_sf.to_pandas()
prev = previous_application_sf.to_pandas()

In [53]:
data.drop(['CREATED_BY','CREATED_AT'], axis=1, inplace=True)
test.drop(['CREATED_BY','CREATED_AT'], axis=1, inplace=True)
prev.drop(['CREATED_BY','CREATED_AT'], axis=1, inplace=True)

In [54]:
categorical_feats = [
    f for f in data.columns if data[f].dtype == 'object'
]

In [55]:
for f_ in categorical_feats:
    data[f_], indexer = pd.factorize(data[f_])
    test[f_] = indexer.get_indexer(test[f_])

In [56]:
gc.enable()

y_train = data['TARGET']
del data['TARGET']

In [57]:
prev_cat_features = [
    f_ for f_ in prev.columns if prev[f_].dtype == 'object'
]
for f_ in prev_cat_features:
    prev[f_], _ = pd.factorize(prev[f_])

In [58]:
avg_prev = prev.groupby('SK_ID_CURR').mean()
cnt_prev = prev[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').count()
avg_prev['nb_app'] = cnt_prev['SK_ID_PREV']
del avg_prev['SK_ID_PREV']

In [59]:
x_train = data.merge(right=avg_prev.reset_index(), how='left', on='SK_ID_CURR')
x_test = test.merge(right=avg_prev.reset_index(), how='left', on='SK_ID_CURR')

x_train = x_train.fillna(0)
x_test= x_test.fillna(0)

ntrain = x_train.shape[0]
ntest = x_test.shape[0]

In [61]:
excluded_feats = ['SK_ID_CURR']
features = [f_ for f_ in x_train.columns if f_ not in excluded_feats]

In [62]:
x_train = x_train[features]
x_test = x_test[features]

kf = KFold(n_splits = NFOLDS, shuffle=True, random_state=SEED)

In [63]:
class SklearnWrapper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict_proba(x)[:,1]

In [64]:
class CatboostWrapper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_seed'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict_proba(x)[:,1]

In [65]:
class XgbWrapper(object):
    def __init__(self, seed=0, params=None):
        self.param = params
        self.param['seed'] = seed
        self.nrounds = params.pop('nrounds', 250)

    def train(self, x_train, y_train):
        dtrain = xgb.DMatrix(x_train, label=y_train)
        self.gbdt = xgb.train(self.param, dtrain, self.nrounds)

    def predict(self, x):
        return self.gbdt.predict(xgb.DMatrix(x))

In [66]:
def get_oof(clf):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf.split(x_train)):
        x_tr = x_train.loc[train_index]
        y_tr = y_train.loc[train_index]
        x_te = x_train.loc[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [67]:
et_params = {
    'n_jobs': 16,
    'n_estimators': 200,
    'max_features': 0.5,
    'max_depth': 12,
    'min_samples_leaf': 2,
}

In [68]:
rf_params = {
    'n_jobs': 16,
    'n_estimators': 200,
    'max_features': 0.2,
    'max_depth': 12,
    'min_samples_leaf': 2,
}

In [69]:
xgb_params = {
    'seed': 0,
    'colsample_bytree': 0.7,
    'silent': 1,
    'subsample': 0.7,
    'learning_rate': 0.075,
    'objective': 'binary:logistic',
    'max_depth': 4,
    'num_parallel_tree': 1,
    'min_child_weight': 1,
    'nrounds': 200
}

In [70]:
catboost_params = {
    'iterations': 200,
    'learning_rate': 0.5,
    'depth': 3,
    'l2_leaf_reg': 40,
    'bootstrap_type': 'Bernoulli',
    'subsample': 0.7,
    'scale_pos_weight': 5,
    'eval_metric': 'AUC',
    'od_type': 'Iter',
    'allow_writing_files': False
}

In [71]:
xg = XgbWrapper(seed=SEED, params=xgb_params)
et = SklearnWrapper(clf=ExtraTreesClassifier, seed=SEED, params=et_params)
rf = SklearnWrapper(clf=RandomForestClassifier, seed=SEED, params=rf_params)
cb = CatboostWrapper(clf= CatBoostClassifier, seed = SEED, params=catboost_params)

In [72]:
xg_oof_train, xg_oof_test = get_oof(xg)
et_oof_train, et_oof_test = get_oof(et)
rf_oof_train, rf_oof_test = get_oof(rf)
cb_oof_train, cb_oof_test = get_oof(cb)

Parameters: { "silent" } are not used.

Parameters: { "silent" } are not used.

Parameters: { "silent" } are not used.



0:	total: 160ms	remaining: 31.8s
1:	total: 269ms	remaining: 26.7s
2:	total: 356ms	remaining: 23.4s
3:	total: 473ms	remaining: 23.2s
4:	total: 570ms	remaining: 22.2s
5:	total: 694ms	remaining: 22.4s
6:	total: 825ms	remaining: 22.8s
7:	total: 921ms	remaining: 22.1s
8:	total: 1.03s	remaining: 21.8s
9:	total: 1.14s	remaining: 21.7s
10:	total: 1.25s	remaining: 21.4s
11:	total: 1.34s	remaining: 21s
12:	total: 1.44s	remaining: 20.7s
13:	total: 1.55s	remaining: 20.6s
14:	total: 1.66s	remaining: 20.5s
15:	total: 1.75s	remaining: 20.1s
16:	total: 1.85s	remaining: 19.9s
17:	total: 1.94s	remaining: 19.6s
18:	total: 2.03s	remaining: 19.4s
19:	total: 2.13s	remaining: 19.2s
20:	total: 2.23s	remaining: 19s
21:	total: 2.33s	remaining: 18.9s
22:	total: 2.41s	remaining: 18.5s
23:	total: 2.53s	remaining: 18.5s
24:	total: 2.62s	remaining: 18.3s
25:	total: 2.7s	remaining: 18.1s
26:	total: 2.81s	remaining: 18s
27:	total: 2.9s	remaining: 17.8s
28:	total: 2.99s	remaining: 17.6s
29:	total: 3.09s	remaining: 17.5

42:	total: 4.21s	remaining: 15.4s
43:	total: 4.32s	remaining: 15.3s
44:	total: 4.44s	remaining: 15.3s
45:	total: 4.52s	remaining: 15.1s
46:	total: 4.61s	remaining: 15s
47:	total: 4.7s	remaining: 14.9s
48:	total: 4.79s	remaining: 14.8s
49:	total: 4.88s	remaining: 14.6s
50:	total: 4.97s	remaining: 14.5s
51:	total: 5.05s	remaining: 14.4s
52:	total: 5.13s	remaining: 14.2s
53:	total: 5.21s	remaining: 14.1s
54:	total: 5.31s	remaining: 14s
55:	total: 5.4s	remaining: 13.9s
56:	total: 5.49s	remaining: 13.8s
57:	total: 5.57s	remaining: 13.6s
58:	total: 5.66s	remaining: 13.5s
59:	total: 5.75s	remaining: 13.4s
60:	total: 5.83s	remaining: 13.3s
61:	total: 5.91s	remaining: 13.2s
62:	total: 6.02s	remaining: 13.1s
63:	total: 6.12s	remaining: 13s
64:	total: 6.25s	remaining: 13s
65:	total: 6.34s	remaining: 12.9s
66:	total: 6.44s	remaining: 12.8s
67:	total: 6.55s	remaining: 12.7s
68:	total: 6.66s	remaining: 12.6s
69:	total: 6.75s	remaining: 12.5s
70:	total: 6.86s	remaining: 12.5s
71:	total: 6.94s	remaini

83:	total: 7.79s	remaining: 10.8s
84:	total: 7.87s	remaining: 10.6s
85:	total: 7.97s	remaining: 10.6s
86:	total: 8.06s	remaining: 10.5s
87:	total: 8.13s	remaining: 10.4s
88:	total: 8.22s	remaining: 10.3s
89:	total: 8.3s	remaining: 10.1s
90:	total: 8.38s	remaining: 10s
91:	total: 8.46s	remaining: 9.93s
92:	total: 8.54s	remaining: 9.83s
93:	total: 8.67s	remaining: 9.78s
94:	total: 8.76s	remaining: 9.68s
95:	total: 8.85s	remaining: 9.59s
96:	total: 8.93s	remaining: 9.48s
97:	total: 9.02s	remaining: 9.39s
98:	total: 9.09s	remaining: 9.28s
99:	total: 9.15s	remaining: 9.15s
100:	total: 9.27s	remaining: 9.08s
101:	total: 9.35s	remaining: 8.98s
102:	total: 9.44s	remaining: 8.89s
103:	total: 9.53s	remaining: 8.79s
104:	total: 9.62s	remaining: 8.7s
105:	total: 9.8s	remaining: 8.69s
106:	total: 9.91s	remaining: 8.62s
107:	total: 9.99s	remaining: 8.51s
108:	total: 10.1s	remaining: 8.44s
109:	total: 10.2s	remaining: 8.36s
110:	total: 10.3s	remaining: 8.25s
111:	total: 10.4s	remaining: 8.15s
112:	to

In [100]:
print("XGBoost-CV Recall-Score: {}".format((recall_score(y_train, np.round(xg_oof_train)))))
print("Extra Tree-CV Recall-Score: {}".format((recall_score(y_train, np.round(et_oof_train)))))
print("RandomForest-CV Recall-Score: {}".format((recall_score(y_train, np.round(rf_oof_train)))))
print("CatBoost-CV Recall-Score: {}".format((recall_score(y_train, np.round(cb_oof_train)))))

XGBoost-CV Recall-Score: 0.020503524672708964
Extra Tree-CV Recall-Score: 0.00016112789526686808
RandomForest-CV Recall-Score: 0.0002014098690835851
CatBoost-CV Recall-Score: 0.38328298086606244


In [101]:
print("XGBoost-CV Precision-Score: {}".format((precision_score(y_train, np.round(xg_oof_train)))))
print("Extra Tree-CV Precision-Score: {}".format((precision_score(y_train, np.round(et_oof_train)))))
print("RandomForest-CV Precision-Score: {}".format((precision_score(y_train, np.round(rf_oof_train)))))
print("CatBoost-CV Precision-Score: {}".format((precision_score(y_train, np.round(cb_oof_train)))))

XGBoost-CV Precision-Score: 0.5520607375271149
Extra Tree-CV Precision-Score: 0.8
RandomForest-CV Precision-Score: 0.8333333333333334
CatBoost-CV Precision-Score: 0.2610568481123793


In [98]:
print("XGBoost-CV F1-Score: {}".format((f1_score(y_train, np.round(xg_oof_train)))))
print("Extra Tree-CV F1-Score: {}".format((f1_score(y_train, np.round(et_oof_train)))))
print("RandomForest-CV F1-Score: {}".format((f1_score(y_train, np.round(rf_oof_train)))))
print("CatBoost-CV F1-Score: {}".format((f1_score(y_train, np.round(cb_oof_train)))))

XGBoost-CV F1-Score: 0.03953858701984697
Extra Tree-CV F1-Score: 0.0003221908981071284
RandomForest-CV F1-Score: 0.0004027224034473038
CatBoost-CV F1-Score: 0.31057725262350466


In [99]:
print("XGBoost-CV AUC-Score: {}".format((roc_auc_score(y_train, np.round(xg_oof_train)))))
print("Extra Tree-CV AUC-Score: {}".format((roc_auc_score(y_train, np.round(et_oof_train)))))
print("RandomForest-CV AUC-Score: {}".format((roc_auc_score(y_train, np.round(rf_oof_train)))))
print("CatBoost-CV AUC-Score: {}".format((roc_auc_score(y_train, np.round(cb_oof_train)))))

XGBoost-CV AUC-Score: 0.5095212698464541
Extra Tree-CV AUC-Score: 0.5000787952006845
RandomForest-CV AUC-Score: 0.5000989361875929
CatBoost-CV AUC-Score: 0.6440038288579975


In [102]:
x_train = np.concatenate((xg_oof_train, et_oof_train, rf_oof_train, cb_oof_train), axis=1)
x_test = np.concatenate((xg_oof_test, et_oof_test, rf_oof_test, cb_oof_test), axis=1)

In [103]:
print("{},{}".format(x_train.shape, x_test.shape))

(307511, 4),(48744, 4)


In [119]:
logistic_regression = RandomForestClassifier()
logistic_regression.fit(x_train,y_train)

In [120]:
y_log_train = logistic_regression.predict(x_train)

In [124]:
sum(y_train)

24825

In [121]:
sum(y_log_train)

24803

In [126]:
print("RandomForest-Stacking Recall-Score: {}".format((recall_score(y_train, y_log_train))))
print("RandomForest-Stacking Precision-Score: {}".format((precision_score(y_train, y_log_train))))
print("RandomForest-Stacking F1-Score: {}".format((f1_score(y_train, y_log_train))))
print("RandomForest-Stacking AUC-Score: {}".format((roc_auc_score(y_train, y_log_train))))

RandomForest-Stacking Recall-Score: 0.9991137965760323
RandomForest-Stacking Precision-Score: 1.0
RandomForest-Stacking F1-Score: 0.9995567018618523
RandomForest-Stacking AUC-Score: 0.9995568982880161


In [127]:
print("RandomForest-Stacking AUC-Score: {}".format((confusion_matrix(y_train, y_log_train))))

RandomForest-Stacking AUC-Score: [[282686      0]
 [    22  24803]]
