In [None]:
!pip install refractml

In [1]:
!pip install xgboost

Collecting xgboost
[?25l  Downloading https://files.pythonhosted.org/packages/c3/eb/496aa2f5d356af4185f770bc76055307f8d1870e11016b10fd779b21769c/xgboost-2.0.3-py3-none-manylinux2014_x86_64.whl (297.1MB)
[K     |████████████████████████████████| 297.1MB 49.0MB/s eta 0:00:01
[?25hCollecting numpy
[?25l  Downloading https://files.pythonhosted.org/packages/98/5d/5738903efe0ecb73e51eb44feafba32bdba2081263d40c5043568ff60faf/numpy-1.24.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3MB)
[K     |████████████████████████████████| 17.3MB 40.3MB/s eta 0:00:01
[?25hCollecting scipy
[?25l  Downloading https://files.pythonhosted.org/packages/69/f0/fb07a9548e48b687b8bf2fa81d71aba9cfc548d365046ca1c791e24db99d/scipy-1.10.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.5MB)
[K     |████████████████████████████████| 34.5MB 44.5MB/s eta 0:00:01
[31mERROR: snowflake-ml-python 1.0.1 has requirement packaging<24,>=20.9, but you'll have packaging 24.0 which is incompati

In [3]:
!pip install catboost

Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/1f/a7/82a2856613e36245663cfabde87cb1f80878548dd9e116e83a4f0e72eff7/catboost-1.2.5-cp38-cp38-manylinux2014_x86_64.whl (98.2MB)
[K     |████████████████████████████████| 98.2MB 47.7MB/s eta 0:00:01   |▎                               | 931kB 3.9MB/s eta 0:00:26
[?25hCollecting graphviz
[?25l  Downloading https://files.pythonhosted.org/packages/00/be/d59db2d1d52697c6adc9eacaf50e8965b6345cc143f671e1ed068818d5cf/graphviz-0.20.3-py3-none-any.whl (47kB)
[K     |████████████████████████████████| 51kB 45.6MB/s eta 0:00:01
[?25hCollecting matplotlib
[?25l  Downloading https://files.pythonhosted.org/packages/30/33/cc27211d2ffeee4fd7402dca137b6e8a83f6dcae3d4be8d0ad5068555561/matplotlib-3.7.5-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (9.2MB)
[K     |████████████████████████████████| 9.2MB 42.4MB/s eta 0:00:01
[?25hCollecting pandas>=0.24
[?25l  Downloading https://files.pythonhosted.org/packages/f

You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [1]:
from snowflake.snowpark import Session
import configparser

In [2]:
import pandas as pd
import numpy as np
from scipy.stats import skew
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, recall_score, precision_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from math import sqrt
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from catboost import CatBoostClassifier
import gc

In [None]:
from refractml import *
from refractml.constants import MLModelFlavours
import requests

In [3]:
NFOLDS = 3
SEED = 0
NROWS = None

# Code to establish connection and read data from Snowflake

In [4]:
config = configparser.ConfigParser()
config.read("snowflake_connection.ini")

['snowflake_connection.ini']

In [5]:
connection_parameters = {
    "user": f'{config["Snowflake"]["user"]}',
    "password": f'{config["Snowflake"]["password"]}',
    #"password": os.getenv('snowflake_password'),
    "account": f'{config["Snowflake"]["account"]}',
    #"account": os.getenv('snowflake_account'),
    "WAREHOUSE": f'{config["Snowflake"]["WAREHOUSE"]}',
    "DATABASE": f'{config["Snowflake"]["DATABASE"]}',
    "SCHEMA": f'{config["Snowflake"]["SCHEMA"]}'
}

In [6]:
def snowflake_connector(conn):
    try:
        session = Session.builder.configs(conn).create()
        print("connection successful!")
    except:
        raise ValueError("error while connecting with db")
    return session

session = snowflake_connector(connection_parameters)

connection successful!


In [7]:
application_train_sf  = session.table("CRA_APPLICATION_TRAIN_DETAILS")
application_test_sf  = session.table("CRA_APPLICATION_TEST_DETAILS")
previous_application_sf  = session.table("CRA_PREVIOUS_APPLICATION_DETAILS")

# Convert Snowflake data into Pandas dataframes

In [8]:
data = application_train_sf.to_pandas()
test = application_test_sf.to_pandas()
prev = previous_application_sf.to_pandas()

In [9]:
data.drop(['CREATED_BY','CREATED_AT'], axis=1, inplace=True)
test.drop(['CREATED_BY','CREATED_AT'], axis=1, inplace=True)
prev.drop(['CREATED_BY','CREATED_AT'], axis=1, inplace=True)

In [10]:
categorical_feats = [
    f for f in data.columns if data[f].dtype == 'object'
]

In [11]:
for f_ in categorical_feats:
    data[f_], indexer = pd.factorize(data[f_])
    test[f_] = indexer.get_indexer(test[f_])

In [12]:
gc.enable()

y_train = data['TARGET']
del data['TARGET']

In [13]:
prev_cat_features = [
    f_ for f_ in prev.columns if prev[f_].dtype == 'object'
]
for f_ in prev_cat_features:
    prev[f_], _ = pd.factorize(prev[f_])

In [14]:
avg_prev = prev.groupby('SK_ID_CURR').mean()
cnt_prev = prev[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').count()
avg_prev['nb_app'] = cnt_prev['SK_ID_PREV']
del avg_prev['SK_ID_PREV']

In [15]:
x_train = data.merge(right=avg_prev.reset_index(), how='left', on='SK_ID_CURR')
x_test = test.merge(right=avg_prev.reset_index(), how='left', on='SK_ID_CURR')

x_train = x_train.fillna(0)
x_test= x_test.fillna(0)

ntrain = x_train.shape[0]
ntest = x_test.shape[0]

In [16]:
excluded_feats = ['SK_ID_CURR']
features = [f_ for f_ in x_train.columns if f_ not in excluded_feats]

In [17]:
x_train = x_train[features]
x_test = x_test[features]

kf = KFold(n_splits = NFOLDS, shuffle=True, random_state=SEED)

In [18]:
class SklearnWrapper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict_proba(x)[:,1]

In [19]:
class CatboostWrapper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_seed'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict_proba(x)[:,1]

In [20]:
class XgbWrapper(object):
    def __init__(self, seed=0, params=None):
        self.param = params
        self.param['seed'] = seed
        self.nrounds = params.pop('nrounds', 250)

    def train(self, x_train, y_train):
        dtrain = xgb.DMatrix(x_train, label=y_train)
        self.gbdt = xgb.train(self.param, dtrain, self.nrounds)

    def predict(self, x):
        return self.gbdt.predict(xgb.DMatrix(x))

In [21]:
def get_oof(clf):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf.split(x_train)):
        x_tr = x_train.loc[train_index]
        y_tr = y_train.loc[train_index]
        x_te = x_train.loc[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [22]:
et_params = {
    'n_jobs': 16,
    'n_estimators': 200,
    'max_features': 0.5,
    'max_depth': 12,
    'min_samples_leaf': 2,
}

In [23]:
rf_params = {
    'n_jobs': 16,
    'n_estimators': 200,
    'max_features': 0.2,
    'max_depth': 12,
    'min_samples_leaf': 2,
}

In [24]:
xgb_params = {
    'seed': 0,
    'colsample_bytree': 0.7,
    'silent': 1,
    'subsample': 0.7,
    'learning_rate': 0.075,
    'objective': 'binary:logistic',
    'max_depth': 4,
    'num_parallel_tree': 1,
    'min_child_weight': 1,
    'nrounds': 200
}

In [25]:
catboost_params = {
    'iterations': 200,
    'learning_rate': 0.5,
    'depth': 3,
    'l2_leaf_reg': 40,
    'bootstrap_type': 'Bernoulli',
    'subsample': 0.7,
    'scale_pos_weight': 5,
    'eval_metric': 'AUC',
    'od_type': 'Iter',
    'allow_writing_files': False
}

In [26]:
xg = XgbWrapper(seed=SEED, params=xgb_params)
et = SklearnWrapper(clf=ExtraTreesClassifier, seed=SEED, params=et_params)
rf = SklearnWrapper(clf=RandomForestClassifier, seed=SEED, params=rf_params)
cb = CatboostWrapper(clf= CatBoostClassifier, seed = SEED, params=catboost_params)

In [27]:
xg_oof_train, xg_oof_test = get_oof(xg)

Parameters: { "silent" } are not used.

Parameters: { "silent" } are not used.

Parameters: { "silent" } are not used.



In [None]:
et_oof_train, et_oof_test = get_oof(et)

In [None]:
rf_oof_train, rf_oof_test = get_oof(rf)

In [None]:
cb_oof_train, cb_oof_test = get_oof(cb)

In [None]:
@scoring_func
def score(model, request):
    payload_dict = request.json["payload"]
    data = pd.DataFrame(payload_dict,index=[0])
    y_pred = model.predict(model)
    y_prob = model.predict_proba(data)[:,1]
    temp_dict = {"Prediction: ": y_pred, "Probability: ": y_prod }
    return temp_dict

In [None]:
payload  = x_train.iloc[0].to_dict()
payload

In [None]:
req = requests.Request()
req.json = {"payload":payload}
y_req = req
output = score(xg, y_req)
output

In [100]:
print("XGBoost-CV Recall-Score: {}".format((recall_score(y_train, np.round(xg_oof_train)))))
print("Extra Tree-CV Recall-Score: {}".format((recall_score(y_train, np.round(et_oof_train)))))
print("RandomForest-CV Recall-Score: {}".format((recall_score(y_train, np.round(rf_oof_train)))))
print("CatBoost-CV Recall-Score: {}".format((recall_score(y_train, np.round(cb_oof_train)))))

XGBoost-CV Recall-Score: 0.020503524672708964
Extra Tree-CV Recall-Score: 0.00016112789526686808
RandomForest-CV Recall-Score: 0.0002014098690835851
CatBoost-CV Recall-Score: 0.38328298086606244


In [101]:
print("XGBoost-CV Precision-Score: {}".format((precision_score(y_train, np.round(xg_oof_train)))))
print("Extra Tree-CV Precision-Score: {}".format((precision_score(y_train, np.round(et_oof_train)))))
print("RandomForest-CV Precision-Score: {}".format((precision_score(y_train, np.round(rf_oof_train)))))
print("CatBoost-CV Precision-Score: {}".format((precision_score(y_train, np.round(cb_oof_train)))))

XGBoost-CV Precision-Score: 0.5520607375271149
Extra Tree-CV Precision-Score: 0.8
RandomForest-CV Precision-Score: 0.8333333333333334
CatBoost-CV Precision-Score: 0.2610568481123793


In [98]:
print("XGBoost-CV F1-Score: {}".format((f1_score(y_train, np.round(xg_oof_train)))))
print("Extra Tree-CV F1-Score: {}".format((f1_score(y_train, np.round(et_oof_train)))))
print("RandomForest-CV F1-Score: {}".format((f1_score(y_train, np.round(rf_oof_train)))))
print("CatBoost-CV F1-Score: {}".format((f1_score(y_train, np.round(cb_oof_train)))))

XGBoost-CV F1-Score: 0.03953858701984697
Extra Tree-CV F1-Score: 0.0003221908981071284
RandomForest-CV F1-Score: 0.0004027224034473038
CatBoost-CV F1-Score: 0.31057725262350466


In [99]:
print("XGBoost-CV AUC-Score: {}".format((roc_auc_score(y_train, np.round(xg_oof_train)))))
print("Extra Tree-CV AUC-Score: {}".format((roc_auc_score(y_train, np.round(et_oof_train)))))
print("RandomForest-CV AUC-Score: {}".format((roc_auc_score(y_train, np.round(rf_oof_train)))))
print("CatBoost-CV AUC-Score: {}".format((roc_auc_score(y_train, np.round(cb_oof_train)))))

XGBoost-CV AUC-Score: 0.5095212698464541
Extra Tree-CV AUC-Score: 0.5000787952006845
RandomForest-CV AUC-Score: 0.5000989361875929
CatBoost-CV AUC-Score: 0.6440038288579975


In [102]:
x_train = np.concatenate((xg_oof_train, et_oof_train, rf_oof_train, cb_oof_train), axis=1)
x_test = np.concatenate((xg_oof_test, et_oof_test, rf_oof_test, cb_oof_test), axis=1)

In [103]:
print("{},{}".format(x_train.shape, x_test.shape))

(307511, 4),(48744, 4)


In [119]:
logistic_regression = RandomForestClassifier()
logistic_regression.fit(x_train,y_train)

In [120]:
y_log_train = logistic_regression.predict(x_train)

In [124]:
sum(y_train)

24825

In [121]:
sum(y_log_train)

24803

In [126]:
print("RandomForest-Stacking Recall-Score: {}".format((recall_score(y_train, y_log_train))))
print("RandomForest-Stacking Precision-Score: {}".format((precision_score(y_train, y_log_train))))
print("RandomForest-Stacking F1-Score: {}".format((f1_score(y_train, y_log_train))))
print("RandomForest-Stacking AUC-Score: {}".format((roc_auc_score(y_train, y_log_train))))

RandomForest-Stacking Recall-Score: 0.9991137965760323
RandomForest-Stacking Precision-Score: 1.0
RandomForest-Stacking F1-Score: 0.9995567018618523
RandomForest-Stacking AUC-Score: 0.9995568982880161


In [127]:
print("RandomForest-Stacking AUC-Score: {}".format((confusion_matrix(y_train, y_log_train))))

RandomForest-Stacking AUC-Score: [[282686      0]
 [    22  24803]]
