# Use Credit Risk Analytics Notebook Template

In [1]:
from snowflake.snowpark import Session
from snowflake.connector.pandas_tools import write_pandas
# Data Science Libs
import pandas as pd
import numpy as np
from scipy.stats import skew
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, recall_score, precision_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from math import sqrt
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from catboost import CatBoostClassifier
import gc

import configparser

# FosforIO to read data from Snowflake
from fosforio import snowflake
# FosforML to register Model on FDC
from joblib import dump, load
from fosforml import *
from fosforml.constants import MLModelFlavours
import requests

Connection manager service url initialised to http://fdc-project-manager:80/project-manager
If you need to update its value then update the variable CONNECTION_MANAGER_BASE_URL in os env.




In [2]:
NFOLDS = 3
SEED = 0
NROWS = None

In [3]:
# To get snowflake connection object with a default snowflake connection created by the user, if available.
#snowflake.get_connection()

# To get snowflake connection object with a specific connection name
snowflake.get_connection(connection_name="FDC_Banking_FS_Snowflake")

Connection object created: <snowflake.connector.connection.SnowflakeConnection object at 0x7f0a80474f10>
Please close the connection after use!


<snowflake.connector.connection.SnowflakeConnection at 0x7f0a80474f10>

In [4]:
application_train_sf  = snowflake.get_dataframe("CRA_APPLICATION_TRAIN_DETAILS")
application_test_sf  = snowflake.get_dataframe("CRA_APPLICATION_TEST_DETAILS")
previous_application_sf  = snowflake.get_dataframe("CRA_PREVIOUS_APPLICATION_DETAILS")

# Convert Snowflake data into Pandas dataframes

In [5]:
data = application_train_sf.copy()
test = application_test_sf.copy()
prev = previous_application_sf.copy()

In [6]:
data.drop(['CREATED_BY','CREATED_AT'], axis=1, inplace=True)
test.drop(['CREATED_BY','CREATED_AT'], axis=1, inplace=True)
prev.drop(['CREATED_BY','CREATED_AT'], axis=1, inplace=True)

In [7]:
categorical_feats = [
    f for f in data.columns if data[f].dtype == 'object'
]

In [8]:
for f_ in categorical_feats:
    data[f_], indexer = pd.factorize(data[f_])
    test[f_] = indexer.get_indexer(test[f_])

In [9]:
gc.enable()

y_train = data['TARGET']
del data['TARGET']

In [10]:
prev_cat_features = [
    f_ for f_ in prev.columns if prev[f_].dtype == 'object'
]
for f_ in prev_cat_features:
    prev[f_], _ = pd.factorize(prev[f_])

In [11]:
avg_prev = prev.groupby('SK_ID_CURR').mean()
cnt_prev = prev[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').count()
avg_prev['nb_app'] = cnt_prev['SK_ID_PREV']
del avg_prev['SK_ID_PREV']

In [12]:
x_train = data.merge(right=avg_prev.reset_index(), how='left', on='SK_ID_CURR')
x_test = test.merge(right=avg_prev.reset_index(), how='left', on='SK_ID_CURR')

x_train = x_train.fillna(0)
x_test= x_test.fillna(0)

ntrain = x_train.shape[0]
ntest = x_test.shape[0]

In [13]:
excluded_feats = ['SK_ID_CURR']
features = [f_ for f_ in x_train.columns if f_ not in excluded_feats]

In [14]:
x_train = x_train[features]
x_test = x_test[features]

kf = KFold(n_splits = NFOLDS, shuffle=True, random_state=SEED)

In [15]:
class SklearnWrapper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)

    def predict_proba(self, x):
        return self.clf.predict_proba(x)[:,1]

In [16]:
class CatboostWrapper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_seed'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)
    
    def predict_proba(self, x):
        return self.clf.predict_proba(x)[:,1]

In [17]:
class XgbWrapper(object):
    def __init__(self, seed=0, params=None):
        self.param = params
        self.param['seed'] = seed
        self.nrounds = params.pop('nrounds', 250)

    def train(self, x_train, y_train):
        dtrain = xgb.DMatrix(x_train, label=y_train)
        self.gbdt = xgb.train(self.param, dtrain, self.nrounds)

    def predict(self, x):
        return self.gbdt.predict(xgb.DMatrix(x))
    
    def predict_proba(self, x):
        return self.gbdt.predict_proba(xgb.DMatrix(x))[:,1]

In [18]:
def get_oof(clf):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf.split(x_train)):
        x_tr = x_train.loc[train_index]
        y_tr = y_train.loc[train_index]
        x_te = x_train.loc[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [19]:
et_params = {
    'n_jobs': 16,
    'n_estimators': 200,
    'max_features': 0.5,
    'max_depth': 12,
    'min_samples_leaf': 2,
    'random_state':0
}

In [20]:
rf_params = {
    'n_jobs': 16,
    'n_estimators': 200,
    'max_features': 0.2,
    'max_depth': 12,
    'min_samples_leaf': 2,
    'random_state':0
}

In [21]:
xgb_params = {
    'seed': 0,
    'colsample_bytree': 0.7,
    'silent': 1,
    'subsample': 0.7,
    'learning_rate': 0.075,
    'objective': 'binary:logistic',
    'max_depth': 4,
    'num_parallel_tree': 1,
    'min_child_weight': 1,
    'nrounds': 200
}

In [22]:
catboost_params = {
    'iterations': 200,
    'learning_rate': 0.5,
    'depth': 3,
    'l2_leaf_reg': 40,
    'bootstrap_type': 'Bernoulli',
    'subsample': 0.7,
    'scale_pos_weight': 5,
    'eval_metric': 'AUC',
    'od_type': 'Iter',
    'allow_writing_files': False,
    'random_seed':0
}

In [23]:
#xg = XgbWrapper(seed=SEED, params=xgb_params)
#et = SklearnWrapper(clf=ExtraTreesClassifier, seed=SEED, params=et_params)
#rf = SklearnWrapper(clf=RandomForestClassifier, seed=SEED, params=rf_params)
#cb = CatboostWrapper(clf= CatBoostClassifier, seed = SEED, params=catboost_params)

# XGBoost Classifier

In [61]:
#xg_oof_train, xg_oof_test = get_oof(xg)

In [24]:
param = xgb_params
param['seed'] = 0
nrounds = 250

dtrain = xgb.DMatrix(x_train, label=y_train)
gbdt = xgb.train(param, dtrain, nrounds)

Parameters: { "nrounds", "silent" } are not used.



In [None]:
#y_prob = gbdt.predict(xgb.DMatrix(x))
#y_pred = np.round(y_prob)

In [25]:
y_prob = gbdt.predict(xgb.DMatrix(x_train))

In [26]:
y_pred = np.round(y_prob)

In [27]:
y_prob

array([0.10488835, 0.18760933, 0.03474117, ..., 0.01431217, 0.0234681 ,
       0.17675298], dtype=float32)

In [28]:
y_pred

array([0., 0., 0., ..., 0., 0., 0.], dtype=float32)

In [29]:
@scoring_func
def score(model, request):
    payload_dict = request.json["payload"]
    data = pd.DataFrame(payload_dict,index=[0])
    y_pred = model.predict(xgb.DMatrix(data))
    #y_prob = model.predict_proba(xgb.DMatrix(data))[:,1]
    temp_dict = {"Prediction: ": np.round(y_pred), "Probability: ": y_pred }
    return temp_dict

In [30]:
payload  = x_train.iloc[0].to_dict()
payload

{'NAME_CONTRACT_TYPE_x': 0.0,
 'CODE_GENDER': 0.0,
 'FLAG_OWN_CAR': 0.0,
 'FLAG_OWN_REALTY': 0.0,
 'CNT_CHILDREN': 2.0,
 'AMT_INCOME_TOTAL': 450000.0,
 'AMT_CREDIT_x': 450000.0,
 'AMT_ANNUITY_x': 20979.0,
 'AMT_GOODS_PRICE_x': 450000.0,
 'NAME_TYPE_SUITE_x': 0.0,
 'NAME_INCOME_TYPE': 0.0,
 'NAME_EDUCATION_TYPE': 0.0,
 'NAME_FAMILY_STATUS': 0.0,
 'NAME_HOUSING_TYPE': 0.0,
 'REGION_POPULATION_RELATIVE': 0.00733,
 'DAYS_BIRTH': -10860.0,
 'DAYS_EMPLOYED': -215.0,
 'DAYS_REGISTRATION': -4606.0,
 'DAYS_ID_PUBLISH': -3485.0,
 'OWN_CAR_AGE': 6.0,
 'FLAG_MOBIL': 1.0,
 'FLAG_EMP_PHONE': 1.0,
 'FLAG_WORK_PHONE': 0.0,
 'FLAG_CONT_MOBILE': 1.0,
 'FLAG_PHONE': 0.0,
 'FLAG_EMAIL': 0.0,
 'OCCUPATION_TYPE': 0.0,
 'CNT_FAM_MEMBERS': 4.0,
 'REGION_RATING_CLIENT': 2.0,
 'REGION_RATING_CLIENT_W_CITY': 2.0,
 'WEEKDAY_APPR_PROCESS_START_x': 0.0,
 'HOUR_APPR_PROCESS_START_x': 10.0,
 'REG_REGION_NOT_LIVE_REGION': 1.0,
 'REG_REGION_NOT_WORK_REGION': 1.0,
 'LIVE_REGION_NOT_WORK_REGION': 0.0,
 'REG_CITY_NOT_LIVE

In [31]:
print ('{ "payload": ', payload, "}")

{ "payload":  {'NAME_CONTRACT_TYPE_x': 0.0, 'CODE_GENDER': 0.0, 'FLAG_OWN_CAR': 0.0, 'FLAG_OWN_REALTY': 0.0, 'CNT_CHILDREN': 2.0, 'AMT_INCOME_TOTAL': 450000.0, 'AMT_CREDIT_x': 450000.0, 'AMT_ANNUITY_x': 20979.0, 'AMT_GOODS_PRICE_x': 450000.0, 'NAME_TYPE_SUITE_x': 0.0, 'NAME_INCOME_TYPE': 0.0, 'NAME_EDUCATION_TYPE': 0.0, 'NAME_FAMILY_STATUS': 0.0, 'NAME_HOUSING_TYPE': 0.0, 'REGION_POPULATION_RELATIVE': 0.00733, 'DAYS_BIRTH': -10860.0, 'DAYS_EMPLOYED': -215.0, 'DAYS_REGISTRATION': -4606.0, 'DAYS_ID_PUBLISH': -3485.0, 'OWN_CAR_AGE': 6.0, 'FLAG_MOBIL': 1.0, 'FLAG_EMP_PHONE': 1.0, 'FLAG_WORK_PHONE': 0.0, 'FLAG_CONT_MOBILE': 1.0, 'FLAG_PHONE': 0.0, 'FLAG_EMAIL': 0.0, 'OCCUPATION_TYPE': 0.0, 'CNT_FAM_MEMBERS': 4.0, 'REGION_RATING_CLIENT': 2.0, 'REGION_RATING_CLIENT_W_CITY': 2.0, 'WEEKDAY_APPR_PROCESS_START_x': 0.0, 'HOUR_APPR_PROCESS_START_x': 10.0, 'REG_REGION_NOT_LIVE_REGION': 1.0, 'REG_REGION_NOT_WORK_REGION': 1.0, 'LIVE_REGION_NOT_WORK_REGION': 0.0, 'REG_CITY_NOT_LIVE_CITY': 1.0, 'REG_CIT

In [32]:
req = requests.Request()
req.json = {"payload":payload}
y_req = req
score(gbdt, y_req)

{'Prediction: ': array([0.], dtype=float32),
 'Probability: ': array([0.10488835], dtype=float32)}

In [33]:
## registering the model in Fosfor.
model_reg = register_model(gbdt,
               score, 
               name="Credit_Risk_XGB_Classifier", 
               description="Credit Risk XGBoost Classification Model",
               flavour=MLModelFlavours.sklearn,
               model_type="classification",
               init_script="pip install snowflake-ml-python==1.0.11",
               y_true=y_train,
               y_pred=y_pred,
               prob=y_prob,
               features=x_train.columns,
               input_type="json", 
               explain_ai=True,
               x_train=x_train, 
               x_test=x_train, 
               y_train=y_train.tolist(),
               y_test=y_train.tolist(),
               feature_names=x_train.columns.tolist(),
               original_features=x_train.columns.tolist(),
               feature_ids=x_train.columns,
               kyd=True, kyd_score = True)

Calculating build time metrics

Progress: ██████████████████████████████████████████████████████████████████████ 100.0%


VBox(children=(HTML(value='<style>.grad_1{background: #2468a4;} .grad_2{ color:white; background: #2468a4;}</s…

In [34]:
import pickle

In [37]:
with open('/data/Output/CreditRisk_XGBclassifier_v1.pkl', 'wb') as f:  # open a text file
    pickle.dump(gbdt, f) # serialize the list

In [39]:
with open('model_artifacts/CreditRisk_XGBclassifier_v1.pkl', 'wb') as f:  # open a text file
    pickle.dump(gbdt, f) # serialize the list

# RandomForest Classifier

In [40]:
RFC = RandomForestClassifier(**rf_params)
RFC.fit(x_train, y_train)

In [41]:
y_pred = RFC.predict(x_train)

In [42]:
y_prob = RFC.predict_proba(x_train)[:,1]

In [43]:
y_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int8)

In [44]:
y_prob

array([0.14765435, 0.19342677, 0.04328996, ..., 0.02141991, 0.02572166,
       0.17335446])

In [45]:
@scoring_func
def score(model, request):
    payload_dict = request.json["payload"]
    data = pd.DataFrame(payload_dict,index=[0])
    y_pred = model.predict(data)
    y_prob = model.predict_proba(data)[:,1]
    temp_dict = {"Prediction: ": np.round(y_pred), "Probability: ": y_prob }
    return temp_dict

In [46]:
payload  = x_train.iloc[0].to_dict()
print ('{ "payload": ', payload, "}")

{ "payload":  {'NAME_CONTRACT_TYPE_x': 0.0, 'CODE_GENDER': 0.0, 'FLAG_OWN_CAR': 0.0, 'FLAG_OWN_REALTY': 0.0, 'CNT_CHILDREN': 2.0, 'AMT_INCOME_TOTAL': 450000.0, 'AMT_CREDIT_x': 450000.0, 'AMT_ANNUITY_x': 20979.0, 'AMT_GOODS_PRICE_x': 450000.0, 'NAME_TYPE_SUITE_x': 0.0, 'NAME_INCOME_TYPE': 0.0, 'NAME_EDUCATION_TYPE': 0.0, 'NAME_FAMILY_STATUS': 0.0, 'NAME_HOUSING_TYPE': 0.0, 'REGION_POPULATION_RELATIVE': 0.00733, 'DAYS_BIRTH': -10860.0, 'DAYS_EMPLOYED': -215.0, 'DAYS_REGISTRATION': -4606.0, 'DAYS_ID_PUBLISH': -3485.0, 'OWN_CAR_AGE': 6.0, 'FLAG_MOBIL': 1.0, 'FLAG_EMP_PHONE': 1.0, 'FLAG_WORK_PHONE': 0.0, 'FLAG_CONT_MOBILE': 1.0, 'FLAG_PHONE': 0.0, 'FLAG_EMAIL': 0.0, 'OCCUPATION_TYPE': 0.0, 'CNT_FAM_MEMBERS': 4.0, 'REGION_RATING_CLIENT': 2.0, 'REGION_RATING_CLIENT_W_CITY': 2.0, 'WEEKDAY_APPR_PROCESS_START_x': 0.0, 'HOUR_APPR_PROCESS_START_x': 10.0, 'REG_REGION_NOT_LIVE_REGION': 1.0, 'REG_REGION_NOT_WORK_REGION': 1.0, 'LIVE_REGION_NOT_WORK_REGION': 0.0, 'REG_CITY_NOT_LIVE_CITY': 1.0, 'REG_CIT

In [47]:
req = requests.Request()
req.json = {"payload":payload}
y_req = req
score(RFC, y_req)

{'Prediction: ': array([0], dtype=int8), 'Probability: ': array([0.14765435])}

In [48]:
## registering the model in Fosfor.
model_reg = register_model(RFC,
               score, 
               name="Credit_Risk_RandomForest_Classifier", 
               description="Credit Risk RandomForest Classification Model",
               flavour=MLModelFlavours.sklearn,
               model_type="classification",
               init_script="pip install snowflake-ml-python==1.0.11",
               y_true=y_train,
               y_pred=y_pred,
               prob=y_prob,
               features=x_train.columns,
               input_type="json", 
               explain_ai=True,
               x_train=x_train, 
               x_test=x_train, 
               y_train=y_train.tolist(),
               y_test=y_train.tolist(),
               feature_names=x_train.columns.tolist(),
               original_features=x_train.columns.tolist(),
               feature_ids=x_train.columns,
               kyd=True, kyd_score = True)

Calculating build time metrics

Progress: ██████████████████████████████████████████████████████████████████████ 100.0%


VBox(children=(HTML(value='<style>.grad_1{background: #2468a4;} .grad_2{ color:white; background: #2468a4;}</s…

In [49]:
with open('/data/Output/CreditRisk_RFclassifier_v1.pkl', 'wb') as f:  # open a text file
    pickle.dump(RFC, f) # serialize the list

In [50]:
with open('model_artifacts/CreditRisk_RFclassifier_v1.pkl', 'wb') as f:  # open a text file
    pickle.dump(RFC, f) # serialize the list

# Extra Tree Classifier

In [51]:
ETC = ExtraTreesClassifier(**et_params)
ETC.fit(x_train, y_train)

In [52]:
y_pred = ETC.predict(x_train)
y_prob = ETC.predict_proba(x_train)[:,1]

In [53]:
y_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int8)

In [54]:
y_prob

array([0.12784927, 0.14551793, 0.03972889, ..., 0.02458349, 0.02757475,
       0.14668218])

In [55]:
@scoring_func
def score(model, request):
    payload_dict = request.json["payload"]
    data = pd.DataFrame(payload_dict,index=[0])
    y_pred = model.predict(data)
    y_prob = model.predict_proba(data)[:,1]
    temp_dict = {"Prediction: ": np.round(y_pred), "Probability: ": y_prob }
    return temp_dict

In [56]:
payload  = x_train.iloc[0].to_dict()
print ('{ "payload": ', payload, "}")

{ "payload":  {'NAME_CONTRACT_TYPE_x': 0.0, 'CODE_GENDER': 0.0, 'FLAG_OWN_CAR': 0.0, 'FLAG_OWN_REALTY': 0.0, 'CNT_CHILDREN': 2.0, 'AMT_INCOME_TOTAL': 450000.0, 'AMT_CREDIT_x': 450000.0, 'AMT_ANNUITY_x': 20979.0, 'AMT_GOODS_PRICE_x': 450000.0, 'NAME_TYPE_SUITE_x': 0.0, 'NAME_INCOME_TYPE': 0.0, 'NAME_EDUCATION_TYPE': 0.0, 'NAME_FAMILY_STATUS': 0.0, 'NAME_HOUSING_TYPE': 0.0, 'REGION_POPULATION_RELATIVE': 0.00733, 'DAYS_BIRTH': -10860.0, 'DAYS_EMPLOYED': -215.0, 'DAYS_REGISTRATION': -4606.0, 'DAYS_ID_PUBLISH': -3485.0, 'OWN_CAR_AGE': 6.0, 'FLAG_MOBIL': 1.0, 'FLAG_EMP_PHONE': 1.0, 'FLAG_WORK_PHONE': 0.0, 'FLAG_CONT_MOBILE': 1.0, 'FLAG_PHONE': 0.0, 'FLAG_EMAIL': 0.0, 'OCCUPATION_TYPE': 0.0, 'CNT_FAM_MEMBERS': 4.0, 'REGION_RATING_CLIENT': 2.0, 'REGION_RATING_CLIENT_W_CITY': 2.0, 'WEEKDAY_APPR_PROCESS_START_x': 0.0, 'HOUR_APPR_PROCESS_START_x': 10.0, 'REG_REGION_NOT_LIVE_REGION': 1.0, 'REG_REGION_NOT_WORK_REGION': 1.0, 'LIVE_REGION_NOT_WORK_REGION': 0.0, 'REG_CITY_NOT_LIVE_CITY': 1.0, 'REG_CIT

In [57]:
req = requests.Request()
req.json = {"payload":payload}
y_req = req
score(ETC, y_req)

{'Prediction: ': array([0], dtype=int8), 'Probability: ': array([0.12784927])}

In [58]:
## registering the model in Fosfor.
model_reg = register_model(ETC,
               score, 
               name="Credit_Risk_ExtraTree_Classifier", 
               description="Credit Risk ExtraTree Classification Model",
               flavour=MLModelFlavours.sklearn,
               model_type="classification",
               init_script="pip install snowflake-ml-python==1.0.11",
               y_true=y_train,
               y_pred=y_pred,
               prob=y_prob,
               features=x_train.columns,
               input_type="json", 
               explain_ai=True,
               x_train=x_train, 
               x_test=x_train, 
               y_train=y_train.tolist(),
               y_test=y_train.tolist(),
               feature_names=x_train.columns.tolist(),
               original_features=x_train.columns.tolist(),
               feature_ids=x_train.columns,
               kyd=True, kyd_score = True)

Calculating build time metrics

Progress: ██████████████████████████████████████████████████████████████████████ 100.0%


VBox(children=(HTML(value='<style>.grad_1{background: #2468a4;} .grad_2{ color:white; background: #2468a4;}</s…

In [59]:
with open('/data/Output/CreditRisk_ETclassifier_v1.pkl', 'wb') as f:  # open a text file
    pickle.dump(ETC, f) # serialize the list

In [60]:
with open('model_artifacts/CreditRisk_ETclassifier_v1.pkl', 'wb') as f:  # open a text file
    pickle.dump(ETC, f) # serialize the list

# CATBoost Classifier

In [61]:
CBC = CatBoostClassifier(**catboost_params)
CBC.fit(x_train, y_train)

0:	total: 74.6ms	remaining: 14.8s
1:	total: 97ms	remaining: 9.6s
2:	total: 123ms	remaining: 8.04s
3:	total: 146ms	remaining: 7.17s
4:	total: 171ms	remaining: 6.67s
5:	total: 195ms	remaining: 6.29s
6:	total: 220ms	remaining: 6.06s
7:	total: 242ms	remaining: 5.82s
8:	total: 268ms	remaining: 5.69s
9:	total: 291ms	remaining: 5.52s
10:	total: 313ms	remaining: 5.38s
11:	total: 336ms	remaining: 5.26s
12:	total: 359ms	remaining: 5.16s
13:	total: 381ms	remaining: 5.06s
14:	total: 408ms	remaining: 5.04s
15:	total: 433ms	remaining: 4.98s
16:	total: 455ms	remaining: 4.9s
17:	total: 480ms	remaining: 4.86s
18:	total: 504ms	remaining: 4.8s
19:	total: 536ms	remaining: 4.83s
20:	total: 560ms	remaining: 4.78s
21:	total: 591ms	remaining: 4.78s
22:	total: 620ms	remaining: 4.77s
23:	total: 648ms	remaining: 4.75s
24:	total: 675ms	remaining: 4.73s
25:	total: 705ms	remaining: 4.72s
26:	total: 731ms	remaining: 4.69s
27:	total: 756ms	remaining: 4.64s
28:	total: 778ms	remaining: 4.59s
29:	total: 799ms	remaining:

<catboost.core.CatBoostClassifier at 0x7f0995ac5fa0>

In [62]:
y_pred = CBC.predict(x_train)
y_prob = CBC.predict_proba(x_train)[:,1]

In [63]:
y_pred

array([0, 1, 0, ..., 0, 0, 1])

In [64]:
y_prob

array([0.48134981, 0.59034187, 0.11299618, ..., 0.06110293, 0.07483319,
       0.60933042])

In [65]:
@scoring_func
def score(model, request):
    payload_dict = request.json["payload"]
    data = pd.DataFrame(payload_dict,index=[0])
    y_pred = model.predict(data)
    y_prob = model.predict_proba(data)[:,1]
    temp_dict = {"Prediction: ": np.round(y_pred), "Probability: ": y_prob }
    return temp_dict

In [66]:
payload  = x_train.iloc[0].to_dict()
print ('{ "payload": ', payload, "}")

{ "payload":  {'NAME_CONTRACT_TYPE_x': 0.0, 'CODE_GENDER': 0.0, 'FLAG_OWN_CAR': 0.0, 'FLAG_OWN_REALTY': 0.0, 'CNT_CHILDREN': 2.0, 'AMT_INCOME_TOTAL': 450000.0, 'AMT_CREDIT_x': 450000.0, 'AMT_ANNUITY_x': 20979.0, 'AMT_GOODS_PRICE_x': 450000.0, 'NAME_TYPE_SUITE_x': 0.0, 'NAME_INCOME_TYPE': 0.0, 'NAME_EDUCATION_TYPE': 0.0, 'NAME_FAMILY_STATUS': 0.0, 'NAME_HOUSING_TYPE': 0.0, 'REGION_POPULATION_RELATIVE': 0.00733, 'DAYS_BIRTH': -10860.0, 'DAYS_EMPLOYED': -215.0, 'DAYS_REGISTRATION': -4606.0, 'DAYS_ID_PUBLISH': -3485.0, 'OWN_CAR_AGE': 6.0, 'FLAG_MOBIL': 1.0, 'FLAG_EMP_PHONE': 1.0, 'FLAG_WORK_PHONE': 0.0, 'FLAG_CONT_MOBILE': 1.0, 'FLAG_PHONE': 0.0, 'FLAG_EMAIL': 0.0, 'OCCUPATION_TYPE': 0.0, 'CNT_FAM_MEMBERS': 4.0, 'REGION_RATING_CLIENT': 2.0, 'REGION_RATING_CLIENT_W_CITY': 2.0, 'WEEKDAY_APPR_PROCESS_START_x': 0.0, 'HOUR_APPR_PROCESS_START_x': 10.0, 'REG_REGION_NOT_LIVE_REGION': 1.0, 'REG_REGION_NOT_WORK_REGION': 1.0, 'LIVE_REGION_NOT_WORK_REGION': 0.0, 'REG_CITY_NOT_LIVE_CITY': 1.0, 'REG_CIT

In [67]:
req = requests.Request()
req.json = {"payload":payload}
y_req = req
score(CBC, y_req)

{'Prediction: ': array([0]), 'Probability: ': array([0.48134981])}

In [68]:
## registering the model in Fosfor.
model_reg = register_model(CBC,
               score, 
               name="Credit_Risk_CATBoost_Classifier", 
               description="Credit Risk CatBoost Classification Model",
               flavour=MLModelFlavours.sklearn,
               model_type="classification",
               init_script="pip install snowflake-ml-python==1.0.11",
               y_true=y_train,
               y_pred=y_pred,
               prob=y_prob,
               features=x_train.columns,
               input_type="json", 
               explain_ai=True,
               x_train=x_train, 
               x_test=x_train, 
               y_train=y_train.tolist(),
               y_test=y_train.tolist(),
               feature_names=x_train.columns.tolist(),
               original_features=x_train.columns.tolist(),
               feature_ids=x_train.columns,
               kyd=True, kyd_score = True)

Calculating build time metrics

Progress: ██████████████████████████████████████████████████████████████████████ 100.0%


VBox(children=(HTML(value='<style>.grad_1{background: #2468a4;} .grad_2{ color:white; background: #2468a4;}</s…

In [69]:
with open('/data/Output/CreditRisk_CatBoostclassifier_v1.pkl', 'wb') as f:  # open a text file
    pickle.dump(CBC, f) # serialize the list

In [70]:
with open('model_artifacts/CreditRisk_CatBoostclassifier_v1.pkl', 'wb') as f:  # open a text file
    pickle.dump(CBC, f) # serialize the list

# Version 1 of Stacking Classifier

In [71]:
x_train_XGB = gbdt.predict(xgb.DMatrix(x_train))
x_train_RFC = RFC.predict_proba(x_train)[:,1]
x_train_ETC = ETC.predict_proba(x_train)[:,1]
x_train_CBC = CBC.predict_proba(x_train)[:,1]

In [72]:
x_test_XGB = gbdt.predict(xgb.DMatrix(x_test))
x_test_RFC = RFC.predict_proba(x_test)[:,1]
x_test_ETC = ETC.predict_proba(x_test)[:,1]
x_test_CBC = CBC.predict_proba(x_test)[:,1]

In [73]:
x_train_XGBT = x_train_XGB.reshape(-1,1)
x_train_RFCT = x_train_RFC.reshape(-1,1)
x_train_ETCT = x_train_ETC.reshape(-1,1)
x_train_CBCT = x_train_CBC.reshape(-1,1)

In [74]:
x_train_stk = np.concatenate((x_train_XGBT, x_train_RFCT, x_train_ETCT, x_train_CBCT),axis=1)

In [75]:
x_train_stk.shape

(307511, 4)

In [76]:
x_test_XGBT = x_test_XGB.reshape(-1,1)
x_test_RFCT = x_test_RFC.reshape(-1,1)
x_test_ETCT = x_test_ETC.reshape(-1,1)
x_test_CBCT = x_test_CBC.reshape(-1,1)

In [77]:
x_test_stk = np.concatenate((x_test_XGBT, x_test_RFCT, x_test_ETCT, x_test_CBCT),axis=1)
x_test_stk.shape

(48744, 4)

In [78]:
RFC_Stack = RandomForestClassifier()
RFC_Stack.fit(x_train_stk,y_train)

In [79]:
y_pred_stk = RFC_Stack.predict(x_train_stk)
y_prob_stk = RFC_Stack.predict_proba(x_train_stk)[:,1]

In [80]:
ytest_pred_stk = RFC_Stack.predict(x_test_stk)
ytest_prob_stk = RFC_Stack.predict_proba(x_test_stk)[:,1]

In [81]:
y_pred_stk

array([0, 1, 0, ..., 0, 0, 0], dtype=int8)

In [82]:
y_prob_stk

array([0.07, 0.92, 0.03, ..., 0.  , 0.  , 0.02])

In [83]:
@scoring_func
def score(model, request):
    payload_dict = request.json["payload"]
    data = pd.DataFrame(payload_dict,index=[0])
    
    data_XGB = gbdt.predict(xgb.DMatrix(data))
    data_RFC = RFC.predict_proba(data)[:,1]
    data_ETC = ETC.predict_proba(data)[:,1]
    data_CBC = CBC.predict_proba(data)[:,1]
    
    data_XGBT = data_XGB.reshape(-1,1)
    data_RFCT = data_RFC.reshape(-1,1)
    data_ETCT = data_ETC.reshape(-1,1)
    data_CBCT = data_CBC.reshape(-1,1)
    
    data = np.concatenate((data_XGBT, data_RFCT, data_ETCT, data_CBCT), axis=1)
    
    y_pred = model.predict(data)
    y_prob = model.predict_proba(data)[:,1]
    temp_dict = {"Prediction: ": np.round(y_pred), "Probability: ": y_prob }
    return temp_dict

In [84]:
payload  = x_train.iloc[0].to_dict()
print ('{ "payload": ', payload, "}")

{ "payload":  {'NAME_CONTRACT_TYPE_x': 0.0, 'CODE_GENDER': 0.0, 'FLAG_OWN_CAR': 0.0, 'FLAG_OWN_REALTY': 0.0, 'CNT_CHILDREN': 2.0, 'AMT_INCOME_TOTAL': 450000.0, 'AMT_CREDIT_x': 450000.0, 'AMT_ANNUITY_x': 20979.0, 'AMT_GOODS_PRICE_x': 450000.0, 'NAME_TYPE_SUITE_x': 0.0, 'NAME_INCOME_TYPE': 0.0, 'NAME_EDUCATION_TYPE': 0.0, 'NAME_FAMILY_STATUS': 0.0, 'NAME_HOUSING_TYPE': 0.0, 'REGION_POPULATION_RELATIVE': 0.00733, 'DAYS_BIRTH': -10860.0, 'DAYS_EMPLOYED': -215.0, 'DAYS_REGISTRATION': -4606.0, 'DAYS_ID_PUBLISH': -3485.0, 'OWN_CAR_AGE': 6.0, 'FLAG_MOBIL': 1.0, 'FLAG_EMP_PHONE': 1.0, 'FLAG_WORK_PHONE': 0.0, 'FLAG_CONT_MOBILE': 1.0, 'FLAG_PHONE': 0.0, 'FLAG_EMAIL': 0.0, 'OCCUPATION_TYPE': 0.0, 'CNT_FAM_MEMBERS': 4.0, 'REGION_RATING_CLIENT': 2.0, 'REGION_RATING_CLIENT_W_CITY': 2.0, 'WEEKDAY_APPR_PROCESS_START_x': 0.0, 'HOUR_APPR_PROCESS_START_x': 10.0, 'REG_REGION_NOT_LIVE_REGION': 1.0, 'REG_REGION_NOT_WORK_REGION': 1.0, 'LIVE_REGION_NOT_WORK_REGION': 0.0, 'REG_CITY_NOT_LIVE_CITY': 1.0, 'REG_CIT

In [85]:
req = requests.Request()
req.json = {"payload":payload}
y_req = req
score(RFC_Stack, y_req)

{'Prediction: ': array([0], dtype=int8), 'Probability: ': array([0.07])}

In [86]:
## registering the model in Fosfor.
model_reg = register_model(RFC_Stack,
               score, 
               name="Credit_Risk_StackedRF_Classifier", 
               description="Credit Risk Stacked RF Classification Model",
               flavour=MLModelFlavours.sklearn,
               model_type="classification",
               init_script="pip install snowflake-ml-python==1.0.11",
               y_true=y_train,
               y_pred=y_pred,
               prob=y_prob,
               features=x_train.columns,
               input_type="json", 
               explain_ai=True,
               x_train=x_train, 
               x_test=x_train, 
               y_train=y_train.tolist(),
               y_test=y_train.tolist(),
               feature_names=x_train.columns.tolist(),
               original_features=x_train.columns.tolist(),
               feature_ids=x_train.columns,
               kyd=True, kyd_score = True)

Calculating build time metrics

Progress: ██████████████████████████████████████████████████████████████████████ 100.0%


VBox(children=(HTML(value='<style>.grad_1{background: #2468a4;} .grad_2{ color:white; background: #2468a4;}</s…

In [87]:
with open('/data/Output/CreditRisk_StackedRFclassifier_v1.pkl', 'wb') as f:  # open a text file
    pickle.dump(RFC_Stack, f) # serialize the list

In [88]:
with open('model_artifacts/CreditRisk_StackedRFclassifier_v1.pkl', 'wb') as f:  # open a text file
    pickle.dump(RFC_Stack, f) # serialize the list

# Version 2 of Stacking Classifier

In [89]:
x_train_stk = np.concatenate((x_train_RFCT, x_train_ETCT, x_train_CBCT),axis=1)
x_test_stk = np.concatenate(( x_test_RFCT, x_test_ETCT, x_test_CBCT),axis=1)

RFC_Stack = RandomForestClassifier()
RFC_Stack.fit(x_train_stk,y_train)

y_pred_stk = RFC_Stack.predict(x_train_stk)
y_prob_stk = RFC_Stack.predict_proba(x_train_stk)[:,1]

ytest_pred_stk = RFC_Stack.predict(x_test_stk)
ytest_prob_stk = RFC_Stack.predict_proba(x_test_stk)[:,1]

y_pred_stk
y_prob_stk

array([0.26, 0.84, 0.01, ..., 0.  , 0.  , 0.04])

In [90]:
@scoring_func
def score(model, request):
    payload_dict = request.json["payload"]
    data = pd.DataFrame(payload_dict,index=[0])
    
    #data_XGB = gbdt.predict(xgb.DMatrix(data))
    data_RFC = RFC.predict_proba(data)[:,1]
    data_ETC = ETC.predict_proba(data)[:,1]
    data_CBC = CBC.predict_proba(data)[:,1]
    
    #data_XGBT = data_XGB.reshape(-1,1)
    data_RFCT = data_RFC.reshape(-1,1)
    data_ETCT = data_ETC.reshape(-1,1)
    data_CBCT = data_CBC.reshape(-1,1)
    
    #data = np.concatenate((data_XGBT, data_RFCT, data_ETCT, data_CBCT), axis=1)
    data = np.concatenate((data_RFCT, data_ETCT, data_CBCT), axis=1)
    
    y_pred = model.predict(data)
    y_prob = model.predict_proba(data)[:,1]
    temp_dict = {"Prediction: ": np.round(y_pred), "Probability: ": y_prob }
    return temp_dict

In [91]:
## registering the model in Fosfor with correct output (y_pred_stk, y_prob_stk.
model_reg = register_model(RFC_Stack,
               score, 
               name="Credit_Risk_StackedRF_Classifier", 
               description="Credit Risk Stacked RF Classification Model",
               flavour=MLModelFlavours.sklearn,
               model_type="classification",
               init_script="pip install snowflake-ml-python==1.0.11",
               y_true=y_train,
               y_pred=y_pred_stk,
               prob=y_prob_stk,
               features=x_train.columns,
               input_type="json", 
               explain_ai=True,
               x_train=x_train, 
               x_test=x_train, 
               y_train=y_train.tolist(),
               y_test=y_train.tolist(),
               feature_names=x_train.columns.tolist(),
               original_features=x_train.columns.tolist(),
               feature_ids=x_train.columns,
               kyd=True, kyd_score = True)

Calculating build time metrics

Progress: ██████████████████████████████████████████████████████████████████████ 100.0%


VBox(children=(HTML(value='<style>.grad_1{background: #2468a4;} .grad_2{ color:white; background: #2468a4;}</s…

In [94]:
with open('/data/Output/CreditRisk_StackedRFclassifier_v2.pkl', 'wb') as f:  # open a text file
    pickle.dump(RFC_Stack, f) # serialize the list

In [95]:
with open('model_artifacts/CreditRisk_StackedRFclassifier_v2.pkl', 'wb') as f:  # open a text file
    pickle.dump(RFC_Stack, f) # serialize the list

# Create Sample Dataframe/Table for Monitoring Setup 

In [96]:
x_train.shape, y_train.shape, y_pred_stk.shape, y_prob_stk.shape

((307511, 156), (307511,), (307511,), (307511,))

In [97]:
x_test.shape ,  ytest_pred_stk.shape, ytest_prob_stk.shape

((48744, 156), (48744,), (48744,))

In [98]:
type(x_train), type(y_train), type(y_pred_stk), type(y_prob_stk)

(pandas.core.frame.DataFrame,
 pandas.core.series.Series,
 numpy.ndarray,
 numpy.ndarray)

In [99]:
df_pred_stk = pd.DataFrame(y_pred_stk, columns=['Prediction'])

In [100]:
df_prob_stk = pd.DataFrame(y_prob_stk, columns=['Probability'])

In [101]:
df_train = pd.DataFrame(y_train, columns=['TARGET'])

In [102]:
temp_df = pd.concat([x_train, df_train, df_pred_stk, df_prob_stk], axis=1)

In [103]:
temp_df.to_csv('/data/Output/application_train_output.csv')

In [104]:
temp_df = pd.read_csv('/data/Output/application_train_output.csv')

In [105]:
temp_df_1 = temp_df.sample(frac = 0.7)
temp_df_2 = temp_df.drop(temp_df_1.index)

In [106]:
print (temp_df.shape)
print (temp_df_1.shape)
print (temp_df_2.shape)

(307511, 160)
(215258, 160)
(92253, 160)


In [107]:
type(temp_df_1)

pandas.core.frame.DataFrame

In [15]:
df_model=session.createDataFrame(
        temp_df_1.values.tolist(),
        schema=temp_df_1.columns.tolist())
df_model.write.mode("overwrite").save_as_table("FDC_Banking_FS.PUBLIC.CRA_APPLICATION_OUTPUT_BATCH1")

In [16]:
df_model=session.createDataFrame(
        temp_df_2.values.tolist(),
        schema=temp_df_2.columns.tolist())
df_model.write.mode("overwrite").save_as_table("FDC_Banking_FS.PUBLIC.CRA_APPLICATION_OUTPUT_BATCH2")