# Use Credit Risk Analytics Notebook Template

In [1]:
from snowflake.snowpark import Session
from snowflake.connector.pandas_tools import write_pandas
# Data Science Libs
import pandas as pd
import numpy as np
from scipy.stats import skew
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, recall_score, precision_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from math import sqrt
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from catboost import CatBoostClassifier
import gc

# FosforML to register Model on FDC
from fosforml.model_manager.snowflakesession import get_session
my_session = get_session()
from fosforml import register_model

import pickle
from joblib import dump, load
import requests

In [2]:
NFOLDS = 3
SEED = 0
NROWS = None

In [3]:
table_name = 'CRA_APPLICATION_TRAIN_DETAILS'

sf_df = my_session.sql("select * from {}".format(table_name))
application_train_sf = sf_df.to_pandas()

In [4]:
table_name = 'CRA_APPLICATION_TEST_DETAILS'

sf_df = my_session.sql("select * from {}".format(table_name))
application_test_sf = sf_df.to_pandas()

In [5]:
table_name = 'CRA_PREVIOUS_APPLICATION_DETAILS'

sf_df = my_session.sql("select * from {}".format(table_name))
previous_application_sf = sf_df.to_pandas()

In [60]:
#application_train_sf  = snowflake.get_dataframe("CRA_APPLICATION_TRAIN_DETAILS")
#application_test_sf  = snowflake.get_dataframe("CRA_APPLICATION_TEST_DETAILS")
#previous_application_sf  = snowflake.get_dataframe("CRA_PREVIOUS_APPLICATION_DETAILS")

# Convert Snowflake data into Pandas dataframes

In [10]:
data = application_train_sf.copy()
test = application_test_sf.copy()
prev = previous_application_sf.copy()

In [11]:
data.drop(['CREATED_BY','CREATED_AT'], axis=1, inplace=True)
test.drop(['CREATED_BY','CREATED_AT'], axis=1, inplace=True)
prev.drop(['CREATED_BY','CREATED_AT'], axis=1, inplace=True)

In [12]:
categorical_feats = [
    f for f in data.columns if data[f].dtype == 'object'
]

In [13]:
for f_ in categorical_feats:
    data[f_], indexer = pd.factorize(data[f_])
    test[f_] = indexer.get_indexer(test[f_])

In [14]:
gc.enable()

y_train = data['TARGET']
del data['TARGET']

In [15]:
prev_cat_features = [
    f_ for f_ in prev.columns if prev[f_].dtype == 'object'
]
for f_ in prev_cat_features:
    prev[f_], _ = pd.factorize(prev[f_])

In [16]:
avg_prev = prev.groupby('SK_ID_CURR').mean()
cnt_prev = prev[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').count()
avg_prev['nb_app'] = cnt_prev['SK_ID_PREV']
del avg_prev['SK_ID_PREV']

In [17]:
x_train = data.merge(right=avg_prev.reset_index(), how='left', on='SK_ID_CURR')
x_test = test.merge(right=avg_prev.reset_index(), how='left', on='SK_ID_CURR')

In [20]:
x_train.columns = x_train.columns.str.upper()
x_test.columns = x_test.columns.str.upper()

In [21]:
x_train = x_train.fillna(0)
x_test= x_test.fillna(0)

ntrain = x_train.shape[0]
ntest = x_test.shape[0]

In [22]:
excluded_feats = ['SK_ID_CURR']
features = [f_ for f_ in x_train.columns if f_ not in excluded_feats]

In [23]:
x_train = x_train[features]
x_test = x_test[features]

kf = KFold(n_splits = NFOLDS, shuffle=True, random_state=SEED)

In [24]:
class SklearnWrapper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)

    def predict_proba(self, x):
        return self.clf.predict_proba(x)[:,1]

In [25]:
class CatboostWrapper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_seed'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)
    
    def predict_proba(self, x):
        return self.clf.predict_proba(x)[:,1]

In [26]:
class XgbWrapper(object):
    def __init__(self, seed=0, params=None):
        self.param = params
        self.param['seed'] = seed
        self.nrounds = params.pop('nrounds', 250)

    def train(self, x_train, y_train):
        dtrain = xgb.DMatrix(x_train, label=y_train)
        self.gbdt = xgb.train(self.param, dtrain, self.nrounds)

    def predict(self, x):
        return self.gbdt.predict(xgb.DMatrix(x))
    
    def predict_proba(self, x):
        return self.gbdt.predict_proba(xgb.DMatrix(x))[:,1]

In [27]:
def get_oof(clf):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf.split(x_train)):
        x_tr = x_train.loc[train_index]
        y_tr = y_train.loc[train_index]
        x_te = x_train.loc[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [28]:
et_params = {
    'n_jobs': 16,
    'n_estimators': 200,
    'max_features': 0.5,
    'max_depth': 12,
    'min_samples_leaf': 2,
    'random_state':0
}

In [29]:
rf_params = {
    'n_jobs': 16,
    'n_estimators': 200,
    'max_features': 0.2,
    'max_depth': 12,
    'min_samples_leaf': 2,
    'random_state':0
}

In [30]:
xgb_params = {
    'seed': 0,
    'colsample_bytree': 0.7,
    'silent': 1,
    'subsample': 0.7,
    'learning_rate': 0.075,
    'objective': 'binary:logistic',
    'max_depth': 4,
    'num_parallel_tree': 1,
    'min_child_weight': 1,
    'nrounds': 200
}

In [31]:
catboost_params = {
    'iterations': 200,
    'learning_rate': 0.5,
    'depth': 3,
    'l2_leaf_reg': 40,
    'bootstrap_type': 'Bernoulli',
    'subsample': 0.7,
    'scale_pos_weight': 5,
    'eval_metric': 'AUC',
    'od_type': 'Iter',
    'allow_writing_files': False,
    'random_seed':0
}

In [32]:
xg = XgbWrapper(seed=SEED, params=xgb_params)
et = SklearnWrapper(clf=ExtraTreesClassifier, seed=SEED, params=et_params)
rf = SklearnWrapper(clf=RandomForestClassifier, seed=SEED, params=rf_params)
cb = CatboostWrapper(clf= CatBoostClassifier, seed = SEED, params=catboost_params)

# XGBoost Classifier

In [25]:
xg_oof_train, xg_oof_test = get_oof(xg)

Parameters: { "silent" } are not used.

Parameters: { "silent" } are not used.

Parameters: { "silent" } are not used.



In [26]:
param = xgb_params
param['seed'] = 0
nrounds = 250

dtrain = xgb.DMatrix(x_train, label=y_train)
gbdt = xgb.train(param, dtrain, nrounds)

Parameters: { "silent" } are not used.



In [82]:
#y_prob = gbdt.predict(xgb.DMatrix(x))
#y_pred = np.round(y_prob)

In [27]:
y_prob = gbdt.predict(xgb.DMatrix(x_train))

In [28]:
y_pred = np.round(y_prob)

In [29]:
y_prob

array([0.0776754 , 0.0133677 , 0.05679303, ..., 0.1400755 , 0.05083345,
       0.03295279], dtype=float32)

In [30]:
y_pred

array([0., 0., 0., ..., 0., 0., 0.], dtype=float32)

In [40]:
y_train_df = pd.DataFrame(y_train)
y_pred_df = pd.DataFrame(y_pred, columns=['PREDICTION'])
y_prob_df = pd.DataFrame(y_prob, columns=['PROBABILITY'])

In [43]:
type(x_train),type(x_test), type(y_train_df),type(y_pred_df), type(y_prob_df)

(pandas.core.frame.DataFrame,
 pandas.core.frame.DataFrame,
 pandas.core.frame.DataFrame,
 pandas.core.frame.DataFrame,
 pandas.core.frame.DataFrame)

In [45]:
from fosforml import register_model

In [46]:
## registering the model in Fosfor Insight Designer.
register_model(
    model_obj=gbdt, 
    session=my_session,
    x_train=x_train,
    y_train=y_train_df,
    x_test=x_train,
    y_test=y_train_df,
    y_pred=y_pred_df,
    prob=y_prob_df,
    source="Notebook",
    dataset_name="CRA_APPLICATION_TRAIN_DETAILS",
    dataset_source="Snowflake",
    #dataset_source="InMemory",
    name="Credit_Risk_XGB_Classifier",
    description="Credit Risk XGBoost Classification Model",
    flavour="sklearn",
    model_type="classification",
    conda_dependencies=["scikit-learn==1.3.2"]
)

Exception: Failed to get model performance metrics. 'feature_names'

In [51]:
import pickle

In [53]:
with open('./CreditRisk_XGBclassifier_v1.pkl', 'wb') as f:  # open a text file
    pickle.dump(gbdt, f) # serialize the list

In [94]:
#with open('model_artifacts/CreditRisk_XGBclassifier_v1.pkl', 'wb') as f:  # open a text file
#    pickle.dump(gbdt, f) # serialize the list

# RandomForest Classifier

In [33]:
RFC = RandomForestClassifier(**rf_params)
RFC.fit(x_train, y_train)

In [34]:
y_pred = RFC.predict(x_train)

In [35]:
y_prob = RFC.predict_proba(x_train)[:,1]

In [36]:
y_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int8)

In [37]:
y_prob

array([0.06920236, 0.02455619, 0.04306713, ..., 0.14702164, 0.07165822,
       0.04874642])

In [38]:
y_train_df = pd.DataFrame(y_train)
y_pred_df = pd.DataFrame(y_pred, columns=['PREDICTION'])
y_prob_df = pd.DataFrame(y_prob, columns=['PROBABILITY'])

In [39]:
type(x_train),type(x_test), type(y_train_df),type(y_pred_df), type(y_prob_df)

(pandas.core.frame.DataFrame,
 pandas.core.frame.DataFrame,
 pandas.core.frame.DataFrame,
 pandas.core.frame.DataFrame,
 pandas.core.frame.DataFrame)

In [40]:
## registering the model in Fosfor Insight Designer.
register_model(
    model_obj=RFC, 
    session=my_session,
    x_train=x_train,
    y_train=y_train_df,
    x_test=x_train,
    y_test=y_train_df,
    y_pred=y_pred_df,
    prob=y_prob_df,
    source="Notebook",
    dataset_name="CRA_APPLICATION_TRAIN_DETAILS",
    dataset_source="Snowflake",
    #dataset_source="InMemory",
    name="Credit_Risk_RandomForest_Classifier",
    description="Credit Risk RandomForest Classification Model",
    flavour="sklearn",
    model_type="classification",
    conda_dependencies=["scikit-learn==1.3.2"]
)

Got error object of type 'NoneType' has no len() when trying to read default values from function: <class 'snowflake.ml.modeling.metrics.classification._register_confusion_matrix_computer.<locals>.ConfusionMatrixComputer'>. Proceeding without creating optional arguments
Got error object of type 'NoneType' has no len() when trying to read default values from function: <class 'snowflake.ml.modeling.metrics.metrics_utils.register_accumulator_udtf.<locals>.Accumulator'>. Proceeding without creating optional arguments


Calculating build time metrics

Progress: ██████████████                                                         20.0%
Calculating build time metrics

Progress: ████████████████████████████                                           40.0%


Got error object of type 'NoneType' has no len() when trying to read default values from function: <class 'snowflake.ml.modeling.metrics.classification._register_multilabel_confusion_matrix_computer.<locals>.MultilabelConfusionMatrixComputer'>. Proceeding without creating optional arguments
The version of package 'scikit-learn' in the local environment is 1.3.2, which does not fit the criteria for the requirement 'scikit-learn<1.4'. Your UDF might not work when the package version is different between the server and your local environment.
DataFrame.flatten() is deprecated since 0.7.0. Use `DataFrame.join_table_function()` instead.
Got error object of type 'NoneType' has no len() when trying to read default values from function: <class 'snowflake.ml.modeling.metrics.classification._register_multilabel_confusion_matrix_computer.<locals>.MultilabelConfusionMatrixComputer'>. Proceeding without creating optional arguments
The version of package 'scikit-learn' in the local environment is 1.

Calculating build time metrics

Progress: ██████████████████████████████████████████                             60.0%


The version of package 'scikit-learn' in the local environment is 1.3.2, which does not fit the criteria for the requirement 'scikit-learn==1.3.*'. Your UDF might not work when the package version is different between the server and your local environment.
The version of package 'snowflake-snowpark-python' in the local environment is 1.21.1, which does not fit the criteria for the requirement 'snowflake-snowpark-python'. Your UDF might not work when the package version is different between the server and your local environment.
Got error object of type 'NoneType' has no len() when trying to read default values from function: <function roc_curve.<locals>.roc_curve_anon_sproc at 0x7f589c1caf70>. Proceeding without creating optional arguments
The version of package 'scikit-learn' in the local environment is 1.3.2, which does not fit the criteria for the requirement 'scikit-learn==1.3.*'. Your UDF might not work when the package version is different between the server and your local enviro

Calculating build time metrics

Progress: ████████████████████████████████████████████████████████               80.0%
Calculating build time metrics

Progress: ██████████████████████████████████████████████████████████████████████ 100.0%


"Model 'MODEL_E5DDB3DE_670C_4AAA_ABD4_6EEA047DAB71_FDC_CREDIT_RISK_RANDOMFOREST_CLASSIFIER' registered successfully."

In [42]:
with open('./CreditRisk_RFclassifier_v1.pkl', 'wb') as f:  # open a text file
    pickle.dump(RFC, f) # serialize the list

In [None]:
#with open('model_artifacts/CreditRisk_RFclassifier_v1.pkl', 'wb') as f:  # open a text file
#    pickle.dump(RFC, f) # serialize the list

# Extra Tree Classifier

In [43]:
ETC = ExtraTreesClassifier(**et_params)
ETC.fit(x_train, y_train)

In [44]:
y_pred = ETC.predict(x_train)
y_prob = ETC.predict_proba(x_train)[:,1]

In [45]:
y_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int8)

In [46]:
y_prob

array([0.0804109 , 0.02471077, 0.03529438, ..., 0.13456295, 0.07011287,
       0.03994063])

In [47]:
y_train_df = pd.DataFrame(y_train)
y_pred_df = pd.DataFrame(y_pred, columns=['PREDICTION'])
y_prob_df = pd.DataFrame(y_prob, columns=['PROBABILITY'])

In [48]:
type(x_train),type(x_test), type(y_train_df),type(y_pred_df), type(y_prob_df)

(pandas.core.frame.DataFrame,
 pandas.core.frame.DataFrame,
 pandas.core.frame.DataFrame,
 pandas.core.frame.DataFrame,
 pandas.core.frame.DataFrame)

In [49]:
from fosforml import register_model
## registering the model in Fosfor Insight Designer.
register_model(
    model_obj=ETC, 
    session=my_session,
    x_train=x_train,
    y_train=y_train_df,
    x_test=x_train,
    y_test=y_train_df,
    y_pred=y_pred_df,
    prob=y_prob_df,
    source="Notebook",
    dataset_name="CRA_APPLICATION_TRAIN_DETAILS",
    dataset_source="Snowflake",
    #dataset_source="InMemory",
    name="Credit_Risk_ExtraTree_Classifier",
    description="Credit Risk ExtraTree Classification Model",
    flavour="sklearn",
    model_type="classification",
    conda_dependencies=["scikit-learn==1.3.2"]
)



Calculating build time metrics

Progress: ██████████████                                                         20.0%
Calculating build time metrics

Progress: ████████████████████████████                                           40.0%




Calculating build time metrics

Progress: ██████████████████████████████████████████                             60.0%




Calculating build time metrics

Progress: ████████████████████████████████████████████████████████               80.0%
Calculating build time metrics

Progress: ██████████████████████████████████████████████████████████████████████ 100.0%


"Model 'MODEL_E5DDB3DE_670C_4AAA_ABD4_6EEA047DAB71_FDC_CREDIT_RISK_EXTRATREE_CLASSIFIER' registered successfully."

In [50]:
import pickle
with open('./CreditRisk_ETclassifier_v1.pkl', 'wb') as f:  # open a text file
    pickle.dump(ETC, f) # serialize the list

In [None]:
#with open('model_artifacts/CreditRisk_ETclassifier_v1.pkl', 'wb') as f:  # open a text file
#    pickle.dump(ETC, f) # serialize the list

# CATBoost Classifier

In [35]:
CBC = CatBoostClassifier(**catboost_params)
CBC.fit(x_train, y_train)

0:	total: 101ms	remaining: 20.1s
1:	total: 160ms	remaining: 15.8s
2:	total: 211ms	remaining: 13.8s
3:	total: 264ms	remaining: 12.9s
4:	total: 317ms	remaining: 12.4s
5:	total: 375ms	remaining: 12.1s
6:	total: 443ms	remaining: 12.2s
7:	total: 510ms	remaining: 12.2s
8:	total: 568ms	remaining: 12.1s
9:	total: 632ms	remaining: 12s
10:	total: 708ms	remaining: 12.2s
11:	total: 765ms	remaining: 12s
12:	total: 828ms	remaining: 11.9s
13:	total: 899ms	remaining: 11.9s
14:	total: 958ms	remaining: 11.8s
15:	total: 1s	remaining: 11.6s
16:	total: 1.06s	remaining: 11.4s
17:	total: 1.11s	remaining: 11.2s
18:	total: 1.15s	remaining: 11s
19:	total: 1.21s	remaining: 10.9s
20:	total: 1.26s	remaining: 10.8s
21:	total: 1.34s	remaining: 10.8s
22:	total: 1.43s	remaining: 11s
23:	total: 1.5s	remaining: 11s
24:	total: 1.55s	remaining: 10.8s
25:	total: 1.6s	remaining: 10.7s
26:	total: 1.66s	remaining: 10.7s
27:	total: 1.71s	remaining: 10.5s
28:	total: 1.76s	remaining: 10.4s
29:	total: 1.81s	remaining: 10.3s
30:	t

<catboost.core.CatBoostClassifier at 0x7f891068e3a0>

In [36]:
y_pred = CBC.predict(x_train)
y_prob = CBC.predict_proba(x_train)[:,1]

In [37]:
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [38]:
y_prob

array([0.25499246, 0.05174261, 0.23989301, ..., 0.43857352, 0.15645137,
       0.12447822])

In [39]:
y_train_df = pd.DataFrame(y_train)
y_pred_df = pd.DataFrame(y_pred, columns=['PREDICTION'])
y_prob_df = pd.DataFrame(y_prob, columns=['PROBABILITY'])

In [40]:
type(x_train),type(x_test), type(y_train_df),type(y_pred_df), type(y_prob_df)

(pandas.core.frame.DataFrame,
 pandas.core.frame.DataFrame,
 pandas.core.frame.DataFrame,
 pandas.core.frame.DataFrame,
 pandas.core.frame.DataFrame)

In [41]:
## registering the model in Fosfor Insight Designer.
register_model(
    model_obj=CBC, 
    session=my_session,
    x_train=x_train,
    y_train=y_train_df,
    x_test=x_train,
    y_test=y_train_df,
    y_pred=y_pred_df,
    prob=y_prob_df,
    source="Notebook",
    dataset_name="CRA_APPLICATION_TRAIN_DETAILS",
    dataset_source="Snowflake",
    #dataset_source="InMemory",
    name="Credit_Risk_CATBoost_Classifier",
    description="Credit Risk CATBoost Classification Model",
    flavour="sklearn",
    model_type="classification",
    conda_dependencies=["scikit-learn==1.3.2"]
)

Exception: Failed to get model performance metrics. 'feature_names'

In [42]:
with open('./CreditRisk_CatBoostclassifier_v1.pkl', 'wb') as f:  # open a text file
    pickle.dump(CBC, f) # serialize the list

In [None]:
#with open('model_artifacts/CreditRisk_CatBoostclassifier_v1.pkl', 'wb') as f:  # open a text file
#    pickle.dump(CBC, f) # serialize the list

# Version 1 of Stacking Classifier

In [51]:
x_train_RFC = RFC.predict_proba(x_train)[:,1]
x_train_ETC = ETC.predict_proba(x_train)[:,1]

In [None]:
x_train_XGB = gbdt.predict(xgb.DMatrix(x_train))
x_train_CBC = CBC.predict_proba(x_train)[:,1]

In [52]:
x_test_RFC = RFC.predict_proba(x_test)[:,1]
x_test_ETC = ETC.predict_proba(x_test)[:,1]

In [None]:
x_test_XGB = gbdt.predict(xgb.DMatrix(x_test))
x_test_CBC = CBC.predict_proba(x_test)[:,1]

In [53]:
x_train_RFCT = x_train_RFC.reshape(-1,1)
x_train_ETCT = x_train_ETC.reshape(-1,1)

In [None]:
x_train_XGBT = x_train_XGB.reshape(-1,1)
x_train_CBCT = x_train_CBC.reshape(-1,1)

In [54]:
#x_train_stk = np.concatenate((x_train_XGBT, x_train_RFCT, x_train_ETCT, x_train_CBCT),axis=1)
x_train_stk = np.concatenate((x_train_RFCT, x_train_ETCT),axis=1)

In [55]:
x_train_stk.shape

(307511, 2)

In [56]:
x_test_RFCT = x_test_RFC.reshape(-1,1)
x_test_ETCT = x_test_ETC.reshape(-1,1)

In [None]:
x_test_XGBT = x_test_XGB.reshape(-1,1)
x_test_CBCT = x_test_CBC.reshape(-1,1)

In [57]:
#x_test_stk = np.concatenate((x_test_XGBT, x_test_RFCT, x_test_ETCT, x_test_CBCT),axis=1)
x_test_stk = np.concatenate((x_test_RFCT, x_test_ETCT),axis=1)
x_test_stk.shape

(48744, 2)

In [58]:
RFC_Stack = RandomForestClassifier()
RFC_Stack.fit(x_train_stk,y_train)

In [59]:
y_pred_stk = RFC_Stack.predict(x_train_stk)
y_prob_stk = RFC_Stack.predict_proba(x_train_stk)[:,1]

In [60]:
ytest_pred_stk = RFC_Stack.predict(x_test_stk)
ytest_prob_stk = RFC_Stack.predict_proba(x_test_stk)[:,1]

In [61]:
ytest_pred_stk

array([0, 0, 0, ..., 0, 0, 0], dtype=int8)

In [62]:
ytest_prob_stk

array([0.  , 0.04, 0.  , ..., 0.  , 0.  , 0.06])

In [66]:
x_train_stk

array([[0.06920236, 0.0804109 ],
       [0.02455619, 0.02471077],
       [0.04306713, 0.03529438],
       ...,
       [0.14702164, 0.13456295],
       [0.07165822, 0.07011287],
       [0.04874642, 0.03994063]])

In [67]:
x_train_df = pd.DataFrame(x_train_stk, columns=['A','B'])

In [63]:
y_train_df = pd.DataFrame(y_train)
y_pred_df = pd.DataFrame(ytest_pred_stk, columns=['PREDICTION'])
y_prob_df = pd.DataFrame(ytest_prob_stk, columns=['PROBABILITY'])

In [69]:
type(x_train_df),type(y_train_df),type(y_pred_df), type(y_prob_df)

(pandas.core.frame.DataFrame,
 pandas.core.frame.DataFrame,
 pandas.core.frame.DataFrame,
 pandas.core.frame.DataFrame)

In [70]:
## registering the model in Fosfor Insight Designer.
register_model(
    model_obj=RFC_Stack, 
    session=my_session,
    x_train=x_train_df,
    y_train=y_train_df,
    x_test=x_train_df,
    y_test=y_train_df,
    y_pred=y_pred_df,
    prob=y_prob_df,
    source="Notebook",
    dataset_name="CRA_APPLICATION_TRAIN_DETAILS",
    dataset_source="Snowflake",
    #dataset_source="InMemory",
    name="Credit_Risk_StackedRF_Classifier",
    description="Credit Risk Stacked RF Classification Model",
    flavour="sklearn",
    model_type="classification",
    conda_dependencies=["scikit-learn==1.3.2"]
)



Calculating build time metrics

Progress: ██████████████                                                         20.0%
Calculating build time metrics

Progress: ████████████████████████████                                           40.0%




(1300) (1304): 01b6dd68-0710-dac5-0072-f30310ab54be: 100357 (P0000): Python Interpreter Error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/snowflake/ml/modeling/metrics/classification.py", line 1059, in end_partition
  File "/usr/lib/python_udf/ea8ed5a0e1ade0c32e9d9f9496043fbbfc23d3c483d0ea40d43816b8ac9cc5ca/lib/python3.9/site-packages/sklearn/utils/_param_validation.py", line 211, in wrapper
    return func(*args, **kwargs)
  File "/usr/lib/python_udf/ea8ed5a0e1ade0c32e9d9f9496043fbbfc23d3c483d0ea40d43816b8ac9cc5ca/lib/python3.9/site-packages/sklearn/metrics/_classification.py", line 505, in multilabel_confusion_matrix
    y_type, y_true, y_pred = _check_targets(y_true, y_pred)
  File "/usr/lib/python_udf/ea8ed5a0e1ade0c32e9d9f9496043fbbfc23d3c483d0ea40d43816b8ac9cc5ca/lib/python3.9/site-packages/sklearn/metrics/_classification.py", line 93, in _check_targets
    raise ValueError(
ValueError: Classification metrics can't handle a mix of binary and



(1300) (1304): 01b6dd68-0710-dac5-0072-f30310ab54de: 100357 (P0000): Python Interpreter Error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/snowflake/ml/modeling/metrics/classification.py", line 1059, in end_partition
  File "/usr/lib/python_udf/ea8ed5a0e1ade0c32e9d9f9496043fbbfc23d3c483d0ea40d43816b8ac9cc5ca/lib/python3.9/site-packages/sklearn/utils/_param_validation.py", line 211, in wrapper
    return func(*args, **kwargs)
  File "/usr/lib/python_udf/ea8ed5a0e1ade0c32e9d9f9496043fbbfc23d3c483d0ea40d43816b8ac9cc5ca/lib/python3.9/site-packages/sklearn/metrics/_classification.py", line 505, in multilabel_confusion_matrix
    y_type, y_true, y_pred = _check_targets(y_true, y_pred)
  File "/usr/lib/python_udf/ea8ed5a0e1ade0c32e9d9f9496043fbbfc23d3c483d0ea40d43816b8ac9cc5ca/lib/python3.9/site-packages/sklearn/metrics/_classification.py", line 93, in _check_targets
    raise ValueError(
ValueError: Classification metrics can't handle a mix of binary and



Error in while calculating roc_auc 
Calculating build time metrics

Progress: ████████████████████████████████████████████████████████               80.0%
Error in while calculating feature_importance 
Calculating build time metrics

Progress: ██████████████████████████████████████████████████████████████████████ 100.0%


"Model 'MODEL_E5DDB3DE_670C_4AAA_ABD4_6EEA047DAB71_FDC_CREDIT_RISK_STACKEDRF_CLASSIFIER' registered successfully."

In [75]:
x_train_df_1 = x_train.head(199999)

In [None]:
df_model=my_session.createDataFrame(
        x_train_df_1.values.tolist(),
        schema=x_train_df_1.columns.tolist())
df_model.write.mode("overwrite").save_as_table("FDC_BANKING_FS.BFS_CREDIT_RISK_SCHEMA.CRA_APPLICATION_STACK_INPUT")

In [None]:
with open('./CreditRisk_StackedRFclassifier_v1.pkl', 'wb') as f:  # open a text file
    pickle.dump(RFC_Stack, f) # serialize the list

In [None]:
#with open('model_artifacts/CreditRisk_StackedRFclassifier_v1.pkl', 'wb') as f:  # open a text file
#    pickle.dump(RFC_Stack, f) # serialize the list

# Version 2 of Stacking Classifier

In [None]:
x_train_stk = np.concatenate((x_train_RFCT, x_train_ETCT, x_train_CBCT),axis=1)
x_test_stk = np.concatenate(( x_test_RFCT, x_test_ETCT, x_test_CBCT),axis=1)

RFC_Stack = RandomForestClassifier()
RFC_Stack.fit(x_train_stk,y_train)

y_pred_stk = RFC_Stack.predict(x_train_stk)
y_prob_stk = RFC_Stack.predict_proba(x_train_stk)[:,1]

ytest_pred_stk = RFC_Stack.predict(x_test_stk)
ytest_prob_stk = RFC_Stack.predict_proba(x_test_stk)[:,1]

y_pred_stk
y_prob_stk

In [None]:
## registering the model in Fosfor with correct output (y_pred_stk, y_prob_stk.
model_reg = register_model(RFC_Stack,
               score, 
               name="Credit_Risk_StackedRF_Classifier", 
               description="Credit Risk Stacked RF Classification Model",
               flavour=MLModelFlavours.sklearn,
               model_type="classification",
               #init_script="pip install snowflake-ml-python==1.0.11",
               init_script="\\n pip install fosforml \\n pip install fosforio[snowflake] \\n pip install seaborn \\n pip install plotly \\n pip install statsmodels \\n pip install xgboost \\n pip install catboost \\n pip install snowflake-connector-python[pandas]",                              
               y_true=y_train,
               y_pred=y_pred_stk,
               prob=y_prob_stk,
               features=x_train.columns,
               input_type="json", 
               explain_ai=True,
               x_train=x_train, 
               x_test=x_train, 
               y_train=y_train.tolist(),
               y_test=y_train.tolist(),
               feature_names=x_train.columns.tolist(),
               original_features=x_train.columns.tolist(),
               feature_ids=x_train.columns,
               kyd=True, kyd_score = True)

In [None]:
with open('/data/Output/CreditRisk_StackedRFclassifier_v2.pkl', 'wb') as f:  # open a text file
    pickle.dump(RFC_Stack, f) # serialize the list

In [None]:
#with open('model_artifacts/CreditRisk_StackedRFclassifier_v2.pkl', 'wb') as f:  # open a text file
#    pickle.dump(RFC_Stack, f) # serialize the list

# Create Sample Dataframe/Table for Monitoring Setup 

In [None]:
x_train.shape, y_train.shape, y_pred_stk.shape, y_prob_stk.shape

In [None]:
x_test.shape ,  ytest_pred_stk.shape, ytest_prob_stk.shape

In [None]:
type(x_train), type(y_train), type(y_pred_stk), type(y_prob_stk)

In [None]:
df_pred_stk = pd.DataFrame(y_pred_stk, columns=['Prediction'])

In [None]:
df_prob_stk = pd.DataFrame(y_prob_stk, columns=['Probability'])

In [None]:
df_train = pd.DataFrame(y_train, columns=['TARGET'])

In [None]:
temp_df = pd.concat([x_train, df_train, df_pred_stk, df_prob_stk], axis=1)

In [None]:
temp_df.to_csv('/data/Output/application_train_output.csv')

In [None]:
temp_df = pd.read_csv('/data/Output/application_train_output.csv')

In [None]:
temp_df_1 = temp_df.sample(frac = 0.7)
temp_df_2 = temp_df.drop(temp_df_1.index)

In [None]:
print (temp_df.shape)
print (temp_df_1.shape)
print (temp_df_2.shape)

In [None]:
type(temp_df_1)

In [None]:
<Suppose to fail here>

In [None]:
df_model=session.createDataFrame(
        temp_df_1.values.tolist(),
        schema=temp_df_1.columns.tolist())
df_model.write.mode("overwrite").save_as_table("FDC_Banking_FS.PUBLIC.CRA_APPLICATION_OUTPUT_BATCH1")

In [None]:
df_model=session.createDataFrame(
        temp_df_2.values.tolist(),
        schema=temp_df_2.columns.tolist())
df_model.write.mode("overwrite").save_as_table("FDC_Banking_FS.PUBLIC.CRA_APPLICATION_OUTPUT_BATCH2")