# Importing Libraries

In [1]:
#Snowpark lib
from snowflake.snowpark import Session

# Data Science Libs
import numpy as np
import pandas as pd

# create_temp_table warning suppresion
import warnings; warnings.simplefilter('ignore')

#ConfigParser to read ini file
import configparser

from fosforio import snowflake

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

Connection manager service url initialised to http://fdc-project-manager:80/project-manager
If you need to update its value then update the variable CONNECTION_MANAGER_BASE_URL in os env.


# Fetching Data

In [2]:
# To read a specific dataset published from a snowflake connection
df = snowflake.get_dataframe("MASTER")
df.head()

Unnamed: 0,ID,GENDER,DOB,CITY,STATE,AGE,MONTHLY_INCOME,EMPLOYER_NAME,SALARY_ACCOUNT,LOAN_AMOUNT_APPLIED,...,QUARTER,MOBILE_VERIFIED,FILLED_FORM,DEVICE_TYPE,VAR2,SOURCE,VAR4,VAR5,VAR1,DISBURSED
0,ID000977Z13,Male,10/1/1988,New York,New York,36,1313.242865,Hcl Global Systems,Bank of Hope,900000.0,...,4,Y,Y,Mobile,E,S127,7.0,14,HBXX,1.0
1,ID000626U25,Male,4/1/1992,New York,New York,32,1270.880192,Synechron,Bank of Marin,900000.0,...,2,Y,N,Web-browser,G,S151,5.0,10,HBXB,1.0
2,ID019473Z30,Male,02/1/1996,New York,New York,28,500.0,Syntel,Exchange Bank,10000.0,...,2,Y,N,Web-browser,B,S144,4.0,11,HBXC,0.0
3,ID000062G50,Male,3/6/1989,New York,New York,35,1497.0,Uber Technologies,Bank of Marin,900000.0,...,1,Y,Y,Web-browser,B,S133,4.0,0,HAXB,1.0
4,ID000935I02,Female,30/8/1996,New York,New York,28,903.0,Hcl America,CIT Bank,700000.0,...,4,N,Y,Web-browser,B,S160,1.0,3,HBXX,0.0


# Preprocessing

In [3]:
data = df.drop(["ID", "LEAD_CREATION_DATE", "DEVICE_TYPE", "SALARY_ACCOUNT", "CITY", "DOB", "EMPLOYER_NAME","YEAR","QUARTER","MONTH"], axis=1)

In [4]:
data.head()

Unnamed: 0,GENDER,STATE,AGE,MONTHLY_INCOME,LOAN_AMOUNT_APPLIED,LOAN_TENURE_APPLIED,EXISTING_EMI,MOBILE_VERIFIED,FILLED_FORM,VAR2,SOURCE,VAR4,VAR5,VAR1,DISBURSED
0,Male,New York,36,1313.242865,900000.0,3.0,139.0,Y,Y,E,S127,7.0,14,HBXX,1.0
1,Male,New York,32,1270.880192,900000.0,3.0,0.0,Y,N,G,S151,5.0,10,HBXB,1.0
2,Male,New York,28,500.0,10000.0,0.0,100.0,Y,N,B,S144,4.0,11,HBXC,0.0
3,Male,New York,35,1497.0,900000.0,3.0,140.0,Y,Y,B,S133,4.0,0,HAXB,1.0
4,Female,New York,28,903.0,700000.0,3.0,0.0,N,Y,B,S160,1.0,3,HBXX,0.0


In [5]:
data = data.dropna()

In [6]:
cat_col = []
num_col = []
target = "DISBURSED"

for k, v in dict(data.dtypes).items():
    if k != target:
        if v == "O":
            cat_col.append(k)
        else:
            num_col.append(k)

# Train Test Split

In [7]:
used_cols = [c for c in data.columns.tolist() if c not in [target]]
X = data[used_cols]
y = data[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Building and Training Model Pipeline

In [8]:
# making pipeline
scaler = RobustScaler()
encoder = OneHotEncoder(handle_unknown="ignore")
# putting numeric columns to scaler and categorical to encoder
num_transformer = make_pipeline(scaler)
cat_transformer = make_pipeline(encoder)

# getting together our scaler and encoder with preprocessor
preprocessor = ColumnTransformer(
      transformers=[('num', num_transformer, num_col),
                    ('cat', cat_transformer, cat_col)])

In [9]:
# choosing model
model_name = RandomForestClassifier(n_estimators = 500, max_depth=8, random_state=25, max_samples=0.6)

# giving all values to pipeline
pipe = make_pipeline(preprocessor,model_name)
pipe.fit(X_train, y_train)

# Prediction

In [10]:
# make predictions on test set
y_pred = pipe.predict(X_test)


# Model Registrartion

In [15]:
# !pip install fosforml

In [11]:
from fosforml import *
from fosforml.constants import MLModelFlavours
import requests

In [None]:
## registering the model in Fosfor.
model_reg = register_model(clf,
               score, 
               name="Customer_Segmentation_Dtree_Cluster_Classifier", 
               description="Customer_Segmentation_Dtree_Cluster_Classifier",
               flavour=MLModelFlavours.sklearn,
               model_type="classification",
               init_script="\\n pip install fosforml \\n pip install fosforio[snowflake] \\n pip install seaborn \\n pip install snowflake-connector-python[pandas]",
               y_true=y_test,
               y_pred=clf_prediction,
               prob=clf_prediction,
               features=X_train.columns,
               input_type="json", 
               explain_ai=True,
               x_train=X_train, 
               x_test=X_test, 
               y_train=y_train,
               y_test=y_test,
               feature_names=X_train.columns.tolist(),
               original_features=X_train.columns.tolist(),
               feature_ids=X_train.columns,
               kyd=True, kyd_score = True)