In [2]:
# !pip install fosforml

# Importing Libraries

In [1]:
#Snowpark lib
from snowflake.snowpark import Session

# Data Science Libs
import numpy as np
import pandas as pd

# create_temp_table warning suppresion
import warnings; warnings.simplefilter('ignore')

#ConfigParser to read ini file
import configparser

from fosforio import snowflake

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

Connection manager service url initialised to http://fdc-project-manager:80/project-manager
If you need to update its value then update the variable CONNECTION_MANAGER_BASE_URL in os env.


# Fetching Data

In [11]:
# To read a specific dataset published from a snowflake connection
df = snowflake.get_dataframe("MASTER")
df.head()

Unnamed: 0,ID,GENDER,DOB,CITY,STATE,AGE,MONTHLY_INCOME,EMPLOYER_NAME,SALARY_ACCOUNT,LOAN_AMOUNT_APPLIED,...,QUARTER,MOBILE_VERIFIED,FILLED_FORM,DEVICE_TYPE,VAR2,SOURCE,VAR4,VAR5,VAR1,DISBURSED
0,ID000084J50,Female,4/3/1986,San Francisco,California,38,1197.0,Cisco Systems,United Security Bank,900000.0,...,1,Y,Y,Web-browser,E,S157,7.0,15,HBXD,1.0
1,ID000873Y34,Female,12/4/1985,Los Angeles,California,39,633.0,Oath Holdings,Citizens Business Bank,500000.0,...,4,N,N,Web-browser,B,S144,4.0,0,HBXX,0.0
2,ID000873Y34,Female,12/4/1985,Los Angeles,California,39,633.0,Oath Holdings,Citizens Business Bank,700000.0,...,4,N,N,Web-browser,B,S144,4.0,0,HBXX,0.0
3,ID000873Y34,Female,12/4/1985,Los Angeles,California,39,633.0,Oath Holdings,Citizens Business Bank,500000.0,...,2,Y,Y,Mobile,E,S161,1.0,8,HBXA,1.0
4,ID000873Y34,Female,12/4/1985,Los Angeles,California,39,633.0,Oath Holdings,Citizens Business Bank,700000.0,...,2,Y,Y,Mobile,E,S161,1.0,8,HBXA,1.0


In [12]:
df = df.dropna()

# Preprocessing

In [13]:
temp_data = df[["ID", "LEAD_CREATION_DATE", "DEVICE_TYPE", "SALARY_ACCOUNT", "CITY", "DOB", "EMPLOYER_NAME","YEAR","QUARTER","MONTH"]]
data = df.drop(["ID", "LEAD_CREATION_DATE", "DEVICE_TYPE", "SALARY_ACCOUNT", "CITY", "DOB", "EMPLOYER_NAME","YEAR","QUARTER","MONTH"], axis=1)

In [14]:
data.head()

Unnamed: 0,GENDER,STATE,AGE,MONTHLY_INCOME,LOAN_AMOUNT_APPLIED,LOAN_TENURE_APPLIED,EXISTING_EMI,MOBILE_VERIFIED,FILLED_FORM,VAR2,SOURCE,VAR4,VAR5,VAR1,DISBURSED
0,Female,California,38,1197.0,900000.0,3.0,0.0,Y,Y,E,S157,7.0,15,HBXD,1.0
1,Female,California,39,633.0,500000.0,3.0,0.0,N,N,B,S144,4.0,0,HBXX,0.0
2,Female,California,39,633.0,700000.0,3.0,0.0,N,N,B,S144,4.0,0,HBXX,0.0
3,Female,California,39,633.0,500000.0,3.0,0.0,Y,Y,E,S161,1.0,8,HBXA,1.0
4,Female,California,39,633.0,700000.0,3.0,0.0,Y,Y,E,S161,1.0,8,HBXA,1.0


In [15]:
temp_data

Unnamed: 0,ID,LEAD_CREATION_DATE,DEVICE_TYPE,SALARY_ACCOUNT,CITY,DOB,EMPLOYER_NAME,YEAR,QUARTER,MONTH
0,ID000084J50,27/2/2024,Web-browser,United Security Bank,San Francisco,4/3/1986,Cisco Systems,2024,1,2
1,ID000873Y34,1/11/2023,Web-browser,Citizens Business Bank,Los Angeles,12/4/1985,Oath Holdings,2023,4,11
2,ID000873Y34,1/11/2023,Web-browser,Citizens Business Bank,Los Angeles,12/4/1985,Oath Holdings,2023,4,11
3,ID000873Y34,6/5/2022,Mobile,Citizens Business Bank,Los Angeles,12/4/1985,Oath Holdings,2022,2,5
4,ID000873Y34,6/5/2022,Mobile,Citizens Business Bank,Los Angeles,12/4/1985,Oath Holdings,2022,2,5
...,...,...,...,...,...,...,...,...,...,...
212710,ID116921Z10,27/7/2024,Web-browser,Mechanics Bank,Los Angeles,16/1/1994,Microsoft,2024,3,7
212711,ID004818I30,05/5/2024,Web-browser,Bank of Stockton,Los Angeles,09/6/1996,Compunnel Software Group,2024,2,5
212712,ID039279T40,01/6/2024,Web-browser,Ever Trust Bank,San Diego,21/7/1994,Hcl America,2024,2,6
212713,ID000226X81,24/4/2024,Web-browser,Westamerica Bank,New York,19/3/1989,Jpmorgan Chase,2024,2,4


In [6]:
# data = data.dropna()

In [16]:
cat_col = []
num_col = []
target = "DISBURSED"

for k, v in dict(data.dtypes).items():
    if k != target:
        if v == "O":
            cat_col.append(k)
        else:
            num_col.append(k)

# Train Test Split

In [17]:
used_cols = [c for c in data.columns.tolist() if c not in [target]]
X = data[used_cols]
y = data[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Building and Training Model Pipeline

In [18]:
# making pipeline
scaler = RobustScaler()
encoder = OneHotEncoder(handle_unknown="ignore")
# putting numeric columns to scaler and categorical to encoder
num_transformer = make_pipeline(scaler)
cat_transformer = make_pipeline(encoder)

# getting together our scaler and encoder with preprocessor
preprocessor = ColumnTransformer(
      transformers=[('num', num_transformer, num_col),
                    ('cat', cat_transformer, cat_col)])

In [None]:
# choosing model
model_name = RandomForestClassifier(n_estimators = 500, max_depth=12, random_state=25, max_samples=0.6)

# giving all values to pipeline
pipe = make_pipeline(preprocessor,model_name)
pipe.fit(X_train, y_train)

# Prediction

In [None]:
# make predictions on test set
y_pred = pipe.predict(X_test)
y_prob = pipe.predict_proba(X_test)


# Model Evaluation

In [None]:
# importing Libraries 
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc

In [None]:
# Confusion Matrix 
cm = confusion_matrix(y_test, y_pred) 
# Accuracy 
accuracy = accuracy_score(y_test, y_pred) 
# Precision 
precision = precision_score(y_test, y_pred) 
# Recall 
recall = recall_score(y_test, y_pred) 
# F1-Score 
f1 = f1_score(y_test, y_pred) 
# ROC Curve and AUC 
fpr, tpr, thresholds = roc_curve(y_test, y_pred) 
roc_auc = auc(fpr, tpr) 
  
print("Confusion Matrix:") 
print(cm) 
print("Accuracy:", round(accuracy,2)) 
print("Precision:", round(precision,2)) 
print("Recall:", round(recall,2)) 
print("F1-Score:", round(f1,2)) 
print("ROC AUC:", round(roc_auc,2)) 

In [None]:

import matplotlib.pyplot as plt 
  
# Plot ROC curve 
plt.figure() 
  
# Plot the ROC curve with a label displaying the ROC AUC score 
plt.plot(fpr, tpr, color='darkorange', lw=2, 
         label='ROC curve (area = %0.2f)' % roc_auc) 
  
# Plot a dashed diagonal line for reference 
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--') 
  
# Set the x and y-axis limits 
plt.xlim([0.0, 1.0]) 
plt.ylim([0.0, 1.05]) 
  
# Label the x and y-axes 
plt.xlabel('False Positive Rate') 
plt.ylabel('True Positive Rate') 
  
# Set the title of the plot 
plt.title('Receiver Operating Characteristic') 
  
# Add a legend to the plot 
plt.legend(loc='lower right') 
  
# Display the ROC curve plot 
plt.show() 

# Model Registrartion

In [15]:
from fosforml import *
from fosforml.constants import MLModelFlavours
import requests

In [32]:
@scoring_func
def score(model, request):
    payload_dict = eval(request.json["payload"])
    data = pd.DataFrame.from_dict(payload_dict)
    result = pipe.predict(data)
    prediction = pd.DataFrame({"Lead_Status":result})
    prediction["Lead_Status"] = prediction["Lead_Status"].apply(lambda x: "Converted" if x == 1 else "Not Converted")
    probability = pipe.predict_proba(data)
    prediction["Probability"] = [round(k[1],4) for k in probability]
    prediction = str(prediction.to_dict())
    return prediction

In [33]:
payload = X_test.head(100).to_dict()
req = requests.Request()
req.json = {"payload":str(payload)}
y = req
yo = score(pipe, y)
yo

"{'Lead_Status': {0: 'Not Converted', 1: 'Converted', 2: 'Converted', 3: 'Not Converted', 4: 'Not Converted', 5: 'Not Converted', 6: 'Not Converted', 7: 'Converted', 8: 'Not Converted', 9: 'Not Converted', 10: 'Not Converted', 11: 'Not Converted', 12: 'Not Converted', 13: 'Not Converted', 14: 'Not Converted', 15: 'Not Converted', 16: 'Converted', 17: 'Not Converted', 18: 'Not Converted', 19: 'Not Converted', 20: 'Converted', 21: 'Not Converted', 22: 'Not Converted', 23: 'Not Converted', 24: 'Converted', 25: 'Not Converted', 26: 'Converted', 27: 'Converted', 28: 'Converted', 29: 'Converted', 30: 'Not Converted', 31: 'Not Converted', 32: 'Converted', 33: 'Converted', 34: 'Not Converted', 35: 'Not Converted', 36: 'Not Converted', 37: 'Not Converted', 38: 'Converted', 39: 'Not Converted', 40: 'Not Converted', 41: 'Converted', 42: 'Not Converted', 43: 'Not Converted', 44: 'Converted', 45: 'Not Converted', 46: 'Not Converted', 47: 'Converted', 48: 'Not Converted', 49: 'Not Converted', 50: 'N

In [34]:
## registering the model in Fosfor.
model_reg = register_model(pipe,
               score, 
               name="Lead_Conversion_model", 
               description="Lead_conversion_RandomForest_Model",
               flavour=MLModelFlavours.sklearn,
               model_type="classification",
               init_script="\\n pip install fosforml \\n pip install fosforio[snowflake] \\n pip install sklearn\\n pip install snowflake-connector-python[pandas]",
               y_true=y_test,
               y_pred=y_pred,
               prob=y_prob,
               features=X_train.columns,
               input_type="json", 
               explain_ai=True,
               x_train=X_train, 
               x_test=X_test, 
               y_train=y_train,
               y_test=y_test,
               feature_names=X_train.columns.tolist(),
               original_features=X_train.columns.tolist(),
               feature_ids=X_train.columns,
               kyd=True, kyd_score = True)

Calculating build time metrics

Progress: ██████████████████████████████████████████████████████████████████████ 100.0%


VBox(children=(HTML(value='<style>.grad_1{background: #2468a4;} .grad_2{ color:white; background: #2468a4;}</s…

# Predicted File to snowflake 

In [15]:
import os
config = configparser.ConfigParser()
config.read("credentials.ini")

connection_parameters = {
    "user": f'{config["Snowflake"]["user"]}',
    "password": os.getenv('Snowflake_password'),
    "account": f'{config["Snowflake"]["account"]}',
    "WAREHOUSE": f'{config["Snowflake"]["WAREHOUSE"]}',
    "DATABASE": f'{config["Snowflake"]["DATABASE"]}',
    "SCHEMA": f'{config["Snowflake"]["SCHEMA"]}'
}


def snowflake_connector(conn):
    try:
        session = Session.builder.configs(conn).create()
        print("connection successful!")
    except:
        raise ValueError("error while connecting with db")
    return session

session = snowflake_connector(connection_parameters)

connection successful!


In [None]:
data["Lead_Status"] = pipe.predict(data)
data["Lead_Status"] = data["Lead_Status"].apply(lambda x: "Converted" if x == 1 else "Not Converted")

probability = pipe.predict_proba(data)
data["Probability"] = [round(k[1],4) for k in probability]

In [None]:
data.head()

In [None]:
data[["ID", "LEAD_CREATION_DATE", "DEVICE_TYPE", "SALARY_ACCOUNT", "CITY", "DOB", "EMPLOYER_NAME","YEAR","QUARTER","MONTH"]] = temp_data[["ID", "LEAD_CREATION_DATE", "DEVICE_TYPE", "SALARY_ACCOUNT", "CITY", "DOB", "EMPLOYER_NAME","YEAR","QUARTER","MONTH"]] 

In [None]:
data["QUARTER"] = data["QUARTER"].apply(lambda x: "Q"+str(x))

In [None]:
def change_date_format(x):
    temp = x.split("/")
    temp = temp[::-1]
    return "-".join(temp)

data["LEAD_CREATION_DATE"] = data["LEAD_CREATION_DATE"].apply(change_date_format)
data["DOB"] = data["DOB"].apply(change_date_format)


In [None]:
def first_of_month(x):
    temp = x.split("-")
    temp[-1] = "1"
    return "-".join(temp)
data["LEAD_CREATION_DATE"] = data["LEAD_CREATION_DATE"].apply(first_of_month)

In [None]:
len(data.columns)

In [None]:
data.describe()

In [31]:
data["LOAN_AMOUNT_APPLIED"].min()

0.04

In [30]:
data.astype('object').describe()

Unnamed: 0,GENDER,STATE,AGE,MONTHLY_INCOME,LOAN_AMOUNT_APPLIED,LOAN_TENURE_APPLIED,EXISTING_EMI,MOBILE_VERIFIED,FILLED_FORM,VAR2,...,ID,LEAD_CREATION_DATE,DEVICE_TYPE,SALARY_ACCOUNT,CITY,DOB,EMPLOYER_NAME,YEAR,QUARTER,MONTH
count,212659,212659,212659,212659.0,212659.0,212659.0,212659.0,212659,212659,212659,...,212659,212659,212659,212659,212659,212659,212659,212659,212659,212659
unique,2,51,42,4816.0,268.0,10.0,3619.0,2,2,7,...,157779,36,2,32,679,11841,98,3,4,12
top,Male,California,28,610.0,900000.0,3.0,0.0,Y,N,B,...,ID000553H19,2024-7-1,Web-browser,Mechanics Bank,New York,1989-11-11,Cisco Systems,2024,Q2,7
freq,112367,79444,12310,3629.0,61295.0,150121.0,108304.0,126778,122670,64612,...,256,21635,131646,51920,42107,294,5897,104836,73646,30713


In [28]:
data["STATE"].unique()

array(['California', 'New York', 'Iowa', 'Pennsylvania', 'Texas',
       'Washington', 'North Carolina', 'Tennessee', 'Kansas', 'Arizona',
       'New Mexico', 'District of Columbia', 'Illinois', 'Minnesota',
       'Nevada', 'Oregon', 'Connecticut', 'Ohio', 'Florida', 'Arkansas',
       'Oklahoma', 'Utah', 'Colorado', 'Indiana', 'Kentucky',
       'Massachusetts', 'Rhode Island', 'Louisiana', 'Michigan',
       'Alabama', 'Alaska', 'Virginia', 'Idaho', 'Maine', 'Missouri',
       'New Jersey', 'Georgia', 'Wisconsin', 'New Hampshire', 'Hawaii',
       'South Dakota', 'Maryland', 'Montana', 'West Virginia',
       'South Carolina', 'Mississippi', 'Delaware', 'Nebraska', 'Wyoming',
       'North Dakota', 'Vermont'], dtype=object)

In [22]:
df_snowflake=session.createDataFrame(
        data.values.tolist(),
        schema=data.columns.tolist())

df_snowflake.write.mode("overwrite").save_as_table("FDC_BANKING_FS.BFS_LEAD_CONV_SCHEMA.MASTER_DATA")