# Modeling

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
import mlflow
from sklearn.metrics import precision_recall_fscore_support as score
from imblearn.over_sampling import SMOTE


In [28]:
mlflow.set_tracking_uri("file:///C:/Users/msi/Desktop/mlops/mlruns")
mlflow.set_experiment("loan_approval_prediction")
mlflow.sklearn.autolog(disable=True)


In [4]:
df = pd.read_csv("../data/loan_data_1.csv")
df.drop(columns='Unnamed: 0',inplace=True)
df

Unnamed: 0,State,BankState,ApprovalFY,Term,NoEmp,UrbanRural,RevLineCr,LowDoc,DisbursementGross,GrAppv,Industry,IsFranchise,NewBusiness,Default,DisbursementFY,DaysToDisbursement,SBA_AppvPct,AppvDisbursed,RealEstate,GreatRecession
0,IN,OH,1997,84,4,0,0,1,60000.0,60000.0,Retail_trade,0,1,0,1999,730,0.80,1,0,0
1,IN,IN,1997,60,2,0,0,1,40000.0,40000.0,Accom/Food_serv,0,1,0,1997,92,0.80,1,0,0
2,IN,IN,1997,180,7,0,0,0,287000.0,287000.0,Healthcare/Social_assist,0,0,0,1997,306,0.75,1,0,1
3,OK,OK,1997,60,2,0,0,1,35000.0,35000.0,Unknown,0,0,0,1997,122,0.80,1,0,0
4,FL,FL,1997,240,14,0,0,0,229000.0,229000.0,Unknown,0,0,0,1997,75,1.00,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
590496,TX,DC,1997,84,5,0,0,1,79000.0,79000.0,Unknown,0,1,0,1997,123,0.80,1,0,0
590497,OH,IL,1997,60,6,0,1,0,85000.0,85000.0,Retail_trade,0,0,0,1997,246,0.50,1,0,0
590498,CA,CA,1997,108,26,0,0,0,300000.0,300000.0,Manufacturing,0,0,0,1997,215,0.75,1,0,0
590499,HI,HI,1997,60,6,0,0,1,75000.0,75000.0,Unknown,0,0,1,1997,32,0.80,1,0,0


In [30]:
import json
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

# Dictionary to store mappings for all categorical columns
mappings = {}

# Loop through object (categorical) columns
for column in df.select_dtypes(include='object').columns:
    # Encode the column
    df[column] = encoder.fit_transform(df[column])
    
    # Create the mapping from category to index
    category_to_index = {category: index for index, category in enumerate(encoder.classes_)}
    
    # Store the mapping in the dictionary
    mappings[column] = category_to_index

# Save the dictionary to a JSON file
with open("../backend/src/store.json", "w") as file:
    json.dump(mappings, file, indent=4)


In [5]:
df.columns

Index(['State', 'BankState', 'ApprovalFY', 'Term', 'NoEmp', 'UrbanRural',
       'RevLineCr', 'LowDoc', 'DisbursementGross', 'GrAppv', 'Industry',
       'IsFranchise', 'NewBusiness', 'Default', 'DisbursementFY',
       'DaysToDisbursement', 'SBA_AppvPct', 'AppvDisbursed', 'RealEstate',
       'GreatRecession'],
      dtype='object')

### Establish target and feature fields


In [32]:
y = df['Default']
X = df.drop('Default', axis=1)

### Scale the feature values prior to modeling


In [33]:
scale = StandardScaler()
X_scaled = scale.fit_transform(X)

X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.25)

In [34]:
X_scaled.shape

(590501, 19)

In [35]:
method= SMOTE()
X_resampled, y_resampled = method.fit_resample(X_train, y_train)

# LogisticRegression:

In [36]:
# Initialize model
log_reg = LogisticRegression(random_state=2)

# Train the model and make predictions
log_reg.fit(X_train, y_train)
y_logpred = log_reg.predict(X_val)

# Print the results
print(classification_report(y_val, y_logpred, digits=3))

              precision    recall  f1-score   support

           0      0.874     0.961     0.915    120243
           1      0.696     0.390     0.500     27383

    accuracy                          0.855    147626
   macro avg      0.785     0.676     0.708    147626
weighted avg      0.841     0.855     0.838    147626



In [37]:

with mlflow.start_run(run_name='LogisticRegression'):
    mlflow.log_param("data_version",1)
    mlflow.log_param("input_rows",df.shape[0])
    mlflow.log_param("input_cols",df.shape[1])
    #model fitting and training
    lr=LogisticRegression()
    mlflow.set_tag(key= "model",value="LogisticRegression")
    params = lr.get_params()
    mlflow.log_params(params)
    lr.fit(X_train,y_train)
    train_features_name = f'{X_train=}'.split('=')[0]
    train_label_name = f'{y_train=}'.split('=')[0]
    mlflow.set_tag(key="train_features_name",value= train_features_name)
    mlflow.set_tag(key= "train_label_name",value=train_label_name)
    predicted=lr.predict(X_val)
    precision,recall,fscore,support=score(y_val,predicted,average='macro')
    mlflow.log_metric("Precision_test",precision)
    mlflow.log_metric("Recall_test",recall)
    mlflow.log_metric("F1_score_test",fscore)
    mlflow.sklearn.log_model(lr,artifact_path="ML_models")




# XGBoost:

In [39]:
xgboost = XGBClassifier(random_state=2)

xgboost.fit(X_train, y_train)
y_xgbpred = xgboost.predict(X_val)

# Print the results
print(classification_report(y_val, y_xgbpred, digits=3))

              precision    recall  f1-score   support

           0      0.969     0.980     0.975    120243
           1      0.907     0.863     0.885     27383

    accuracy                          0.958    147626
   macro avg      0.938     0.922     0.930    147626
weighted avg      0.958     0.958     0.958    147626



In [40]:

with mlflow.start_run(run_name='XGBoost'):
    mlflow.log_param("data_version",1)
    mlflow.log_param("input_rows",df.shape[0])
    mlflow.log_param("input_cols",df.shape[1])
    xg = XGBClassifier()
    params = xg.get_params()
    mlflow.set_tag(key= "model", value="XGBClassifier")
    mlflow.log_params(params)
    xg.fit(X_train,y_train)
    train_features_name = f'{X_train=}'.split('=')[0]
    train_label_name = f'{y_train=}'.split('=')[0]
    mlflow.set_tag(key="train_features_name",value= train_features_name)
    mlflow.set_tag(key= "train_label_name",value=train_label_name)
    predicted=xg.predict(X_val)
    precision,recall,fscore,support=score(y_val,predicted,average='macro')
    mlflow.log_metric("Precision_test",precision)
    mlflow.log_metric("Recall_test",recall)
    mlflow.log_metric("F1_score_test",fscore)
    mlflow.xgboost.log_model(xg,artifact_path="ML_models")




### List the importance of each feature

In [41]:

for name, importance in sorted(zip(X.columns, xgboost.feature_importances_)):
    print(name, "=", importance)

ApprovalFY = 0.20596862
AppvDisbursed = 0.03901559
BankState = 0.029654756
DaysToDisbursement = 0.01547745
DisbursementFY = 0.03427445
DisbursementGross = 0.011691744
GrAppv = 0.032201372
GreatRecession = 0.010401577
Industry = 0.008969399
IsFranchise = 0.006186307
LowDoc = 0.018806655
NewBusiness = 0.02222905
NoEmp = 0.008265215
RealEstate = 0.0
RevLineCr = 0.054240208
SBA_AppvPct = 0.104090095
State = 0.021454057
Term = 0.3345423
UrbanRural = 0.042531196


### Build pipeling for feature selection and modeling; SelectKBest defaults to top 10 features


In [42]:
xgb_featimp = XGBClassifier(random_state=2)

pipe = Pipeline(steps=[
    ('feature_selection', SelectKBest()),
    ('model', xgb_featimp)
])

pipe.fit(X_train, y_train)
y_featimppred = pipe.predict(X_val)

print(classification_report(y_val, y_featimppred, digits=3))

              precision    recall  f1-score   support

           0      0.964     0.976     0.970    120243
           1      0.888     0.842     0.864     27383

    accuracy                          0.951    147626
   macro avg      0.926     0.909     0.917    147626
weighted avg      0.950     0.951     0.951    147626



In [44]:

with mlflow.start_run(run_name='XGBoost2'):
    mlflow.log_param("data_version",2)
    mlflow.log_param("input_rows",df.shape[0])
    mlflow.log_param("input_cols",df.shape[1])
    xgb_featimp = XGBClassifier(random_state=2)
    pipe = Pipeline(steps=[
    ('feature_selection', SelectKBest()),
    ('model', xgb_featimp)
    ])
    params = xgb_featimp.get_params()
    mlflow.set_tag(key= "model", value="XGBClassifier")
    mlflow.log_params(params)
    pipe.fit(X_train, y_train)
    train_features_name = f'{X_train=}'.split('=')[0]
    train_label_name = f'{y_train=}'.split('=')[0]
    mlflow.set_tag(key="train_features_name",value= train_features_name)
    mlflow.set_tag(key= "train_label_name",value=train_label_name)
    predicted=pipe.predict(X_val)
    precision,recall,fscore,support=score(y_val,predicted,average='macro')
    mlflow.log_metric("Precision_test",precision)
    mlflow.log_metric("Recall_test",recall)
    mlflow.log_metric("F1_score_test",fscore)
    mlflow.xgboost.log_model(xg,artifact_path="ML_models")




### List the importance of each feature

In [45]:

for name, importance in sorted(zip(X.columns, xgb_featimp.feature_importances_)):
    print(name, "=", importance)

ApprovalFY = 0.061979383
BankState = 0.3998053
DisbursementGross = 0.040910725
GrAppv = 0.0
LowDoc = 0.114158794
NoEmp = 0.013915403
RevLineCr = 0.037644003
State = 0.23328649
Term = 0.0658888
UrbanRural = 0.032411136


# RandomFoest:

In [46]:
with mlflow.start_run(run_name='RandomForest'):
    mlflow.log_param("data_version",3)
    mlflow.log_param("input_rows",df.shape[0])
    mlflow.log_param("input_cols",df.shape[1])
    rf = RandomForestClassifier(random_state=5)
    mlflow.set_tag(key="model", value = "RandomForest")
    params = rf.get_params()
    mlflow.log_params(params)
    rf.fit(X_resampled,y_resampled)
    train_features_name = f'{X_resampled=}'.split('=')[0]
    train_label_name = f'{y_resampled=}'.split('=')[0]
    mlflow.set_tag(key="train_features_name",value= train_features_name)
    mlflow.set_tag(key= "train_label_name",value=train_label_name)
    predicted=rf.predict(X_val)
    precision,recall,fscore,support=score(y_val,predicted,average='macro')
    mlflow.log_metric("Precision_test",precision)
    mlflow.log_metric("Recall_test",recall)
    mlflow.log_metric("F1_score_test",fscore)
    mlflow.sklearn.log_model(rf,artifact_path="ML_models")



In [47]:
rf = RandomForestClassifier(random_state=5)
rf.fit(X_resampled,y_resampled)
predicted=rf.predict(X_val)

In [48]:
print('Classification report:\n', classification_report(y_val, predicted))
conf_mat = confusion_matrix(y_true=y_val, y_pred=predicted)
print('Confusion matrix:\n', conf_mat)
print('Share of Non-Fraud in Test Data:', 1-round(y_val.sum()/len(y_val),4))

Classification report:
               precision    recall  f1-score   support

           0       0.97      0.97      0.97    120243
           1       0.88      0.87      0.87     27383

    accuracy                           0.95    147626
   macro avg       0.92      0.92      0.92    147626
weighted avg       0.95      0.95      0.95    147626

Confusion matrix:
 [[116900   3343]
 [  3561  23822]]
Share of Non-Fraud in Test Data: 0.8145


# MLFLOW Best Model:

In [49]:

#Reading Pandas Dataframe from mlflow
all_experiments = [exp.experiment_id for exp in mlflow.search_experiments()]
df_mlflow = mlflow.search_runs(experiment_ids=all_experiments,filter_string="metrics.F1_score_test <1")
run_id = df_mlflow.loc[df_mlflow['metrics.F1_score_test'].idxmax()]['run_id']


In [50]:
df_mlflow


Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.Precision_test,metrics.F1_score_test,metrics.Recall_test,params.criterion,...,params.multi_class,params.fit_intercept,tags.mlflow.source.type,tags.mlflow.source.name,tags.mlflow.runName,tags.train_features_name,tags.mlflow.log-model.history,tags.model,tags.mlflow.user,tags.train_label_name
0,8eb9cdac8f6a4e458f0a9bb7502ef1f4,195371603636612681,FINISHED,file:///c:/Users/msi/Desktop/mlops/mlruns/1953...,2024-11-29 18:46:35.077000+00:00,2024-11-29 18:50:23.490000+00:00,0.923688,0.922375,0.921077,gini,...,,,LOCAL,c:\Users\msi\anaconda3\envs\mlops\lib\site-pac...,RandomForest,X_resampled,"[{""run_id"": ""8eb9cdac8f6a4e458f0a9bb7502ef1f4""...",RandomForest,msi,y_resampled
1,12c870f4ff854d6c8c8f993ebc88e385,195371603636612681,FINISHED,file:///c:/Users/msi/Desktop/mlops/mlruns/1953...,2024-11-29 18:46:20.121000+00:00,2024-11-29 18:46:25.692000+00:00,0.926399,0.917307,0.908876,,...,,,LOCAL,c:\Users\msi\anaconda3\envs\mlops\lib\site-pac...,XGBoost2,X_train,"[{""run_id"": ""12c870f4ff854d6c8c8f993ebc88e385""...",XGBClassifier,msi,y_train
2,508f98877c244ee58b2ee59373384b32,195371603636612681,FINISHED,file:///c:/Users/msi/Desktop/mlops/mlruns/1953...,2024-11-29 18:42:45.425000+00:00,2024-11-29 18:42:51.701000+00:00,0.938174,0.929597,0.921592,,...,,,LOCAL,c:\Users\msi\anaconda3\envs\mlops\lib\site-pac...,XGBoost,X_train,"[{""run_id"": ""508f98877c244ee58b2ee59373384b32""...",XGBClassifier,msi,y_train
3,b2e477bb332d4032a160ae87fd702add,195371603636612681,FINISHED,file:///c:/Users/msi/Desktop/mlops/mlruns/1953...,2024-11-29 18:41:16.600000+00:00,2024-11-29 18:41:26.862000+00:00,0.785017,0.707742,0.675689,,...,deprecated,True,LOCAL,c:\Users\msi\anaconda3\envs\mlops\lib\site-pac...,LogisticRegression,X_train,"[{""run_id"": ""b2e477bb332d4032a160ae87fd702add""...",LogisticRegression,msi,y_train


In [51]:
from mlflow.client import *
client = MlflowClient()
model_name='ML_models'
# Create the model URI
model_uri = f"runs:/{run_id}/artifacts/{model_name}"

# Register the model
reg = mlflow.register_model(model_uri, model_name)

# Get the model version
model_version = reg.version  # Get the registered model version

# Transition the model version to Staging
new_stage = "Staging"

client.transition_model_version_stage(
    name=model_name,
    version=model_version,
    stage=new_stage,
    archive_existing_versions=True
)

print(f"Model {model_name} version {model_version} transitioned to {new_stage} stage.")

Model ML_models version 5 transitioned to Staging stage.


Registered model 'ML_models' already exists. Creating a new version of this model...
Created version '5' of model 'ML_models'.
  client.transition_model_version_stage(


In [52]:

#let's call the model from the model registry ( in production stage)
import mlflow.pyfunc

logged_model = f'runs:/{run_id}/ML_models'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)
print(loaded_model)

# Predict on a Pandas DataFrame.

loaded_model.predict(X_val)


mlflow.pyfunc.loaded_model:
  artifact_path: ML_models
  flavor: mlflow.xgboost
  run_id: 508f98877c244ee58b2ee59373384b32



array([0, 1, 0, ..., 0, 0, 0])