## Experiment tracking with MLFlow

### Data

In [7]:
import pandas as pd

In [8]:
## Dataset link - https://www.kaggle.com/datasets/architsharma01/loan-approval-prediction-dataset
DATASET = './loan_approval_dataset.csv'

In [9]:
df = pd.read_csv(DATASET)
df.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


In [10]:
# Renaming the column
df = df.rename(columns = {col_name: col_name.strip() for col_name in df.columns}) # Stripping the column name
df.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


In [11]:
df.columns

Index(['loan_id', 'no_of_dependents', 'education', 'self_employed',
       'income_annum', 'loan_amount', 'loan_term', 'cibil_score',
       'residential_assets_value', 'commercial_assets_value',
       'luxury_assets_value', 'bank_asset_value', 'loan_status'],
      dtype='object')

In [12]:
df['education'].value_counts()

education
Graduate        2144
Not Graduate    2125
Name: count, dtype: int64

### Data Preprocessing

In [13]:
from sklearn.model_selection import train_test_split 

In [14]:
df.loan_status.value_counts()

loan_status
Approved    2656
Rejected    1613
Name: count, dtype: int64

In [15]:
df_processed = df.copy()

In [16]:
# One hot encoding 
def convert_to_onehot(x):
    if x.strip().lower() == 'graduate':    # 1 - Graduate, 0 Not Graduate    
        return 1 
    return 0 

df_processed['education'] = df_processed['education'].apply(convert_to_onehot)
df_processed.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,1,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,0,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,1,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,1,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,0,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


In [17]:
# One hot encoding 
def convert_to_onehot(x):
    if x.strip().lower() == 'approved':    # 1 - Graduate, 0 Not Graduate    
        return 1 
    return 0 

df_processed['loan_status'] = df_processed['loan_status'].apply(convert_to_onehot)
df_processed.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,1,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,1
1,2,0,0,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,0
2,3,3,1,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,0
3,4,3,1,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,0
4,5,5,0,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,0


In [18]:
# One hot encoding 
def convert_to_onehot(x):
    if x.strip().lower() == 'yes':    # 1 - Graduate, 0 Not Graduate    
        return 1 
    return 0 

df_processed['self_employed'] = df_processed['self_employed'].apply(convert_to_onehot)
df_processed.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,1,0,9600000,29900000,12,778,2400000,17600000,22700000,8000000,1
1,2,0,0,1,4100000,12200000,8,417,2700000,2200000,8800000,3300000,0
2,3,3,1,0,9100000,29700000,20,506,7100000,4500000,33300000,12800000,0
3,4,3,1,0,8200000,30700000,8,467,18200000,3300000,23300000,7900000,0
4,5,5,0,1,9800000,24200000,20,382,12400000,8200000,29400000,5000000,0


### Feature Engineering

In [19]:
df_processed['total_asset'] = df_processed['luxury_assets_value'] + df_processed['commercial_assets_value'] + df_processed['residential_assets_value'] 
df_processed.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status,total_asset
0,1,2,1,0,9600000,29900000,12,778,2400000,17600000,22700000,8000000,1,42700000
1,2,0,0,1,4100000,12200000,8,417,2700000,2200000,8800000,3300000,0,13700000
2,3,3,1,0,9100000,29700000,20,506,7100000,4500000,33300000,12800000,0,44900000
3,4,3,1,0,8200000,30700000,8,467,18200000,3300000,23300000,7900000,0,44800000
4,5,5,0,1,9800000,24200000,20,382,12400000,8200000,29400000,5000000,0,50000000


### Setting up mlflow

In [20]:
import mlflow
import os

In [21]:
experiment_name = "loan approval - ray deployment - 1"
ml_path = os.path.join(os.getcwd(),"mlruns")
mlflow.set_tracking_uri(uri=ml_path)
# mlflow.set_tracking_uri(uri="http://localhost:5000")
# experiment_id = mlflow.set_experiment(experiment_name).experiment_id 
if not mlflow.get_experiment_by_name(experiment_name):
    experiment_id = mlflow.create_experiment(experiment_name)
else:
    experiment_id = mlflow.get_experiment_by_name(experiment_name).experiment_id
print(experiment_id)

os.environ["MLFLOW_ENABLE_SYSTEM_METRICS_LOGGING"] = "true"

442153692775467660


In [22]:
mlflow.get_artifact_uri()

2025/01/29 12:02:48 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.


'mlflow-artifacts:/0/cf3c751017194faa92e26fed865f3a7f/artifacts'

In [23]:
mlflow.get_experiment_by_name(experiment_name)

<Experiment: artifact_location='/home/thebeginner86/code/ml/ops/mlruns/442153692775467660', creation_time=1738131500291, experiment_id='442153692775467660', last_update_time=1738131500291, lifecycle_stage='active', name='loan approval - ray deployment - 1', tags={}>

In [24]:
from xgboost import XGBClassifier 
import mlflow
from sklearn.metrics import accuracy_score, precision_score, f1_score
from mlflow.models import infer_signature 
import warnings
warnings.filterwarnings('ignore')

In [25]:
dataset = mlflow.data.from_pandas(
    df_processed, name="Loan Approval Dataset - 1", targets="loan_status"
)

In [26]:
test_size = 0.2 
train_df, val_df = train_test_split(df_processed, stratify=df.loan_status, test_size=test_size, random_state=1234) 

In [27]:
X_train = train_df.drop(columns = ['loan_status'], axis = 1) 
y_train = train_df['loan_status']

In [28]:
X_val = val_df.drop(columns = ['loan_status'], axis = 1) 
y_val = val_df['loan_status']

In [29]:
xgb_classifier = XGBClassifier(
    n_estimators=10, 
    max_depth=3, 
    learning_rate=1, 
    objective="binary:logistic", 
    random_state=123, 
)


In [30]:
import time
mlflow.end_run()
with mlflow.start_run(experiment_id=experiment_id, run_name="exp-4", log_system_metrics=True) as run:
    time.sleep(15)
    mlflow.set_tracking_uri("http://localhost:5000")
    mlflow.log_input(dataset, context="training")
    xgb_classifier.fit(X_train, y_train) 
    clf_params = xgb_classifier.get_xgb_params() 
    mlflow.log_params(clf_params) 
    signature = infer_signature(X_train, xgb_classifier.predict(X_train))   # What is this infer_signature
    model_info = mlflow.xgboost.log_model(
        xgb_model=xgb_classifier, artifact_path="loan-classification-29-01-25", signature=signature
    ) 
    y_pred = xgb_classifier.predict(X_val)
    acc = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("f1", f1)

print("Accuracy achieved: ", acc)
print("Precision achieved: ", precision)
print("F1 achieved: ", f1)

print(mlflow.MlflowClient().get_run(run.info.run_id).data)


2025/01/29 12:02:53 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2025/01/29 12:02:53 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!
2025/01/29 12:02:53 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.
2025/01/29 12:03:16 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2025/01/29 12:03:16 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!


🏃 View run exp-4 at: http://localhost:5000/#/experiments/442153692775467660/runs/1def860380b2486b8d4276fb20f2192c
🧪 View experiment at: http://localhost:5000/#/experiments/442153692775467660
Accuracy achieved:  0.9812646370023419
Precision achieved:  0.9813084112149533
F1 achieved:  0.9849906191369606
<RunData: metrics={'accuracy': 0.9812646370023419,
 'f1': 0.9849906191369606,
 'precision': 0.9813084112149533,
 'system/cpu_utilization_percentage': 3.4,
 'system/disk_available_megabytes': 919001.0,
 'system/disk_usage_megabytes': 107107.8,
 'system/disk_usage_percentage': 10.4,
 'system/network_receive_megabytes': 0.40102999999999156,
 'system/network_transmit_megabytes': 0.4011779999999874,
 'system/system_memory_usage_megabytes': 7111.1,
 'system/system_memory_usage_percentage': 86.8}, params={'base_score': 'None',
 'booster': 'None',
 'colsample_bylevel': 'None',
 'colsample_bynode': 'None',
 'colsample_bytree': 'None',
 'device': 'None',
 'eval_metric': 'None',
 'gamma': 'None',
 '

In [49]:
print(model_info.model_uri)
model_uri = mlflow.get_artifact_uri("loan-classification-29-01-25")
print(model_uri)

runs:/1def860380b2486b8d4276fb20f2192c/loan-classification-29-01-25
mlflow-artifacts:/0/96e0be9355e34d1d92b6028e9d975e7f/artifacts/loan-classification-29-01-25


In [32]:
model_uri

'mlflow-artifacts:/0/96e0be9355e34d1d92b6028e9d975e7f/artifacts/loan-classification-29-01-25'

In [33]:
result = mlflow.register_model(
    model_uri, "xgboost-model-29-01-25"
)

Registered model 'xgboost-model-29-01-25' already exists. Creating a new version of this model...
2025/01/29 12:03:16 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: xgboost-model-29-01-25, version 2
Created version '2' of model 'xgboost-model-29-01-25'.


In [34]:
import pickle
with open('linear_regression_model.pkl', 'wb') as file:
    pickle.dump(xgb_classifier, file)

In [35]:
# loan_classifier = mlflow.xgboost.load_model(model_uri)
with open('linear_regression_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

In [36]:
y_pred

array([1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1,
       0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0,
       0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1,
       0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0,
       1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,

## Deployment

### Setting up ray

In [51]:
import ray
from fastapi import FastAPI
from ray import serve
import requests
from starlette.requests import Request

In [52]:
if ray.is_initialized():
    ray.shutdown()

ray.init()

2025-01-29 12:09:53,977	INFO worker.py:1781 -- Started a local Ray instance.


0,1
Python version:,3.10.12
Ray version:,2.34.0


In [53]:
ray.cluster_resources()

{'CPU': 8.0,
 'node:__internal_head__': 1.0,
 'object_store_memory': 820698316.0,
 'memory': 1641396635.0,
 'node:192.168.183.173': 1.0}

In [54]:
num_workers = 1 
resources_per_worker = {"CPU": 1}
MLFLOW_TRACKING_URI = "http://localhost:5000"

In [55]:
from starlette.requests import Request

import ray
from ray import serve
import numpy as np 
import seaborn as sns
import xgboost as xgb
import optuna
from sklearn.model_selection import cross_val_score

In [56]:
train_df, val_df = train_test_split(df_processed, stratify=df.loan_status, test_size=0.4, random_state=1234)
X_train = train_df.drop(columns = ['loan_status'], axis = 1) 
y_train = train_df['loan_status']
X_val = val_df.drop(columns = ['loan_status'], axis = 1) 
y_val = val_df['loan_status']

In [57]:
@serve.deployment(num_replicas=1)
class Model:
    def __init__(self):
        mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
        self.predictor = mlflow.xgboost.load_model(model_uri)
        
    def predict(self, input_data: list):
        # Perform prediction using the loaded model
        prediction = self.predictor.predict_proba(input_data)
        return {"prediction": prediction.tolist()}

    async def __call__(self, http_request: Request) -> str:
        data: str = await http_request.json() 
        return self.predict(data)




In [58]:
loan_predictor = Model.bind()

In [59]:
serve.run(loan_predictor)

Exception: Failed to submit task to actor ActorID(109d62967a16247e5eebe62001000000) due to b"Can't find actor 109d62967a16247e5eebe62001000000. It might be dead or it's from a different cluster"

In [54]:
with mlflow.start_run(experiment_id=experiment_id, nested=True, run_name="inference"):
    
    loaded_model.fit(X_train, y_train)   #, eval_set=[(X_val, y_valid)], early_stopping_rounds=10, verbose=False)

    artifact_path = "loan-classification-1"
    
    
    preds = loaded_model.predict(X_val)
    # print(preds)
    preds = [1 if x>0.5 else 0 for x in preds]
    acc = accuracy_score(y_val, preds)
    print("Accuracy achieved: ", acc)

    model_uri = mlflow.get_artifact_uri(artifact_path)

2025/01/29 10:11:05 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.


2025/01/29 10:11:05 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2025/01/29 10:11:05 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!


Accuracy achieved:  0.9754098360655737
🏃 View run inference at: http://localhost:5000/#/experiments/675914172608211943/runs/72b8f017268f40b4a353116452b30f99
🧪 View experiment at: http://localhost:5000/#/experiments/675914172608211943


In [55]:
import json
input_data = json.dumps({'input_data': X_val.iloc[0].values.reshape(1, -1).tolist()})