## Experiment tracking with MLFlow

### Data

In [54]:
import pandas as pd

In [55]:
## Dataset link - https://www.kaggle.com/datasets/architsharma01/loan-approval-prediction-dataset
DATASET = './loan_approval_dataset.csv'

In [56]:
df = pd.read_csv(DATASET)
df.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


In [57]:
# Renaming the column
df = df.rename(columns = {col_name: col_name.strip() for col_name in df.columns}) # Stripping the column name
df.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


In [58]:
df.columns

Index(['loan_id', 'no_of_dependents', 'education', 'self_employed',
       'income_annum', 'loan_amount', 'loan_term', 'cibil_score',
       'residential_assets_value', 'commercial_assets_value',
       'luxury_assets_value', 'bank_asset_value', 'loan_status'],
      dtype='object')

In [59]:
df['education'].value_counts()

education
Graduate        2144
Not Graduate    2125
Name: count, dtype: int64

### Data Preprocessing

In [60]:
from sklearn.model_selection import train_test_split 

In [61]:
df.loan_status.value_counts()

loan_status
Approved    2656
Rejected    1613
Name: count, dtype: int64

In [62]:
df_processed = df.copy()

In [63]:
# One hot encoding 
def convert_to_onehot(x):
    if x.strip().lower() == 'graduate':    # 1 - Graduate, 0 Not Graduate    
        return 1 
    return 0 

df_processed['education'] = df_processed['education'].apply(convert_to_onehot)
df_processed.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,1,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,0,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,1,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,1,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,0,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


In [64]:
# One hot encoding 
def convert_to_onehot(x):
    if x.strip().lower() == 'approved':    # 1 - Graduate, 0 Not Graduate    
        return 1 
    return 0 

df_processed['loan_status'] = df_processed['loan_status'].apply(convert_to_onehot)
df_processed.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,1,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,1
1,2,0,0,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,0
2,3,3,1,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,0
3,4,3,1,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,0
4,5,5,0,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,0


In [65]:
# One hot encoding 
def convert_to_onehot(x):
    if x.strip().lower() == 'yes':    # 1 - Graduate, 0 Not Graduate    
        return 1 
    return 0 

df_processed['self_employed'] = df_processed['self_employed'].apply(convert_to_onehot)
df_processed.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,1,0,9600000,29900000,12,778,2400000,17600000,22700000,8000000,1
1,2,0,0,1,4100000,12200000,8,417,2700000,2200000,8800000,3300000,0
2,3,3,1,0,9100000,29700000,20,506,7100000,4500000,33300000,12800000,0
3,4,3,1,0,8200000,30700000,8,467,18200000,3300000,23300000,7900000,0
4,5,5,0,1,9800000,24200000,20,382,12400000,8200000,29400000,5000000,0


### Feature Engineering

In [66]:
df_processed['total_asset'] = df_processed['luxury_assets_value'] + df_processed['commercial_assets_value'] + df_processed['residential_assets_value'] 
df_processed.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status,total_asset
0,1,2,1,0,9600000,29900000,12,778,2400000,17600000,22700000,8000000,1,42700000
1,2,0,0,1,4100000,12200000,8,417,2700000,2200000,8800000,3300000,0,13700000
2,3,3,1,0,9100000,29700000,20,506,7100000,4500000,33300000,12800000,0,44900000
3,4,3,1,0,8200000,30700000,8,467,18200000,3300000,23300000,7900000,0,44800000
4,5,5,0,1,9800000,24200000,20,382,12400000,8200000,29400000,5000000,0,50000000


### Setting up mlflow

In [19]:
import mlflow
import os

In [105]:
experiment_name = "loan approval - ray deployment"
# ml_path = os.path.join(os.getcwd(),"mlruns1")
# mlflow.set_tracking_uri(uri=ml_path)
mlflow.set_tracking_uri(uri="http://localhost:5000")
# experiment_id = mlflow.set_experiment(experiment_name).experiment_id 
if not mlflow.get_experiment_by_name(experiment_name):
    experiment_id = mlflow.create_experiment(experiment_name)
else:
    experiment_id = mlflow.get_experiment_by_name(experiment_name).experiment_id
print(experiment_id)

os.environ["MLFLOW_ENABLE_SYSTEM_METRICS_LOGGING"] = "true"

954289996000994963


In [106]:
mlflow.get_experiment_by_name(experiment_name)

<Experiment: artifact_location='mlflow-artifacts:/954289996000994963', creation_time=1734069527513, experiment_id='954289996000994963', last_update_time=1734069527513, lifecycle_stage='active', name='loan approval - ray deployment', tags={}>

In [107]:
from xgboost import XGBClassifier 
import mlflow
from sklearn.metrics import accuracy_score, precision_score, f1_score
from mlflow.models import infer_signature 
import warnings
warnings.filterwarnings('ignore')

In [108]:
dataset = mlflow.data.from_pandas(
    df_processed, name="Loan Approval Dataset", targets="loan_status"
)

In [109]:
test_size = 0.2 
train_df, val_df = train_test_split(df_processed, stratify=df.loan_status, test_size=test_size, random_state=1234) 

In [110]:
X_train = train_df.drop(columns = ['loan_status'], axis = 1) 
y_train = train_df['loan_status']

In [111]:
X_val = val_df.drop(columns = ['loan_status'], axis = 1) 
y_val = val_df['loan_status']

In [112]:
xgb_classifier = XGBClassifier(
    n_estimators=10, 
    max_depth=3, 
    learning_rate=1, 
    objective="binary:logistic", 
    random_state=123, 
)


In [114]:
import time
with mlflow.start_run(experiment_id=experiment_id, run_name="exp-1", log_system_metrics=True) as run:
    time.sleep(15)
    mlflow.log_input(dataset, context="training")
    xgb_classifier.fit(X_train, y_train) 
    clf_params = xgb_classifier.get_xgb_params() 
    mlflow.log_params(clf_params) 
    signature = infer_signature(X_train, xgb_classifier.predict(X_train))   # What is this infer_signature
    model_info = mlflow.xgboost.log_model(
        xgb_model=xgb_classifier, artifact_path="loan-classification", signature=signature
    ) 
    y_pred = xgb_classifier.predict(X_val)
    acc = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("f1", f1)

print("Accuracy achieved: ", acc)
print("Precision achieved: ", precision)
print("F1 achieved: ", f1)

print(mlflow.MlflowClient().get_run(run.info.run_id).data)


2024/12/13 11:29:12 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.
2024/12/13 11:29:29 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2024/12/13 11:29:29 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!


🏃 View run exp-1 at: http://localhost:5000/#/experiments/954289996000994963/runs/6c319b3f15014744a41a5888c83bd5cc
🧪 View experiment at: http://localhost:5000/#/experiments/954289996000994963
Accuracy achieved:  0.9812646370023419
Precision achieved:  0.9813084112149533
F1 achieved:  0.9849906191369606
<RunData: metrics={'accuracy': 0.9812646370023419,
 'f1': 0.9849906191369606,
 'precision': 0.9813084112149533,
 'system/cpu_utilization_percentage': 7.4,
 'system/disk_available_megabytes': 920538.7,
 'system/disk_usage_megabytes': 105570.1,
 'system/disk_usage_percentage': 10.3,
 'system/network_receive_megabytes': 0.0036000000000058208,
 'system/network_transmit_megabytes': 0.0036000000000058208,
 'system/system_memory_usage_megabytes': 5679.8,
 'system/system_memory_usage_percentage': 69.3}, params={'base_score': 'None',
 'booster': 'None',
 'colsample_bylevel': 'None',
 'colsample_bynode': 'None',
 'colsample_bytree': 'None',
 'device': 'None',
 'eval_metric': 'None',
 'gamma': 'None

In [115]:
print(model_info.model_uri)
model_uri = mlflow.get_artifact_uri("loan-classification")
print(model_uri)

2024/12/13 11:29:49 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.


runs:/6c319b3f15014744a41a5888c83bd5cc/loan-classification
mlflow-artifacts:/0/b1acfddeacf2407b9cc8db2b8d208f18/artifacts/loan-classification


In [124]:
result = mlflow.register_model(
    model_uri, "xgboost-model"
)

Successfully registered model 'xgboost-model'.
2024/12/13 11:33:17 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: xgboost-model, version 1
Created version '1' of model 'xgboost-model'.


In [116]:
import pickle
with open('linear_regression_model.pkl', 'wb') as file:
    pickle.dump(xgb_classifier, file)

In [117]:
# loan_classifier = mlflow.xgboost.load_model(model_uri)
with open('linear_regression_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

In [118]:
y_pred

array([1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1,
       0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0,
       0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1,
       0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0,
       1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,

## Deployment

### Setting up ray

In [51]:
from starlette.requests import Request

import ray
from ray import serve
import numpy as np 
import seaborn as sns
import xgboost as xgb
import optuna
from sklearn.model_selection import cross_val_score

In [137]:
train_df, val_df = train_test_split(df_processed, stratify=df.loan_status, test_size=0.4, random_state=1234)
X_train = train_df.drop(columns = ['loan_status'], axis = 1) 
y_train = train_df['loan_status']
X_val = val_df.drop(columns = ['loan_status'], axis = 1) 
y_val = val_df['loan_status']

In [138]:
with mlflow.start_run(experiment_id=experiment_id, nested=True, run_name="inference"):
    
    loaded_model.fit(X_train, y_train)   #, eval_set=[(X_val, y_valid)], early_stopping_rounds=10, verbose=False)

    artifact_path = "loan-classification"
    
    
    preds = loaded_model.predict(X_val)
    # print(preds)
    preds = [1 if x>0.5 else 0 for x in preds]
    acc = accuracy_score(y_val, preds)
    print("Accuracy achieved: ", acc)

    model_uri = mlflow.get_artifact_uri(artifact_path)

2024/12/13 12:03:02 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.


2024/12/13 12:03:02 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2024/12/13 12:03:02 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!


Accuracy achieved:  0.9754098360655737
🏃 View run inference at: http://localhost:5000/#/experiments/954289996000994963/runs/17a4c18aa27f4fe681812f868380ac4c
🧪 View experiment at: http://localhost:5000/#/experiments/954289996000994963


In [134]:
import json
input_data = json.dumps({'input_data': X_val.iloc[0].values.reshape(1, -1).tolist()})