In [6]:
import mlflow
import mlflow.sklearn
from mlflow.tracking import MlflowClient

import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import average_precision_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

import warnings
warnings.filterwarnings('ignore')

In [7]:
mlflow.__version__

'2.20.3'

### Reading the csv files created in preparation part

In [8]:
train_df = pd.read_csv('../Assignment-1/sms+spam+collection/train.csv')
valid_df = pd.read_csv('../Assignment-1/sms+spam+collection/valid.csv')

In [9]:
# There are some empty sms filed, hence removing them
train_df.dropna(inplace=True)

### Processing the text

In [10]:
bow_transformer = CountVectorizer(analyzer=lambda x: x.split()).fit(train_df['processed sms'])
sms_bow = bow_transformer.transform(train_df['processed sms'])
sms_bow

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 35911 stored elements and shape (4171, 6952)>

In [11]:
tfidf_transformer = TfidfTransformer().fit(sms_bow)

### Creating TF-IDF embeddings for train, valid and test datasets

In [12]:
y_train = train_df.label.map({'ham': 0, 'spam': 1}).values

train_sms_tfidf = tfidf_transformer.transform(sms_bow)
train_sms_tfidf

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 35911 stored elements and shape (4171, 6952)>

In [13]:
y_true_val = valid_df.label.map({'ham': 0, 'spam': 1}).values

valid_sms_tfidf = tfidf_transformer.transform(bow_transformer.transform(valid_df['processed sms']))
valid_sms_tfidf

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 5134 stored elements and shape (697, 6952)>

### Define the models

In [14]:
# Define Benchmark Models
models = {
    "logistic_regression": LogisticRegression(C=0.001, penalty=None),
    "random_forest": DecisionTreeClassifier(random_state=0, criterion='entropy', max_features=None, splitter='random'),
    "svc": SVC(C=10, degree=2, gamma='scale', kernel='linear', probability=True),
}

### Start MLflow Experiment

In [15]:
mlflow.set_experiment("Benchmark Models")

client = MlflowClient()
for model_name, model in models.items():
    with mlflow.start_run(run_name=model_name):
        print(f"Training {model_name}...")

        # Train Model
        model.fit(train_sms_tfidf, y_train)
        y_pred_prob = model.predict_proba(valid_sms_tfidf)[:, 1]

        # Compute AUCPR
        aucpr = average_precision_score(y_true_val, y_pred_prob)
        print(f"{model_name} AUCPR: {aucpr:.4f}")

        # Log Metrics & Parameters
        mlflow.log_param("model_type", model_name)
        mlflow.log_metric("AUCPR", aucpr)

        # Log Model in MLflow
        mlflow.sklearn.log_model(model, model_name, input_example=train_sms_tfidf[0])

        # Register Model
        registered_model_uri = f"runs:/{mlflow.active_run().info.run_id}/{model_name}"
        mlflow.register_model(model_uri=registered_model_uri, name=model_name)

2025/03/19 14:38:25 INFO mlflow.tracking.fluent: Experiment with name 'Benchmark Models' does not exist. Creating a new experiment.


Training logistic_regression...
logistic_regression AUCPR: 0.9513


Successfully registered model 'logistic_regression'.
Created version '1' of model 'logistic_regression'.


Training random_forest...
random_forest AUCPR: 0.7396


Successfully registered model 'random_forest'.
Created version '1' of model 'random_forest'.


Training svc...
svc AUCPR: 0.9671


Successfully registered model 'svc'.
Created version '1' of model 'svc'.


## Getting the experiments and printing AUCPR metric for every benchmark.

In [16]:
client = MlflowClient()
experiment = client.get_experiment_by_name("Benchmark Models")

print("AUCPR Scores for Registered Models:")
for run in client.search_runs(experiment.experiment_id):
    model_name = run.data.params["model_type"]
    aucpr = run.data.metrics["AUCPR"]
    print(f"{model_name}: AUCPR = {aucpr:.4f}")

AUCPR Scores for Registered Models:
svc: AUCPR = 0.9671
random_forest: AUCPR = 0.7396
logistic_regression: AUCPR = 0.9513
