#### **Importing Required Libraries**

In [1]:
import random
import warnings
from urllib.parse import urlparse
from sklearn.svm import SVC
import matplotlib.pyplot as plt
import mlflow
import mlflow.sklearn
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import (
    accuracy_score,
    auc,
    precision_score, recall_score, f1_score, average_precision_score,confusion_matrix,
    precision_recall_curve,
)
from sklearn.pipeline import make_pipeline

# Set random seed for reproducibility
random.seed(21)

# Suppress warnings
warnings.filterwarnings("ignore")

#### Loading the datasets

In [2]:
# Load datasets
train = pd.read_csv("train.csv").dropna()
test = pd.read_csv("test.csv").dropna()
val = pd.read_csv("validation.csv").dropna()

# Extract features and labels
train_X, train_y = train["Message"], train["Label"]
test_X, test_y = test["Message"], test["Label"]
val_X, val_y = val["Message"], val["Label"]

In [3]:
print("Train Samples:", train_X.shape[0])
print("Test Samples:", test_X.shape[0])
print("Validation Samples:", val_X.shape[0])

Train Samples: 3063
Test Samples: 1021
Validation Samples: 1021


#### Defining AUCPR

In [4]:
def AUCPR(predictions, y_test):
    precision, recall, _ = precision_recall_curve(y_test, predictions)
    return auc(recall, precision)

##### Define benchmark models

In [5]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "Support Vector Machine": SVC(probability=True)
}

- ##### Code to clear past mlruns

In [None]:
# import os
# import mlflow

# # Define MLflow tracking directory with correct Windows path
# mlflow_tracking_dir = "E:\Sem 3\AppliedMachineLearning\Assignment 2\mlruns"

# # Set MLflow tracking URI (use `os.path.abspath()` for Windows compatibility)
# mlflow.set_tracking_uri(f"file:///{os.path.abspath(mlflow_tracking_dir).replace('\\', '/')}")

# # Explicitly create or set an experiment
# experiment_name = "Spam Detection Benchmark Models"

# mlflow.set_experiment(experiment_name)

# print(f"MLflow is tracking experiments in: {mlflow_tracking_dir}")

#### Function to evaluate and print metrics

In [7]:
def evaluate_model(model, X_val, y_val, model_name,set_name):
    predictions = model.predict(X_val)
    pred_probs = model.predict_proba(X_val)[:, 1] if hasattr(model, "predict_proba") else None
    
    accuracy = accuracy_score(y_val, predictions) * 100
    precision = precision_score(y_val, predictions) * 100
    recall = recall_score(y_val, predictions) * 100
    f1 = f1_score(y_val, predictions) * 100
    aucpr = average_precision_score(y_val, pred_probs) * 100 if pred_probs is not None else "N/A"
    
    print(f"{model_name}\n\n")
    print(f"On {set_name} Dataset:")
    print(f"Accuracy : {accuracy:.2f}%")
    print(f"Precision : {precision:.2f}%")
    print(f"Recall : {recall:.2f}%")
    print(f"F1 Score : {f1:.2f}%")
    print(f"**** AUCPR : {aucpr}% ****")
    print("\n" + "-" * 50 + "\n")


#### Traning and testing (along with validation set )

In [8]:
# Support Vector Classifier (SVC)
pipeline_svc = make_pipeline(TfidfVectorizer(), SVC(probability=True, random_state=21))
pipeline_svc.fit(train_X, train_y)
evaluate_model(pipeline_svc, test_X, test_y, "Support Vector Classifier (SVC)",set_name="Test")
evaluate_model(pipeline_svc, val_X, val_y, "Support Vector Classifier (SVC)",set_name="validation")

# Logistic Regression
pipeline_lr = make_pipeline(TfidfVectorizer(), LogisticRegression(random_state=21))
pipeline_lr.fit(train_X, train_y)
evaluate_model(pipeline_lr, test_X, test_y, "Logistic Regression",set_name="Test")
evaluate_model(pipeline_lr, val_X, val_y, "Logistic Regression",set_name="validation")

# Random Forest Classifier
pipeline_rf = make_pipeline(TfidfVectorizer(), RandomForestClassifier(random_state=21, max_depth=60, n_jobs=-1))
pipeline_rf.fit(train_X, train_y)
evaluate_model(pipeline_rf, test_X, test_y, "Random Forest Classifier",set_name="Test")
evaluate_model(pipeline_rf, val_X, val_y, "Random Forest Classifier",set_name="validation")

Support Vector Classifier (SVC)


On Test Dataset:
Accuracy : 97.16%
Precision : 98.02%
Recall : 78.57%
F1 Score : 87.22%
**** AUCPR : 97.07321454294792% ****

--------------------------------------------------

Support Vector Classifier (SVC)


On validation Dataset:
Accuracy : 96.96%
Precision : 100.00%
Recall : 75.59%
F1 Score : 86.10%
**** AUCPR : 96.52037009982595% ****

--------------------------------------------------

Logistic Regression


On Test Dataset:
Accuracy : 95.30%
Precision : 96.43%
Recall : 64.29%
F1 Score : 77.14%
**** AUCPR : 92.84592507038877% ****

--------------------------------------------------

Logistic Regression


On validation Dataset:
Accuracy : 94.71%
Precision : 96.20%
Recall : 59.84%
F1 Score : 73.79%
**** AUCPR : 94.10158012973164% ****

--------------------------------------------------

Random Forest Classifier


On Test Dataset:
Accuracy : 95.89%
Precision : 100.00%
Recall : 66.67%
F1 Score : 80.00%
**** AUCPR : 96.44248703025087% ****

---------

### logging the models

In [9]:
def log_model(pipeline, X_test, y_test, X_val, y_val, model_name):
    with mlflow.start_run(run_name=model_name):

        y_pred_test = pipeline.predict(X_test)
        y_pred_val = pipeline.predict(X_val)
        
        train_accuracy = pipeline.score(X_test, y_test)
        val_accuracy = pipeline.score(X_val, y_val)

        print(f"Training Accuracy: {train_accuracy}")
        print(f"Validation Accuracy: {val_accuracy}")

        mlflow.log_param("model_name", model_name)

        # Logging test metrics
        test_accuracy = accuracy_score(y_test, y_pred_test)
        aucpr_test = AUCPR(y_test, y_pred_test)
        
        mlflow.log_metric("accuracy", test_accuracy)
        mlflow.log_metric("precision", precision_score(y_test, y_pred_test, average='weighted'))
        mlflow.log_metric("recall", recall_score(y_test, y_pred_test, average='weighted'))
        mlflow.log_metric("f1_score", f1_score(y_test, y_pred_test, average='weighted'))
        mlflow.log_metric("AUCPR", aucpr_test)
        mlflow.log_dict(np.array(confusion_matrix(y_test, y_pred_test)).tolist(), "confusion_matrix.json")

        print(f"\nModel ({model_name}):")
        print(f"Accuracy: {test_accuracy}")
        print(f"AUCPR: {aucpr_test}")
        print(f"Confusion Matrix:\n {confusion_matrix(y_test, y_pred_test)} \n")

        # Logging validation AUCPR
        aucpr_val = AUCPR(y_val, y_pred_val)
        mlflow.log_metric("Validation AUCPR", aucpr_val)
        print(f"Validation AUCPR: {aucpr_val}")

        input_example = pd.DataFrame(X_test[:1])  # Ensuring input_example is a DataFrame
        mlflow.sklearn.log_model(pipeline, "model", input_example=input_example)
        mlflow.register_model(f"runs:/{mlflow.active_run().info.run_id}/{model_name}_val", model_name)


# Logging models Version -1 

In [10]:
# Logging models
log_model(pipeline_svc, test_X, test_y, val_X, val_y, "Support Vector Classifier")
log_model(pipeline_lr, test_X, test_y, val_X, val_y, "Logistic Regression")
log_model(pipeline_rf, test_X, test_y, val_X, val_y, "Random Forest Classifier")

Training Accuracy: 0.9715964740450539
Validation Accuracy: 0.9696376101860921

Model (Support Vector Classifier):
Accuracy: 0.9715964740450539
AUCPR: 0.8839355846876138
Confusion Matrix:
 [[893   2]
 [ 27  99]] 

Validation AUCPR: 0.8779527559055118


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Successfully registered model 'Support Vector Classifier'.
Created version '1' of model 'Support Vector Classifier'.


Training Accuracy: 0.9529872673849168
Validation Accuracy: 0.9471106758080313

Model (Logistic Regression):
Accuracy: 0.9529872673849168
AUCPR: 0.8050405764656499
Confusion Matrix:
 [[892   3]
 [ 45  81]] 

Validation AUCPR: 0.7816944045472664


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Successfully registered model 'Logistic Regression'.
Created version '1' of model 'Logistic Regression'.


Training Accuracy: 0.9588638589618022
Validation Accuracy: 0.9657198824681684

Model (Random Forest Classifier):
Accuracy: 0.9588638589618022
AUCPR: 0.8333333333333333
Confusion Matrix:
 [[895   0]
 [ 42  84]] 

Validation AUCPR: 0.8622047244094488


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Successfully registered model 'Random Forest Classifier'.
Created version '1' of model 'Random Forest Classifier'.


# Logging models Version -2 ( By update in some parameters)


In [11]:
# Update parameters for Version 2
pipeline_svc.set_params(svc__C=3.0, svc__kernel="poly")  # Change C value and use polynomial kernel
pipeline_lr.set_params(logisticregression__C=2.0, logisticregression__solver="saga", logisticregression__max_iter=250)  # Adjust C, solver, and iterations
pipeline_rf.set_params(randomforestclassifier__n_estimators=300, 
                        randomforestclassifier__max_depth=100, 
                        randomforestclassifier__min_samples_split=10)  # More trees, deeper splits

# Retrain models with updated parameters
pipeline_svc.fit(train_X, train_y)
pipeline_lr.fit(train_X, train_y)
pipeline_rf.fit(train_X, train_y)

# Evaluate updated models
evaluate_model(pipeline_svc, test_X, test_y, "Support Vector Classifier (V2)", set_name="Test")
evaluate_model(pipeline_svc, val_X, val_y, "Support Vector Classifier (V2)", set_name="Validation")

evaluate_model(pipeline_lr, test_X, test_y, "Logistic Regression (V2)", set_name="Test")
evaluate_model(pipeline_lr, val_X, val_y, "Logistic Regression (V2)", set_name="Validation")

evaluate_model(pipeline_rf, test_X, test_y, "Random Forest Classifier (V2)", set_name="Test")
evaluate_model(pipeline_rf, val_X, val_y, "Random Forest Classifier (V2)", set_name="Validation")


Support Vector Classifier (V2)


On Test Dataset:
Accuracy : 92.75%
Precision : 100.00%
Recall : 41.27%
F1 Score : 58.43%
**** AUCPR : 95.60903160517886% ****

--------------------------------------------------

Support Vector Classifier (V2)


On Validation Dataset:
Accuracy : 92.16%
Precision : 100.00%
Recall : 37.01%
F1 Score : 54.02%
**** AUCPR : 96.1220593803229% ****

--------------------------------------------------

Logistic Regression (V2)


On Test Dataset:
Accuracy : 96.08%
Precision : 95.74%
Recall : 71.43%
F1 Score : 81.82%
**** AUCPR : 94.77895184749521% ****

--------------------------------------------------

Logistic Regression (V2)


On Validation Dataset:
Accuracy : 96.28%
Precision : 96.84%
Recall : 72.44%
F1 Score : 82.88%
**** AUCPR : 95.2483951622973% ****

--------------------------------------------------

Random Forest Classifier (V2)


On Test Dataset:
Accuracy : 96.57%
Precision : 96.91%
Recall : 74.60%
F1 Score : 84.30%
**** AUCPR : 95.87654617589419% ****

##### Log Version 2 in MLflow

In [12]:
# Log Version 2 in MLflow
log_model(pipeline_svc, test_X, test_y, val_X, val_y, "Support Vector Classifier")
log_model(pipeline_lr, test_X, test_y, val_X, val_y, "Logistic Regression")
log_model(pipeline_rf, test_X, test_y, val_X, val_y, "Random Forest Classifier")

Training Accuracy: 0.9275220372184133
Validation Accuracy: 0.921645445641528

Model (Support Vector Classifier):
Accuracy: 0.9275220372184133
AUCPR: 0.7063492063492063
Confusion Matrix:
 [[895   0]
 [ 74  52]] 

Validation AUCPR: 0.6850393700787402


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Registered model 'Support Vector Classifier' already exists. Creating a new version of this model...
Created version '2' of model 'Support Vector Classifier'.


Training Accuracy: 0.960822722820764
Validation Accuracy: 0.9627815866797258

Model (Logistic Regression):
Accuracy: 0.960822722820764
AUCPR: 0.8378251252571381
Confusion Matrix:
 [[891   4]
 [ 36  90]] 

Validation AUCPR: 0.8478843986194596


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Registered model 'Logistic Regression' already exists. Creating a new version of this model...
Created version '2' of model 'Logistic Regression'.


Training Accuracy: 0.9657198824681684
Validation Accuracy: 0.9676787463271302

Model (Random Forest Classifier):
Accuracy: 0.9657198824681684
AUCPR: 0.8590211033843211
Confusion Matrix:
 [[892   3]
 [ 32  94]] 

Validation AUCPR: 0.8687281062023396


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Registered model 'Random Forest Classifier' already exists. Creating a new version of this model...
Created version '2' of model 'Random Forest Classifier'.


#### Best Model Selection Based on AUCPR

In [None]:
from mlflow.tracking import MlflowClient

def select_best_model(model_names):
    client = MlflowClient()
    print("\n=== Best Model Selection Based on AUCPR ===")
    
    overall_best_model = None
    overall_best_aucpr = -1  # Track the highest AUCPR across all models
    
    for model_name in model_names:
        versions = client.search_model_versions(f"name='{model_name}'")
        best_version = None
        best_aucpr = -1  # Start with a very low AUCPR for each model
        
        for version in versions:
            if version.current_stage in ["Staging", "None"]:  # Include these stages
                run = client.get_run(version.run_id)
                aucpr = float(run.data.metrics.get("AUCPR", 0))  # Get AUCPR
                
                if aucpr > best_aucpr:
                    best_aucpr = aucpr
                    best_version = version  # Store best version
        
        # Print the best AUCPR per model
        if best_version:
            print(f"🔹 Best AUCPR for {model_name}: {best_aucpr}")
            
            # Update overall best model
            if best_aucpr > overall_best_aucpr:
                overall_best_aucpr = best_aucpr
                overall_best_model = (model_name, best_version.version)
        else:
            print(f"⚠️ No valid versions found for {model_name}")

    # Print overall best model
    if overall_best_model:
        print(f"\n🏆 Overall Best Model: {overall_best_model[0]} (Version {overall_best_model[1]}) with AUCPR: {overall_best_aucpr}")
    else:
        print("\n⚠️ No valid models found!")

# Define models to check
model_names = ["Logistic Regression", "Random Forest Classifier", "Support Vector Classifier"]
select_best_model(model_names)



=== Best Model Selection Based on AUCPR ===
🔹 Best AUCPR for Logistic Regression: 0.8378251252571381
🔹 Best AUCPR for Random Forest Classifier: 0.8590211033843211
🔹 Best AUCPR for Support Vector Classifier: 0.8839355846876138

🏆 Overall Best Model: Support Vector Classifier (Version 1) with AUCPR: 0.8839355846876138
