In [1]:
import pandas as pd
import scipy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import average_precision_score, classification_report, ConfusionMatrixDisplay, confusion_matrix, accuracy_score, precision_recall_curve, auc
from sklearn.naive_bayes import MultinomialNB
import mlflow
from urllib.parse import urlparse
import numpy as np
from sklearn.svm import SVC
import mlflow
from mlflow.tracking import MlflowClient
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier

import json
import warnings

warnings.simplefilter("ignore")

In [2]:
# Loading the datasets
train = pd.read_csv("train.csv")
train_X, train_y = train.transformed_text, train.Spam

test = pd.read_csv("test.csv")
test_X, test_y = test.transformed_text, test.Spam

val = pd.read_csv("validation.csv")
val_X, val_y = val.transformed_text, val.Spam

In [3]:
def eval_metrics(actual_y, pred_y):
    precision, recall, _ = precision_recall_curve(actual_y, pred_y)
    return auc(recall, precision)

In [4]:
def training_and_benchmarking_with_mlflow(model, model_name, train_X=train_X, train_y=train_y, val_X=val_X, val_y=val_y, test_X=test_X, test_y=test_y):
    #Trains and benchmarks model
    print(f"\nTraining and Evaluating {model_name}:\n")
    
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(max_features=2000)),
        ('clf', model),
    ])

    pipeline.fit(train_X, train_y)

    train_score = pipeline.score(train_X, train_y)
    validation_score = pipeline.score(val_X, val_y)

    print(f'Training Accuracy: {train_score}')
    print(f'Validation Accuracy: {validation_score}')

    y_pred = pipeline.predict(test_X)
    aucpr = eval_metrics(test_y, y_pred)
    acc = accuracy_score(test_y, y_pred)
    conf = confusion_matrix(test_y, y_pred)

    with mlflow.start_run(run_name=f"{model_name}"):
        mlflow.log_param("tfidf__max_features", 2000)
        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("AUCPR",aucpr)
        mlflow.log_dict(np.array(conf).tolist(), "confusion_matrix.json")
        mlflow.sklearn.log_model(pipeline, "model")

        print("\nModel (tfidf__max_features={:f}):".format(2000))
        print(f"Accuracy: {acc}")
        print(f"AUCPR: {aucpr} ")
        print(f"Confusion Matrix:\n {conf} \n \n")

        tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
        mlflow.sklearn.log_model(
            sk_model=pipeline,
            artifact_path="sklearn-model",
            registered_model_name=f"{model_name}-classification-model"
        )
        if tracking_url_type_store != "file":
            mlflow.sklearn.log_model(pipeline, "model", registered_model_name=f"{model_name}")
        else:
            mlflow.sklearn.log_model(pipeline, "model")



In [5]:
# Defining models
LR=LogisticRegression(random_state=42, max_iter=2000)
NB=MultinomialNB()
SVM=SVC(random_state=42, probability=True)

In [6]:
mlflow.set_tracking_uri("http://localhost:5000")

In [7]:
training_and_benchmarking_with_mlflow(NB, "Naive Bayes")


Training and Evaluating Naive Bayes:

Training Accuracy: 0.9812920927912198
Validation Accuracy: 0.9755529685681025

Model (tfidf__max_features=2000.000000):
Accuracy: 0.986046511627907
AUCPR: 0.9727191413237926 
Confusion Matrix:
 [[659   6]
 [  6 189]] 
 



Registered model 'Naive Bayes-classification-model' already exists. Creating a new version of this model...
2024/02/23 15:31:38 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Naive Bayes-classification-model, version 2
Created version '2' of model 'Naive Bayes-classification-model'.
Registered model 'Naive Bayes' already exists. Creating a new version of this model...
2024/02/23 15:31:42 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Naive Bayes, version 2
Created version '2' of model 'Naive Bayes'.


In [8]:
training_and_benchmarking_with_mlflow(LR,"Logistic Regression")


Training and Evaluating Logistic Regression:

Training Accuracy: 0.9942629084559741
Validation Accuracy: 0.9778812572759022

Model (tfidf__max_features=2000.000000):
Accuracy: 0.9918604651162791
AUCPR: 0.9854721777610395 
Confusion Matrix:
 [[664   1]
 [  6 189]] 
 



Registered model 'Logistic Regression-classification-model' already exists. Creating a new version of this model...
2024/02/23 15:31:52 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Logistic Regression-classification-model, version 2
Created version '2' of model 'Logistic Regression-classification-model'.
Registered model 'Logistic Regression' already exists. Creating a new version of this model...
2024/02/23 15:31:56 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Logistic Regression, version 2
Created version '2' of model 'Logistic Regression'.


In [9]:
training_and_benchmarking_with_mlflow(SVM,"Support Vector Machine")


Training and Evaluating Support Vector Machine:

Training Accuracy: 0.9995011224744326
Validation Accuracy: 0.9848661233993015

Model (tfidf__max_features=2000.000000):
Accuracy: 0.9941860465116279
AUCPR: 0.9888972391789462 
Confusion Matrix:
 [[663   2]
 [  3 192]] 
 



Registered model 'Support Vector Machine-classification-model' already exists. Creating a new version of this model...
2024/02/23 15:32:30 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Support Vector Machine-classification-model, version 2
Created version '2' of model 'Support Vector Machine-classification-model'.
Registered model 'Support Vector Machine' already exists. Creating a new version of this model...
2024/02/23 15:32:34 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Support Vector Machine, version 2
Created version '2' of model 'Support Vector Machine'.


Support Vector Machine gives us the highest Accuracy and AUCPR score and is hence the best choice of model for this experiment.
