# Assignment 2
## Training and testing
### Muhammed Jassim
### MDS202220

### Importing Libraries

In [9]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import average_precision_score, accuracy_score
import mlflow
import mlflow.sklearn
from mlflow.tracking import MlflowClient

### Load, Vectorize and split data

In [10]:
# Load train, validation, and test data
train_data = pd.read_csv('./data/train.csv')
validation_data = pd.read_csv('./data/validation.csv')
test_data = pd.read_csv('./data/test.csv')

# Initializing the vectorizer
# vectorizer = CountVectorizer()
vectorizer = TfidfVectorizer()

# Extract features and target variable
X_train, y_train = vectorizer.fit_transform(train_data['text']), train_data['spam']
X_val, y_val = vectorizer.transform(validation_data['text']), validation_data['spam']
X_test, y_test = vectorizer.transform(test_data['text']), test_data['spam']

### Declare the benchmark model names

In [11]:
models = {
    "random_forest" : RandomForestClassifier,
    "naive_bayes" : MultinomialNB,
    "log_regression" : LogisticRegression 
}

### Define `mlflow` tracking function

In [12]:
def track_mlflow(model_name, x_train, y_train, x_val, y_val):
    with mlflow.start_run(run_name=model_name):
        print(f"Starting {model_name} run")

        # Fitting the model
        model = models[model_name]()
        model.fit(x_train, y_train)
        y_pred = model.predict(x_val)
        model_results = {
            'accuracy' : accuracy_score(y_val, y_pred),
            'aucpr' : average_precision_score(y_val, y_pred)
        }
        print(f"{model_name} performance:\n {model_results}")

        # Logging model parameters
        mlflow.log_params(model.get_params())

        # Logging model performance metrics
        for metric, value in model_results.items():
            mlflow.log_metric(f"{model_name}_{metric}", value)
        
        # Logging the model
        client = MlflowClient()
        mlflow.sklearn.log_model(model, model_name)

        curr_run_id = mlflow.active_run().info.run_id
        model_uri = f"runs:/{curr_run_id}/{model_name}"

        try:
            client.create_registered_model(model_name)
        except Exception:
            print(f"A model named {model_name} is already registered.")
        
        model_details = client.create_model_version(model_name, model_uri, curr_run_id)
        print(f">>> Model {model_name} (version {model_details.version}) has been registered.")

### Random Forest Classifier

In [13]:
track_mlflow('random_forest', X_train, y_train, X_val, y_val)

Starting random_forest run
random_forest performance:
 {'accuracy': 0.9705561613958561, 'aucpr': 0.901481753296087}
A model named random_forest is already registered.
>>> Model random_forest (version 2) has been registered.


### Multinomial Naive Bayes

In [14]:
track_mlflow('naive_bayes', X_train, y_train, X_val, y_val)

Starting naive_bayes run
naive_bayes performance:
 {'accuracy': 0.88113413304253, 'aucpr': 0.6022781892323515}
A model named naive_bayes is already registered.
>>> Model naive_bayes (version 2) has been registered.


### Logistic Regression

In [15]:
track_mlflow('log_regression', X_train, y_train, X_val, y_val)

Starting log_regression run
log_regression performance:
 {'accuracy': 0.9738276990185387, 'aucpr': 0.910346453190559}
A model named log_regression is already registered.
>>> Model log_regression (version 2) has been registered.


### Testing on Test dataset

In [18]:
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
model_results = {
    'accuracy' : accuracy_score(y_test, y_pred),
    'aucpr' : average_precision_score(y_test, y_pred)
    }
print(f"Logistic Regression performance on text data: \n{model_results}")

Logistic Regression performance on text data: 
{'accuracy': 0.9729493891797557, 'aucpr': 0.9111268013133108}


After conducting experiments on our dataset, we observed that the *Logistic Regression model* outperformed both the *Multinomial Naive Bayes* and *Random Forest model*. The AUC-PR values achieved of the models were:
* Random Forest Classifier : $0.901481753296087$
* Multinomial Naive Bayes : $0.6022781892323515$
* Logistic Regression : $0.910346453190559$

This indicates that the *Logistic Regression* algorithm is more effective in capturing the complexities of the data and providing better predictive performance, particularly in the context of AUC-PR, a critical metric for imbalanced classification tasks. The performance metrics of this model on the test set is given below:
* Accuracy : $0.9729493891797557$
* AUC-PR : $0.9111268013133108$