# Binary Spam Classifier
The purpose of this notebook is to evaluate scikit-learn models for binary spam classification.


In [14]:
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
import numpy as np
import requests
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from dotenv import load_dotenv
import os
import mlflow
import mlflow.sklearn
import uuid

In [17]:
RANDOM_STATE = 123

## Import Data

In [None]:
DATA_PATH = os.path.join("..", "data")
DATASET_FILENAME = "PLACEHOLDER.pkl"

In [None]:
df = pd.read_pickle(DATA_PATH, DATASET_FILENAME)
df.head()

## Define Models
- Use statistical models and a simple neural network for variety
- Notice that we are not using SVC due to computational cost
- Save the models in a dictionary for easier indexing

In [12]:
logreg = LogisticRegression()
nb = GaussianNB()
knn = KNeighborsClassifier(n_neighbors=5)
nn = MLPClassifier(alpha=1, max_iter=1000, random_state=RANDOM_STATE)
rf = RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1, random_state=RANDOM_STATE)

models = {"logreg": logreg, "nb": nb, "knn": knn, "nn": nn, "rf": rf}

## Define Vectorizers
- Text input must be converted into numerical input (vectors) before classification
- For computational efficiency, we'll use non-neural vectorizers

In [16]:
count_vectorizer = CountVectorizer()
tfidf_vectorizer = TfidfVectorizer()
hashing_vectorizer = HashingVectorizer()

vectorizers = {"count_vectorizer": count_vectorizer, "tfidf_vectorizer": tfidf_vectorizer, "hashing_vectorizer": hashing_vectorizer}

## Config MLflow

In [None]:
mlflow.set_tracking_uri("http://localhost:5000")

EXPERIMENT_NAME = "binary_classifier_corpus_0"
RUN_UID = uuid.uuid4()
mlflow.set_experiment(experiment_name=EXPERIMENT_NAME)

experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
print("Experiment ID:", experiment.experiment_id)
print("Experiment Name:", experiment.name)


## Evaluate Models


- Split the input features (issue content) and target feature (label)
- For binary classification, convert all non-"spam" issues into "not-spam"

In [None]:
X = df['issue_title'] + ' ' + df['issue_body']
y = df['target'].apply(lambda x: 'spam' if x == 'spam' else 'not-spam')

Evaluate model performance using [K-fold cross validation](https://www.analyticsvidhya.com/blog/2022/02/k-fold-cross-validation-technique-and-its-essentials/#:~:text=K%2Dfold%20cross%2Dvalidation%20is,folds%20are%20used%20for%20training.) as an unbiased measure of performance before training final classifier on full dataset
- For each classifier model and vectorizer, train and evaluate + save results in MLflow

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
results = {name: [] for name in models}

for model_name, model in models.items():
    for vectorizer_name, vectorizer in vectorizers.items():
        X_vectorized = vectorizer.fit_transform(X)  # Vectorize the input issue text
        
        accuracies = []
        precisions = []
        recalls = []
        f1_scores = []

        for fold, (train_index, test_index) in kf.split(X_vectorized):
            X_train, X_test = X_vectorized[train_index], X_vectorized[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            model.fit(X_train, y_train)  # Train the model
            y_pred = model.predict(X_test)  # Predict on the test set

            # Calculate metrics
            accuracy = accuracy_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred, pos_label='spam')
            recall = recall_score(y_test, y_pred, pos_label='spam')
            f1 = f1_score(y_test, y_pred, pos_label='spam')

            # Accumulate metrics
            accuracies.append(accuracy)
            precisions.append(precision)
            recalls.append(recall)
            f1_scores.append(f1)

        # Log final metrics for the model after all folds
        with mlflow.start_run(run_name=f"{model_name}_{vectorizer_name}_{RUN_UID}"):
            avg_accuracy = np.mean(accuracies)
            avg_precision = np.mean(precisions)
            avg_recall = np.mean(recalls)
            avg_f1 = np.mean(f1_scores)

            mlflow.log_param("model", model_name)
            mlflow.log_param("vectorizer", vectorizer_name)
            mlflow.log_metric("avg_accuracy", avg_accuracy)
            mlflow.log_metric("avg_precision", avg_precision)
            mlflow.log_metric("avg_recall", avg_recall)
            mlflow.log_metric("avg_f1_score", avg_f1)

            # Store the result
            results[model_name].append(avg_accuracy)

# Print the average accuracy for each model
for name, scores in results.items():
    print(f"{name}: {sum(scores)/len(scores):.2f}")