# Binary Spam Classifier
The purpose of this notebook is to evaluate scikit-learn models for binary spam classification.


In [1]:
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
import numpy as np
import requests
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from scipy.sparse import issparse


from dotenv import load_dotenv
import os
import mlflow
import mlflow.sklearn
import time

In [2]:
RANDOM_STATE = 123

## Import Data

In [3]:
DATASET_FILENAME = "dataset-v1.pkl"
DATA_PATH = os.path.join("..", "data", DATASET_FILENAME)

In [4]:
df = pd.read_pickle(DATA_PATH)
df.head()

Unnamed: 0,id,url,author,title,body,labels
0,2315914373,https://api.github.com/repos/conda-forge/libav...,traversaro,Missing migrations for libavif pinned in conda...,### Comment:\n\nI was debugging some strange r...,[question]
1,2277197953,https://api.github.com/repos/Gravitate-Health/...,joofio,[UI] MVP2 feedback,UAG\n* search button - no need for click on se...,"[question, MVP3]"
2,2281363246,https://api.github.com/repos/manoj1689/HtmlToP...,manoj1689,All Issues resolved.,"""Welcome to the Repository"".",[bug]
3,2324288726,https://api.github.com/repos/chester-hill-solu...,sai-sy,"Create account rules when broken, no error prompt",,"[bug, design]"
4,2309225227,https://api.github.com/repos/tylerapritchard/l...,tylerapritchard,Nan is Search Results table,Just a general question - is there something w...,[question]


## Preprocess data
- Split the input features (issue content) and target feature (label)
- For binary classification, convert all non-"spam" issues into "not-spam"

In [5]:
X = df['title'] + ' ' + df['body']
y = df['labels'].apply(lambda x: 'spam' if 'spam' in x else 'not-spam')

## Config MLflow

In [6]:
mlflow.set_tracking_uri("http://localhost:5000")

EXPERIMENT_NAME = "binary_classifier_corpus_0"
RUN_UID = str(int(time.time()))
mlflow.set_experiment(experiment_name=EXPERIMENT_NAME)

experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
print("Experiment ID:", experiment.experiment_id)
print("Experiment Name:", experiment.name)


2024/06/15 00:15:26 INFO mlflow.tracking.fluent: Experiment with name 'binary_classifier_corpus_0' does not exist. Creating a new experiment.


Experiment ID: 1
Experiment Name: binary_classifier_corpus_0


## Define Models
- Use statistical models and a simple neural network for variety
- Notice that we are not using SVC due to computational cost
- Save the models in a dictionary with their corresponding hyperparameters for easier indexing

In [7]:
logreg_hyperparameters = {
    'penalty': 'l2',
    'solver': 'lbfgs',
    'random_state': RANDOM_STATE
}
nn_hyperparameters = {
    'learning_rate': 'adaptive',
    'alpha': 0.0001,
    'max_iter': 1000,
    'activation': 'relu',
    'solver': 'adam',
    'random_state': RANDOM_STATE
}
rf_hyperparameters = {
    'max_depth': None,
    'n_estimators': 100, 
    'max_features': 'sqrt',
    'random_state': RANDOM_STATE
}

logreg = LogisticRegression(**logreg_hyperparameters)
nb = GaussianNB()
knn = KNeighborsClassifier(n_neighbors=5)
nn = MLPClassifier(**nn_hyperparameters)
rf = RandomForestClassifier(**rf_hyperparameters)

models = {
    "logreg": (logreg, logreg_hyperparameters),
    "nb": (nb, {}),
    "knn": (knn, {"n_neighbors": 5}),
    "nn": (nn, nn_hyperparameters),
    "rf": (rf, rf_hyperparameters)
}

## Define Vectorizers
- Text input must be converted into numerical input (vectors) before classification
- For computational efficiency, we'll use non-neural vectorizers Bag-of-Words and TFIDF

In [8]:
count_vectorizer = CountVectorizer() # Bag-of-Words vectorizer
tfidf_vectorizer = TfidfVectorizer()

vectorizers = {"count_vectorizer": count_vectorizer, "tfidf_vectorizer": tfidf_vectorizer}

## Run Experiments
- Evaluate model performance using [K-fold cross validation](https://www.analyticsvidhya.com/blog/2022/02/k-fold-cross-validation-technique-and-its-essentials/#:~:text=K%2Dfold%20cross%2Dvalidation%20is,folds%20are%20used%20for%20training.) as an unbiased measure of performance before training final classifier on full dataset
- For each classifier model and vectorizer, train and evaluate + save results in MLflow

In [9]:
N_SPLITS = 10

kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
results = {(model_name, vectorizer_name): [] for model_name in models for vectorizer_name in vectorizers}

for model_name, (model, hyperparams) in models.items():
    for vectorizer_name, vectorizer in vectorizers.items():
        print(f"Evaluating {model_name} with {vectorizer_name}")
        X_vectorized = vectorizer.fit_transform(X)  # Vectorize the input text
        
        if issparse(X_vectorized) and isinstance(model, GaussianNB):
            X_vectorized = X_vectorized.toarray()  # Convert to dense array

        accuracies = []
        precisions = []
        recalls = []
        f1_scores = []

        for fold, (train_index, test_index) in enumerate(kf.split(X_vectorized)):
            X_train, X_test = X_vectorized[train_index], X_vectorized[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            model.fit(X_train, y_train)  # Train the model
            y_pred = model.predict(X_test)  # Predict on the test set

            # Calculate metrics
            accuracy = accuracy_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred, pos_label='spam')
            recall = recall_score(y_test, y_pred, pos_label='spam')
            f1 = f1_score(y_test, y_pred, pos_label='spam')

            # Accumulate metrics
            accuracies.append(accuracy)
            precisions.append(precision)
            recalls.append(recall)
            f1_scores.append(f1)

        # Log final metrics for the model after all folds
        with mlflow.start_run(run_name=f"{model_name}_{vectorizer_name}_{RUN_UID}"):
            avg_accuracy = np.mean(accuracies)
            avg_precision = np.mean(precisions)
            avg_recall = np.mean(recalls)
            avg_f1 = np.mean(f1_scores)

            mlflow.log_param("model", model_name)
            mlflow.log_param("vectorizer", vectorizer_name)
            mlflow.log_metrics({
                "avg_accuracy": avg_accuracy,
                "avg_precision": avg_precision,
                "avg_recall": avg_recall,
                "avg_f1_score": avg_f1
            })

            # Log hyperparameters
            mlflow.log_params(hyperparams)

            # Store the result
            results[(model_name, vectorizer_name)].append(avg_accuracy)

# Print the average accuracy for each model and vectorizer
for (model_name, vectorizer_name), accuracies in results.items():
    print(f"Model: {model_name}, Vectorizer: {vectorizer_name}, Average Accuracy: {np.mean(accuracies)}")

Evaluating logreg with count_vectorizer


The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh(<full-path-to-git-executable>)

All git commands will error until this is rectified.

This initial message can be silenced or aggravated in the future by setting the
$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - quiet|q|silence|s|silent|none|n|0: for no message or exception
    - error|e|exception|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet



Evaluating logreg with tfidf_vectorizer
Evaluating nb with count_vectorizer
Evaluating nb with tfidf_vectorizer
Evaluating knn with count_vectorizer
Evaluating knn with tfidf_vectorizer
Evaluating nn with count_vectorizer
Evaluating nn with tfidf_vectorizer
Evaluating rf with count_vectorizer
Evaluating rf with tfidf_vectorizer
Model: logreg, Vectorizer: count_vectorizer, Average Accuracy: 0.9621059268600252
Model: logreg, Vectorizer: tfidf_vectorizer, Average Accuracy: 0.9637512760463579
Model: nb, Vectorizer: count_vectorizer, Average Accuracy: 0.9352068696330992
Model: nb, Vectorizer: tfidf_vectorizer, Average Accuracy: 0.9264216657659281
Model: knn, Vectorizer: count_vectorizer, Average Accuracy: 0.7836425869212753
Model: knn, Vectorizer: tfidf_vectorizer, Average Accuracy: 0.947817210112292
Model: nn, Vectorizer: count_vectorizer, Average Accuracy: 0.9686993334534317
Model: nn, Vectorizer: tfidf_vectorizer, Average Accuracy: 0.9725334774515103
Model: rf, Vectorizer: count_vectoriz