# Binary Spam Classifier Experiments
The purpose of this notebook is to evaluate scikit-learn models for binary spam classification.


In [1]:
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
import numpy as np
import requests
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from scipy.sparse import issparse


from dotenv import load_dotenv
import os
import mlflow
import mlflow.sklearn
import time

import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords
import string

In [2]:
RANDOM_STATE = 123

## Import Data

In [3]:
DATASET_FILENAME = "dataset-v2.pkl"
DATA_PATH = os.path.join("..", "data", DATASET_FILENAME)

In [4]:
df = pd.read_pickle(DATA_PATH)
df.head()

Unnamed: 0,id,url,author,title,body,labels
0,2172615896,https://api.github.com/repos/lindsaypj/portfol...,lindsaypj,Clicking accordion topic updates terminal text...,,"[bug, pending MERGE]"
1,2320286757,https://api.github.com/repos/JosefNemec/Playni...,fm-117,[IGDB Metadata] Some games cannot be found,When looking for metadata using the IGDB searc...,[bug]
2,2242364830,https://api.github.com/repos/nomicflux/scrpg-h...,nomicflux,EnergyChoice needs to handle Physical for Powe...,,"[bug, enhancement]"
3,2309230756,https://api.github.com/repos/fluentcms/FluentC...,pournasserian,Why should we have `StopPropagation` property ...,,"[enhancement, question]"
4,2229948881,https://api.github.com/repos/hpsaturn/ESPNowCa...,beniroquai,Many-to-one connection,"Hey @hpsaturn, I stumbled upon your project wh...","[enhancement, question]"


## Preprocess data
- Split the input features (issue content) and target feature (label)
- For binary classification, convert all non-"spam" issues into "not-spam"

In [5]:
X = df['title'] + ' ' + df['body']
y = df['labels'].apply(lambda x: 'spam' if 'spam' in x else 'not-spam')

## Config MLflow

In [6]:
EXPERIMENT_NAME = "binary_classifier_corpus_2"
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment(experiment_name=EXPERIMENT_NAME)
experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)

EXPERIMENT_ID = experiment.experiment_id
RUN_UID = str(int(time.time()))

print("Experiment ID:",EXPERIMENT_ID)
print("Experiment Name:", EXPERIMENT_NAME)

Experiment ID: 2
Experiment Name: binary_classifier_corpus_2


## Define Models
- Use statistical models and a simple neural network for variety
- Notice that we are not using SVC due to computational cost
- Save the models in a dictionary with their corresponding hyperparameters for easier indexing

In [7]:
logreg_hyperparameters = {
    'penalty': 'l2',
    'solver': 'lbfgs',
    'random_state': RANDOM_STATE
}
nn_hyperparameters = {
    'learning_rate': 'adaptive',
    'alpha': 0.0001,
    'max_iter': 200,
    'activation': 'relu',
    'solver': 'adam',
    'random_state': RANDOM_STATE
}
rf_hyperparameters = {
    'max_depth': None,
    'n_estimators': 100, 
    'max_features': 'sqrt',
    'random_state': RANDOM_STATE
}

logreg = LogisticRegression(**logreg_hyperparameters)
nb = GaussianNB()
knn = KNeighborsClassifier(n_neighbors=5)
nn = MLPClassifier(**nn_hyperparameters)
rf = RandomForestClassifier(**rf_hyperparameters)

models = {
    "logreg": logreg, 
    # "nb": nb, 
    "knn": knn, 
    "nn": nn, 
    "rf": rf
}

## Define Vectorizers
- Text input must be converted into numerical input (vectors) before classification
- For computational efficiency, we'll use non-neural vectorizers Bag-of-Words and TFIDF

In [8]:
STOP_WORDS = set(stopwords.words('english') + list(string.punctuation))
STOP_WORDS = list(STOP_WORDS)

vectorizer_params = {
    'lowercase': True,
    'min_df': 5,
    'max_features': 25000
}
count_vectorizer = CountVectorizer(stop_words=STOP_WORDS, **vectorizer_params) # Bag-of-Words vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words=STOP_WORDS, **vectorizer_params)

vectorizers = {"count_vectorizer": count_vectorizer, "tfidf_vectorizer": tfidf_vectorizer}

## Run Experiments
- Evaluate model performance using [K-fold cross validation](https://www.analyticsvidhya.com/blog/2022/02/k-fold-cross-validation-technique-and-its-essentials/#:~:text=K%2Dfold%20cross%2Dvalidation%20is,folds%20are%20used%20for%20training.) as an unbiased measure of performance before training final classifier on full dataset
- For each classifier model and vectorizer, train and evaluate + save results in MLflow

In [9]:
N_SPLITS = 10

kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
results = {(model_name, vectorizer_name): [] for model_name in models for vectorizer_name in vectorizers}

for model_name, model in models.items():
    for vectorizer_name, vectorizer in vectorizers.items():
        print(f"Evaluating {model_name} with {vectorizer_name}")
        X_vectorized = vectorizer.fit_transform(X)  # Vectorize the text input

        if issparse(X_vectorized) and isinstance(model, GaussianNB):
            X_vectorized = X_vectorized.toarray()  # Convert to dense array

        accuracies = []
        precisions = []
        recalls = []
        f1_scores = []

        for fold, (train_index, test_index) in enumerate(kf.split(X_vectorized)):
            print("fold", fold)
            X_train, X_test = X_vectorized[train_index], X_vectorized[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            model.fit(X_train, y_train)  # Train the model
            y_pred = model.predict(X_test)  # Predict on the test set

            # Calculate metrics
            accuracy = accuracy_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred, pos_label='spam')
            recall = recall_score(y_test, y_pred, pos_label='spam')
            f1 = f1_score(y_test, y_pred, pos_label='spam')

            # Accumulate metrics
            accuracies.append(accuracy)
            precisions.append(precision)
            recalls.append(recall)
            f1_scores.append(f1)

        # Log final metrics for the model after all folds
        with mlflow.start_run(run_name=f"{model_name}_{vectorizer_name}_{RUN_UID}") as run:
            avg_accuracy = np.mean(accuracies)
            avg_precision = np.mean(precisions)
            avg_recall = np.mean(recalls)
            avg_f1 = np.mean(f1_scores)

            mlflow.log_param("model", model_name)
            mlflow.log_param("vectorizer", vectorizer_name)
            mlflow.log_metrics({
                "avg_accuracy": avg_accuracy,
                "avg_precision": avg_precision,
                "avg_recall": avg_recall,
                "avg_f1_score": avg_f1
            })

            # Log hyperparameters
            model_params = model.get_params()
            mlflow.log_params(model_params)
            if model_name != 'rf':
                mlflow.log_params(vectorizer_params) # vectorizer params conflict with rf params

            results[(model_name, vectorizer_name)].append(avg_accuracy)

for (model_name, vectorizer_name), accuracies in results.items():
    print(f"Model: {model_name}, Vectorizer: {vectorizer_name}, Average Accuracy: {np.mean(accuracies)}")

Evaluating rf with count_vectorizer
fold 0
fold 1
fold 2
fold 3
fold 4
fold 5
fold 6
fold 7
fold 8
fold 9


The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh(<full-path-to-git-executable>)

All git commands will error until this is rectified.

This initial message can be silenced or aggravated in the future by setting the
$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - quiet|q|silence|s|silent|none|n|0: for no message or exception
    - error|e|exception|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet



Evaluating rf with tfidf_vectorizer
fold 0
fold 1
fold 2
fold 3
fold 4
fold 5
fold 6
fold 7
fold 8
fold 9
Model: rf, Vectorizer: count_vectorizer, Average Accuracy: 0.9830426693512878
Model: rf, Vectorizer: tfidf_vectorizer, Average Accuracy: 0.9834627339999408
