# Binary Spam Classifier
The purpose of this notebook is to train scikit-learn binary spam classification models on the full corpus for deployment.


In [1]:
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
import numpy as np
import requests
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from scipy.sparse import issparse
from sklearn.pipeline import Pipeline


from dotenv import load_dotenv
import os
import mlflow
import mlflow.sklearn
import joblib
import time

In [2]:
RANDOM_STATE = 123

## Import Data

In [3]:
DATASET_FILENAME = "dataset-v1.pkl"
DATA_PATH = os.path.join("..", "data", DATASET_FILENAME)

In [4]:
df = pd.read_pickle(DATA_PATH)
df.head()

Unnamed: 0,id,url,author,title,body,labels
0,2315914373,https://api.github.com/repos/conda-forge/libav...,traversaro,Missing migrations for libavif pinned in conda...,### Comment:\n\nI was debugging some strange r...,[question]
1,2277197953,https://api.github.com/repos/Gravitate-Health/...,joofio,[UI] MVP2 feedback,UAG\n* search button - no need for click on se...,"[question, MVP3]"
2,2281363246,https://api.github.com/repos/manoj1689/HtmlToP...,manoj1689,All Issues resolved.,"""Welcome to the Repository"".",[bug]
3,2324288726,https://api.github.com/repos/chester-hill-solu...,sai-sy,"Create account rules when broken, no error prompt",,"[bug, design]"
4,2309225227,https://api.github.com/repos/tylerapritchard/l...,tylerapritchard,Nan is Search Results table,Just a general question - is there something w...,[question]


## Preprocess data
- Split the input features (issue content) and target feature (label)
- For binary classification, convert all non-"spam" issues into "not-spam"

In [5]:
X = df['title'] + ' ' + df['body']
y = df['labels'].apply(lambda x: 'spam' if 'spam' in x else 'not-spam')

## Config MLflow

In [6]:
EXPERIMENT_NAME = "binary_classifier_corpus_0"
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment(experiment_name=EXPERIMENT_NAME)
experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)

EXPERIMENT_ID = experiment.experiment_id
RUN_UID = str(int(time.time()))

print("Experiment ID:",EXPERIMENT_ID)
print("Experiment Name:", EXPERIMENT_NAME)


Experiment ID: 1
Experiment Name: binary_classifier_corpus_0


## Define Models
- Use statistical models and a simple neural network for variety
- Notice that we are not using SVC due to computational cost
- Save the models in a dictionary with their corresponding hyperparameters for easier indexing

In [7]:
logreg_hyperparameters = {
    'penalty': 'l2',
    'solver': 'lbfgs',
    'random_state': RANDOM_STATE
}
nn_hyperparameters = {
    'learning_rate': 'adaptive',
    'alpha': 0.0001,
    'max_iter': 1000,
    'activation': 'relu',
    'solver': 'adam',
    'random_state': RANDOM_STATE
}
rf_hyperparameters = {
    'max_depth': None,
    'n_estimators': 100, 
    'max_features': 'sqrt',
    'random_state': RANDOM_STATE
}

logreg = LogisticRegression(**logreg_hyperparameters)
nb = GaussianNB()
knn = KNeighborsClassifier(n_neighbors=5)
nn = MLPClassifier(**nn_hyperparameters)
rf = RandomForestClassifier(**rf_hyperparameters)

models = {"logreg": logreg, "nb": nb, "knn": knn, "nn": nn, "rf": rf}

## Define Vectorizers
- Text input must be converted into numerical input (vectors) before classification
- For computational efficiency, we'll use non-neural vectorizers Bag-of-Words and TFIDF

In [8]:
count_vectorizer = CountVectorizer() # Bag-of-Words vectorizer
tfidf_vectorizer = TfidfVectorizer()

vectorizers = {"count_vectorizer": count_vectorizer, "tfidf_vectorizer": tfidf_vectorizer}

## Train Model On Full Dataset
- Select the best performing classifier-vectorizer model pair from the experiments
- Save as pipeline in MLflow artifacts folder

In [9]:
def train_and_log_model(X, y, model, model_name, model_params, vectorizer, vectorizer_name):
    registered_model_name = f"{model_name}_{vectorizer_name}"
    run_name = f"{registered_model_name}_{RUN_UID}"

    pipeline = Pipeline([
        ('vectorizer', vectorizer),
        ('model', model)
    ])

    pipeline.fit(X, y)
    run_id = None

    with mlflow.start_run(experiment_id=EXPERIMENT_ID, run_name=run_name) as run:
        # Log model
        mlflow.sklearn.log_model(pipeline, model_name, registered_model_name=registered_model_name)

        # Log model parameters
        model_params = model.get_params()
        mlflow.log_params(model_params)
        mlflow.log_param("model_name", model_name)
        mlflow.log_param("vectorizer_name", vectorizer_name)

        run_id = run.info.run_id

    mlflow.end_run()
    return run_id

In [10]:
model_params = {
    'model': nn,
    'model_name': 'nn',
    'model_params': nn_hyperparameters
}
vectorizer_params = {
    'vectorizer': tfidf_vectorizer,
    'vectorizer_name': 'tfidf_vectorizer'
}
run_id = train_and_log_model(X, y, **model_params, **vectorizer_params)

The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh(<full-path-to-git-executable>)

All git commands will error until this is rectified.

This initial message can be silenced or aggravated in the future by setting the
$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - quiet|q|silence|s|silent|none|n|0: for no message or exception
    - error|e|exception|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet

Successfully registered model 'nn_tfidf_vectorizer'.
2024/06/15 13:58:50 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: nn_tfidf_vectorizer, version 1
Created version '1' of model 'nn_tfidf_vectorizer'.


## Import and run the classifier
- Import the binary classifier as a pickle file from mlartifacts
- Run predictions using `.predict()`

In [11]:
model_uri = os.path.join('mlartifacts', EXPERIMENT_ID, run_id, 'artifacts', model_params['model_name'], 'model.pkl')
binary_classifier = joblib.load(model_uri)

In [12]:
SAMPLE_SPAM_TEXT = "get free bitcoin here"
spam_prediction = binary_classifier.predict([SAMPLE_SPAM_TEXT])[0]
spam_prediction

'spam'