# Benchmarking of various models for spam detection.

In [90]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/EECS_6448/Data

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/EECS_6448/Data


In [91]:

from __future__ import annotations
import os
import warnings
import pandas as pd
from typing import Callable, Tuple

warnings.filterwarnings('ignore')

# Data Preprocessing & NLP
import nltk
import unicodedata
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

# Download NLTK resources
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [92]:
# Function to clean and preprocess text
def clean_text(text: str) -> str:
    STOPWORDS = set(stopwords.words('english'))
    TOK = nltk.tokenize.toktok.ToktokTokenizer()
    PORTER_STEMMER = nltk.PorterStemmer()
    WHITELIST = string.digits + string.whitespace + string.ascii_letters

    # Replace accented chars with normal form
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore').strip()

    # Remove double space
    while '  ' in text:
        text = text.replace('  ', ' ')

    # Keep only A-Z, a-z, 0-9, space
    text = ''.join(c for c in text if c in WHITELIST)

    tokens = [t.strip() for t in TOK.tokenize(text.lower())]
    tokens = [t for t in tokens if t not in STOPWORDS]
    text = ' '.join([PORTER_STEMMER.stem(word) for word in tokens])

    return text

In [93]:
# Function to load and preprocess data
def load_and_preprocess_data(file_path: str, target_column: str) -> pd.DataFrame:
    df = pd.read_csv(file_path, names=["App", "Source", "Author", "ReviewID", "Language", "Unnamed", "Star", "Date", "User", "Summary", "Review", "Category"])
    df = df.drop("Unnamed", axis=1)
    df['Review'] = df['Summary'] + ' ' + df['Review']
    df.fillna('nan', inplace=True)
    df['Review_proc'] = df['Review'].apply(clean_text)
    df = df.drop("Summary", axis=1)
    df['Target'] = df[target_column].apply(lambda x: 0 if 's' in str(x) else 1)
    return df

In [94]:
# Function to balance the dataset
def balance_dataset(df: pd.DataFrame, target_column: str) -> pd.DataFrame:
    class_counts = df[target_column].value_counts()
    min_count = class_counts.min()
    df = pd.concat([df[df[target_column] == label].sample(min_count) for label in df[target_column].unique()])
    df = df.sample(frac=1).reset_index(drop=True)
    return df

In [95]:
# Function to prepare data for modeling
def prepare_data_for_modeling(df: pd.DataFrame, target_column: str) -> Tuple:
    X = df['Review_proc']
    y = df['Target']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    tfidf_vec = TfidfVectorizer(ngram_range=(1, 3), max_features=5000)
    tfidf_vec.fit(df['Review_proc'].to_numpy())

    X_train = tfidf_vec.transform(X_train.to_numpy())
    y_train = y_train.to_numpy()

    X_test = tfidf_vec.transform(X_test.to_numpy())
    y_test = y_test.to_numpy()

    label_encoder = LabelEncoder()
    y_train = label_encoder.fit_transform(y_train)
    y_test = label_encoder.fit_transform(y_test)

    return X_train, X_test, y_train, y_test

In [96]:
# Function to train and evaluate a model
def train_and_evaluate_model(model: Callable, X_train, y_train, X_test, y_test) -> float:
    classifier = model()
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

In [97]:
def test_multiple_models(file_path: str, category_column: str, target_column: str):
    df = load_and_preprocess_data(file_path, category_column)
    df = balance_dataset(df, target_column)
    X_train, X_test, y_train, y_test = prepare_data_for_modeling(df, target_column)

    models = {
        "RandomForest": RandomForestClassifier,
        "MultinomialNB": MultinomialNB,
        "SVM": SVC,
        "LogisticRegression": LogisticRegression,
        "KNeighbors": KNeighborsClassifier,
        "GradientBoosting": GradientBoostingClassifier,
        "AdaBoost": AdaBoostClassifier,
        "MLPClassifier": MLPClassifier,
    }

    for model_name, model_class in models.items():
        print(f"Training {model_name}...")
        classifier = model_class()
        classifier.fit(X_train, y_train)

        print("Making predictions on the test set...")
        y_pred = classifier.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)
        print(f"{model_name} - Accuracy: {accuracy:.4f}")
        print("Classification Report:")
        print(classification_report(y_test, y_pred))

# Example usage
test_multiple_models('Data/spam.csv', 'Category', 'Target')


Training RandomForest...
Making predictions on the test set...
RandomForest - Accuracy: 0.7255
Classification Report:
              precision    recall  f1-score   support

           0       0.66      0.96      0.78        79
           1       0.92      0.47      0.62        74

    accuracy                           0.73       153
   macro avg       0.79      0.72      0.70       153
weighted avg       0.79      0.73      0.71       153

Training MultinomialNB...
Making predictions on the test set...
MultinomialNB - Accuracy: 0.7320
Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.71      0.73        79
           1       0.71      0.76      0.73        74

    accuracy                           0.73       153
   macro avg       0.73      0.73      0.73       153
weighted avg       0.73      0.73      0.73       153

Training SVM...
Making predictions on the test set...
SVM - Accuracy: 0.6209
Classification Report:
        