# Spam Detection

## Initialization

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [3]:
np.random.seed(42)

In [4]:
data = pd.read_csv('spam.csv', encoding = 'ISO-8859-1')
data = data.iloc[:, :2]
data.columns = ['class', 'email']
data

Unnamed: 0,class,email
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [5]:
## Taking spam as a positive class prevents misclassifying normal/ham messages as spam, 
## which could result in important emails being missed or filtered out, leading to a negative user experience

data['class'] = data['class'].map({'ham': 0, 'spam': 1})

X, y = np.array(data['email']), np.array(data['class'])

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# additionally splitting training set into true train set and validation set, for model evaluation, 
# since we assume that we don't have access to test set

X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.15, random_state=42)

In [7]:
# Removing duplicates from train set

unique_texts = set()

filtered_X_train = []
filtered_y_train = []

for text, label in zip(X_train, y_train):
    if text not in unique_texts:
        unique_texts.add(text)
        
        filtered_X_train.append(text)
        filtered_y_train.append(label)

X_train = filtered_X_train
y_train = filtered_y_train

## Text Pre-Processing for Modeling

In [8]:
#!pip3 install spacy
#!python3 -m spacy download en_core_web_sm

In [9]:
import re
import spacy

nlp = spacy.load("en_core_web_sm")
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Preprocessing function
def preprocess_text(X_train):
    preprocessed = []
    for text in X_train:
        tokens = re.findall(r'\b\w+\b', text.lower())
        pure_text = [token for token in tokens if token not in stop_words]
        preprocessed.append(pure_text)
    return preprocessed

X_train = preprocess_text(X_train)
X_test = preprocess_text(X_test)
X_valid = preprocess_text(X_valid)

In [10]:
# Constructing Bag of Words manually

word_set = set()
for document in X_train:
    word_set.update(document)

# Creating a vocabulary mapping each word to a unique index
word_to_index = {word: index for index, word in enumerate(word_set)}

def get_bag_of_words(texts):
    numeric_texts = []
    for text in texts:
        bow_vector = [0] * len(word_set)
        for word in text:
            if word in word_to_index:
                bow_vector[word_to_index[word]] += 1
        numeric_texts.append(bow_vector)
    return numeric_texts
    
X_train_numeric = np.array(get_bag_of_words(X_train))
X_test_numeric = np.array(get_bag_of_words(X_test))
X_valid_numeric = np.array(get_bag_of_words(X_valid))

## Model Evaluation

- Logistic Regression
- Naive Bayes

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

clf = LogisticRegression(random_state=42).fit(X_train_numeric, y_train)

y_pred_train = clf.predict(X_train_numeric)
y_pred_valid = clf.predict(X_valid_numeric)

f1_train = f1_score(y_train, y_pred_train)
f1_valid = f1_score(y_valid, y_pred_valid)

print("Logistic Regression")
print("Training F1 Score:", f1_train)
print("Validation F1 Score:", f1_valid)

Logistic Regression
Training F1 Score: 0.987012987012987
Validation F1 Score: 0.9078947368421053


In [12]:
from sklearn.naive_bayes import MultinomialNB

nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_numeric, y_train)

y_pred_train = nb_classifier.predict(X_train_numeric)
y_pred_valid = nb_classifier.predict(X_valid_numeric)

f1_train = f1_score(y_train, y_pred_train)
f1_valid = f1_score(y_valid, y_pred_valid)

print("Multinomial Naive Bayes")
print("Training F1 Score:", f1_train)
print("Validation F1 Score:", f1_valid)

Multinomial Naive Bayes
Training F1 Score: 0.9850746268656716
Validation F1 Score: 0.9308176100628931


In [13]:
y_pred_test = nb_classifier.predict(X_test_numeric)

f1_test = f1_score(y_test, y_pred_test)

print("Multinomial Naive Bayes")
print("Test F1 Score:", f1_test)

Multinomial Naive Bayes
Test F1 Score: 0.9383561643835616


## Summary

Multinomial Naive Bayes was chosen as the final algorithm, despite the fact that logistic regression had a higher F1 score on the training set. The reason for this choice is that Multinomial NB exhibited a higher F1 score on the validation set.