# E-mail Spam Filtering


importing the required libraries

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import GridSearchCV
import os
import time

# 1) Data Preparation
downloading and extracting the dataset

In [None]:
!wget http://www.aueb.gr/users/ion/data/lingspam_public.tar.gz --no-check-certificate
!tar -xzf lingspam_public.tar.gz

--2023-11-01 00:41:21--  http://www.aueb.gr/users/ion/data/lingspam_public.tar.gz
Resolving www.aueb.gr (www.aueb.gr)... 195.251.255.156
Connecting to www.aueb.gr (www.aueb.gr)|195.251.255.156|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://pages.aueb.gr/users/ion/data/lingspam_public.tar.gz [following]
--2023-11-01 00:41:21--  http://pages.aueb.gr/users/ion/data/lingspam_public.tar.gz
Resolving pages.aueb.gr (pages.aueb.gr)... 195.251.255.230
Connecting to pages.aueb.gr (pages.aueb.gr)|195.251.255.230|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://www2.aueb.gr/users/ion/data/lingspam_public.tar.gz [following]
--2023-11-01 00:41:22--  https://www2.aueb.gr/users/ion/data/lingspam_public.tar.gz
Resolving www2.aueb.gr (www2.aueb.gr)... 195.251.255.230
Connecting to www2.aueb.gr (www2.aueb.gr)|195.251.255.230|:443... connected.
  Unable to locally verify the issuer's authority.
HTTP request 

preparing the data. extracting the emails and labels from the files in the dataset

In [None]:
# Path to the parent directory
parent_dir = "lingspam_public/lemm_stop"

# Lists to store training and testing emails
train_emails = []
train_labels = []
test_emails = []
test_labels = []

# iterate over each sub-folder
for folder in os.listdir(parent_dir):
   # construct the full path to the sub-folder
   folder_path = os.path.join(parent_dir, folder)

   # check if the path is a directory
   if os.path.isdir(folder_path):
       # tterate over each file in the sub-folder
       for filename in os.listdir(folder_path):
           # construct the full path to the file
           file_path = os.path.join(folder_path, filename)

           # check if the path is a file
           if os.path.isfile(file_path):
               # open the file and read the emails
               with open(file_path, "r") as f:
                  emails = f.readlines()
                  # if the filename starts with 'spmsg', it is considered spam, otherwise it's not
                  is_spam = [1 if filename.startswith('spmsg') else 0 for _ in emails]
                  # if the folder is "part10", use it for testing, otherwise use it for training
                  if folder == "part10":
                      test_emails.extend(emails)
                      test_labels.extend(is_spam)
                  else:
                      train_emails.extend(emails)
                      train_labels.extend(is_spam)


 # 2) Feature selection using the information gain
 feature selection using the information gain (IG) metric

In [None]:
# Vectorize the emails
tfidf_vectorizer = CountVectorizer()
X = tfidf_vectorizer.fit_transform(train_emails)

# Perform feature selection with mutual_info_classif
for N in [10, 100, 1000]:
    selector = SelectKBest(mutual_info_classif, k=N)
    X_new = selector.fit_transform(X, train_labels)

    # Get the feature names
    feature_names = tfidf_vectorizer.get_feature_names_out()
    selected_features = [feature_names[i] for i in selector.get_support(indices=True)]
    print(f"Top {N} features: {selected_features}")


Top 10 features: ['business', 'click', 'free', 'language', 'linguistic', 'market', 'money', 'our', 'remove', 'university']
Top 100 features: ['100', '20', 'abstract', 'ad', 'address', 'advertise', 'amaze', 'anywhere', 'best', 'bonus', 'bulk', 'business', 'buy', 'card', 'cash', 'cd', 'check', 'click', 'com', 'company', 'conference', 'cost', 'credit', 'customer', 'day', 'discussion', 'dollar', 'earn', 'easy', 'edu', 'email', 'english', 'ever', 'every', 'fax', 'financial', 'free', 'fun', 'grammar', 'guarantee', 'here', 'home', 'hour', 'hundred', 'income', 'instruction', 'internet', 'investment', 'language', 'linguist', 'linguistic', 'linguistics', 'list', 'live', 'll', 'mail', 'mailing', 'market', 'million', 'money', 'month', 'name', 'need', 'offer', 'online', 'order', 'our', 'over', 'papers', 'pay', 'product', 'profit', 'program', 'purchase', 'receive', 'remove', 'report', 'sale', 'save', 'sell', 'send', 'service', 'simply', 'speaker', 'start', 'success', 'syntax', 'theory', 'thousand', 

# 3) Implementing Classifiers
implementing the following classifiers:

○	Bernoulli NB classifier with binary features;

○	Multinomial NB with binary features; and

○	Multinomial NB with term frequency (TF) features.


In [None]:
# Vectorize the emails
vectorizer = CountVectorizer(binary = True)
X_train = vectorizer.fit_transform(train_emails)
X_test = vectorizer.transform(test_emails)

# Define the classifiers
classifiers = {
    "BernoulliNB": BernoulliNB(),
    "MultinomialNB_binary": MultinomialNB(),
    "MultinomialNB_tf": MultinomialNB()
}

# Vectorize the emails with term frequency features
vectorizer_tf = CountVectorizer()
X_train_tf = vectorizer_tf.fit_transform(train_emails)
X_test_tf = vectorizer_tf.transform(test_emails)

# For each classifier and for N = {10, 100, 1000}
for classifier_name, classifier in classifiers.items():
    print(f"Classifier: {classifier_name}")
    if classifier_name == "MultinomialNB_tf":
        X_train = X_train_tf
        X_test = X_test_tf
    # Perform cross-validation and calculate spam precision and spam recall
    for N in [10, 100, 1000]:
        start_time = time.time()
        selector = SelectKBest(mutual_info_classif, k=N)
        X_train_new = selector.fit_transform(X_train, train_labels)
        X_test_new = selector.transform(X_test)
        classifier.fit(X_train_new, train_labels)
        y_pred = classifier.predict(X_test_new)
        precision = precision_score(test_labels, y_pred, pos_label=1)
        recall = recall_score(test_labels, y_pred, pos_label=1)
        latency = time.time() - start_time
        # Get the feature names
        feature_names = vectorizer.get_feature_names_out()
        selected_features = [feature_names[i] for i in selector.get_support(indices=True)]
        print(f"Top {N} features:")
        print(f"Precision = {precision}, Recall = {recall}, Latency = {latency} seconds")


Classifier: BernoulliNB
Top 10 features:
Precision = 0.8846153846153846, Recall = 0.3129251700680272, Latency = 124.10886311531067 seconds
Top 100 features:
Precision = 0.9565217391304348, Recall = 0.29931972789115646, Latency = 119.53114986419678 seconds
Top 1000 features:
Precision = 1.0, Recall = 0.30612244897959184, Latency = 119.71840333938599 seconds
Classifier: MultinomialNB_binary
Top 10 features:
Precision = 0.8846153846153846, Recall = 0.3129251700680272, Latency = 121.85881352424622 seconds
Top 100 features:
Precision = 0.9811320754716981, Recall = 0.35374149659863946, Latency = 120.03866958618164 seconds
Top 1000 features:
Precision = 1.0, Recall = 0.46258503401360546, Latency = 124.37408089637756 seconds
Classifier: MultinomialNB_tf
Top 10 features:
Precision = 0.8666666666666667, Recall = 0.35374149659863946, Latency = 123.65640473365784 seconds
Top 100 features:
Precision = 0.95, Recall = 0.3877551020408163, Latency = 122.05910515785217 seconds
Top 1000 features:
Precisi

# SVM Spam Filter
Designing a Support Vector Machine (SVM) based spam filter

In [None]:
# Vectorize the emails with TF-IDF features
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_emails)
X_test = vectorizer.transform(test_emails)

# Define the parameter grid
param_grid = {'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001]}

# Initialize the SVM classifier
svm = SVC(kernel='linear')  # Linear Kernel

# Use GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(svm, param_grid, cv=5, scoring='accuracy')

start_time = time.time()
grid_search.fit(X_train, train_labels)

# Get the best parameters and the best estimator
best_params = grid_search.best_params_
best_svm = grid_search.best_estimator_

# Predict the test set results
y_pred = best_svm.predict(X_test)

# Calculate precision and recall
precision = precision_score(test_labels, y_pred, pos_label=1)
recall = recall_score(test_labels, y_pred, pos_label=1)
latency = time.time() - start_time

# Print precision, recall and latency
print(f"SVM Classifier: Precision = {precision}, Recall = {recall}, Latency = {latency} seconds")
print(f"Best parameters: {best_params}")


SVM Classifier: Precision = 0.9861111111111112, Recall = 0.48299319727891155, Latency = 220.54181170463562 seconds
Best parameters: {'C': 10, 'gamma': 1}


Methodology :

To begin, text data undergoes a transformation into TF-IDF feature vectors, assessing the significance of words within the documents.

Following this, a grid search is employed for hyperparameter tuning to enhance the SVM classifier. This tuning process primarily focuses on optimizing the 'C' regularization parameter and the 'gamma' kernel parameter. The ultimate goal is to maximize classification accuracy using a 5-fold cross-validation technique. Initially, the SVM classifier employs a linear kernel.

Once the hyperparameter tuning process is complete, the best hyperparameters are determined, along with the corresponding SVM estimator. Subsequently, the trained SVM model is utilized to make predictions for the test dataset, and precision and recall scores are computed to assess its performance.

Ultimately, the results, encompassing precision, recall, and training duration, are displayed in the console alongside the best hyperparameters.
