In [1]:
#%pip install torch
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score

### Read in the file of only HAM emails (previously obtained from the CEAS_08_cleaned.csv dataset.

In [2]:
df_ham = pd.read_csv("ham.csv")
df_ham

Unnamed: 0,sender,subject,body,label,urls
0,Michael Parker <ivqrnai@pobox.com>,Re: svn commit: r619753 - in /spamassassin/tru...,Would anyone object to removing .so from this ...,0,1
1,qydlqcws-iacfym@issues.apache.org,[Bug 5780] URI processing turns uuencoded stri...,http://issues.apache.org/SpamAssassin/show_bug...,0,1
2,Racing <uqyrmo@sailing.ie>,RE: Trial IRC Certificate Application,"\nPlelim,\n\nJust to remind you that if a cert...",0,1
3,Aaron Kulkis <cmiqlkx91@hotpop.com>,"Re: [opensuse] Why can't I use ""shutdown now"" ...",Carlos E. R. wrote: > -----BEGIN PGP SIGNED ME...,0,1
4,Aaron Kulkis <cmiqlkx91@hotpop.com>,Re: Fwd: [opensuse] Re: openSUSE Boxed Editions,Steve Jacobs wrote: > ---------- Forwarded mes...,0,1
...,...,...,...,...,...
17307,robert healy <vrcjauctt@gmail.com>,I want to cancel my account,How do I cancel my account. I want to erase i...,0,0
17308,Nick Zeljkovic <kppyozizjt@site5.com>,RE: [opensuse] Apache and SSL,\nI don't use virtual hosts. Here is the entry...,0,1
17309,Abhijit Vyas <xpojhbz@gmail.com>,Slideshow viewer,Hello there ! \nGreat work on the slide show v...,0,0
17310,Joseph Brennan <vupzesm@columbia.edu>,Note on 2-digit years,"\nMail from sender , coming from intuit.com\ns...",0,0


### Read in the file of only SPAM emails.

In [3]:
df_spam = pd.read_csv("spam.csv")
df_spam

Unnamed: 0,sender,subject,body,label,urls
0,Young Esposito <Young@iworld.de>,Never agree to be a loser,"Buck up, your troubles caused by small dimensi...",1,1
1,Mok <ipline's1983@icable.ph>,Befriend Jenna Jameson,\nUpgrade your sex and pleasures with these te...,1,1
2,Daily Top 10 <Karmandeep-opengevl@universalnet...,CNN.com Daily Top 10,>+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+...,1,1
3,Gretchen Suggs <externalsep1@loanofficertool.com>,SpecialPricesPharmMoreinfo,\nWelcomeFastShippingCustomerSupport\nhttp://7...,1,1
4,Caroline Aragon <dwthaidomainnamesm@thaidomain...,From Caroline Aragon,\n\n\n\n\nYo wu urS mo ou go rc ebo eForM rgi ...,1,0
...,...,...,...,...,...
21837,Amandalee <tamekut_1991@smarttech.com>,Be larger than ever after 2 months,The cure for dysfunction and impotency availab...,1,1
21838,CNN Alerts <tidhamsu_2002@tyler.sprnet.org>,CNN Alerts: My Custom Alert,\n\n\nCNN Alerts: My Custom Alert\n\n\n\n\n\n\...,1,0
21839,Amee Zeisler <dlintentions@ctk.cz>,Patients can access Our online health shop is ...,\n\nLab-tested female sensual leverage!\n\nPat...,1,0
21840,CNN Alerts <charlene-detecton@btcmarketing.com>,CNN Alerts: My Custom Alert,\n\nCNN Alerts: My Custom Alert\n\n\n\n\n\n\n ...,1,0


### Remove the \n characters in HAM. Replace them with spaces, since they don't add anything to the analysis.

In [4]:
df_ham['body'] = df_ham['body'].str.replace(r'\n', ' ', regex=True)
df_ham

Unnamed: 0,sender,subject,body,label,urls
0,Michael Parker <ivqrnai@pobox.com>,Re: svn commit: r619753 - in /spamassassin/tru...,Would anyone object to removing .so from this ...,0,1
1,qydlqcws-iacfym@issues.apache.org,[Bug 5780] URI processing turns uuencoded stri...,http://issues.apache.org/SpamAssassin/show_bug...,0,1
2,Racing <uqyrmo@sailing.ie>,RE: Trial IRC Certificate Application,"Plelim, Just to remind you that if a certifi...",0,1
3,Aaron Kulkis <cmiqlkx91@hotpop.com>,"Re: [opensuse] Why can't I use ""shutdown now"" ...",Carlos E. R. wrote: > -----BEGIN PGP SIGNED ME...,0,1
4,Aaron Kulkis <cmiqlkx91@hotpop.com>,Re: Fwd: [opensuse] Re: openSUSE Boxed Editions,Steve Jacobs wrote: > ---------- Forwarded mes...,0,1
...,...,...,...,...,...
17307,robert healy <vrcjauctt@gmail.com>,I want to cancel my account,How do I cancel my account. I want to erase i...,0,0
17308,Nick Zeljkovic <kppyozizjt@site5.com>,RE: [opensuse] Apache and SSL,I don't use virtual hosts. Here is the entry ...,0,1
17309,Abhijit Vyas <xpojhbz@gmail.com>,Slideshow viewer,Hello there ! Great work on the slide show vi...,0,0
17310,Joseph Brennan <vupzesm@columbia.edu>,Note on 2-digit years,"Mail from sender , coming from intuit.com ser...",0,0


### Similarly remove the \n characters in SPAM and replace them with spaces.

In [5]:
df_spam['body'] = df_spam['body'].str.replace(r'\n', ' ', regex=True)
df_spam

Unnamed: 0,sender,subject,body,label,urls
0,Young Esposito <Young@iworld.de>,Never agree to be a loser,"Buck up, your troubles caused by small dimensi...",1,1
1,Mok <ipline's1983@icable.ph>,Befriend Jenna Jameson,Upgrade your sex and pleasures with these tec...,1,1
2,Daily Top 10 <Karmandeep-opengevl@universalnet...,CNN.com Daily Top 10,>+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+...,1,1
3,Gretchen Suggs <externalsep1@loanofficertool.com>,SpecialPricesPharmMoreinfo,WelcomeFastShippingCustomerSupport http://7iw...,1,1
4,Caroline Aragon <dwthaidomainnamesm@thaidomain...,From Caroline Aragon,Yo wu urS mo ou go rc ebo eForM rgi oreWo...,1,0
...,...,...,...,...,...
21837,Amandalee <tamekut_1991@smarttech.com>,Be larger than ever after 2 months,The cure for dysfunction and impotency availab...,1,1
21838,CNN Alerts <tidhamsu_2002@tyler.sprnet.org>,CNN Alerts: My Custom Alert,CNN Alerts: My Custom Alert Aler...,1,0
21839,Amee Zeisler <dlintentions@ctk.cz>,Patients can access Our online health shop is ...,Lab-tested female sensual leverage! Patient...,1,0
21840,CNN Alerts <charlene-detecton@btcmarketing.com>,CNN Alerts: My Custom Alert,CNN Alerts: My Custom Alert Alert...,1,0


### There were some null elements in the subject and body, and these must be removed before TfIdfVectorization.
### I also concatenated the Subject and Body together, and placed the combined text in a new column called "text".

In [6]:
# Replace null values with empty strings
df_ham['subject'] = df_ham['subject'].fillna('')
df_ham['body'] = df_ham['body'].fillna('')
df_spam['subject'] = df_spam['subject'].fillna('')
df_spam['body'] = df_spam['body'].fillna('')

# Concatenate the 'subject' and 'body' columns
df_ham['text'] = df_ham['subject'] + ' ' + df_ham['body']
df_spam['text'] = df_spam['subject'] + ' ' + df_spam['body']

# Confirm the absence of null values
print(df_ham['text'].isnull().sum())
print(df_spam['text'].isnull().sum())

0
0


### For the test set, I needed a combination of HAM and SPAM emails, so I concatenated the SPAM and HAM dataframes, mixed them up, and re-indexed the result.

In [7]:
df_combined = pd.concat([df_ham, df_spam], ignore_index=True)    # Combine the HAM and SPAM dataframes.
df_combined = df_combined.sample(frac=1).reset_index(drop=True)  # Randomize the rows, then reset the index.
df_combined

Unnamed: 0,sender,subject,body,label,urls,text
0,Alessandro Antonucci <dtqekbsagi@idsia.ch>,[UAI] SIPTA Newsletter Announcement - New issue,We would like to briefly indicate that the lat...,0,1,[UAI] SIPTA Newsletter Announcement - New issu...
1,Luis Villa <cvov@tieguy.org>,Re: [FoRK] It saddens and disturbs me.,"(applause) On 11/2/07, Damien Morton wrote: ...",0,1,Re: [FoRK] It saddens and disturbs me. (applau...
2,Daily Top 10 <moranj-etnemlit@rsac.com>,CNN.com Daily Top 10,>+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+...,1,1,CNN.com Daily Top 10 >+=+=+=+=+=+=+=+=+=+=+=+=...
3,Daily Top 10 <tenshuts@manning.com>,CNN.com Daily Top 10,>+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+...,1,1,CNN.com Daily Top 10 >+=+=+=+=+=+=+=+=+=+=+=+=...
4,Daily Top 10 <imentijd_1955@mejiaslaw.com>,CNN.com Daily Top 10,>+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+...,1,1,CNN.com Daily Top 10 >+=+=+=+=+=+=+=+=+=+=+=+=...
...,...,...,...,...,...,...
39149,Charlesetta Morisky <jqpresumption@discountdom...,You do not want to buy unknown them at strange...,Man's improving formula effective for for 90%...,1,0,You do not want to buy unknown them at strange...
39150,Jörgen Sköld <mxatwq.uhlgy@teknikkompaniet.se>,SV: Triptracker Slideshow - translation,Hi! Thanx for the answers - I've tried and ...,0,1,SV: Triptracker Slideshow - translation Hi! ...
39151,Google-AdWords <reactivation@google.com>,Your AdWords Google Account is stoped.,------------------------------- --------------...,1,0,Your AdWords Google Account is stoped. -------...
39152,Daily Top 10 <Arturo-mulpapen@beliefnet.com>,CNN.com Daily Top 10,>+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+...,1,1,CNN.com Daily Top 10 >+=+=+=+=+=+=+=+=+=+=+=+=...


### The autoencoder will be trained on the HAM dataset, but first it needs to be vectorized.  The test data will be a combination of HAM and SPAM.

In [8]:
# Create the TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Data preprocessing
train_data = vectorizer.fit_transform(df_ham["text"])
test_data = vectorizer.transform(df_combined["text"])

### Define the autoencoder model. I used a single ReLU encoder and a single ReLU decoder. 

In [9]:
# Autoencoder architecture
class Autoencoder(nn.Module):
    def __init__(self, input_dim, encoding_dim):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.ReLU(),
            nn.Linear(512, encoding_dim)
        )
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, 512),
            nn.ReLU(),
            nn.Linear(512, input_dim)
        )
        
    def forward(self, x):
        encoding = self.encoder(x)
        decoded = self.decoder(encoding)
        return decoded

### Define the hyperparameters. I selected 10 epochs to star. Encoding dimension was set at 32 (arbitrary). Adam optimzer. MSE was used to assess reconstruction error. 

In [10]:
# Initialize the autoencoder
input_dim = train_data.get_shape()[1]
encoding_dim = test_data.get_shape()[1]
criterion = nn.MSELoss()
num_epochs = 10
encoding_dim = 32
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
autoencoder = Autoencoder(input_dim, encoding_dim).to(device)
optimizer = torch.optim.Adam(autoencoder.parameters())

# Training loop
for epoch in range(num_epochs):
    inputs = train_data  # Use the entire sparse matrix as input

    # Convert sparse matrix to PyTorch tensor with the same data type as the model
    inputs = torch.from_numpy(inputs.toarray()).to(device=device, dtype=torch.get_default_dtype())

    outputs = autoencoder(inputs)
    loss = criterion(outputs, inputs)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # Print the reconstruction error for this epoch
    print(f'Epoch [{epoch+1}/{num_epochs}], Reconstruction Error: {loss.item():.4f}')

Epoch [1/10], Reconstruction Error: 0.0024
Epoch [2/10], Reconstruction Error: 0.0011
Epoch [3/10], Reconstruction Error: 0.0005
Epoch [4/10], Reconstruction Error: 0.0003
Epoch [5/10], Reconstruction Error: 0.0002
Epoch [6/10], Reconstruction Error: 0.0002
Epoch [7/10], Reconstruction Error: 0.0002
Epoch [8/10], Reconstruction Error: 0.0003
Epoch [9/10], Reconstruction Error: 0.0002
Epoch [10/10], Reconstruction Error: 0.0002


### Make predictions on the test set. Create an array of reconstruction errors, and save them with the ground_truth 'label' column values, so we can later compare them to the defined threshold.

In [11]:
# Compute reconstruction errors for test set
test_errors = []
test_labels = []  # Initialize a list to store true labels
for idx, email in enumerate(test_data):
    # Convert sparse matrix to dense numpy array
    dense_email = email.toarray()
    
    # Create a PyTorch tensor from the dense numpy array
    input_tensor = torch.Tensor(dense_email).to(device)
    
    output_tensor = autoencoder(input_tensor)
    reconstruction_error = criterion(output_tensor, input_tensor)
    test_errors.append(reconstruction_error.item())
    
    # Append the true label from df_combined
    test_labels.append(df_combined.iloc[idx]['label'])

print ("Computations complete.")

Computations complete.


### Set the threshold of SPAM determination at 85% percentile of the HAM-trained autoregression reconstruction error distribution. 

In [12]:
# Assuming non-spam emails are the majority class
non_spam_errors = [err for err, label in zip(test_errors, test_labels) if label == 0]
anomaly_threshold = np.percentile(non_spam_errors, 85)

### We'll call it SPAM if the reconstruction error is > threshold value of the distribution of HAM reconstruction errors. 

In [13]:
# Predict labels based on the anomaly threshold
y_pred = [1 if err > anomaly_threshold else 0 for err in test_errors]
y_true = test_labels

### Calculate model performance stats

In [14]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Compute evaluation metrics
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

Accuracy: 0.7368
Precision: 0.8448
Recall: 0.6470
F1-score: 0.7328
