In [1]:
import kagglehub
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder


path = kagglehub.dataset_download("ozlerhakan/spam-or-not-spam-dataset")


df = pd.read_csv(f"{path}/spam_or_not_spam.csv")
print(df.head())

  from .autonotebook import tqdm as notebook_tqdm


                                               email  label
0   date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...      0
1  martin a posted tassos papadopoulos the greek ...      0
2  man threatens explosion in moscow thursday aug...      0
3  klez the virus that won t die already the most...      0
4   in adding cream to spaghetti carbonara which ...      0


In [2]:
df.dropna(inplace=True)


label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])


vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['email']).toarray()
y = df['label'].values


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [3]:
class SpamDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


train_dataset = SpamDataset(X_train, y_train)
test_dataset = SpamDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [4]:
class MLPModel(nn.Module):
    def __init__(self, input_size):
        super(MLPModel, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)  # First layer
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(128, 64)  # Second layer
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(64, 2)  # Output layer (2 classes)
        self.softmax = nn.LogSoftmax(dim=1)  # Log softmax for classification

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.fc3(x)
        return self.softmax(x)


input_size = X.shape[1]  
model = MLPModel(input_size)


In [5]:
criterion = nn.NLLLoss()  
optimizer = optim.Adam(model.parameters(), lr=0.001) 

In [6]:
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss:.4f}")

Epoch 1/10, Loss: 30.8237
Epoch 2/10, Loss: 3.6467
Epoch 3/10, Loss: 0.5733
Epoch 4/10, Loss: 0.2395
Epoch 5/10, Loss: 0.1504
Epoch 6/10, Loss: 0.1260
Epoch 7/10, Loss: 0.1087
Epoch 8/10, Loss: 0.1032
Epoch 9/10, Loss: 0.0882
Epoch 10/10, Loss: 0.0912


In [7]:
model.eval()
correct, total = 0, 0

with torch.no_grad():  
    for batch_X, batch_y in test_loader:
        outputs = model(batch_X)
        predicted = torch.argmax(outputs, dim=1)
        total += batch_y.size(0)
        correct += (predicted == batch_y).sum().item()

accuracy = 100 * correct / total
print(f"Test Accuracy: {accuracy:.2f}%")

Test Accuracy: 99.00%


In [8]:
def predict_email(model, email_text, vectorizer):
    model.eval()  

    # Convert email text to numerical format using the same TF-IDF vectorizer
    email_vector = vectorizer.transform([email_text]).toarray()
    email_tensor = torch.tensor(email_vector, dtype=torch.float32)

    
    with torch.no_grad():
        output = model(email_tensor)
        predicted_label = torch.argmax(output, dim=1).item()

    # Convert 0 and 1 into "spam" or "not spam"
    label_mapping = {0: "not spam", 1: "spam"}
    prediction = label_mapping[predicted_label]

    return prediction


In [9]:
email1 = "Congratulations! You have won a $1000 Walmart gift card. Click here to claim your prize."
email2 = "Hey John, can we reschedule our meeting to next Monday?"
email3 = "Hurry up! Your Netflix account has been suspended due to suspicious activity. Click here to restore access."

print(f"Email: {email1}\nPrediction: {predict_email(model, email1, vectorizer)}\n")
print(f"Email: {email2}\nPrediction: {predict_email(model, email2, vectorizer)}\n")
print(f"Email: {email3}\nPrediction: {predict_email(model, email3, vectorizer)}\n")


Email: Congratulations! You have won a $1000 Walmart gift card. Click here to claim your prize.
Prediction: spam

Email: Hey John, can we reschedule our meeting to next Monday?
Prediction: not spam

Email: Hurry up! Your Netflix account has been suspended due to suspicious activity. Click here to restore access.
Prediction: spam

