<a href="https://colab.research.google.com/github/mervegb/deep-learning/blob/main/spam_or_ham.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Steps
- Preprocessing => tokenize the text and prepare the data for training
- Word Embeddings => use pre-trained word embeddings
- Model => create a simple model with a linear layer
- Training => train the model
- Evaluation => evaluate the model performance

In [67]:
import torch
import torch.nn as nn
import torch.optim as optim

# 0 is ham, 1 is spam
data = [("Buy two get one free", 1),
        ("Call you later", 0),
        ("Congratulations, you won", 1),
        ("Meeting at 3 PM", 0),
        ("Click this link", 1),
        ("How are you?", 0),
        ("You have won a prize", 1),
        ("Dinner at 7?", 0),
        ("URGENT! Your account is compromised", 1),
        ("Looking forward to seeing you", 0),
        ("Earn money fast", 1),
        ("Can we reschedule?", 0)]

In [68]:
# Create a vocabulary
vocab = set(word for text, _ in data for word in text.split())
word_to_index = {word: i for i, word in enumerate(vocab)}
vocab_size = len(vocab)


In [69]:
# Convert text to numbers (Tokenization)
X = [[word_to_index[word] for word in text.split()] for text, _ in data]
y = [label for _, label in data]

In [70]:
# Model
class SpamHamClassifier(nn.Module):
  def __init__(self, vocab_size, embed_size):
    super(SpamHamClassifier, self).__init__()
    self.embedding = nn.Embedding(vocab_size,embed_size)
    self.linear = nn.Linear(embed_size,2)

  def forward(self,x):
    out = self.embedding(x).mean(dim=1)
    out = self.linear(out)
    return out

In [71]:
# Initialize model, loss, and optimizer
model = SpamHamClassifier(vocab_size,10)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [72]:
# Find the maximum sentence length
max_len = max(len(x) for x in X)

# Pad the shorter sentences with zeros
X_padded = [x + [0]*(max_len - len(x)) for x in X]

# Convert padded data to PyTorch tensors
X_tensor = torch.tensor(X_padded, dtype=torch.long)
y_tensor = torch.tensor(y, dtype=torch.long)

# Padding
Padding is a technique used in data preprocessing to make sequences have the same length. This is particulary important when you're feeding the data into models like neural networks that require fixed-size input.

Neural networks often expect input of a constant size. In this scenario, you'd pad the shorter sentences with extra 'empty' values (often zeros) to match the longest one:

"I love AI." -> "I love AI. PAD PAD PAD PAD PAD"
"I'm a junior to mid software developer." -> "I'm a junior to mid software developer."
"I want to switch careers." -> "I want to switch careers. PAD PAD PAD"

In [86]:
for epoch in range(10):
    optimizer.zero_grad()
    output = model(X_tensor)

    loss = loss_fn(output, y_tensor)
    loss.backward()

    optimizer.step()

    print(f"Epoch {epoch+1}, Loss: {loss.item()}")

Epoch 1, Loss: 0.0006097842706367373
Epoch 2, Loss: 0.000609248352702707
Epoch 3, Loss: 0.0006087521323934197
Epoch 4, Loss: 0.0006082063191570342
Epoch 5, Loss: 0.0006076901918277144
Epoch 6, Loss: 0.0006071741809137166
Epoch 7, Loss: 0.0006066581117920578
Epoch 8, Loss: 0.0006061222520656884
Epoch 9, Loss: 0.0006056160200387239
Epoch 10, Loss: 0.0006050899974070489


In [94]:
with torch.no_grad():
    test_text = "You won a gift from us!"

    # Tokenize and pad the test sentence
    test_token = [word_to_index.get(word, 0) for word in test_text.split()]
    test_token_padded = test_token + [0] * (max_len - len(test_token))

    # Convert to a tensor
    test_tensor = torch.tensor([test_token_padded], dtype=torch.long)

    # Run the model
    output = model(test_tensor)

    # Get the prediction
    _, predicted = torch.max(output, 1)
    print("Spam" if predicted.item() == 1 else "Ham")


Spam
