# Embeddings
We saw in our previous notebook that the one-hot vectors and the bag-of-word-vectors were very sparse (i.e. they had a lot of zeros), and thus didn't contain much information. In particular, every word has the same distance to all other words in one-hot encodings. We would gain a lot if we are able to encode our words such that similar words are close together.

## Prepare Data

We start by downloading the data and preparing it for our purposes.

In [None]:
url = "https://raw.githubusercontent.com/mattminder/nlp_intro/refs/heads/main/data/sms_spam_collection/SMSSpamCollection"

import pandas as pd
import urllib.request
data = urllib.request.urlopen(url)

# directly load the file from github for compatability with Colab
lines_split = [
    line.decode().strip().split("\t")
    for line in data
]
df = pd.DataFrame(lines_split, columns=["label", "text"])

We do tokenization and stemming in the same way as before:

In [None]:
def remove_punctuation(text):
    for letter in '".,;:!?()_*':
        text = text.replace(letter, " ")  # replace with a space
    return text

def rudimentary_stemming(word_list):
    suffixes_to_remove = [
        "s",  # plural suffix
        "ing",
        "ed",
    ]
    def remove_suffixes(word):
        for suffix in suffixes_to_remove:
            word = word.removesuffix(suffix)
        return word

    return [
        remove_suffixes(word)
        for word in word_list
    ]

def preprocessing(text):
    lower_case = text.lower()
    without_punctuation = remove_punctuation(lower_case)
    tokens = without_punctuation.split()  # splits at any whitespace
    after_stemming = rudimentary_stemming(tokens)
    return after_stemming

df["word_list"] = df["text"].apply(preprocessing)


Again, we create a dictionary that maps frequent words to a number. 

In [None]:
def get_frequent_word_dictionary(word_list, minimum_count=10):
    """Create a mapping from frequent words to an integer."""
    # create a dictionary with the number of occurrences of every word
    word_count = pd.Series(word_list).value_counts().to_dict()

    # identify the set of words that are frequent enough
    relevant_words = {
        word for word, count in word_count.items() if count >= minimum_count
    }

    # turn that set into a dictionary
    return {
        word: i
        for i, word in enumerate(relevant_words)
    }


frequent_word_dictionary = get_frequent_word_dictionary(
    [e for row in df["word_list"].to_list() for e in row]
)

vocabulary_size = len(frequent_word_dictionary) + 1

# this time we assign the last number to unknown words
df["word_number_list"] = df["word_list"].apply(
    lambda word_list: [
        frequent_word_dictionary.get(word, vocabulary_size)
        for word in word_list
    ]
)

In [None]:
# We want to keep texts with more than three words
keep = df["word_number_list"].apply(len) > 3
df = df[keep]

## Skip-Gram Model
One way to calculate such an encoding is to use a skip-gram model. It takes a word and wants to predict all words surrounding the input word in a given sentence.

For example in the sentence:
"I go out to *take* *the* **dog** *for* *a* walk".

If we provide the word **dog** as an input, we would want to correctly predict all the words marked in italic.

In [None]:
import torch

class SkipGram(torch.nn.Module):

    def __init__(self, vocabulary_size, embedding_dim):
        super().__init__()

        # We randomly initialize two matrices, embedding and context
        self.embedding_matrix = torch.nn.Parameter(
            torch.randn(size=(vocabulary_size, embedding_dim))
        )
        self.context_matrix = torch.nn.Parameter(
            torch.randn(size=(embedding_dim, vocabulary_size))
        )

        # in the end, we calculate Softmax according to the last axis

    def forward(self, x):
        emb = x @ self.embedding_matrix
        return emb @ self.context_matrix


Next, we have to define how we load data:

In [None]:
import random
import functools

class SkipGramData(torch.utils.data.Dataset):

    def __init__(self, df, vocabulary_size):
        self.df = df
        self.vocabulary_size = vocabulary_size

    def _number_to_one_hot(self, number):
        return torch.nn.functional.one_hot(torch.Tensor([number]).long(), self.vocabulary_size).float()

    def __getitem__(self, idx):
        row_ix = idx % len(self.df)
        word_number_list = self.df["word_number_list"].iloc[row_ix]

        # we randomly choose an input and target word, at most 2 words apart
        word_number = random.sample(range(len(word_number_list)), 1)[0]

        query = word_number_list[word_number]

        targets = [
            self._number_to_one_hot(word_number_list[word_number + delta])
            for delta in [-2, -1, 1, 2]
            if word_number + delta >= 0 and word_number + delta < len(word_number_list)
        ]
        target = functools.reduce(lambda x, y: x + y, targets)
        target = torch.clip(target, 0, 1)

        return (
            self._number_to_one_hot(query),
            target,
        )

    def __len__(self):
        return len(self.df)


Split into training and validation data:

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=.2, random_state=123)

Defining the model and the data loaders:

In [None]:
skip_gram_train_data = SkipGramData(train, vocabulary_size + 1)
skip_gram_test_data = SkipGramData(test, vocabulary_size + 1)

def my_collate(batch):
    queries, targets = zip(*batch)
    return torch.concat(queries), torch.concat(targets)

# load data in batches
train_data_loader = torch.utils.data.DataLoader(
    skip_gram_train_data,
    batch_size=32,
    shuffle=True,
    collate_fn=my_collate
)
test_data_loader = torch.utils.data.DataLoader(
    skip_gram_test_data,
    batch_size=32,
    shuffle=True,
    collate_fn=my_collate
)

Training the model:

In [None]:
# we embed into two dimensions only
skip_gram_model = SkipGram(vocabulary_size + 1, 2)
optimizer = torch.optim.AdamW(skip_gram_model.parameters(), lr=.001)

n_epochs = 200
train_losses = []
val_losses = []

for i in range(n_epochs):
    # train during a single epoch
    train_loss_epoch = []
    for queries, targets in train_data_loader:
        optimizer.zero_grad()

        outputs = skip_gram_model(queries)

        loss = torch.nn.functional.cross_entropy(outputs, targets)
        loss.backward()
        train_loss_epoch.append(loss.item())

        optimizer.step()
        
    # validation run
    val_loss_epoch = []
    with torch.no_grad():
        for queries, targets in train_data_loader:
            outputs = skip_gram_model(queries)
            val_loss = torch.nn.functional.cross_entropy(outputs, targets)
            val_loss_epoch.append(val_loss)

    train_losses.append(sum(train_loss_epoch) / len(train_loss_epoch))
    val_losses.append(sum(val_loss_epoch) / len(val_loss_epoch))

Looking at the loss:

In [None]:
from matplotlib import pyplot as plt
plt.plot(train_losses)
plt.plot(val_losses)

plt.legend(["train loss", "test loss"])


## Visualizing Embeddings
Now, we can have a look at the embeddings that the model has learned.

In [None]:
embeddings = pd.DataFrame(skip_gram_model.embedding_matrix.data)
embeddings.head()

Let's construct the inverse dictionary, to know what the entries correspond to:

In [None]:
inverse_dict = {v: k for k, v in frequent_word_dictionary.items()}
embeddings.index = [inverse_dict.get(i, "__unknown") for i in range(len(embeddings))]
embeddings.head()

In [None]:
plt.figure(figsize=(12, 12))
plt.scatter(embeddings[0], embeddings[1])
for word, row in embeddings.iterrows():
    plt.text(row[0], row[1], word)

We see that the embeddings did not learn a lot of useful signal. The only thing that is consistently close to each other are the words "sorry", "i'll", "call", and "later".

This was somewhat to be expected:
- With 5'600 messages, our corpus is very small for natural language processing.
- Messages are very short, so we only have few words per document.
- Text quality is poor, since there are many typos, slang and abbreviation.

All of this means that we don't have many example usages for all of the words in our corpus. We therefore don't see them in enough situations to properly learn what context they are used in. The only exception is the combination sorry, call and later which is sufficiently abundant for our model to learn that these words are often used together.

## Bigger Embeddings
Let's see if using a bigger embedding size can improve our model.

In [None]:
embedding_size = 20

skip_gram_model = SkipGram(vocabulary_size + 1, embedding_size)
optimizer = torch.optim.AdamW(skip_gram_model.parameters())

n_epochs = 200
train_losses = []
val_losses = []

for i in range(n_epochs):
    # train during a single epoch
    train_loss_epoch = []
    for queries, targets in train_data_loader:
        optimizer.zero_grad()

        outputs = skip_gram_model(queries)

        loss = torch.nn.functional.cross_entropy(outputs, targets)
        loss.backward()
        train_loss_epoch.append(loss.item())

        optimizer.step()
        
    # validation run
    val_loss_epoch = []
    with torch.no_grad():
        for queries, targets in train_data_loader:
            outputs = skip_gram_model(queries)
            val_loss = torch.nn.functional.cross_entropy(outputs, targets)
            val_loss_epoch.append(val_loss)

    train_losses.append(sum(train_loss_epoch) / len(train_loss_epoch))
    val_losses.append(sum(val_loss_epoch) / len(val_loss_epoch))

Looking at the loss:

In [None]:
from matplotlib import pyplot as plt
plt.plot(train_losses)
plt.plot(val_losses)

plt.legend(["train loss", "test loss"])


In [None]:
val_losses[-1]

Using the larger model slightly improved the loss. However, it is not until we use the embedding on a downstream task that we see if our embedding is any good.

## Using these embeddings to detect spam

To see whether our embedding is useful, we use them as an input to our spam classification. We will use a small neural network for the classifier, because we do not expect that we can separate the messages linearly.

As was the case with bag of words, we again face the challenge of somehow having to aggregate the embeddings of sentences with different length to always create a vector of the same size. We do this by calculating for every sentence the average value of the embedding. This is done in the code below (note that this isn't the most efficient way to do this, but hopefully the clearest to read).

In [None]:
embeddings = pd.DataFrame(skip_gram_model.embedding_matrix.data)
embeddings.shape

In [None]:
import numpy as np
mean_embedding = []

# Iterate through every sentence
for word_number_list in df["word_number_list"]:
    # Calculate the mean embedding for every word in that sentence
    embedding = np.zeros((embeddings.shape[1]), dtype=float)
    count = 0
    for word_number in word_number_list:
        if word_number in embeddings.index:
            embedding = embedding + embeddings.loc[word_number]
            count = count + 1
    
    if count == 0:
        mean_embedding.append(embedding)
    else:
        mean_embedding.append(embedding / count)

# Turn everything into an array
mean_embedding_df = pd.DataFrame(mean_embedding)


We prepare the data for training, by using the same train-test-split as before:

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score

target = df["label"] == "spam"

train_x, test_x, train_y, test_y = train_test_split(
    mean_embedding_df, target, test_size=.2, random_state=123
)

# Convert the matrices to PyTorch
train_x_tensor = torch.tensor(train_x.values).float()
test_x_tensor = torch.tensor(test_x.values).float()
train_y_tensor = torch.tensor(train_y.values).float().unsqueeze(-1)
test_y_tensor = torch.tensor(test_y.values).float().unsqueeze(-1)

Defining the model and the optimizers:

In [None]:
def create_model(input_dimensions, hidden_dimensions, number_hidden):
    """Creates a feed-forward neural network with ReLU activations."""
    return torch.nn.Sequential(
        torch.nn.Linear(input_dimensions, hidden_dimensions),
        torch.nn.ReLU(),
        *[
            layer
            for _ in range(number_hidden)
            for layer in (torch.nn.Linear(hidden_dimensions, hidden_dimensions), torch.nn.ReLU())
        ],
        torch.nn.Linear(hidden_dimensions, 1)
    )

model = create_model(embedding_size, hidden_dimensions=20, number_hidden=5)
optimizer = torch.optim.AdamW(model.parameters(), 1e-3)

Training the model:

In [None]:
ce_loss = torch.nn.BCEWithLogitsLoss()

learning_curve = []

for epoch in range(10000):
    optimizer.zero_grad()

    output = model(train_x_tensor)
    loss = ce_loss(output, train_y_tensor)
    loss.backward()
    
    train_loss_epoch.append(loss.item())
    optimizer.step()
    learning_curve.append(loss.detach().item())


Let's look at the learning curve:

In [None]:
plt.plot(learning_curve);

Let's calculate the performance:

In [None]:
# Calculate the predictions on the test set
predictions = (model(test_x_tensor) > 0).numpy()

print("Precision:", precision_score(test_y, predictions))
print("Recall:", recall_score(test_y, predictions))

We see that the performance is quite a bit lower than what we obtain from the bag-of-words-model. However, note that we achieved this performance by reducing the 1'000 dimensions of bag-of-words to just 20. By using a larger embedding size, we would expect our model to perform a bit better.

Moreover, as we have seen, the little amount of text that we have is not sufficient to learn a rich embedding from. We would expect a higher performance if we used embeddings trained on more text.