This notebook is used to learn the embeddings of words in a dataset using the word2vec model.

In [None]:
from datasets import load_dataset
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchtext
import nltk
from nltk.corpus import stopwords
from datasets import load_from_disk
import numpy as np
import tqdm
import pandas as pd
from datasets import Dataset
import collections

In [None]:
seed = 257

np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

## Prepare the data

We begin by tokenizing and cleaning the data. This process consists of removing punctuation, numbers, and stop words.

In [None]:
# load the dataset
train_data, test_data = load_dataset("yelp_polarity", split=["train", "test"])

In [None]:
# tokenize the dataset
tokenizer = torchtext.data.utils.get_tokenizer("basic_english")


def tokenize(obs, tokenizer, max_length=512):
    """
    Tokenize an observation
    max_length: the maximum length of the tokenized sequence
    """
    return {"tokens": tokenizer(obs["text"])[:max_length]}

In [None]:
# remove stopwords and punctuation
stop_words = stopwords.words("english")


def remove_stopwords(obs):
    """
    Removes stopwords from tokens for each obs in Dataset
    """
    obs["tokens"] = [word for word in obs["tokens"] if word not in stop_words]
    return obs


def remove_punctuation(obs):
    """
    Removes punctuation from tokens for each obs in Dataset
    """
    obs["tokens"] = [word for word in obs["tokens"] if word.isalpha()]
    return obs


def tokenize_and_clean(obs):
    """
    Tokenize, remove stopwords and punctuation from observation
    """
    tokens = tokenizer(obs["text"][:512])
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [word for word in tokens if word.isalpha()]
    return {"tokens": tokens}


# train_data = train_data.map(remove_stopwords)

In [None]:
# tokenizer(train_data[0]["text"][:512])
train_data = train_data.map(tokenize_and_clean)
test_data = test_data.map(tokenize_and_clean)

In [None]:
# train_data.save_to_disk("/datasets/yelp_polarity_train")
# train_data = load_from_disk("/datasets/yelp_polarity_train/")

Now that our data has been tokenized and cleaned, we can create a validation set.

In [None]:
# validation data
train_valid_data = train_data.train_test_split(test_size=0.25)
train_data = train_valid_data["train"]
valid_data = train_valid_data["test"]

From the training data, we now proceed to create a vocabulary comprised of the training data's unique words (if they appear more than 75 times).

In [None]:
# creating the vocabulary
special_tokens = ["<unk>"]

# setting a minimum frequency for the tokens ... 75 times in 420,000 sentences is not a lot
vocab = torchtext.vocab.build_vocab_from_iterator(
    train_data["tokens"], specials=special_tokens, min_freq=75
)
vocab.set_default_index(vocab["<unk>"])
len(vocab)

Now that we have the vocabulary, we can numerically encode the words in the training data.

In [None]:
def numericalize_example(obs, vocab):
    ids = vocab.lookup_indices(obs["tokens"])
    return {"ids": ids}


train_data = train_data.map(numericalize_example, fn_kwargs={"vocab": vocab})
valid_data = valid_data.map(numericalize_example, fn_kwargs={"vocab": vocab})
test_data = test_data.map(numericalize_example, fn_kwargs={"vocab": vocab})

Now that we have numericalized the data, we can create word pairs for the skip-gram model. To finalize the numericalization process, we'll transform `x` and `y` into PyTorch tensors.

In [None]:
def get_word_pairs(sentence, window_size=3):
    """
    Generate word pairs from a sentence
    """
    for i, ids in enumerate(sentence):
        for j in range(1, window_size + 1):
            if i + j < len(sentence):
                yield (sentence[i], sentence[i + j])
            if i - j >= 0:
                yield (sentence[i], sentence[i - j])


def extract_pairs(dataset):
    """
    Extract word pairs from dataset
    """
    for i, obs in enumerate(dataset):
        yield from get_word_pairs(obs["ids"])

In [None]:
# convert the new training data to a dataset from a DataFrame
new_train = extract_pairs(train_data)
new_train = pd.DataFrame(new_train, columns=["x", "y"])
new_train = Dataset.from_pandas(new_train)
new_train = new_train.with_format(type="torch", columns=["x", "y"])

# convert the new validation data to a dataset from a DataFrame
new_valid = extract_pairs(valid_data)
new_valid = pd.DataFrame(new_valid, columns=["x", "y"])
new_valid = Dataset.from_pandas(new_valid)
new_valid = new_valid.with_format(type="torch", columns=["x", "y"])

# # convert the new test data to a dataset from a DataFrame
new_test = extract_pairs(test_data)
new_test = pd.DataFrame(new_test, columns=["x", "y"])
new_test = Dataset.from_pandas(new_test)
new_test = new_test.with_format(type="torch", columns=["x", "y"])

In [None]:
# new_train.save_to_disk("/datasets/yelp_polarity_train_torchpairs")
# new_train = load_from_disk("/datasets/yelp_polarity_train_torchpairs")

# new_valid.save_to_disk("/datasets/yelp_polarity_valid_torchpairs")
# new_valid = load_from_disk("/datasets/yelp_polarity_valid_torchpairs")

# new_test.save_to_disk("/datasets/yelp_polarity_test_torchpairs")
# new_test = load_from_disk("/datasets/yelp_polarity_test_torchpairs")

The embedding layer is used to transform our sparse one-hot vector (sparse as most of the elements are 0) into a dense embedding vector (dense as the dimensionality is a lot smaller and all the elements are real numbers). This embedding layer is simply a single fully connected layer. As well as reducing the dimensionality of the input to the RNN, there is the theory that words which have similar impact on the sentiment of the review are mapped close together in this dense vector space. For more information about word embeddings, see here.

The final step in preparing the data is to create a DataLoader and batch the data. For each batch, we'll create a single tensor for the input data and another for the output data. The input data will contain the indexes of the center words and the output data will contain the indexes of the context words.

In [None]:
# collate function
def get_collate_fn():
    def collate_fn(batch):
        """
        Collate function for the DataLoader
        """
        batch_x = []
        batch_y = []
        for _, obs in enumerate(batch):
            batch_x.append(obs["x"])
            batch_y.append(obs["y"])

        batch = {"x": batch_x, "y": batch_y}

        return batch

    return collate_fn

Now we shall define a function which returns our actual data loader. 

In [None]:
def get_data_loader(dataset, batch_size=64, shuffle=False):
    """
    Get a DataLoader for the dataset
    """
    collate_fn = get_collate_fn()
    data_loader = torch.utils.data.DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        collate_fn=collate_fn,
    )
    return data_loader

Now we get the data loaders for each of our train, validation and test sets. After doing so, we can continue onto building our model.

In [None]:
batch_size = 64

train_loader = get_data_loader(new_train, batch_size=batch_size, shuffle=True)
valid_loader = get_data_loader(new_valid, batch_size=batch_size, shuffle=False)
test_loader = get_data_loader(new_test, batch_size=batch_size, shuffle=False)

## Building the Model

Our model will consist of an embedding layer that will represent words in a lower-dimensional space seeking to capture semantic relations between words. We can understand these as the "sentiment analysis features" we seek to extract. Finally, the output layer will be a softmax output layer of size `vocab_size`. The output of the model will eventually be discarded, as we are only interested in the hidden layer weights.

In [None]:
class Word2Vec(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size):
        super(Word2Vec, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.fc1 = nn.Linear(embedding_dim, hidden_size)
        self.fc2 = nn.Linear(hidden_size, vocab_size)
        pass

    def forward(self, x):
        embeds = self.embeddings(x)
        out = F.relu(self.fc1(embeds))
        out = self.fc2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs

In [None]:
vocab_size = len(vocab)
embedding_dim = 10
hidden_size = 10

model = Word2Vec(vocab_size, vocab_size, hidden_size)

Now we have to define the loss function and the optimizer. Because we have used a log softmax output, we will use the negative log likelihood loss. We will use the Adam optimizer.

In [None]:
# loss function and optimizer
criterion = nn.NLLLoss()
optimizer = optim.Adam(model.parameters())

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)


In [None]:
model = model.to(device)
criterion = criterion.to(device)

Now we have to define the training loop.

In [None]:
# training function
def train(data_loader, model, criterion, optimizer, device):
    """
    Train the model
    """
    model.train()
    epoch_losses = []
    for batch in tqdm.tqdm(data_loader, desc="Training..."):
        x = torch.stack(batch["x"]).to(device)
        y = torch.stack(batch["y"]).to(device)
        prediction = model(x)
        loss = criterion(prediction, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_losses.append(loss.item())
    return np.mean(epoch_losses)

Now we can define a validate loop which we'll use to measure validation performance.

In [None]:
# validation function
def evaluate(data_loader, model, criterion, device):
    """
    Evaluate the model
    """
    model.eval()
    epoch_losses = []
    with torch.no_grad():
        for batch in tqdm.tqdm(data_loader, desc="Evaluating..."):
            x = torch.stack(batch["x"]).to(device)
            y = torch.stack(batch["y"]).to(device)
            prediction = model(x)
            loss = criterion(prediction, y)
            epoch_losses.append(loss.item())
    return np.mean(epoch_losses)

Our model is ready to be trained. We can now train the model and measure its performance on the validation set.

In [None]:
num_epochs = 10
best_valid_loss = float("inf")

metrics = collections.defaultdict(list)

for epoch in range(num_epochs):
    train_loss = train(train_loader, model, criterion, optimizer, device)
    valid_loss = evaluate(valid_loader, model, criterion, device)
    metrics["train_loss"].append(train_loss)
    metrics["valid_loss"].append(valid_loss)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), "word2vec-model.pt")

    print(
        f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Valid Loss: {valid_loss:.4f}"
    )

In [None]:
embeddings = model.embeddings.weight.data.cpu().numpy()
np.save("word2vec-embeddings.npy", embeddings)
