In [7]:
from collections import Counter
import re

In [4]:
with open("/content/sample_data/shakespeare.txt", "r" ,encoding="utf8") as f:
  text = f.read()
text = text.lower()

In [14]:
tokens = re.findall(r"\w+|[^\w\s]", text)

In [20]:
vocab = sorted(set(tokens))

In [22]:
word2idx = {w: i for i, w in enumerate(vocab)}
idx2word = {i: w for w, i in word2idx.items()}
vocab_size = len(vocab)

In [27]:
indices = [word2idx[w] for w in tokens]

In [28]:
import torch
from torch.utils.data import Dataset, DataLoader

In [30]:
class NNLM_Dataset(Dataset):
    def __init__(self, data, context_size):
        self.data = data
        self.context_size = context_size

    def __len__(self):
        return len(self.data) - self.context_size

    def __getitem__(self, idx):
        context = self.data[idx:idx + self.context_size]
        target = self.data[idx + self.context_size]
        return torch.tensor(context), torch.tensor(target)

In [32]:
context_size = 5
dataset = NNLM_Dataset(indices, context_size)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

In [33]:
import torch.nn as nn

In [48]:
import torch
import torch.nn.functional as F
import torch.nn as nn

class NNLM(nn.Module):
    def __init__(self, vocab_size, context_size, embedding_dim, hidden_dim):
        super().__init__()
        # projection matrix instead of Embedding layer
        self.W_proj = nn.Parameter(torch.randn(vocab_size, embedding_dim))
        self.fc1    = nn.Linear(context_size * embedding_dim, hidden_dim)
        self.act    = nn.Tanh()
        self.fc2    = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        # print(x.shape)
        one_hot = F.one_hot(x, num_classes=self.W_proj.size(0)).float()
        # print(one_hot.shape)
        emb = one_hot @ self.W_proj
        emb = emb.view(emb.size(0), -1)
        # print(emb.shape)
        h   = self.act(self.fc1(emb))
        # print(h.shape)
        out = self.fc2(h)
        # print(out.shape)
        return out


In [49]:
model = NNLM(vocab_size, context_size, embedding_dim=100, hidden_dim=128)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for epoch in range(5):
    total_loss = 0
    for context, target in dataloader:
        optimizer.zero_grad()
        output = model(context)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch + 1}, Loss: {total_loss:.4f}")


Epoch 1, Loss: 23158.1381
Epoch 2, Loss: 20473.6165
Epoch 3, Loss: 19123.2606
Epoch 4, Loss: 18010.4937
Epoch 5, Loss: 17013.4194


In [50]:
context_words = ["to", "be", "or", "not", "to"]
context_ids = torch.tensor([[word2idx[w] for w in context_words]])
with torch.no_grad():
    output = model(context_ids)
    predicted = torch.argmax(output, dim=1)
    print("Next word prediction:", idx2word[predicted.item()])

Next word prediction: be


In [54]:
context_words = ["are", "all", "resolved", "rather", "to"]
context_ids = torch.tensor([[word2idx[w] for w in context_words]])
with torch.no_grad():
    output = model(context_ids)
    predicted = torch.argmax(output, dim=1)
    print("Next word prediction:", idx2word[predicted.item()])

Next word prediction: see
