In [8]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [7]:
# Download Penn Treebank dataset
import urllib.request
import os

# Create data directory if it doesn't exist
os.makedirs('data/ptb', exist_ok=True)

# Penn Treebank URLs
ptb_urls = {
    'train': 'https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.train.txt',
    'valid': 'https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.valid.txt',
    'test': 'https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.test.txt'
}

# Download files
for split, url in ptb_urls.items():
    filepath = f'data/ptb/ptb.{split}.txt'
    if not os.path.exists(filepath):
        print(f'Downloading {split} data...')
        urllib.request.urlretrieve(url, filepath)
        print(f'Saved to {filepath}')
    else:
        print(f'{split} data already exists at {filepath}')

Downloading train data...
Saved to data/ptb/ptb.train.txt
Downloading valid data...
Saved to data/ptb/ptb.valid.txt
Downloading test data...
Saved to data/ptb/ptb.test.txt
Saved to data/ptb/ptb.valid.txt
Downloading test data...
Saved to data/ptb/ptb.test.txt


In [1]:
# Load and tokenize the data
def load_ptb_data(filepath):
    """Load PTB data and return list of words"""
    with open(filepath, 'r') as f:
        # PTB has one sentence per line, words separated by spaces
        # <unk> is used for unknown words, <eos> marks end of sentence
        text = f.read()
        words = text.split()
    return words

# Load all splits
train_words = load_ptb_data('data/ptb/ptb.train.txt')
valid_words = load_ptb_data('data/ptb/ptb.valid.txt')
test_words = load_ptb_data('data/ptb/ptb.test.txt')

print(f'Train: {len(train_words)} words')
print(f'Valid: {len(valid_words)} words')
print(f'Test: {len(test_words)} words')
print(f'\nFirst 50 words: {train_words[:50]}')

Train: 887521 words
Valid: 70390 words
Test: 78669 words

First 50 words: ['aer', 'banknote', 'berlitz', 'calloway', 'centrust', 'cluett', 'fromstein', 'gitano', 'guterman', 'hydro-quebec', 'ipo', 'kia', 'memotec', 'mlx', 'nahb', 'punts', 'rake', 'regatta', 'rubens', 'sim', 'snack-food', 'ssangyong', 'swapo', 'wachter', 'pierre', '<unk>', 'N', 'years', 'old', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'nov.', 'N', 'mr.', '<unk>', 'is', 'chairman', 'of', '<unk>', 'n.v.', 'the', 'dutch', 'publishing', 'group']


In [18]:
def build_vocabulary(words):
    words_set = set(words)
    word_to_idx = {word: i for i, word in enumerate(words_set)}
    idx_to_word = {i: word for (word, i) in word_to_idx.items()}

    return word_to_idx, idx_to_word

In [19]:
word_to_idx, idx_to_word = build_vocabulary(train_words+valid_words+test_words)
Vsize = len(word_to_idx)

In [26]:
Vsize

9999

In [20]:
def build_dataset(words):
    X, Y = [], []
    for i in range(len(words)-4):
        x1, x2, x3, y = words[i+1],words[i+2], words[i+3], words[i+4] 
        X.append((word_to_idx[x1],word_to_idx[x2],word_to_idx[x3]))
        Y.append(word_to_idx[y])
    
    return torch.tensor(X),torch.tensor(Y)


In [21]:
trainX, trainY = build_dataset(train_words)
devX, devY = build_dataset(valid_words)
testX, testY = build_dataset(test_words)

In [23]:
trainX.shape

torch.Size([887517, 3])

In [None]:
import torch.nn.functional as F

batch_size = (32,)
base_lr = 0.5
g = torch.Generator().manual_seed(2147483647)
C = torch.randn((Vsize, 10), generator=g) * 0.01
W1 = torch.randn((30, 100), generator=g) * 0.01
b1 = torch.zeros((100,))
W2 = torch.randn((100, Vsize), generator=g) * 0.01
b2 = torch.zeros((Vsize,)) 
parameters = [C, W1, b1, W2, b2] 
for p in parameters:
    p.requires_grad = True

for i in range(200_000):
    #Forward pass
    ix = torch.randint(trainX.shape[0], batch_size, generator=g)
    emb = C[trainX[ix]] # (887517, 3, 10)
    emb = emb.view(emb.shape[0], -1) # (887517, 30)
    h = torch.tanh(emb @ W1 + b1) # (887517, 100)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, trainY[ix]) # Softmax & cross entropy loss compute

    if not i % 10_000:
        print(f"step: {i:7d}, loss = {loss.item():.4f}")
    
    for p in parameters:
        p.grad = None
    
    # Compute gradients
    loss.backward()
    
    # Update parameters
    lr = base_lr if i < 150_000 else base_lr * 0.1
    for p in parameters:
        p.data -= lr * p.grad

print("Final train loss: ", loss)

# Compute dev loss with the params optimized.
with torch.no_grad():
    emb = C[devX] # (887517, 3, 10)
    emb = emb.view(emb.shape[0], -1) # (887517, 30)
    h = torch.tanh(emb @ W1 + b1) # (887517, 100)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, devY) # Softmax & cross entropy loss 
    print("Final dev loss: ", loss)

    
    

step:       0, loss = 9.2103
step:   10000, loss = 5.9252
