In [3]:
# E01: train a trigram language model, i.e. take two characters as an input to predict the 3rd one. 
# Feel free to use either counting or a neural net. Evaluate the loss; Did it improve over a bigram model?

In [11]:
import torch
import torch.nn.functional as F

In [12]:
words = open('names.txt', 'r').read().splitlines()

In [13]:
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0

itos = {i:s for s,i in stoi.items()}

In [14]:
# creates the training set of bigrams (x, y)
xs, ys = [], []

for w in words:
    # Trigram context requires TWO starting characters
    # If word is 'emma', context is:
    # .. -> e
    # .e -> m
    # em -> m
    # mm -> a
    # ma -> .
    chs = ['.'] + ['.'] + list(w) + ['.']
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        ix3 = stoi[ch3] # target
        # Create a unique idx for the input pair (ch1, ch2)
        # this maps (0, 0) -> 0, (0, 1) -> 1 ... (26, 26) -> 728
        trigram_idx = ix1 * 27 + ix2 # base-27
        xs.append(trigram_idx)
        ys.append(ix3)
xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement()
print('number of examples: ', num)

# initialize the 'network'
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27*27, 27), generator=g, requires_grad=True)

number of examples:  228146


In [15]:
# gradient descent
for k in range(100):
    # forward pass
    xenc = F.one_hot(xs, num_classes=27*27).float() # input to the network: one-hot encoding
    logits = xenc @ W # predict log-counts
    counts = logits.exp() # counts, equivalent to N
    probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
    # W -> zero, the more smooth distribution you're going achieve, 
    # 0.01 * (W**2).mean() called regularization
    loss = -probs[torch.arange(num), ys].log().mean() + 0.01 * (W**2).mean()
    print(f'loss: {loss.item()}')

    # backward pass
    W.grad = None # set to zero the gradient
    loss.backward()

    # update
    W.data += -50 * W.grad

loss: 3.8028225898742676
loss: 3.64870285987854
loss: 3.556884765625
loss: 3.4891085624694824
loss: 3.4331703186035156
loss: 3.384026527404785
loss: 3.3398447036743164
loss: 3.299639940261841
loss: 3.262803554534912
loss: 3.2289042472839355
loss: 3.1975934505462646
loss: 3.1685667037963867
loss: 3.1415536403656006
loss: 3.1163175106048584
loss: 3.0926547050476074
loss: 3.0703928470611572
loss: 3.0493879318237305
loss: 3.0295186042785645
loss: 3.01068377494812
loss: 2.9927961826324463
loss: 2.975781202316284
loss: 2.9595749378204346
loss: 2.944119453430176
loss: 2.9293642044067383
loss: 2.915264368057251
loss: 2.901778221130371
loss: 2.8888683319091797
loss: 2.8765010833740234
loss: 2.8646445274353027
loss: 2.853269577026367
loss: 2.842349052429199
loss: 2.831857204437256
loss: 2.8217711448669434
loss: 2.812068223953247
loss: 2.8027267456054688
loss: 2.7937276363372803
loss: 2.7850513458251953
loss: 2.7766804695129395
loss: 2.7685980796813965
loss: 2.7607884407043457
loss: 2.75323677062

In [24]:
g = torch.Generator().manual_seed(2147483647)

for i in range(20):
    out = []
    ix1 = 0 # .
    ix2 = 0 # .
    while True:
        # ---------- CHANGE START ----------
        
        # 1. Turn the integer index 'ix' into a Tensor
        x = torch.tensor([ix1 * 27 + ix2])
        
        # 2. One-hot encode the input (creates shape 1x(27*27))
        # We must cast to .float() because W is float, but one_hot creates integers
        xenc = F.one_hot(x, num_classes=27*27).float()
        
        # 3. Forward Pass: Calculate Logits
        logits = xenc @ W 
        
        # 4. Softmax: Convert Logits to Probabilities
        counts = logits.exp()
        p = counts / counts.sum(1, keepdims=True)
        
        # ---------- CHANGE END ----------
        
        # 5. Sample from the distribution (Same as before)
        ix3 = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
        if ix3 == 0:
            break
        out.append(itos[ix3])
        ix1 = ix2
        ix2 = ix3
    
    print(''.join(out))

cexzdfzjglkuriana
kayhhmvlzimjtna
nalkfdkzka
da
samiyaubjtbhrigotwxezgzjeqkgxojkwptedo
kaley
masidey
nkgvnrnfrftbspmhwcjdenvtahlvsuznsdrxdlngil
pynw
istnj
ra
danne
zktsder
jair
t
gbckajbhzabsvoth
khysxqevecmpjxhcayhrieen
xmvpfoqzmtrfvjbsdblmysox
laptjapxzqbgpqlhariyannk
ille


In [23]:
g = torch.Generator().manual_seed(2147483647)

for i in range(20):
    out = []
    ix1 = 0 # .
    ix2 = 0 # .
    while True:
        # ---------- CHANGE START ----------
        
        # 1. Turn the integer index 'ix' into a Tensor
        # x = torch.tensor([ix1 * 27 + ix2])
        
        # 2. One-hot encode the input (creates shape 1x(27*27))
        # We must cast to .float() because W is float, but one_hot creates integers
        # xenc = F.one_hot(x, num_classes=27*27).float()
        
        # 3. Forward Pass: Calculate Logits
        # logits = xenc @ W 


        # OPTIMIZATION
        # Multiply a one-hot vector by a matrix is exactly the same as plucking out a single
        # row from that matrix

        logits = W[ix1 * 27 + ix2]
        
        # 4. Softmax: Convert Logits to Probabilities
        counts = logits.exp()
        p = counts / counts.sum()
        
        # ---------- CHANGE END ----------
        
        # 5. Sample from the distribution (Same as before)
        ix3 = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
        if ix3 == 0:
            break
        out.append(itos[ix3])
        ix1 = ix2
        ix2 = ix3
    
    print(''.join(out))

cexzdfzjglkuriana
kayhhmvlzimjtna
nalkfdkzka
da
samiyaubjtbhrigotwxezgzjeqkgxojkwptedo
kaley
masidey
nkgvnrnfrftbspmhwcjdenvtahlvsuznsdrxdlngil
pynw
istnj
ra
danne
zktsder
jair
t
gbckajbhzabsvoth
khysxqevecmpjxhcayhrieen
xmvpfoqzmtrfvjbsdblmysox
laptjapxzqbgpqlhariyannk
ille


In [25]:
# E02: split up the dataset randomly into 80% train set, 10% dev set, 10% test set. 
# Train the bigram and trigram models only on the training set. Evaluate them on dev and test splits. What can you see?

In [27]:
import torch
import random

# 1. Load
words = open('names.txt', 'r').read().splitlines()

# 2. Shuffle (Crucial! Otherwise you might train on A->S and test on T-Z)
random.seed(42)
random.shuffle(words)

# 3. Calculate split points
n1 = int(0.8 * len(words)) # 80% mark
n2 = int(0.9 * len(words)) # 80% mark

# 4. Slice
words_train = words[:n1]  # 80%
words_dev = words[n1:n2] # 80%-90%
words_test = words[n2:]  # 90%-100%

print(len(words_train), len(words_dev), len(words_test))


25626 3203 3204


In [32]:
# Mappings
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0

def build_dataset(words):
    xs, ys = [], []
    for w in words:
        # Trigram context: start with two dots
        chs = ['.'] + ['.'] + list(w) + ['.']
        for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
            ix1 = stoi[ch1]
            ix2 = stoi[ch2]
            ix3 = stoi[ch3]
            
            # Trigram logic: (27 * ch1) + ch2
            xs.append(ix1 * 27 + ix2)
            ys.append(ix3)
    
    xs = torch.tensor(xs)
    ys = torch.tensor(ys)
    print(f'Dataset has {xs.nelement()} examples')
    return xs, ys

# Build the three sets
print("Building Train...")
Tri_Xtr, Tri_Ytr = build_dataset(words_train)
print("Building Dev...")
Tri_Xdev, Tri_Ydev = build_dataset(words_dev)
print("Building Test...")
Tri_Xte, Tri_Yte = build_dataset(words_test)


Building Train...
Dataset has 182625 examples
Building Dev...
Dataset has 22655 examples
Building Test...
Dataset has 22866 examples


In [None]:
import torch.nn.functional as F
import torch.nn.functional as F
import numpy as np

for alpha in np.arange(0.1, 0.5, 0.1):
    
    # Initialize network (Trigram: 729 inputs -> 27 outputs)
    g = torch.Generator().manual_seed(2147483647)
    W = torch.randn((27*27, 27), generator=g, requires_grad=True)
    
    # --- TRAINING (only on Xtr, Ytr) ---
    for k in range(100): # Increase this to ~1000 or more for good results
        # Forward pass
        xenc = F.one_hot(Tri_Xtr, num_classes=27*27).float()
        logits = xenc @ W
        
        # Softmax
        counts = logits.exp()
        probs = counts / counts.sum(1, keepdims=True)
        
        # Loss
        # loss = -probs[torch.arange(len(Ytr)), Ytr].log().mean()
        loss = -probs[torch.arange(len(Tri_Ytr)), Tri_Ytr].log().mean() + alpha * (W**2).mean()
        
        # Backward
        W.grad = None
        loss.backward()
        W.data += -50 * W.grad
    
    print(f"Training Loss: {loss.item()}")
    # --- EVALUATION (on Xdev, Ydev) ---
    # We do NOT optimize W here, we just calculate loss
    with torch.no_grad(): # Tells torch we don't need gradients here
        xenc_dev = F.one_hot(Tri_Xdev, num_classes=27*27).float()
        logits_dev = xenc_dev @ W
        counts_dev = logits_dev.exp()
        probs_dev = counts_dev / counts_dev.sum(1, keepdims=True)
        loss_dev = -probs_dev[torch.arange(len(Tri_Ydev)), Tri_Ydev].log().mean() + alpha * (W**2).mean()
    
    print(f"Dev Loss: {loss_dev.item()}")

In [34]:
# --- EVALUATION (on Xdev, Ydev) ---
# We do NOT optimize W here, we just calculate loss
with torch.no_grad(): # Tells torch we don't need gradients here
    xenc_dev = F.one_hot(Tri_Xdev, num_classes=27*27).float()
    logits_dev = xenc_dev @ W
    counts_dev = logits_dev.exp()
    probs_dev = counts_dev / counts_dev.sum(1, keepdims=True)
    loss_dev = -probs_dev[torch.arange(len(Tri_Ydev)), Tri_Ydev].log().mean() + 0.01 * (W**2).mean()

print(f"Dev Loss: {loss_dev.item()}")

Dev Loss: 2.5264275074005127


In [41]:
# Bigram 

In [35]:
# Mappings
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0

def build_dataset(words):
    xs, ys = [], []
    for w in words:
        # Bigram context: start with one dots
        chs = ['.'] + list(w) + ['.']
        for ch1, ch2 in zip(chs, chs[1:]):
            ix1 = stoi[ch1]
            ix2 = stoi[ch2]
            xs.append(ix1)
            ys.append(ix2)
    xs = torch.tensor(xs)
    ys = torch.tensor(ys)
    
    print(f'Dataset has {xs.nelement()} examples')
    return xs, ys

# Build the three sets
print("Building Train...")
Bi_Xtr, Bi_Ytr = build_dataset(words_train)
print("Building Dev...")
Bi_Xdev, Bi_Ydev = build_dataset(words_dev)
print("Building Test...")
Bi_Xte, Bi_Yte = build_dataset(words_test)

Building Train...
Dataset has 182625 examples
Building Dev...
Dataset has 22655 examples
Building Test...
Dataset has 22866 examples


In [43]:
import torch.nn.functional as F
import numpy as np

for alpha in np.arange(0.1, 0.5, 0.1):
    g = torch.Generator().manual_seed(2147483647)
    W = torch.randn((27, 27), generator=g, requires_grad=True)
    
    # --- TRAINING (only on Xtr, Ytr) ---
    for k in range(100): # Increase this to ~1000 or more for good results
        # Forward pass
        xenc = F.one_hot(Bi_Xtr, num_classes=27).float()
        logits = xenc @ W
        
        # Softmax
        counts = logits.exp()
        probs = counts / counts.sum(1, keepdims=True)
        
        # Loss
        # loss = -probs[torch.arange(len(Ytr)), Ytr].log().mean()
        loss = -probs[torch.arange(len(Bi_Xtr)), Bi_Ytr].log().mean() + alpha * (W**2).mean()
        
        # Backward
        W.grad = None
        loss.backward()
        W.data += -50 * W.grad
    
    print(f"Training Loss: {loss.item()}")
    # --- EVALUATION (on Xdev, Ydev) ---
    # We do NOT optimize W here, we just calculate loss
    with torch.no_grad(): # Tells torch we don't need gradients here
        xenc_dev = F.one_hot(Bi_Xdev, num_classes=27).float()
        logits_dev = xenc_dev @ W
        counts_dev = logits_dev.exp()
        probs_dev = counts_dev / counts_dev.sum(1, keepdims=True)
        loss_dev = -probs_dev[torch.arange(len(Bi_Ydev)), Bi_Ydev].log().mean() + alpha * (W**2).mean()
    
    print(f"Dev Loss: {loss_dev.item()}")

Training Loss: 2.587466239929199
Dev Loss: 2.5862185955047607
Training Loss: 2.654571771621704
Dev Loss: 2.653724431991577
Training Loss: 2.7046115398406982
Dev Loss: 2.704026460647583
Training Loss: 2.744701862335205
Dev Loss: 2.7443368434906006
