# Exercises

## Trigram Language Model

1. With Counting
2. With Neural Net

### Trigram With Counting

In [753]:
words = open('../names.txt', 'r').read().splitlines()

In [754]:
words[:10]

['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper',
 'evelyn']

In [755]:
import torch
N = torch.zeros((27, 27 ,27), dtype=torch.int32)

In [756]:
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}

In [757]:
for w in words:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    ix3 = stoi[ch3]
    N[ix1, ix2, ix3] += 1

In [758]:
P = (N+1).float()
P /= P.sum(2, keepdims=True)

In [760]:
def count_loss(input):
    log_likelihood = 0.0
    n = 0

    for w in input:
        chs = ['.'] + list(w) + ['.']
        for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
            ix1 = stoi[ch1]
            ix2 = stoi[ch2]
            ix3 = stoi[ch3]
            prob = P[ix1, ix2, ix3]
            logprob = torch.log(prob)
            log_likelihood += logprob
            n += 1
            #print(f'{ch1}{ch2}: {prob:.4f} {logprob:.4f}')

    print(f'{log_likelihood=}')
    nll = -log_likelihood
    print(f'{nll=}')
    print(f"{nll/n=}")

In [761]:
g = torch.Generator().manual_seed(42)

names = []
for i in range(5):
    out = []
    ix1 = 0  # Start with the token '.'
    
    # First character after the start token
    p = P[ix1, :].sum(0)  # Aggregate across all potential second characters
    ix2 = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
    out.append(itos[ix2])

    while True:
        p = P[ix1, ix2]  # Get the probability distribution for the next character
        ix3 = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
        out.append(itos[ix3])
        if ix3 == 0:  # End token
            break
        ix1, ix2 = ix2, ix3  # Move to the next character in the trigram
    names.append(''.join(out))
    print(''.join(out))

print("=========")
print("LOSS")
count_loss(names)


anuee.
ova.
amarbidushante.
un.
illayley.
LOSS
log_likelihood=tensor(-87.3249)
nll=tensor(87.3249)
nll/n=tensor(2.3601)


### Trigram with NN

In [762]:
# create the dataset
xs, ys = [], []
for w in words:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    ix3 = stoi[ch3]
    xs.append((ix1, ix2))
    ys.append(ix3)
xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement()
print('number of examples: ', num)

# initialize the 'network'
g = torch.Generator().manual_seed(42)
W = torch.randn((27*2, 27), generator=g, requires_grad=True)

number of examples:  392226


In [763]:
import torch.nn.functional as F
# gradient descent
for k in range(100):
  
    # forward pass
    xenc = F.one_hot(xs, num_classes=27).float() # input to the network: one-hot encoding
    logits = xenc.view(-1, 27*2) @ W # predict log-counts and merge both of the one-hot encoded character inputs
    counts = logits.exp() # counts, equivalent to N
    probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
    loss = -probs[torch.arange(len(ys)), ys].log().mean() + 0.01*(W**2).mean()
    print(loss.item())
    
    # backward pass
    W.grad = None # set to zero the gradient
    loss.backward()
    
    # update
    W.data += -50 * W.grad

4.220766544342041
3.4673116207122803
3.110934019088745
2.9119250774383545
2.7892463207244873
2.703580856323242
2.6405839920043945
2.592454671859741
2.5544941425323486
2.523717164993286
2.498292922973633
2.4769906997680664
2.4589340686798096
2.443451166152954
2.430039405822754
2.4183120727539062
2.407977342605591
2.398803472518921
2.390608072280884
2.3832430839538574
2.3765876293182373
2.370542049407959
2.365025043487549
2.3599677085876465
2.355313301086426
2.3510138988494873
2.3470299243927
2.343327045440674
2.3398759365081787
2.3366518020629883
2.3336329460144043
2.3308002948760986
2.3281373977661133
2.325629711151123
2.3232643604278564
2.3210294246673584
2.3189151287078857
2.3169119358062744
2.3150112628936768
2.3132054805755615
2.311487913131714
2.3098528385162354
2.308293581008911
2.3068056106567383
2.305384397506714
2.3040246963500977
2.3027234077453613
2.30147647857666
2.3002803325653076
2.2991323471069336
2.2980291843414307
2.296969175338745
2.2959482669830322
2.294965982437134


In [770]:
import torch
import torch.nn.functional as F

# Assuming W is initialized as a weight matrix with dimensions [54, 27]
g = torch.Generator().manual_seed(42)

for i in range(5):
    out = []
    ix1 = 0  # Start with the start token `.`
    
    # Find the second token based on the first token being the start token `.`
    p = (F.one_hot(torch.tensor([ix1]), num_classes=27).float() @ W[:27, :]).exp()
    p = p / p.sum(1, keepdim=True)
    ix2 = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
    out.append(itos[ix2])
    
    while True:
        # Create a one-hot encoded vector for the pair of previous characters
        xenc1 = F.one_hot(torch.tensor([ix1]), num_classes=27).float()
        xenc2 = F.one_hot(torch.tensor([ix2]), num_classes=27).float()
        
        # Concatenate the two one-hot encoded vectors
        xenc = torch.cat((xenc1, xenc2), dim=1)
        
        # Predict log-counts using the weight matrix
        logits = xenc @ W  # W should have dimensions [54, 27]
        counts = logits.exp()  # Convert log-counts to counts
        
        # Calculate probabilities
        p = counts / counts.sum(1, keepdims=True)
        
        # Sample the next character
        ix3 = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
        out.append(itos[ix3])
        
        if ix3 == 0:  # End token
            break
        
        # Shift the indices for the next iteration
        ix1, ix2 = ix2, ix3
    
    print(''.join(out))


anugeenvi.
amarbian.
dan.
ubra.
silayley.
