# Exercises

## E01: Trigram Language Model

E01: train a trigram language model, i.e. take two characters as an input to predict the 3rd one. Feel free to use either counting or a neural net. Evaluate the loss; Did it improve over a bigram model?

1. With Counting
2. With Neural Net

### Trigram With Counting

In [197]:
words = open('../names.txt', 'r').read().splitlines()

In [198]:
words[:10]

['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper',
 'evelyn']

In [199]:
import torch
N = torch.zeros((27, 27 ,27), dtype=torch.int32)

In [200]:
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}

In [201]:
for w in words:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    ix3 = stoi[ch3]
    N[ix1, ix2, ix3] += 1

In [202]:
P = (N+1).float()
P /= P.sum(2, keepdims=True)

In [203]:
def count_loss(input):
    log_likelihood = 0.0
    n = 0

    for w in input:
        chs = ['.'] + list(w) + ['.']
        for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
            ix1 = stoi[ch1]
            ix2 = stoi[ch2]
            ix3 = stoi[ch3]
            prob = P[ix1, ix2, ix3]
            logprob = torch.log(prob)
            log_likelihood += logprob
            n += 1
            #print(f'{ch1}{ch2}: {prob:.4f} {logprob:.4f}')

    print(f'{log_likelihood=}')
    nll = -log_likelihood
    print(f'{nll=}')
    print(f"{nll/n=}")

In [204]:
g = torch.Generator().manual_seed(2147483647)

names = []
for i in range(5):
    out = []
    ix1 = 0  # Start with the token '.'
    
    # First character after the start token
    p = P[ix1, :].sum(0)  # Aggregate across all potential second characters
    ix2 = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
    out.append(itos[ix2])

    while True:
        p = P[ix1, ix2]  # Get the probability distribution for the next character
        ix3 = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
        out.append(itos[ix3])
        if ix3 == 0:  # End token
            break
        ix1, ix2 = ix2, ix3  # Move to the next character in the trigram
    names.append(''.join(out))
    print(''.join(out))

print("=========")
print("LOSS")
count_loss(names)


rexza.
ioulius.
ila.
ityharlonimittain.
luwak.
LOSS
log_likelihood=tensor(-105.8303)
nll=tensor(105.8303)
nll/n=tensor(2.5198)


### Trigram with NN

In [205]:
# create the dataset
xs, ys = [], []
for w in words:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    ix3 = stoi[ch3]
    xs.append((ix1, ix2))
    ys.append(ix3)
xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement()
print('number of examples: ', num)

# initialize the 'network'
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27*2, 27), generator=g, requires_grad=True)

number of examples:  392226


In [206]:
import torch.nn.functional as F
# gradient descent
for k in range(100):
  
    # forward pass
    xenc = F.one_hot(xs, num_classes=27).float() # input to the network: one-hot encoding
    logits = xenc.view(-1, 27*2) @ W # predict log-counts and merge both of the one-hot encoded character inputs
    counts = logits.exp() # counts, equivalent to N
    probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
    loss = -probs[torch.arange(len(ys)), ys].log().mean() + 0.01*(W**2).mean()
    print(loss.item())
    
    # backward pass
    W.grad = None # set to zero the gradient
    loss.backward()
    
    # update
    W.data += -50 * W.grad

4.195971488952637
3.3653788566589355
3.049534320831299
2.8784797191619873
2.7739577293395996
2.7012393474578857
2.6454999446868896
2.601283550262451
2.5652339458465576
2.535409688949585
2.510397434234619
2.4892261028289795
2.4711146354675293
2.455474853515625
2.4418272972106934
2.4298088550567627
2.4191300868988037
2.4095730781555176
2.400963306427002
2.3931643962860107
2.386066198348999
2.3795783519744873
2.373626470565796
2.368147611618042
2.3630881309509277
2.358403205871582
2.354053020477295
2.3500044345855713
2.3462281227111816
2.342698097229004
2.3393924236297607
2.3362908363342285
2.3333756923675537
2.330632209777832
2.3280460834503174
2.3256044387817383
2.3232970237731934
2.321112632751465
2.3190431594848633
2.317079782485962
2.3152151107788086
2.313441753387451
2.3117542266845703
2.3101463317871094
2.30861234664917
2.307147979736328
2.3057494163513184
2.3044114112854004
2.303130626678467
2.301903486251831
2.300727128982544
2.2995975017547607
2.298513412475586
2.297471523284912

In [207]:
import torch
import torch.nn.functional as F

# Assuming W is initialized as a weight matrix with dimensions [54, 27]
g = torch.Generator().manual_seed(2147483647)

for i in range(5):
    out = []
    ix1 = 0  # Start with the start token `.`
    
    # Find the second token based on the first token being the start token `.`
    p = (F.one_hot(torch.tensor([ix1]), num_classes=27).float() @ W[:27, :]).exp()
    p = p / p.sum(1, keepdim=True)
    ix2 = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
    out.append(itos[ix2])
    
    while True:
        # Create a one-hot encoded vector for the pair of previous characters
        xenc1 = F.one_hot(torch.tensor([ix1]), num_classes=27).float()
        xenc2 = F.one_hot(torch.tensor([ix2]), num_classes=27).float()
        
        # Concatenate the two one-hot encoded vectors
        xenc = torch.cat((xenc1, xenc2), dim=1)
        
        # Predict log-counts using the weight matrix
        logits = xenc @ W  # W should have dimensions [54, 27]
        counts = logits.exp()  # Convert log-counts to counts
        
        # Calculate probabilities
        p = counts / counts.sum(1, keepdims=True)
        
        # Sample the next character
        ix3 = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
        out.append(itos[ix3])
        
        if ix3 == 0:  # End token
            break
        
        # Shift the indices for the next iteration
        ix1, ix2 = ix2, ix3
    
    print(''.join(out))


dexze.
iogh.
urailaziayh.
elliimittain.
lusan.


## E02: Train-Dev-Test Set

E02: split up the dataset randomly into 80% train set, 10% dev set, 10% test set. Train the bigram and trigram models only on the training set. Evaluate them on dev and test splits. What can you see?

In [208]:
# prepare the dataset
from sklearn.model_selection import train_test_split

words_train, words_test = train_test_split(words, test_size=0.2, random_state=1234)
words_dev, words_test = train_test_split(words_test, test_size=0.5, random_state=1234)

### Training the Bigram Model

In [209]:
# create the dataset
x_train, y_train, x_dev, y_dev, x_test, y_test = [], [], [], [], [], []
for wgroup in [words_train, words_dev, words_test]:
    xs , ys = [], []
    for w in wgroup:
        # add start and end tokens
        chs = ["."] + list(w) + ["."]
        for ch1, ch2 in zip(chs, chs[1:]):
            ix1 = stoi[ch1]
            ix2 = stoi[ch2]
        
            xs.append(ix1)
            ys.append(ix2)

    xs = torch.tensor(xs, dtype=torch.int64)
    ys = torch.tensor(ys, dtype=torch.int64)

    if wgroup == words_train:
        x_train, y_train = xs, ys
    elif wgroup == words_dev:
        x_dev, y_dev = xs, ys
    else:
        x_test, y_test = xs, ys

# initialize the 'network'
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27, 27), generator=g, requires_grad=True)

In [210]:
# gradient descent
for k in range(200):
  
  # forward pass
  xenc = F.one_hot(x_train, num_classes=27).float() # input to the network: one-hot encoding
  logits = xenc @ W # predict log-counts
  counts = logits.exp() # counts, equivalent to N
  probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
  loss = -probs[torch.arange(len(x_train)), y_train].log().mean() + 0.01*(W**2).mean()
  print(loss.item())
  
  # backward pass
  W.grad = None # set to zero the gradient
  loss.backward()
  
  # update
  W.data += -50 * W.grad

3.767667293548584
3.3780815601348877
3.1604931354522705
3.026573657989502
2.9338600635528564
2.8665993213653564
2.8160202503204346
2.776515483856201
2.744630813598633
2.718219041824341
2.695909023284912
2.67679500579834
2.660250425338745
2.6458210945129395
2.6331593990325928
2.621990919113159
2.612090826034546
2.603271722793579
2.59537935256958
2.588282823562622
2.581873893737793
2.5760607719421387
2.570765733718872
2.5659241676330566
2.561481475830078
2.5573911666870117
2.5536131858825684
2.550114154815674
2.546865701675415
2.543842315673828
2.5410242080688477
2.538391351699829
2.535928249359131
2.533620834350586
2.5314555168151855
2.529421329498291
2.5275075435638428
2.525705337524414
2.524005889892578
2.522402048110962
2.5208864212036133
2.5194523334503174
2.518094301223755
2.5168075561523438
2.5155861377716064
2.5144262313842773
2.5133235454559326
2.5122740268707275
2.511274576187134
2.510321617126465
2.5094127655029297
2.508544921875
2.5077154636383057
2.5069220066070557
2.5061624

### Evaluating the Bigram Model

In [211]:
import numpy as np
# finally, sample from the 'neural net' model
g = torch.Generator().manual_seed(2147483647)

def MLP_loss(x, y, W):
    losses = []
    for i in range(100):
        xenc = F.one_hot(x, num_classes=27).float()
        logits = xenc @ W # predict log-counts
        counts = logits.exp() # counts, equivalent to N
        p = counts / counts.sum(1, keepdims=True) # probabilities for next character
        loss = -probs[torch.arange(len(x)), y].log().mean() + 0.01*(W**2).mean()
        losses.append(loss)
    return sum(losses) / len(losses)
            

In [212]:
print(f"Train Loss: {MLP_loss(x_train, y_train, W):.4f}")
print(f"Dev Loss: {MLP_loss(x_dev, y_dev, W):.4f}")
print(f"Test Loss: {MLP_loss(x_test, y_test, W):.4f}")

Train Loss: 2.4826
Dev Loss: 3.3618
Test Loss: 3.3559


### Training the Trigram Model

In [213]:
x_train, y_train, x_dev, y_dev, x_test, y_test = [], [], [], [], [], []
for wgroup in [words_train, words_dev, words_test]:
    xs , ys = [], []
    for w in wgroup:
        # add start and end tokens
        chs = ["."] + list(w) + ["."]
        for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
            ix1 = stoi[ch1]
            ix2 = stoi[ch2]
            ix3 = stoi[ch3]
        
            xs.append([ix1, ix2])
            ys.append(ix3)

    xs = torch.tensor(xs, dtype=torch.int64)
    ys = torch.tensor(ys, dtype=torch.int64)

    if wgroup == words_train:
        x_train, y_train = xs, ys
    elif wgroup == words_dev:
        x_dev, y_dev = xs, ys
    else:
        x_test, y_test = xs, ys

In [214]:
# initialize the 'network'
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27*2, 27), generator=g, requires_grad=True)

In [215]:
# gradient descent
for k in range(200):
  
  # forward pass
  xenc = F.one_hot(x_train, num_classes=27).float() # input to the network: one-hot encoding
  xenc = xenc.view(-1, 27*2)
  logits = xenc @ W # predict log-counts
  counts = logits.exp() # counts, equivalent to N
  probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
  loss = -probs[torch.arange(len(x_train)), y_train].log().mean()
  print(loss.item())
  
  # backward pass
  W.grad = None # set to zero the gradient
  loss.backward()
  
  # update
  W.data += -50 * W.grad

4.186277866363525
3.3571949005126953
3.0424678325653076
2.8718788623809814
2.7675387859344482
2.694955587387085
2.6393167972564697
2.595170736312866
2.5591745376586914
2.5293869972229004
2.504401206970215
2.483247756958008
2.4651455879211426
2.4495060443878174
2.435850143432617
2.423814058303833
2.4131102561950684
2.4035205841064453
2.3948729038238525
2.3870320320129395
2.379887580871582
2.373351573944092
2.3673486709594727
2.3618173599243164
2.3567044734954834
2.3519647121429443
2.3475594520568848
2.3434550762176514
2.339622974395752
2.3360371589660645
2.3326756954193115
2.3295187950134277
2.3265492916107178
2.3237509727478027
2.321110963821411
2.3186166286468506
2.3162567615509033
2.314021348953247
2.311901330947876
2.309887647628784
2.3079748153686523
2.3061537742614746
2.3044190406799316
2.302765369415283
2.3011863231658936
2.299678325653076
2.2982358932495117
2.2968556880950928
2.2955329418182373
2.294265031814575
2.29304838180542
2.291879892349243
2.2907567024230957
2.28967666625

### Evaluating the Trigram Model

In [216]:
import numpy as np
# finally, sample from the 'neural net' model
g = torch.Generator().manual_seed(2147483647)

def MLP_loss(x, y, W):
    losses = []
    for i in range(100):
        xenc = F.one_hot(x, num_classes=27).float()
        xenc = xenc.view(-1, 27*2)
        logits = xenc @ W # predict log-counts
        counts = logits.exp() # counts, equivalent to N
        p = counts / counts.sum(1, keepdims=True) # probabilities for next character
        loss = -probs[torch.arange(len(x)), y].log().mean()
        losses.append(loss)
    return sum(losses) / len(losses)
            

In [217]:
print(f"Train Loss: {MLP_loss(x_train, y_train, W):.4f}")
print(f"Dev Loss: {MLP_loss(x_dev, y_dev, W):.4f}")
print(f"Test Loss: {MLP_loss(x_test, y_test, W):.4f}")

Train Loss: 2.2483
Dev Loss: 3.3983
Test Loss: 3.4116


Both models overfit.

## E03: Smoothing/Regularization for Trigram Model on Dev Set
E03: use the dev set to tune the strength of smoothing (or regularization) for the trigram model - i.e. try many possibilities and see which one works best based on the dev set loss. What patterns can you see in the train and dev set loss as you tune this strength? Take the best setting of the smoothing and evaluate on the test set once and at the end. How good of a loss do you achieve?

In [218]:
smoothnesses = [0, 0.01, 0.02, 0.05, 0.1, 0.25, 0.5, 1.0]
epochs = 1000
for i, smoothness in enumerate(smoothnesses):
  W = torch.randn((27*2,27), requires_grad = True)
  for k in range(epochs):
      # forward pass
      xenc = F.one_hot(x_train, num_classes = 27).float()
      xenc = xenc.view(-1, 27*2)
      
      # probs is softmax
      logits = xenc @ W
      counts = torch.exp(logits)
      probs = counts / counts.sum(dim = 1, keepdim = True)
      
      # loss (normalized negative log likelihood)
      loss = - probs[torch.arange(len(x_train)), y_train].log().mean()
      # add regularization
      loss += smoothness * W.pow(2).mean()

      if k == epochs - 1:
          print(f"smoothness: {smoothness} | Train Loss: {loss.item():.4f} | Dev Loss {MLP_loss(x_dev, y_dev, W):.4f}")

      # backward pass
      W.grad = None
      loss.backward()

      # update weights
      with torch.no_grad():
          W -= 50 * W.grad

smoothness: 0 | Train Loss: 2.2378 | Dev Loss 3.4837
smoothness: 0.01 | Train Loss: 2.2522 | Dev Loss 3.4109
smoothness: 0.02 | Train Loss: 2.2630 | Dev Loss 3.3704
smoothness: 0.05 | Train Loss: 2.2878 | Dev Loss 3.3030
smoothness: 0.1 | Train Loss: 2.3184 | Dev Loss 3.2437
smoothness: 0.25 | Train Loss: 2.3816 | Dev Loss 3.1590
smoothness: 0.5 | Train Loss: 2.4517 | Dev Loss 3.0962
smoothness: 1.0 | Train Loss: 2.5449 | Dev Loss 3.0422


Evaluate test set on all settings of smoothing

In [219]:
smoothnesses = [0, 0.01, 0.02, 0.05, 0.1, 0.25, 0.5, 1.0]
epochs = 1000
for i, smoothness in enumerate(smoothnesses):
  W = torch.randn((27*2,27), requires_grad = True)
  for k in range(epochs):
      # forward pass
      xenc = F.one_hot(x_train, num_classes = 27).float()
      xenc = xenc.view(-1, 27*2)
      
      # probs is softmax
      logits = xenc @ W
      counts = torch.exp(logits)
      probs = counts / counts.sum(dim = 1, keepdim = True)
      
      # loss (normalized negative log likelihood)
      loss = - probs[torch.arange(len(x_train)), y_train].log().mean()
      # add regularization
      loss += smoothness * W.pow(2).mean()

      if k == epochs - 1:
          print(f"smoothness: {smoothness} | Test Loss: {MLP_loss(x_test, y_test, W):.4f}")


smoothness: 0 | Test Loss: 4.2632
smoothness: 0.01 | Test Loss: 4.3308
smoothness: 0.02 | Test Loss: 4.2541
smoothness: 0.05 | Test Loss: 4.1176
smoothness: 0.1 | Test Loss: 4.2537
smoothness: 0.25 | Test Loss: 4.3165
smoothness: 0.5 | Test Loss: 4.2331
smoothness: 1.0 | Test Loss: 4.4352


## E04: Replace F.one_hot With Indexing Into Rows Of W
E04: we saw that our 1-hot vectors merely select a row of W, so producing these vectors explicitly feels wasteful. Can you delete our use of F.one_hot in favor of simply indexing into rows of W?

### Bigram

In [220]:
# create the training set of bigrams (x,y)
xs, ys = [], []

for w in words[:1]:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2 in zip(chs, chs[1:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    print(ch1, ch2)
    xs.append(ix1)
    ys.append(ix2)
    
xs = torch.tensor(xs)
ys = torch.tensor(ys)

. e
e m
m m
m a
a .


In [221]:
W = torch.randn((27,27), requires_grad = True)

print(xs)
xenc = F.one_hot(xs, num_classes = 27).float()
logits = xenc @ W
print(logits)

logits = W[xs]
print(logits)

tensor([ 0,  5, 13, 13,  1])
tensor([[-0.8201, -0.1828,  0.5611,  1.5837, -0.2356,  0.5476,  0.7776, -0.6826,
         -0.5316,  0.1229,  0.6429,  0.7000, -1.7489, -0.0120,  0.0371,  0.9434,
         -0.1368, -0.2018, -0.2666,  1.6872,  1.6887,  1.3726, -0.4450, -0.3318,
          1.7879, -1.2237,  0.1897],
        [ 0.1886,  0.6403, -0.3606, -0.9040, -1.0911,  0.8286,  0.7166, -0.3046,
         -0.3809, -0.9544,  0.2511,  0.3206, -0.4670, -0.9661, -1.3351, -1.1135,
          0.5519,  0.6228,  0.3322, -1.6020, -2.4841, -0.5358, -0.1532, -0.5253,
          1.0196,  0.0701, -0.5255],
        [-1.2861, -0.8518,  0.7178,  0.3687, -0.8201, -0.7434,  0.3805,  0.2550,
          0.8149,  1.0296, -0.1007,  0.4546, -1.1352, -1.9481, -0.3302,  0.0586,
         -0.0091, -0.0868, -0.3421,  1.5494,  0.3669,  0.3609, -0.0476,  0.5742,
          0.9386,  0.1258, -0.0483],
        [-1.2861, -0.8518,  0.7178,  0.3687, -0.8201, -0.7434,  0.3805,  0.2550,
          0.8149,  1.0296, -0.1007,  0.4546, -1.13

Trigram

In [222]:
# create the dataset
xs, ys = [], []
for w in words:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    ix3 = stoi[ch3]
    xs.append((ix1, ix2))
    ys.append(ix3)
xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement()
print('number of examples: ', num)

# initialize the 'network'
g = torch.Generator().manual_seed(2147483647)

number of examples:  392226


In [223]:
W = torch.randn((27*2,27), requires_grad = True)

xenc = F.one_hot(xs, num_classes = 27).float()
xenc = xenc.view(-1, 27*2)
logits = xenc @ W
print(logits)

logits = W[xs[:, 0]] + W[xs[:, 1]+27]
print(logits)

tensor([[ 0.1451, -1.9512,  0.8956,  ..., -0.1643,  0.2995,  2.1751],
        [-0.9944,  0.7112,  1.4689,  ..., -0.1858,  0.5872,  0.2478],
        [-3.0042,  1.0594, -0.6920,  ..., -0.9187, -0.8198,  0.8225],
        ...,
        [-2.0066, -0.4440, -1.7717,  ..., -0.9949, -1.5134,  2.6471],
        [-0.0695, -0.9170,  3.1367,  ..., -2.0284, -1.4078,  0.3709],
        [ 0.8476, -1.4958, -0.8498,  ...,  0.3662, -1.9539,  1.4925]],
       grad_fn=<MmBackward0>)
tensor([[ 0.1451, -1.9512,  0.8956,  ..., -0.1643,  0.2995,  2.1751],
        [-0.9944,  0.7112,  1.4689,  ..., -0.1858,  0.5872,  0.2478],
        [-3.0042,  1.0594, -0.6920,  ..., -0.9187, -0.8198,  0.8225],
        ...,
        [-2.0066, -0.4440, -1.7717,  ..., -0.9949, -1.5134,  2.6471],
        [-0.0695, -0.9170,  3.1367,  ..., -2.0284, -1.4078,  0.3709],
        [ 0.8476, -1.4958, -0.8498,  ...,  0.3662, -1.9539,  1.4925]],
       grad_fn=<AddBackward0>)


## E05: F.cross_entropy

E05: look up and use F.cross_entropy instead. You should achieve the same result. Can you think of why we'd prefer to use F.cross_entropy instead?

In [224]:
# create the dataset
xs, ys = [], []
for w in words:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    ix3 = stoi[ch3]
    xs.append((ix1, ix2))
    ys.append(ix3)
xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement()
print('number of examples: ', num)

# initialize the 'network'
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27*2, 27), generator=g, requires_grad=True)

number of examples:  392226


In [225]:
import torch.nn.functional as F
# gradient descent
for k in range(100):
  
    # forward pass
    xenc = F.one_hot(xs, num_classes=27).float() # input to the network: one-hot encoding
    logits = xenc.view(-1, 27*2) @ W # predict log-counts and merge both of the one-hot encoded character inputs
    # counts = logits.exp() # counts, equivalent to N
    # probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
    # loss = -probs[torch.arange(len(ys)), ys].log().mean() + 0.01*(W**2).mean()
    loss = F.cross_entropy(logits, ys)
    print(loss.item())
    
    # backward pass
    W.grad = None # set to zero the gradient
    loss.backward()
    
    # update
    W.data += -50 * W.grad

4.186270713806152
3.3573684692382812
3.04215145111084
2.871455192565918
2.7671945095062256
2.694681406021118
2.6390926837921143
2.5949814319610596
2.55900239944458
2.529222249984741
2.5042335987091064
2.483072519302368
2.464961290359497
2.4493143558502197
2.435654401779175
2.423619031906128
2.412919521331787
2.4033381938934326
2.394700527191162
2.386871337890625
2.379739999771118
2.3732173442840576
2.3672289848327637
2.3617119789123535
2.3566133975982666
2.3518881797790527
2.34749698638916
2.343407154083252
2.339588165283203
2.3360161781311035
2.332667589187622
2.3295228481292725
2.3265655040740967
2.3237788677215576
2.3211493492126465
2.3186655044555664
2.3163156509399414
2.314089059829712
2.3119773864746094
2.3099722862243652
2.3080663681030273
2.3062520027160645
2.304523468017578
2.302875518798828
2.301302433013916
2.2997987270355225
2.298360586166382
2.2969841957092285
2.2956655025482178
2.294400691986084
2.293186902999878
2.2920210361480713
2.290900230407715
2.2898218631744385
2.2

## E06: Meta-Exercise
E06: meta-exercise! Think of a fun/interesting exercise and complete it.

Reimplementing the MLP model using pytorch nn.Module

In [226]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [227]:
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(27*2, 27)
        # initialize weights with normal distribution with mean 0 and std 1
        nn.init.normal_(self.fc1.weight, mean=0, std=1)
        
    def forward(self, xs):
        # Create one-hot encodings
        xenc = F.one_hot(xs, num_classes=27).float()
        xenc = xenc.view(-1, 27*2)
        # Use the linear layer for the forward pass
        logits = self.fc1(xenc)
        return logits

In [233]:
model = MLP()
optimizer = torch.optim.SGD(model.parameters(), lr=2)

for k in range(200):
    # forward pass
    logits = model(xs)
    
    loss = F.cross_entropy(logits, ys)
    # add regularization
    loss += 0.2 * model.fc1.weight.pow(2).mean()

    if k % 10 == 0:
        print(f"{k}: {loss.item():.4f}")

    # backward pass
    optimizer.zero_grad()
    loss.backward()

    # update weights
    optimizer.step()

print("Final loss:", loss.item())

0: 4.4355
10: 3.5923
20: 3.3295
30: 3.1760
40: 3.0714
50: 2.9935
60: 2.9320
70: 2.8813
80: 2.8385
90: 2.8017
100: 2.7695
110: 2.7411
120: 2.7157
130: 2.6928
140: 2.6722
150: 2.6535
160: 2.6363
170: 2.6206
180: 2.6062
190: 2.5929
Final loss: 2.58174729347229
