<a href="https://colab.research.google.com/github/lizhieffe/language_model/blob/main/Name_Generation_LM_v1_Customized_Layers_%26_Training_Loops.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
USE_GPU = False

# Setup GPU

In [None]:
if USE_GPU:
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
  assert device != 'cpu', "GPU is not available"
else:
  device = 'cpu'

print(device)

cpu


# Load data

In [None]:
# download the names.txt file from github
!wget https://raw.githubusercontent.com/karpathy/makemore/master/names.txt

# read in all the words
words = open('names.txt', 'r').read().splitlines()
words[:8]

--2023-09-30 00:15:56--  https://raw.githubusercontent.com/karpathy/makemore/master/names.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 228145 (223K) [text/plain]
Saving to: ‘names.txt’


2023-09-30 00:15:57 (9.03 MB/s) - ‘names.txt’ saved [228145/228145]



['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [None]:
len(words)

32033

# Build vocabulary

In [None]:
chars = sorted(list(set(''.join(words))))

stoi = {c:i+1 for i,c in enumerate(chars)}
stoi['.'] = 0

itos = {i:c for c,i in stoi.items()}

assert len(stoi) == len(itos)
assert len(stoi) == 27

vocab_size = len(stoi)

# Create DS

In [None]:
block_size = 7 # Context length: how many chars do we take to predict the next one?

In [None]:
def build_dataset(words):
  X = []
  Y = []
  for w in words:
    context = [0] * block_size
    for c in w + '.':
      iy = stoi[c]
      X.append(context)
      Y.append(iy)
      context = context[1:] + [iy]

  X = torch.tensor(X).to(device)
  Y = torch.tensor(Y).to(device)
  return X, Y

In [None]:
import random
random.seed(42)
random.shuffle(words)

n1 = int(0.6 * len(words))
n2 = int(0.8 * len(words))

print(f'total size = {len(words)}, n1 = {n1}, n2 = {n2}')

Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[n2:])

assert Xtr.shape[0] == Ytr.shape[0]
assert Xdev.shape[0] == Ydev.shape[0]
assert Xte.shape[0] == Yte.shape[0]

print(f'{Xtr.shape=}, {Ytr.shape=}')
print(f'{Xdev.shape=}, {Ydev.shape=}')
print(f'{Xte.shape=}, {Yte.shape=}')

total size = 32033, n1 = 19219, n2 = 25626
Xtr.shape=torch.Size([137024, 7]), Ytr.shape=torch.Size([137024])
Xdev.shape=torch.Size([45601, 7]), Ydev.shape=torch.Size([45601])
Xte.shape=torch.Size([45521, 7]), Yte.shape=torch.Size([45521])


In [None]:
for i in range(20):
  print(f"{''.join(itos[ix.item()] for ix in Xtr[i])} ---> {itos[Ytr[i].item()]}")

....... ---> y
......y ---> u
.....yu ---> h
....yuh ---> e
...yuhe ---> n
..yuhen ---> g
.yuheng ---> .
....... ---> d
......d ---> i
.....di ---> o
....dio ---> n
...dion ---> d
..diond ---> r
.diondr ---> e
diondre ---> .
....... ---> x
......x ---> a
.....xa ---> v
....xav ---> i
...xavi ---> e


# MLP Revisited

## Model params

In [None]:
n_emb = 10
n_hidden = 400

g = torch.Generator().manual_seed(2147483647) # for reproducibility
C = torch.randn((vocab_size, n_emb), generator=g).to(device)
W1 = torch.randn((block_size * n_emb, n_hidden), generator=g).to(device)
b1 = torch.randn(n_hidden, generator=g).to(device)
W2 = torch.randn((n_hidden, n_hidden), generator=g).to(device)
b2 = torch.randn(n_hidden, generator=g).to(device)
W3 = torch.randn((n_hidden, n_hidden), generator=g).to(device)
b3 = torch.randn(n_hidden, generator=g).to(device)
W4 = torch.randn((n_hidden, n_hidden), generator=g).to(device)
b4 = torch.randn(n_hidden, generator=g).to(device)
Wl = torch.randn((n_hidden, vocab_size), generator=g).to(device)
bl = torch.randn(vocab_size, generator=g).to(device)

parameters = [C, W1, b1, W2, b2, W3, b3, W4, b4, Wl, bl]
for p in parameters:
  print(f'shape = {p.shape}')

n_parameters = sum(p.nelement() for p in parameters)
print()
print(f'total parameters = {n_parameters}')

for p in parameters:
  p.requires_grad = True

shape = torch.Size([27, 10])
shape = torch.Size([70, 400])
shape = torch.Size([400])
shape = torch.Size([400, 400])
shape = torch.Size([400])
shape = torch.Size([400, 400])
shape = torch.Size([400])
shape = torch.Size([400, 400])
shape = torch.Size([400])
shape = torch.Size([400, 27])
shape = torch.Size([27])

total parameters = 520697


## Training loop

In [None]:
max_steps = 900_000
batch_size = 8224
total_loss_print_steps = 50

In [None]:
lossi = []

for i in range(max_steps):
  ix = torch.randint(0, Xtr.shape[0], (batch_size,), generator=g).to(device)
  Xb, Yb = Xtr[ix], Ytr[ix]

  # forward pass
  xemb = C[Xb]
  h = xemb.view(xemb.shape[0], -1) @ W1 + b1
  h = torch.tanh(h)
  h = torch.tanh(h @ W2 + b2)
  h = torch.tanh(h @ W3 + b3)
  h = torch.tanh(h @ W4 + b4)

  logits = h @ Wl + bl
  loss = F.cross_entropy(logits, Yb, )
  lossi.append(loss.log10().item())

  # backward pass
  for p in parameters:
    p.grad = None
  loss.backward()

  if i < 20_000:
    lr = 0.1
  else:
    lr = 0.01

  for p in parameters:
    p.data -= lr * p.grad

  if i % (max_steps / total_loss_print_steps) == 0:
    print(f'{i}/{max_steps}: {loss.item():.4f}')


0/900000: 36.7427
18000/900000: 3.4929


In [None]:
plt.plot(lossi)

## Sample the model

In [None]:
g_gpu = torch.Generator(device=device).manual_seed(2147483647) # for reproducibility

out = []
for _ in range(30):
  context = [0] * block_size

  str = ''
  while True:

    xemb = C[torch.tensor([context])]
    h = xemb.view(xemb.shape[0], -1) @ W1 + b1
    h = torch.tanh(h)
    h = torch.tanh(h @ W2 + b2)
    logits = h @ Wl + bl
    probs = F.softmax(logits, dim=1)
    iy = torch.multinomial(probs, num_samples=1, replacement=True, generator=g_gpu)

    if iy == 0:
      str
      out.append(str)
      break
    else:
      context = context[1:] + [iy]
      str += itos[iy.item()]

for w in out:
  print(w)


In [None]:
xemb.shape