<a href="https://colab.research.google.com/github/lizhieffe/language_model/blob/main/Name_Generation_LM_v2_Torch_Layers_%2B_Customized_Loops.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
USE_GPU = False
BLOCK_SIZE = 7 # Context length: how many chars do we take to predict the next one?

# Setup GPU

In [None]:
if USE_GPU:
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
  assert device != 'cpu', "GPU is not available"
else:
  device = 'cpu'

print(device)

cpu


# Util Functions

In [None]:
def _total_params(layers):
  """ Get the total parameter number.

  Args:
    layers: the list of layers of the model

  Returns:
    Number of total parameters
  """
  total_params = 0
  for l in layers:
    for p in l.parameters():
      total_params += p.data.nelement()
  return total_params

In [None]:
def _sample_one_batch(X, Y, batch_size, generator):
  """Sample from ds and generate a batch.

  Args:
    X: features of ds
    Y: labels of ds
    batch_size: batch size
    generator: a pseudorandom number generator for sampling
  Returns:
    Xb: batched features
    Yb: batched labels
  """
  ix = torch.randint(0, X.shape[0], (batch_size, ), generator=generator).to(device)
  # print(f'{ix.device=}')
  Xb, Yb = X[ix], Y[ix]
  return Xb, Yb

In [None]:
def _calculate_loss(Xb, Yb, layers):
  """ Calculate loss.

  Args:
    Xb: the feature batch
    Yb: the label batch
    layers: the layers of the model

  Returns:
    loss: the calculated loss
  """
  emb = C[Xb]
  # print(f'{emb.device=}')
  x = emb.view(emb.shape[0], -1)
  # print(f'{x.device=}')
  for l in layers:
    x = l(x)
    # print(f'{x.device=}')
  loss = F.cross_entropy(x, Yb)
  # print(f'{loss.device=}')
  return loss

# Load data

In [None]:
# download the names.txt file from github
!wget https://raw.githubusercontent.com/karpathy/makemore/master/names.txt

# read in all the words
words = open('names.txt', 'r').read().splitlines()
words[:8]

--2023-09-30 16:20:39--  https://raw.githubusercontent.com/karpathy/makemore/master/names.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 228145 (223K) [text/plain]
Saving to: ‘names.txt.3’


2023-09-30 16:20:39 (6.37 MB/s) - ‘names.txt.3’ saved [228145/228145]



['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [None]:
len(words)

32033

# Build vocabulary

In [None]:
chars = sorted(list(set(''.join(words))))

stoi = {c:i+1 for i,c in enumerate(chars)}
stoi['.'] = 0

itos = {i:c for c,i in stoi.items()}

assert len(stoi) == len(itos)
assert len(stoi) == 27

vocab_size = len(stoi)

# Create DS

In [None]:
def build_dataset(words):
  X = []
  Y = []
  for w in words:
    context = [0] * BLOCK_SIZE
    for c in w + '.':
      iy = stoi[c]
      X.append(context)
      Y.append(iy)
      context = context[1:] + [iy]

  X = torch.tensor(X).to(device)
  Y = torch.tensor(Y).to(device)
  return X, Y

In [None]:
import random
random.seed(42)
random.shuffle(words)

n1 = int(0.6 * len(words))
n2 = int(0.8 * len(words))

print(f'total size = {len(words)}, n1 = {n1}, n2 = {n2}')

Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[n2:])

assert Xtr.shape[0] == Ytr.shape[0]
assert Xdev.shape[0] == Ydev.shape[0]
assert Xte.shape[0] == Yte.shape[0]

print(f'{Xtr.shape=}, {Ytr.shape=}')
print(f'{Xdev.shape=}, {Ydev.shape=}')
print(f'{Xte.shape=}, {Yte.shape=}')

total size = 32033, n1 = 19219, n2 = 25626
Xtr.shape=torch.Size([137024, 7]), Ytr.shape=torch.Size([137024])
Xdev.shape=torch.Size([45601, 7]), Ydev.shape=torch.Size([45601])
Xte.shape=torch.Size([45521, 7]), Yte.shape=torch.Size([45521])


In [None]:
for i in range(20):
  print(f"{''.join(itos[ix.item()] for ix in Xtr[i])} ---> {itos[Ytr[i].item()]}")

....... ---> y
......y ---> u
.....yu ---> h
....yuh ---> e
...yuhe ---> n
..yuhen ---> g
.yuheng ---> .
....... ---> d
......d ---> i
.....di ---> o
....dio ---> n
...dion ---> d
..diond ---> r
.diondr ---> e
diondre ---> .
....... ---> x
......x ---> a
.....xa ---> v
....xav ---> i
...xavi ---> e


# Model

In [None]:
n_embd = 10 # the dimensionality of the character embedding vectors
n_hidden = 100 # the number of neurons in the hidden layer of the MLP
g = torch.Generator().manual_seed(2147483647) # for reproducibility

In [None]:
C = torch.randn((vocab_size, n_embd), generator=g).to(device)

layers = [
    torch.nn.Linear(n_embd * BLOCK_SIZE, n_hidden, bias=True).to(device),
    torch.nn.Tanh().to(device),
    torch.nn.Linear(n_hidden, n_hidden, bias=True).to(device),
    torch.nn.Tanh().to(device),
    torch.nn.Linear(n_hidden, n_hidden, bias=True).to(device),
    torch.nn.Tanh().to(device),
    torch.nn.Linear(n_hidden, vocab_size, bias=True).to(device)
]

print(f'total params = {_total_params(layers)}')

total params = 30027


In [None]:
max_steps = 200000
batch_size = 32000
lossi = []
lossi_dev = []
ud = []

for i in range(max_steps):
  Xb, Yb = _sample_one_batch(Xtr, Ytr, batch_size, g)
  loss = _calculate_loss(Xb, Yb, layers)

  # emb = C[Xb]

  # x = emb.view(emb.shape[0], -1)
  # for l in layers:
  #   x = l(x)

  # loss = F.cross_entropy(x, Yb)

  for l in layers:
    l.zero_grad()
  loss.backward()

  Xb_dev, Yb_dev = _sample_one_batch(Xdev, Ydev, batch_size, g)
  loss_dev = _calculate_loss(Xb_dev, Yb_dev, layers)

  # update
  lr = 0.1 if i < 15000 else 0.01 # step learning rate decay
  # lr = 0.01
  for l in layers:
    for p in l.parameters():
      # print(f'{p.data.shape=}, {p.grad.shape=}')
      p.data -= lr * p.grad

  # Track status
  if i % 1000 == 0:
    print(f'{i}/{max_steps}: training loss={loss.item():.4f}, dev loss={loss_dev.item():.4f}')
  lossi.append(loss.log10().item())
  lossi_dev.append(loss_dev.log10().item())

0/200000: training loss=3.2814, dev loss=3.2837


KeyboardInterrupt: ignored

In [None]:
C.device
Xb.device
# for l in layers:
#   print(f'{l.device=}')

In [None]:
plt.plot(lossi)

In [None]:
plt.plot(lossi_dev)

In [None]:
g_gpu = torch.Generator(device=device).manual_seed(2147483647) # for reproducibility

out = []
for _ in range(30):
  context = [0] * BLOCK_SIZE

  str = ''
  while True:

    xemb = C[torch.tensor([context])]
    x = xemb.view(xemb.shape[0], -1)
    for l in layers:
      x = l(x)
    logits = x
    probs = F.softmax(logits, dim=1)
    iy = torch.multinomial(probs, num_samples=1, replacement=True, generator=g_gpu)

    if iy == 0:
      str
      out.append(str)
      break
    else:
      context = context[1:] + [iy]
      str += itos[iy.item()]

for w in out:
  print(w)
