<a href="https://colab.research.google.com/github/lizhieffe/language_model/blob/main/Name_Generation_LM_v3_Torch_Library.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
USE_GPU = True

BLOCK_SIZE = 7 # Context length: how many chars do we take to predict the next one?

# Setup GPU

In [None]:
if USE_GPU:
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
  assert device != 'cpu', "GPU is not available"
else:
  device = 'cpu'

print(device)

cuda:0


# Util Functions

In [None]:
def _total_params(layers):
  """ Get the total parameter number.

  Args:
    layers: the list of layers of the model

  Returns:
    Number of total parameters
  """
  total_params = 0
  for l in layers:
    for p in l.parameters():
      total_params += p.data.nelement()
  return total_params

In [None]:
def _sample_one_batch(X, Y, batch_size, generator):
  """Sample from ds and generate a batch.

  Args:
    X: features of ds
    Y: labels of ds
    batch_size: batch size
    generator: a pseudorandom number generator for sampling
  Returns:
    Xb: batched features
    Yb: batched labels
  """
  ix = torch.randint(0, X.shape[0], (batch_size, ), generator=generator).to(device)
  # print(f'{ix.device=}')
  Xb, Yb = X[ix], Y[ix]
  return Xb, Yb

In [None]:
def _calculate_loss(Xb, Yb, layers):
  """ Calculate loss.

  Args:
    Xb: the feature batch
    Yb: the label batch
    layers: the layers of the model

  Returns:
    loss: the calculated loss
  """
  emb = C[Xb]
  # print(f'{emb.device=}')
  x = emb.view(emb.shape[0], -1)
  # print(f'{x.device=}')
  for l in layers:
    x = l(x)
    # print(f'{x.device=}')
  loss = F.cross_entropy(x, Yb)
  # print(f'{loss.device=}')
  return loss

# Load data

In [None]:
# download the names.txt file from github
!wget https://raw.githubusercontent.com/karpathy/makemore/master/names.txt

# read in all the words
words = open('names.txt', 'r').read().splitlines()
words[:8]

--2023-09-30 16:21:44--  https://raw.githubusercontent.com/karpathy/makemore/master/names.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 228145 (223K) [text/plain]
Saving to: ‘names.txt.5’


2023-09-30 16:21:44 (7.85 MB/s) - ‘names.txt.5’ saved [228145/228145]



['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [None]:
len(words)

32033

# Build vocabulary

In [None]:
chars = sorted(list(set(''.join(words))))

stoi = {c:i+1 for i,c in enumerate(chars)}
stoi['.'] = 0

itos = {i:c for c,i in stoi.items()}

assert len(stoi) == len(itos)
assert len(stoi) == 27

vocab_size = len(stoi)

# Create DS

In [None]:
def build_dataset(words):
  X = []
  Y = []
  for w in words:
    context = [0] * BLOCK_SIZE
    for c in w + '.':
      iy = stoi[c]
      X.append(context)
      Y.append(iy)
      context = context[1:] + [iy]

  X = torch.tensor(X).to(device)
  Y = torch.tensor(Y).to(device)
  return X, Y

In [None]:
import random
random.seed(42)
random.shuffle(words)

n1 = int(0.6 * len(words))
n2 = int(0.8 * len(words))

print(f'total size = {len(words)}, n1 = {n1}, n2 = {n2}')

Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[n2:])

assert Xtr.shape[0] == Ytr.shape[0]
assert Xdev.shape[0] == Ydev.shape[0]
assert Xte.shape[0] == Yte.shape[0]

print(f'{Xtr.shape=}, {Ytr.shape=}')
print(f'{Xdev.shape=}, {Ydev.shape=}')
print(f'{Xte.shape=}, {Yte.shape=}')

total size = 32033, n1 = 19219, n2 = 25626
Xtr.shape=torch.Size([137024, 7]), Ytr.shape=torch.Size([137024])
Xdev.shape=torch.Size([45601, 7]), Ydev.shape=torch.Size([45601])
Xte.shape=torch.Size([45521, 7]), Yte.shape=torch.Size([45521])


In [None]:
for i in range(20):
  print(f"{''.join(itos[ix.item()] for ix in Xtr[i])} ---> {itos[Ytr[i].item()]}")

....... ---> y
......y ---> u
.....yu ---> h
....yuh ---> e
...yuhe ---> n
..yuhen ---> g
.yuheng ---> .
....... ---> d
......d ---> i
.....di ---> o
....dio ---> n
...dion ---> d
..diond ---> r
.diondr ---> e
diondre ---> .
....... ---> x
......x ---> a
.....xa ---> v
....xav ---> i
...xavi ---> e


# Modeling

In [None]:
n_embd = 10 # the dimensionality of the character embedding vectors
N_HIDDEN = 100 # the number of neurons in the hidden layer of the MLP
g = torch.Generator().manual_seed(2147483647) # for reproducibility

## Model class

In [None]:
class Net(torch.nn.Module):
  def __init__(self, n_embd, block_size, vocab_size, generator=None):
    super().__init__()

    self.embd = torch.nn.Embedding(num_embeddings=vocab_size, embedding_dim=n_embd).to(device)
    self.linear1 = torch.nn.Linear(block_size * n_embd, N_HIDDEN, bias=True).to(device)
    self.tanh1 = torch.nn.Tanh().to(device)
    self.linear2 = torch.nn.Linear(N_HIDDEN, N_HIDDEN, bias=True).to(device)
    self.tanh2 = torch.nn.Tanh().to(device)
    self.linear3 = torch.nn.Linear(N_HIDDEN, N_HIDDEN, bias=True).to(device)
    self.tanh3 = torch.nn.Tanh().to(device)
    self.linear_logits = torch.nn.Linear(N_HIDDEN, vocab_size, bias=True).to(device)

    self.ffn_layers = [
        self.linear1,
        self.tanh1,
        self.linear2,
        self.tanh2,
        self.linear3,
        self.tanh3,
        self.linear_logits,
    ]

    self.layers = self.ffn_layers + [self.embd]

  def forward(self, x):
    xemb = self.embd(x)
    y = xemb.view(xemb.shape[0], -1)
    for l in self.ffn_layers:
      y = l(y)
    return y

net = Net(n_embd=n_embd, block_size=BLOCK_SIZE, vocab_size=vocab_size)

In [None]:
_total_params = 0

for p in net.parameters():
  _total_params += p.nelement()
  # print(f'{p.data.shape=}')

print(f'Total params = {_total_params}')

Total params = 30297


## Define loss fn and optimizer

In [None]:
import torch.optim as optim

loss_fn = torch.nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

# Train

In [None]:
max_steps = 200000
batch_size = 32000
lossi = []
lossi_dev = []
ud = []

running_loss = 0.0
running_loss_dev = 0.0
running_loss_steps = 0

for i in range(max_steps):

  # Forward
  Xb, Yb = _sample_one_batch(Xtr, Ytr, batch_size, generator=g)
  optimizer.zero_grad()
  outputs = net(Xb)

  # Loss
  loss = loss_fn(outputs, Yb)
  running_loss += loss.item()
  running_loss_steps += 1

  # Eval dev DS
  Xb_dev, Yb_dev = _sample_one_batch(Xdev, Ydev, batch_size, generator=g)
  outputs_dev = net(Xb_dev)
  loss_dev = loss_fn(outputs_dev, Yb_dev)
  running_loss_dev += loss_dev.item()

  # Update
  loss.backward()
  optimizer.step()

  # Track status
  if i % 1000 == 0:
    print(f'{i}/{max_steps}: training loss={running_loss/running_loss_steps:.4f}, dev loss={running_loss_dev/running_loss_steps:.4f}')
    running_loss = 0.0
    running_loss_dev = 0.0
    running_loss_steps = 0

  lossi.append(loss.log10().item())
  lossi_dev.append(loss_dev.log10().item())

0/200000: training loss=3.3001, dev loss=3.2996
1000/200000: training loss=2.8431, dev loss=2.8443
2000/200000: training loss=2.5777, dev loss=2.5776
3000/200000: training loss=2.5144, dev loss=2.5131
4000/200000: training loss=2.4738, dev loss=2.4711
5000/200000: training loss=2.4415, dev loss=2.4387
6000/200000: training loss=2.4119, dev loss=2.4097
7000/200000: training loss=2.3866, dev loss=2.3843
8000/200000: training loss=2.3657, dev loss=2.3637
9000/200000: training loss=2.3487, dev loss=2.3472
10000/200000: training loss=2.3338, dev loss=2.3330
11000/200000: training loss=2.3202, dev loss=2.3201
12000/200000: training loss=2.3079, dev loss=2.3081
13000/200000: training loss=2.2972, dev loss=2.2975
14000/200000: training loss=2.2871, dev loss=2.2884
15000/200000: training loss=2.2779, dev loss=2.2796
16000/200000: training loss=2.2699, dev loss=2.2720
17000/200000: training loss=2.2617, dev loss=2.2647
18000/200000: training loss=2.2548, dev loss=2.2580
19000/200000: training lo

KeyboardInterrupt: ignored

In [None]:
plt.plot(lossi)

In [None]:
plt.plot(lossi_dev)

# Sample the model

In [None]:
g_gpu = torch.Generator(device=device).manual_seed(2147483647) # for reproducibility

words = []
for _ in range(30):

  iys = []
  context = [0] * BLOCK_SIZE
  while True:
    x = torch.tensor([context]).to(device)
    logits = net(x)
    prob = F.softmax(logits, dim=1)
    iy = torch.multinomial(prob, num_samples=1, replacement=True, generator=g_gpu)

    if iy == 0:
      words.append(''.join(itos[iy.item()] for iy in iys))
      break
    else:
      iys.append(iy)
      context = context[1:] + [iy]

for w in words:
  print(w)