In [1]:
# make sure torch can work with my AMD GPU (this is a nightmare)
import os
os.environ['HSA_OVERRIDE_GFX_VERSION'] = '10.3.0'

In [2]:
# get unique set of characters from input text
with open('input.txt','r', encoding='utf-8') as f:
  text = f.read()

chars = sorted(list(set(text)))
vocab_size = len(chars)

print("".join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


**Tokenization codebook trade-off:** Trade-off between codebook sizes and sequence lengths. Having a large code book can result in encodings being shorter. Having a small codebook can result in the encodings being longer.

Here, we are a using a simple character-level tokenizer. In real-world practice however, it is more common to use sub-word based encoding.

In [3]:
# Encoder and decoder (tokenizer)

# character-based encoding
stoi = { char: i for i, char in enumerate(chars) }
itos = { i: char for i, char in enumerate(chars) }
encode = lambda s: [stoi[char] for char in s]
decode = lambda l: "".join(itos[char] for char in l)

print(encode('hi there'))
print(decode(encode('hi there')))

[46, 47, 1, 58, 46, 43, 56, 43]
hi there


In [4]:
# read data into a tensor
import torch
data = torch.tensor(encode(text), dtype=torch.long).cuda()

In [None]:
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]


In [None]:
# Define our "chunk" size for segments of training data to train
block_size = 8
x = train_data[:block_size]
y = train_data[1:block_size+1]
print(x.tolist())
for t in range(block_size):
  context = x[:t+1]
  target = y[t]
  print(f"When input is {context.tolist()} the target: {target.tolist()}.")

In [None]:
torch.manual_seed(1337)
batch_size = 4 # how many sequences in parallel to process
block_size = 8 # length of our sequences

def get_batch(split):
  # obtain batch from train or validation set
  data = train_data if split == 'train' else val_data
  indexes = torch.randint(len(data) - block_size, (batch_size,))
  x = torch.stack([data[i:i+block_size] for i in indexes])
  y = torch.stack([data[i+1:i+block_size+1] for i in indexes])
  return x, y

xb, yb = get_batch('train')
print("inputs:")
print(xb)
print("outputs:")
print(yb)

print(" --- ")

for b in range(batch_size):
  for t in range(block_size):
    x = xb[b][:t+1].tolist()
    y = yb[b][t].tolist()
    print(f"when input is {x} the target is {y}")

In [None]:
# define the model

import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
  def __init__(self, vocab_size):
    super().__init__()
    self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
    
  def forward(self, idx, targets=None):
    # idx and targets are both (B,T) tensor of integers
    logits = self.token_embedding_table(idx)
    
    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets)
    
    return logits, loss
  
  def generate(self, idx, max_new_tokens):
    for _ in range(max_new_tokens):
      logits, loss = self.forward(idx)
      logits = logits[:, -1, :]
      probs = F.softmax(logits, dim=-1)
      idx_next = torch.multinomial(probs, num_samples=1)
      idx = torch.cat((idx,idx_next), dim=1)
    return idx
  
model = BigramLanguageModel(vocab_size).cuda()

In [None]:
# example of what logits are

# consider the first sequence of the batch
print(f"First sequence:")
seq_0 = xb[0]
print(seq_0.tolist())

print(" --- ")

# obtain the logits
logits, _ = model.forward(xb, yb)

# for each of the 8 characters of the sequence, we get an array of 65 elements representing
# the probabilities of characters being the token (higher means more likely)
import numpy as np

# probabilities
seq_logits = logits[:8]
for idx in range(len(seq_0)):
  print(f"Char {seq_0[idx]}")
  print(f"Probabilities(logits): {np.round(seq_logits[idx].tolist(),2)}")

In [None]:
# generate next tokens with input of [[0.0]]
decode(model.generate(idx = torch.zeros((1,1), dtype=torch.long).cuda(), max_new_tokens=100)[0].cpu().tolist())

In [None]:
# Define the optimizer for training
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)

In [None]:
batch_size = 32
for steps in range(10000):
  xb, yb = get_batch('train')
  logits, loss = model.forward(xb,yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()

print(loss.item())

In [None]:
# generate text again after training 
print(decode(model.generate(idx = torch.zeros((1,1), dtype=torch.long).cuda(), max_new_tokens=300)[0].cpu().tolist()))