<a href="https://colab.research.google.com/github/phucb2/lm-hackers/blob/main/Gpt_dev_playground.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# We always start with a dataset to train on. Let's download the tiny shakespeare dataset
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2024-01-05 08:27:43--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2024-01-05 08:27:44 (150 MB/s) - ‘input.txt’ saved [1115394/1115394]



# Self-Attention V1

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from tqdm.notebook import tqdm
torch.manual_seed(203)

# -*- coding: utf-8 -*-

# Load data
with open('input.txt') as file:
  content = file.read()

chars = sorted(set(list(content)))
stoi = {c:i for i, c in enumerate(chars)}
itoc= {i:c for i, c in enumerate(chars)}

encode = lambda s: [stoi[c] for c in s ]
decode = lambda e: ''.join([itoc[i] for i in e])

tensor = torch.tensor(encode(content), dtype=torch.long)
train_sz = int(len(tensor)*0.9)
# Split train and valid
train_ts = tensor[:train_sz]
valid_ts = tensor[train_sz:]

#===============================================================================
# Parameters
#===============================================================================
block_size = 8
vocab_size = len(chars)
batch_size = 32
n_embs = 32
epoches = 5000
# epoches = 100
learning_rate = 1e-2
eval_iter = 400
# Dry run/Debug
dry_run = False
# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#===============================================================================
# Utils
#===============================================================================
@torch.no_grad()
def estimate_loss(model):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(epoches).to(device)
        for k in range(epoches):
            X, Y = get_batch(split)
            X, Y = X.to(device), Y.to(device)
            _, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

def get_batch(ds:str = 'train'):
  data = train_ts if ds == 'train' else valid_ts
  ix = torch.randint(data.shape[0] - block_size, (batch_size,))
  xb = torch.stack([data[i:i+block_size] for i in ix])
  yb = torch.stack([data[i+1:i+block_size+1] for i in ix])
  return xb, yb


#===============================================================================
# Model definition
#===============================================================================

class Head(nn.Module):
  def __init__(self, n_embs, head_dim):
    super().__init__()
    self.query = nn.Linear(n_embs, head_dim)
    self.key = nn.Linear(n_embs, head_dim)
    self.value = nn.Linear(n_embs, head_dim)
    self.register_buffer('tril', torch.tril(torch.ones((block_size, block_size))))

  def forward(self, x):
    B, T, C = x.shape

    q = self.query(x)
    k = self.key(x)

    head_dim = k.shape[-1]
    wei = q @ k.transpose(-1, -2) / (head_dim ** 0.5)
    wei = wei.masked_fill(self.tril[:T,:T] == 0, float('-inf'))
    wei = F.softmax(wei, dim=-1)

    v = self.value(x) # B, T, C
    out = wei @ v # (B, T, T) @ (B, T, head_size) = (B, T, head_size)
    return out

class BigramLM(nn.Module):
  def __init__(self):
    super().__init__()
    self.token_embd = nn.Embedding(vocab_size, n_embs)
    self.position_embd = nn.Embedding(block_size, n_embs)
    self.sa = Head(n_embs, head_dim=n_embs)
    self.lm_head = nn.Linear(n_embs, vocab_size)


  def forward(self, idx, targets=None):
    B, T = idx.shape
    token_em = self.token_embd(idx) # (B, T, C) and C = vocab_size
    position_em = self.position_embd(torch.arange(T, device=idx.device)) # (T, C)
    x = token_em + position_em # (B, T, C)
    x = self.sa(x) # (B, T, C)

    logits = self.lm_head(x) # (B, T, C)

    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C) # (B*T, C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets)
    return logits, loss

  def generate(self,idx,max_generate):
    for _ in range(max_generate):
      idx_cond = idx[:,-block_size:] # B, T
      logits, _ = self(idx_cond) # B, T, C
      logits = logits[:,-1,:]
      probs = F.softmax(logits, dim=-1) # B, C
      idx_next = torch.multinomial(probs, num_samples=1)
      idx = torch.cat((idx, idx_next), dim=1) # B, T+1
    return idx

def generate_text(m, max_size):
  init = torch.zeros((1, 1), dtype=torch.long).to(device)
  o = m.generate(init, max_size)
  return decode(list(o[0].tolist()))


m = BigramLM()
m = m.to(device)
optimizer = torch.optim.AdamW(m.parameters(), lr=learning_rate)

print("Running on", device)
print("Mode: ", "Dry run" if dry_run else "Normal")
for _ in tqdm(range(epoches)):
  xb, yb = get_batch('train')
  xb, yb = xb.to(device), yb.to(device)
  optimizer.zero_grad(set_to_none=True)
  logits, loss = m(xb, yb)
  loss.backward()
  optimizer.step()
  # Pretty print loss every 10 epoches
  if _ % eval_iter == 0:
    losses = estimate_loss(m)
    print(f'Epoch {_} | Train loss: {losses["train"]:.2f} | Val loss: {losses["val"]:.2f}')
    if dry_run:
      break



print(loss.item())

print(generate_text(m, 500))

Running on cuda
Mode:  Normal


  0%|          | 0/5000 [00:00<?, ?it/s]

Epoch 0 | Train loss: 4.03 | Val loss: 4.03
Epoch 400 | Train loss: 2.49 | Val loss: 2.50
Epoch 800 | Train loss: 2.46 | Val loss: 2.48
Epoch 1200 | Train loss: 2.43 | Val loss: 2.46
Epoch 1600 | Train loss: 2.43 | Val loss: 2.45
Epoch 2000 | Train loss: 2.42 | Val loss: 2.45
Epoch 2400 | Train loss: 2.42 | Val loss: 2.45
Epoch 2800 | Train loss: 2.41 | Val loss: 2.44
Epoch 3200 | Train loss: 2.40 | Val loss: 2.43
Epoch 3600 | Train loss: 2.40 | Val loss: 2.43
Epoch 4000 | Train loss: 2.40 | Val loss: 2.42
Epoch 4400 | Train loss: 2.39 | Val loss: 2.42
Epoch 4800 | Train loss: 2.38 | Val loss: 2.41
2.412196159362793


ED:
We, bouy.

RDUKFEY: Cisand ourithencken ud p, til tirtee waves owe mid mutot thele isorwifeg bor'seet hy pitlot tcie GDomwis ht.


Whif aun tesis ot lch ryisow ming mpo,
Cis ow. CO:
Butit wsh ks lout and pat you ll?
 seset, ipl you, me.


BRTY ARII:
s? OMPALANGNoue yok
He gare:
PANBY defe, m waeveropour tto woh
Touser:
Segs? Bulansorsor uco bom quuur.

Fort ly illl e:

# Multi-head self attention

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

torch.manual_seed(203)

# -*- coding: utf-8 -*-

# Load data
with open('input.txt') as file:
  content = file.read()

chars = sorted(set(list(content)))
stoi = {c:i for i, c in enumerate(chars)}
itoc= {i:c for i, c in enumerate(chars)}

encode = lambda s: [stoi[c] for c in s ]
decode = lambda e: ''.join([itoc[i] for i in e])

tensor = torch.tensor(encode(content), dtype=torch.long)
train_sz = int(len(tensor)*0.9)
# Split train and valid
train_ts = tensor[:train_sz]
valid_ts = tensor[train_sz:]

#===============================================================================
# Parameters
#===============================================================================

# model parameters
block_size = 8
vocab_size = len(chars)
batch_size = 32
n_embs = 32

# training parameters
epoches = 5000
# epoches = 100
learning_rate = 1e-2
eval_iter = 400
# Dry run/Debug
dry_run = False
# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#===============================================================================
# Utils
#===============================================================================
@torch.no_grad()
def estimate_loss(model):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(epoches).to(device)
        for k in range(epoches):
            X, Y = get_batch(split)
            _, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

def get_batch(ds:str = 'train'):
  data = train_ts if ds == 'train' else valid_ts
  ix = torch.randint(data.shape[0] - block_size, (batch_size,))
  xb = torch.stack([data[i:i+block_size] for i in ix])
  yb = torch.stack([data[i+1:i+block_size+1] for i in ix])
  xb, yb = xb.to(device), yb.to(device)
  return xb, yb


#===============================================================================
# Model definition
#===============================================================================

class Head(nn.Module):
  def __init__(self, head_dim):
    super().__init__()
    self.query = nn.Linear(n_embs, head_dim)
    self.key = nn.Linear(n_embs, head_dim)
    self.value = nn.Linear(n_embs, head_dim)
    self.register_buffer('tril', torch.tril(torch.ones((block_size, block_size))))

  def forward(self, x):
    B, T, C = x.shape

    q = self.query(x)
    k = self.key(x)

    head_dim = k.shape[-1]
    wei = q @ k.transpose(-1, -2) / (head_dim ** 0.5)
    wei = wei.masked_fill(self.tril[:T,:T] == 0, float('-inf'))
    wei = F.softmax(wei, dim=-1)

    v = self.value(x) # B, T, C
    out = wei @ v # (B, T, T) @ (B, T, head_size) = (B, T, head_size)
    return out

class MultiHead(nn.Module):
  def __init__(self, head_size, num_heads) -> None:
    super().__init__()
    self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])

  def forward(self, x):
    return torch.cat([h(x) for h in self.heads], dim=-1)

class BigramLM(nn.Module):
  def __init__(self):
    super().__init__()
    self.token_embd = nn.Embedding(vocab_size, n_embs)
    self.position_embd = nn.Embedding(block_size, n_embs)
    self.sa = MultiHead(n_embs // 4, 4)
    self.lm_head = nn.Linear(n_embs, vocab_size)


  def forward(self, idx, targets=None):
    B, T = idx.shape
    token_em = self.token_embd(idx) # (B, T, C) and C = vocab_size
    position_em = self.position_embd(torch.arange(T, device=idx.device)) # (T, C)
    x = token_em + position_em # (B, T, C)
    x = self.sa(x) # (B, T, C)

    logits = self.lm_head(x) # (B, T, C)

    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C) # (B*T, C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets)
    return logits, loss

  def generate(self,idx,max_generate):
    for _ in range(max_generate):
      idx_cond = idx[:,-block_size:] # B, T
      logits, _ = self(idx_cond) # B, T, C
      logits = logits[:,-1,:]
      probs = F.softmax(logits, dim=-1) # B, C
      idx_next = torch.multinomial(probs, num_samples=1)
      idx = torch.cat((idx, idx_next), dim=1) # B, T+1
    return idx

def generate_text(m, max_size):
  init = torch.zeros((1, 1), dtype=torch.long).to(device)
  o = m.generate(init, max_size)
  return decode(list(o[0].tolist()))

def estimate_params(m):
  return sum(p.numel() for p in m.parameters() if p.requires_grad)

m = BigramLM()
m = m.to(device)

optimizer = torch.optim.AdamW(m.parameters(), lr=learning_rate)
print("Number of parameters: ", estimate_params(m))
print("Running on", device)
print("Mode: ", "Dry run" if dry_run else "Normal")
for iter in range(epoches):
  xb, yb = get_batch('train')
  optimizer.zero_grad(set_to_none=True)
  logits, loss = m(xb, yb)
  loss.backward()
  optimizer.step()
  # Pretty print loss every 10 epoches
  if iter % eval_iter == 0 or iter == epoches - 1:
    losses = estimate_loss(m)
    print(f'Epoch {iter} | Train loss: {losses["train"]:.2f} | Val loss: {losses["val"]:.2f}')
    if dry_run:
      break



print(loss.item())

print(generate_text(m, 500))

Number of parameters:  7649
Running on cuda
Mode:  Normal
Epoch 4999 | Train loss: 4.03 | Val loss: 4.03
Epoch 4999 | Train loss: 2.38 | Val loss: 2.39
Epoch 4999 | Train loss: 2.31 | Val loss: 2.34
Epoch 4999 | Train loss: 2.27 | Val loss: 2.31
Epoch 4999 | Train loss: 2.26 | Val loss: 2.31
Epoch 4999 | Train loss: 2.24 | Val loss: 2.31
Epoch 4999 | Train loss: 2.23 | Val loss: 2.29
Epoch 4999 | Train loss: 2.21 | Val loss: 2.29
Epoch 4999 | Train loss: 2.21 | Val loss: 2.28
Epoch 4999 | Train loss: 2.20 | Val loss: 2.27
Epoch 4999 | Train loss: 2.20 | Val loss: 2.27
Epoch 4999 | Train loss: 2.19 | Val loss: 2.28
Epoch 4999 | Train loss: 2.18 | Val loss: 2.25
Epoch 4999 | Train loss: 2.19 | Val loss: 2.27
2.277479648590088


EDWI:
Weicuy thainallw
ond
Y Mengest and my he plaveland.
Femin to oancrid mutur dem tionte Whe your's buth-
If loget, eis ofwit he of of fay
witesiecor langry sow minis pAtifis ow. COL:Bivied shaks lost andg:
No to le?
Now bur ip I ful me I to waint hals? O PALAN

# Feedforward

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm.auto import tqdm

torch.manual_seed(203)

# -*- coding: utf-8 -*-

# Load data
with open('input.txt') as file:
  content = file.read()

chars = sorted(set(list(content)))
stoi = {c:i for i, c in enumerate(chars)}
itoc= {i:c for i, c in enumerate(chars)}

encode = lambda s: [stoi[c] for c in s ]
decode = lambda e: ''.join([itoc[i] for i in e])

tensor = torch.tensor(encode(content), dtype=torch.long)
train_sz = int(len(tensor)*0.9)
# Split train and valid
train_ts = tensor[:train_sz]
valid_ts = tensor[train_sz:]

#===============================================================================
# Parameters
#===============================================================================

# model parameters
block_size = 8
vocab_size = len(chars)
batch_size = 32
n_embs = 32

# training parameters
epoches = 5000
# epoches = 100
learning_rate = 1e-2
eval_iter = 400
# Dry run/Debug
dry_run = False
# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#===============================================================================
# Utils
#===============================================================================
@torch.no_grad()
def estimate_loss(model):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(epoches).to(device)
        for k in range(epoches):
            X, Y = get_batch(split)
            _, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

def get_batch(ds:str = 'train'):
  data = train_ts if ds == 'train' else valid_ts
  ix = torch.randint(data.shape[0] - block_size, (batch_size,))
  xb = torch.stack([data[i:i+block_size] for i in ix])
  yb = torch.stack([data[i+1:i+block_size+1] for i in ix])
  xb, yb = xb.to(device), yb.to(device)
  return xb, yb


#===============================================================================
# Model definition
#===============================================================================

class Head(nn.Module):
  def __init__(self, head_dim):
    super().__init__()
    self.query = nn.Linear(n_embs, head_dim)
    self.key = nn.Linear(n_embs, head_dim)
    self.value = nn.Linear(n_embs, head_dim)
    self.register_buffer('tril', torch.tril(torch.ones((block_size, block_size))))

  def forward(self, x):
    B, T, C = x.shape

    q = self.query(x)
    k = self.key(x)

    head_dim = k.shape[-1]
    wei = q @ k.transpose(-1, -2) / (head_dim ** 0.5)
    wei = wei.masked_fill(self.tril[:T,:T] == 0, float('-inf'))
    wei = F.softmax(wei, dim=-1)

    v = self.value(x) # B, T, C
    out = wei @ v # (B, T, T) @ (B, T, head_size) = (B, T, head_size)
    return out

class MultiHead(nn.Module):
  def __init__(self, head_size, num_heads) -> None:
    super().__init__()
    self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])

  def forward(self, x):
    return torch.cat([h(x) for h in self.heads], dim=-1)

class Feedforward(nn.Module):
  def __init__(self):
    super().__init__()
    self.net = nn.Sequential(
      nn.Linear(n_embs, n_embs),
      nn.ReLU(),
    )

  def forward(self, x):
    return self.net(x)

class BigramLM(nn.Module):
  def __init__(self):
    super().__init__()
    self.token_embd = nn.Embedding(vocab_size, n_embs)
    self.position_embd = nn.Embedding(block_size, n_embs)
    self.sa = MultiHead(n_embs // 4, 4)
    self.ffwd = Feedforward()
    self.lm_head = nn.Linear(n_embs, vocab_size)


  def forward(self, idx, targets=None):
    B, T = idx.shape
    token_em = self.token_embd(idx) # (B, T, C) and C = vocab_size
    position_em = self.position_embd(torch.arange(T, device=idx.device)) # (T, C)
    x = token_em + position_em # (B, T, C)
    x = self.sa(x) # (B, T, C)
    x = self.ffwd(x) # (B, T, C)
    logits = self.lm_head(x) # (B, T, C)

    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C) # (B*T, C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets)
    return logits, loss

  def generate(self,idx,max_generate):
    for _ in range(max_generate):
      idx_cond = idx[:,-block_size:] # B, T
      logits, _ = self(idx_cond) # B, T, C
      logits = logits[:,-1,:]
      probs = F.softmax(logits, dim=-1) # B, C
      idx_next = torch.multinomial(probs, num_samples=1)
      idx = torch.cat((idx, idx_next), dim=1) # B, T+1
    return idx

def generate_text(m, max_size):
  init = torch.zeros((1, 1), dtype=torch.long).to(device)
  o = m.generate(init, max_size)
  return decode(list(o[0].tolist()))

def estimate_params(m):
  return sum(p.numel() for p in m.parameters() if p.requires_grad)

m = BigramLM()
m = m.to(device)

optimizer = torch.optim.AdamW(m.parameters(), lr=learning_rate)
print("Number of parameters: ", estimate_params(m))
print("Running on", device)
print("Mode: ", "Dry run" if dry_run else "Normal")
eval_iter = 100 if dry_run else eval_iter

for iter in tqdm(range(epoches)):
  xb, yb = get_batch('train')
  optimizer.zero_grad(set_to_none=True)
  logits, loss = m(xb, yb)
  loss.backward()
  optimizer.step()
  # Pretty print loss every 10 epoches
  if iter % eval_iter == 0 or iter == epoches - 1:
    losses = estimate_loss(m)
    print(f'Epoch {iter} | Train loss: {losses["train"]:.2f} | Val loss: {losses["val"]:.2f}')
    if dry_run:
      break



print(loss.item())

print(generate_text(m, 500))

Number of parameters:  8705
Running on cuda
Mode:  Normal


  0%|          | 0/5000 [00:00<?, ?it/s]

Epoch 0 | Train loss: 4.10 | Val loss: 4.10
Epoch 400 | Train loss: 2.36 | Val loss: 2.38
Epoch 800 | Train loss: 2.28 | Val loss: 2.33
Epoch 1200 | Train loss: 2.23 | Val loss: 2.28
Epoch 1600 | Train loss: 2.21 | Val loss: 2.28
Epoch 2000 | Train loss: 2.21 | Val loss: 2.26
Epoch 2400 | Train loss: 2.20 | Val loss: 2.25
Epoch 2800 | Train loss: 2.17 | Val loss: 2.25
Epoch 3200 | Train loss: 2.17 | Val loss: 2.24
Epoch 3600 | Train loss: 2.17 | Val loss: 2.25
Epoch 4000 | Train loss: 2.17 | Val loss: 2.24
Epoch 4400 | Train loss: 2.15 | Val loss: 2.22
Epoch 4800 | Train loss: 2.17 | Val loss: 2.25
Epoch 4999 | Train loss: 2.14 | Val loss: 2.22
2.1675026416778564


EDWIORDIA:
You finall sord
Mhe ofestigne my hand, wilang:
Fe way stoan miduncaut demorien! wiffe boreseet hat form kneied pomwitues.


DUKE Du thesiecome cheryisow ming mute fis of. COKEdefied shald loh, and fat grove me sest
Bolply fei me reame air I his?  are ialfouedsel
He gare.

ANDY defeng would rop therew would ushir r

# Blocks

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm.auto import tqdm

torch.manual_seed(203)

# -*- coding: utf-8 -*-

# Load data
with open('input.txt') as file:
  content = file.read()

chars = sorted(set(list(content)))
stoi = {c:i for i, c in enumerate(chars)}
itoc= {i:c for i, c in enumerate(chars)}

encode = lambda s: [stoi[c] for c in s ]
decode = lambda e: ''.join([itoc[i] for i in e])

tensor = torch.tensor(encode(content), dtype=torch.long)
train_sz = int(len(tensor)*0.9)
# Split train and valid
train_ts = tensor[:train_sz]
valid_ts = tensor[train_sz:]

#===============================================================================
# Parameters
#===============================================================================

# model parameters
block_size = 8
vocab_size = len(chars)
batch_size = 32
n_embs = 32
num_heads = 4

# training parameters
epoches = 5000
# epoches = 100
learning_rate = 1e-2
eval_iter = 400
# Dry run/Debug
dry_run = False
# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#===============================================================================
# Utils
#===============================================================================
@torch.no_grad()
def estimate_loss(model):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(epoches).to(device)
        for k in range(epoches):
            X, Y = get_batch(split)
            _, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

def get_batch(ds:str = 'train'):
  data = train_ts if ds == 'train' else valid_ts
  ix = torch.randint(data.shape[0] - block_size, (batch_size,))
  xb = torch.stack([data[i:i+block_size] for i in ix])
  yb = torch.stack([data[i+1:i+block_size+1] for i in ix])
  xb, yb = xb.to(device), yb.to(device)
  return xb, yb


#===============================================================================
# Model definition
#===============================================================================

class Head(nn.Module):
  def __init__(self, head_dim):
    super().__init__()
    self.query = nn.Linear(n_embs, head_dim)
    self.key = nn.Linear(n_embs, head_dim)
    self.value = nn.Linear(n_embs, head_dim)
    self.register_buffer('tril', torch.tril(torch.ones((block_size, block_size))))

  def forward(self, x):
    B, T, C = x.shape

    q = self.query(x)
    k = self.key(x)

    head_dim = k.shape[-1]
    wei = q @ k.transpose(-1, -2) / (head_dim ** 0.5)
    wei = wei.masked_fill(self.tril[:T,:T] == 0, float('-inf'))
    wei = F.softmax(wei, dim=-1)

    v = self.value(x) # B, T, C
    out = wei @ v # (B, T, T) @ (B, T, head_size) = (B, T, head_size)
    return out

class MultiHead(nn.Module):
  def __init__(self, head_size, num_heads) -> None:
    super().__init__()
    self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])

  def forward(self, x):
    return torch.cat([h(x) for h in self.heads], dim=-1)

class Feedforward(nn.Module):
  def __init__(self):
    super().__init__()
    self.net = nn.Sequential(
      nn.Linear(n_embs, n_embs),
      nn.ReLU(),
    )

  def forward(self, x):
    return self.net(x)

class Block(nn.Module):
  def __init__(self):
    super().__init__()
    self.sa = MultiHead(n_embs // num_heads, num_heads)
    self.ffwd = Feedforward()
  def forward(self, x):
    x = self.sa(x)
    x = self.ffwd(x)
    return x


class BigramLM(nn.Module):
  def __init__(self):
    super().__init__()
    self.token_embd = nn.Embedding(vocab_size, n_embs)
    self.position_embd = nn.Embedding(block_size, n_embs)
    self.blocks = nn.Sequential(*[Block() for _ in range(4)])
    self.lm_head = nn.Linear(n_embs, vocab_size)


  def forward(self, idx, targets=None):
    B, T = idx.shape
    token_em = self.token_embd(idx) # (B, T, C) and C = vocab_size
    position_em = self.position_embd(torch.arange(T, device=idx.device)) # (T, C)
    x = token_em + position_em # (B, T, C)
    x = self.blocks(x)
    logits = self.lm_head(x) # (B, T, C)

    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C) # (B*T, C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets)
    return logits, loss

  def generate(self,idx,max_generate):
    for _ in range(max_generate):
      idx_cond = idx[:,-block_size:] # B, T
      logits, _ = self(idx_cond) # B, T, C
      logits = logits[:,-1,:]
      probs = F.softmax(logits, dim=-1) # B, C
      idx_next = torch.multinomial(probs, num_samples=1)
      idx = torch.cat((idx, idx_next), dim=1) # B, T+1
    return idx

def generate_text(m, max_size):
  init = torch.zeros((1, 1), dtype=torch.long).to(device)
  o = m.generate(init, max_size)
  return decode(list(o[0].tolist()))

def estimate_params(m):
  return sum(p.numel() for p in m.parameters() if p.requires_grad)

m = BigramLM()
m = m.to(device)

optimizer = torch.optim.AdamW(m.parameters(), lr=learning_rate)
print("Number of parameters: ", estimate_params(m))
print("Running on", device)
print("Mode: ", "Dry run" if dry_run else "Normal")
eval_iter = 100 if dry_run else eval_iter

for iter in tqdm(range(epoches)):
  xb, yb = get_batch('train')
  optimizer.zero_grad(set_to_none=True)
  logits, loss = m(xb, yb)
  loss.backward()
  optimizer.step()
  # Pretty print loss every 10 epoches
  if iter % eval_iter == 0 or iter == epoches - 1:
    losses = estimate_loss(m)
    print(f'Epoch {iter} | Train loss: {losses["train"]:.2f} | Val loss: {losses["val"]:.2f}')
    if dry_run:
      break



print(loss.item())
print(generate_text(m, 500))

Number of parameters:  21377
Running on cuda
Mode:  Normal


  0%|          | 0/5000 [00:00<?, ?it/s]

Epoch 0 | Train loss: 4.11 | Val loss: 4.11
Epoch 400 | Train loss: 3.12 | Val loss: 3.12
Epoch 800 | Train loss: 3.12 | Val loss: 3.11
Epoch 1200 | Train loss: 3.15 | Val loss: 3.15
Epoch 1600 | Train loss: 3.20 | Val loss: 3.21
Epoch 2000 | Train loss: 3.31 | Val loss: 3.35
Epoch 2400 | Train loss: 3.31 | Val loss: 3.35
Epoch 2800 | Train loss: 3.32 | Val loss: 3.35
Epoch 3200 | Train loss: 3.31 | Val loss: 3.35
Epoch 3600 | Train loss: 3.31 | Val loss: 3.35
Epoch 4000 | Train loss: 3.31 | Val loss: 3.35
Epoch 4400 | Train loss: 3.31 | Val loss: 3.35
Epoch 4800 | Train loss: 3.31 | Val loss: 3.35
Epoch 4999 | Train loss: 3.31 | Val loss: 3.35
3.0589053630828857


bUi'retGbmuyEGhfinalgw
otd
ahe oseiIieneLec  d p,a, i e rtneeiAwrsdoaE r dancauion CheiiynrwWfygyg r
eebt hmt 
 lmgotcie GDokwl bet.ohe ,kf a
  desiecot lc.gryfsow  noidrMAeifiswoe. tOg bnoie  shtesdl h,oa ugf'ht tt ln?d sk
vt, ip Iyfei oemrramo airt  hysdd aoAoealNoheeyea
sr eLIe:eaA na d
fFnh  obeenrop H  t w rghwasusLrrrr

# Residual network

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm.auto import tqdm

torch.manual_seed(203)

# -*- coding: utf-8 -*-

# Load data
with open('input.txt') as file:
  content = file.read()

chars = sorted(set(list(content)))
stoi = {c:i for i, c in enumerate(chars)}
itoc= {i:c for i, c in enumerate(chars)}

encode = lambda s: [stoi[c] for c in s ]
decode = lambda e: ''.join([itoc[i] for i in e])

tensor = torch.tensor(encode(content), dtype=torch.long)
train_sz = int(len(tensor)*0.9)
# Split train and valid
train_ts = tensor[:train_sz]
valid_ts = tensor[train_sz:]

#===============================================================================
# Parameters
#===============================================================================

# model parameters
block_size = 256
vocab_size = len(chars)
batch_size = 32
n_embs = 32
num_heads = 4

# training parameters
epoches = 10000
# epoches = 100
learning_rate = 5e-3
eval_iter = 400
# Dry run/Debug
dry_run = False
# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#===============================================================================
# Utils
#===============================================================================
@torch.no_grad()
def estimate_loss(model):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(epoches).to(device)
        for k in range(epoches):
            X, Y = get_batch(split)
            _, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

def get_batch(ds:str = 'train'):
  data = train_ts if ds == 'train' else valid_ts
  ix = torch.randint(data.shape[0] - block_size, (batch_size,))
  xb = torch.stack([data[i:i+block_size] for i in ix])
  yb = torch.stack([data[i+1:i+block_size+1] for i in ix])
  xb, yb = xb.to(device), yb.to(device)
  return xb, yb


#===============================================================================
# Model definition
#===============================================================================

class Head(nn.Module):
  def __init__(self, head_dim):
    super().__init__()
    self.query = nn.Linear(n_embs, head_dim)
    self.key = nn.Linear(n_embs, head_dim)
    self.value = nn.Linear(n_embs, head_dim)
    self.register_buffer('tril', torch.tril(torch.ones((block_size, block_size))))

  def forward(self, x):
    B, T, C = x.shape

    q = self.query(x)
    k = self.key(x)

    head_dim = k.shape[-1]
    wei = q @ k.transpose(-1, -2) / (head_dim ** 0.5)
    wei = wei.masked_fill(self.tril[:T,:T] == 0, float('-inf'))
    wei = F.softmax(wei, dim=-1)

    v = self.value(x) # B, T, C
    out = wei @ v # (B, T, T) @ (B, T, head_size) = (B, T, head_size)
    return out

class MultiHead(nn.Module):
  def __init__(self, head_size, num_heads) -> None:
    super().__init__()
    self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])

  def forward(self, x):
    return torch.cat([h(x) for h in self.heads], dim=-1)

class Feedforward(nn.Module):
  def __init__(self):
    super().__init__()
    self.net = nn.Sequential(
      nn.Linear(n_embs, n_embs * 4),
      nn.ReLU(),
      nn.Linear(n_embs * 4, n_embs)
    )

  def forward(self, x):
    return self.net(x)

class Block(nn.Module):
  def __init__(self):
    super().__init__()
    self.sa = MultiHead(n_embs // num_heads, num_heads)
    self.ffwd = Feedforward()
  def forward(self, x):
    x = x + self.sa(x)
    x = x + self.ffwd(x)
    return x


class BigramLM(nn.Module):
  def __init__(self):
    super().__init__()
    self.token_embd = nn.Embedding(vocab_size, n_embs)
    self.position_embd = nn.Embedding(block_size, n_embs)
    self.blocks = nn.Sequential(*[Block() for _ in range(3)])
    self.lm_head = nn.Linear(n_embs, vocab_size)


  def forward(self, idx, targets=None):
    B, T = idx.shape
    token_em = self.token_embd(idx) # (B, T, C) and C = vocab_size
    position_em = self.position_embd(torch.arange(T, device=idx.device)) # (T, C)
    x = token_em + position_em # (B, T, C)
    x = self.blocks(x)
    logits = self.lm_head(x) # (B, T, C)

    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C) # (B*T, C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets)
    return logits, loss

  def generate(self,idx,max_generate):
    for _ in range(max_generate):
      idx_cond = idx[:,-block_size:] # B, T
      logits, _ = self(idx_cond) # B, T, C
      logits = logits[:,-1,:]
      probs = F.softmax(logits, dim=-1) # B, C
      idx_next = torch.multinomial(probs, num_samples=1)
      idx = torch.cat((idx, idx_next), dim=1) # B, T+1
    return idx

def generate_text(m, max_size):
  init = torch.zeros((1, 1), dtype=torch.long).to(device)
  o = m.generate(init, max_size)
  return decode(list(o[0].tolist()))

def estimate_params(m):
  return sum(p.numel() for p in m.parameters() if p.requires_grad)

m = BigramLM()
m = m.to(device)

optimizer = torch.optim.AdamW(m.parameters(), lr=learning_rate)
print("Number of parameters: ", estimate_params(m))
print("Running on", device)
print("Mode: ", "Dry run" if dry_run else "Normal")
eval_iter = 100 if dry_run else eval_iter

for iter in tqdm(range(epoches)):
  xb, yb = get_batch('train')
  optimizer.zero_grad(set_to_none=True)
  logits, loss = m(xb, yb)
  loss.backward()
  optimizer.step()
  # Pretty print loss every 10 epoches
  if iter % eval_iter == 0 or iter == epoches - 1:
    losses = estimate_loss(m)
    print(f'Epoch {iter} | Train loss: {losses["train"]:.2f} | Val loss: {losses["val"]:.2f}')
    if dry_run:
      break



print(loss.item())
print(generate_text(m, 500))

Number of parameters:  39041
Running on cuda
Mode:  Normal


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch 0 | Train loss: 4.24 | Val loss: 4.25
Epoch 400 | Train loss: 2.30 | Val loss: 2.32
Epoch 800 | Train loss: 2.20 | Val loss: 2.25
Epoch 1200 | Train loss: 2.15 | Val loss: 2.21
Epoch 1600 | Train loss: 2.12 | Val loss: 2.20
Epoch 2000 | Train loss: 2.09 | Val loss: 2.19
Epoch 2400 | Train loss: 2.07 | Val loss: 2.16
Epoch 2800 | Train loss: 2.04 | Val loss: 2.14
Epoch 3200 | Train loss: 2.03 | Val loss: 2.13
Epoch 3600 | Train loss: 2.01 | Val loss: 2.12
Epoch 4000 | Train loss: 2.01 | Val loss: 2.12
Epoch 4400 | Train loss: 2.01 | Val loss: 2.11
Epoch 4800 | Train loss: 1.98 | Val loss: 2.09
Epoch 5200 | Train loss: 1.97 | Val loss: 2.09
Epoch 5600 | Train loss: 1.98 | Val loss: 2.08
Epoch 6000 | Train loss: 1.98 | Val loss: 2.08
Epoch 6400 | Train loss: 1.97 | Val loss: 2.08
Epoch 6800 | Train loss: 1.96 | Val loss: 2.07
Epoch 7200 | Train loss: 1.97 | Val loss: 2.09
Epoch 7600 | Train loss: 1.95 | Val loss: 2.08
Epoch 8000 | Train loss: 1.95 | Val loss: 2.06
Epoch 8400 | Train

# Residual network V2

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm.auto import tqdm

torch.manual_seed(203)

# -*- coding: utf-8 -*-

# Load data
with open('input.txt') as file:
  content = file.read()

chars = sorted(set(list(content)))
stoi = {c:i for i, c in enumerate(chars)}
itoc= {i:c for i, c in enumerate(chars)}

encode = lambda s: [stoi[c] for c in s ]
decode = lambda e: ''.join([itoc[i] for i in e])

tensor = torch.tensor(encode(content), dtype=torch.long)
train_sz = int(len(tensor)*0.9)
# Split train and valid
train_ts = tensor[:train_sz]
valid_ts = tensor[train_sz:]

#===============================================================================
# Parameters
#===============================================================================

# model parameters
block_size = 32 # number of characters to predict
vocab_size = len(chars)
batch_size = 64
n_embs = 32
num_heads = 4

# training parameters
epoches = 5000
# epoches = 100
learning_rate = 1e-2
eval_iter = 400
# Dry run/Debug
dry_run = False
# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#===============================================================================
# Utils
#===============================================================================
@torch.no_grad()
def estimate_loss(model):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(epoches).to(device)
        for k in range(epoches):
            X, Y = get_batch(split)
            _, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

def get_batch(ds:str = 'train'):
  data = train_ts if ds == 'train' else valid_ts
  ix = torch.randint(data.shape[0] - block_size, (batch_size,))
  xb = torch.stack([data[i:i+block_size] for i in ix])
  yb = torch.stack([data[i+1:i+block_size+1] for i in ix])
  xb, yb = xb.to(device), yb.to(device)
  return xb, yb


#===============================================================================
# Model definition
#===============================================================================

class Head(nn.Module):
  def __init__(self, head_dim):
    super().__init__()
    self.query = nn.Linear(n_embs, head_dim)
    self.key = nn.Linear(n_embs, head_dim)
    self.value = nn.Linear(n_embs, head_dim)
    self.register_buffer('tril', torch.tril(torch.ones((block_size, block_size))))

  def forward(self, x):
    B, T, C = x.shape

    q = self.query(x)
    k = self.key(x)

    head_dim = k.shape[-1]
    wei = q @ k.transpose(-1, -2) / (head_dim ** 0.5)
    wei = wei.masked_fill(self.tril[:T,:T] == 0, float('-inf'))
    wei = F.softmax(wei, dim=-1)

    v = self.value(x) # B, T, C
    out = wei @ v # (B, T, T) @ (B, T, head_size) = (B, T, head_size)
    return out

class MultiHead(nn.Module):
  def __init__(self, head_size, num_heads) -> None:
    super().__init__()
    self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])

  def forward(self, x):
    return torch.cat([h(x) for h in self.heads], dim=-1)

class Feedforward(nn.Module):
  def __init__(self):
    super().__init__()
    self.net = nn.Sequential(
      nn.Linear(n_embs, n_embs * 4),
      nn.ReLU(),
      nn.Linear(n_embs * 4, n_embs)
    )

  def forward(self, x):
    return self.net(x)

class Block(nn.Module):
  def __init__(self):
    super().__init__()
    self.sa = MultiHead(n_embs // num_heads, num_heads)
    self.ffwd = Feedforward()
  def forward(self, x):
    x = x + self.sa(x)
    x = x + self.ffwd(x)
    return x


class BigramLM(nn.Module):
  def __init__(self):
    super().__init__()
    self.token_embd = nn.Embedding(vocab_size, n_embs)
    self.position_embd = nn.Embedding(block_size, n_embs)
    self.blocks = nn.Sequential(*[Block() for _ in range(4)])
    self.lm_head = nn.Linear(n_embs, vocab_size)


  def forward(self, idx, targets=None):
    B, T = idx.shape
    token_em = self.token_embd(idx) # (B, T, C) and C = vocab_size
    position_em = self.position_embd(torch.arange(T, device=idx.device)) # (T, C)
    x = token_em + position_em # (B, T, C)
    x = self.blocks(x)
    logits = self.lm_head(x) # (B, T, C)

    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C) # (B*T, C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets)
    return logits, loss

  def generate(self,idx,max_generate):
    for _ in range(max_generate):
      idx_cond = idx[:,-block_size:] # B, T
      logits, _ = self(idx_cond) # B, T, C
      logits = logits[:,-1,:]
      probs = F.softmax(logits, dim=-1) # B, C
      idx_next = torch.multinomial(probs, num_samples=1)
      idx = torch.cat((idx, idx_next), dim=1) # B, T+1
    return idx

def generate_text(m, max_size):
  init = torch.zeros((1, 1), dtype=torch.long).to(device)
  o = m.generate(init, max_size)
  return decode(list(o[0].tolist()))

def estimate_params(m):
  return sum(p.numel() for p in m.parameters() if p.requires_grad)

def save_model(m, path):
  torch.save(m.state_dict(), path)

m = BigramLM()
m = m.to(device)

optimizer = torch.optim.AdamW(m.parameters(), lr=learning_rate)
print("Number of parameters: ", estimate_params(m))
print("Running on", device)
print("Mode: ", "Dry run" if dry_run else "Normal")
eval_iter = 100 if dry_run else eval_iter

for iter in tqdm(range(epoches)):
  xb, yb = get_batch('train')
  optimizer.zero_grad(set_to_none=True)
  logits, loss = m(xb, yb)
  loss.backward()
  optimizer.step()
  # Pretty print loss every 10 epoches
  if iter % eval_iter == 0 or iter == epoches - 1:
    losses = estimate_loss(m)
    print(f'Epoch {iter} | Train loss: {losses["train"]:.2f} | Val loss: {losses["val"]:.2f}')
    if dry_run:
      break



print(loss.item())
print(generate_text(m, 500))

save_model(m, 'model.pt')

Number of parameters:  51329
Running on cuda
Mode:  Normal


  0%|          | 0/5000 [00:00<?, ?it/s]

Epoch 0 | Train loss: 4.52 | Val loss: 4.52
Epoch 400 | Train loss: 2.04 | Val loss: 2.11
Epoch 800 | Train loss: 1.90 | Val loss: 2.03
Epoch 1200 | Train loss: 1.83 | Val loss: 1.98
Epoch 1600 | Train loss: 1.79 | Val loss: 1.93
Epoch 2000 | Train loss: 1.77 | Val loss: 1.92
Epoch 2400 | Train loss: 1.75 | Val loss: 1.90
Epoch 2800 | Train loss: 1.73 | Val loss: 1.89
Epoch 3200 | Train loss: 1.73 | Val loss: 1.89
Epoch 3600 | Train loss: 1.73 | Val loss: 1.88
Epoch 4000 | Train loss: 1.70 | Val loss: 1.87
Epoch 4400 | Train loss: 1.70 | Val loss: 1.88
Epoch 4800 | Train loss: 1.71 | Val loss: 1.87
Epoch 4999 | Train loss: 1.71 | Val loss: 1.88
1.724281907081604


EDWARD:
Thuugh finall sorn
You of it and my loop, will friend?

MIdowES:
Wancaus not this revoyt bar's but made loget, edged with thou cake a
 these could merrison on in pown,
The betch broth wraths loh, and father ble?
Where a colthful'd me?

RuEipt Shose mark in founds,
CalledInd. As a deffar would ropber towards
Touch frre

# Layer norm and dropout

In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm.auto import tqdm

torch.manual_seed(203)

# -*- coding: utf-8 -*-

# Load data
with open('input.txt') as file:
  content = file.read()

chars = sorted(set(list(content)))
stoi = {c:i for i, c in enumerate(chars)}
itoc= {i:c for i, c in enumerate(chars)}

encode = lambda s: [stoi[c] for c in s ]
decode = lambda e: ''.join([itoc[i] for i in e])

tensor = torch.tensor(encode(content), dtype=torch.long)
train_sz = int(len(tensor)*0.9)
# Split train and valid
train_ts = tensor[:train_sz]
valid_ts = tensor[train_sz:]

#===============================================================================
# Parameters
#===============================================================================

# model parameters
block_size = 64 # number of characters to predict
vocab_size = len(chars)
batch_size = 64
n_embs = 128
num_heads = 4
num_layers = 8
dropout_rate = 0.2

# training parameters
epoches = 5000
# epoches = 100
learning_rate = 1e-2
eval_iter = 400
# Dry run/Debug
dry_run = False
# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#===============================================================================
# Utils
#===============================================================================
@torch.no_grad()
def estimate_loss(model):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(epoches).to(device)
        for k in range(epoches):
            X, Y = get_batch(split)
            _, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

def get_batch(ds:str = 'train'):
  data = train_ts if ds == 'train' else valid_ts
  ix = torch.randint(data.shape[0] - block_size, (batch_size,))
  xb = torch.stack([data[i:i+block_size] for i in ix])
  yb = torch.stack([data[i+1:i+block_size+1] for i in ix])
  xb, yb = xb.to(device), yb.to(device)
  return xb, yb


#===============================================================================
# Model definition
#===============================================================================

class Head(nn.Module):
  def __init__(self, head_dim):
    super().__init__()
    self.query = nn.Linear(n_embs, head_dim)
    self.key = nn.Linear(n_embs, head_dim)
    self.value = nn.Linear(n_embs, head_dim)
    self.register_buffer('tril', torch.tril(torch.ones((block_size, block_size))))
    self.dropout = nn.Dropout(dropout_rate)
  def forward(self, x):
    B, T, C = x.shape

    q = self.query(x)
    k = self.key(x)

    head_dim = k.shape[-1]
    wei = q @ k.transpose(-1, -2) / (head_dim ** 0.5)
    wei = wei.masked_fill(self.tril[:T,:T] == 0, float('-inf'))
    wei = F.softmax(wei, dim=-1)
    wei = self.dropout(wei)

    v = self.value(x) # B, T, C
    out = wei @ v # (B, T, T) @ (B, T, head_size) = (B, T, head_size)
    return out

class MultiHead(nn.Module):
  def __init__(self, head_size, num_heads) -> None:
    super().__init__()
    self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
    self.proj = nn.Linear(head_size * num_heads, n_embs)
    self.dropout = nn.Dropout(dropout_rate)

  def forward(self, x):
    out = torch.cat([h(x) for h in self.heads], dim=-1)
    out = self.dropout(out)
    return self.proj(out)

class Feedforward(nn.Module):
  def __init__(self):
    super().__init__()
    self.net = nn.Sequential(
      nn.Linear(n_embs, n_embs * 4),
      nn.ReLU(),
      nn.Linear(n_embs * 4, n_embs),
      nn.Dropout(dropout_rate)
    )

  def forward(self, x):
    return self.net(x)

class Block(nn.Module):
  def __init__(self):
    super().__init__()
    self.sa = MultiHead(n_embs // num_heads, num_heads)
    self.ffwd = Feedforward()
    self.ln1 = nn.LayerNorm(n_embs)
    self.ln2 = nn.LayerNorm(n_embs)

  def forward(self, x):
    x = x + self.sa(self.ln1(x))
    x = x + self.ffwd(self.ln2(x))
    return x


class BigramLM(nn.Module):
  def __init__(self):
    super().__init__()
    self.token_embd = nn.Embedding(vocab_size, n_embs)
    self.position_embd = nn.Embedding(block_size, n_embs)
    self.blocks = nn.Sequential(*[Block() for _ in range(num_layers)])
    self.lm_head = nn.Linear(n_embs, vocab_size)


  def forward(self, idx, targets=None):
    B, T = idx.shape
    token_em = self.token_embd(idx) # (B, T, C) and C = vocab_size
    position_em = self.position_embd(torch.arange(T, device=idx.device)) # (T, C)
    x = token_em + position_em # (B, T, C)
    x = self.blocks(x)
    logits = self.lm_head(x) # (B, T, C)

    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C) # (B*T, C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets)
    return logits, loss

  def generate(self,idx,max_generate):
    for _ in range(max_generate):
      idx_cond = idx[:,-block_size:] # B, T
      logits, _ = self(idx_cond) # B, T, C
      logits = logits[:,-1,:]
      probs = F.softmax(logits, dim=-1) # B, C
      idx_next = torch.multinomial(probs, num_samples=1)
      idx = torch.cat((idx, idx_next), dim=1) # B, T+1
    return idx

def generate_text(m, max_size):
  init = torch.zeros((1, 1), dtype=torch.long).to(device)
  o = m.generate(init, max_size)
  return decode(list(o[0].tolist()))

def estimate_params(m):
  return sum(p.numel() for p in m.parameters() if p.requires_grad)

def save_model(m, path):
  torch.save(m.state_dict(), path)

m = BigramLM()
m = m.to(device)

optimizer = torch.optim.AdamW(m.parameters(), lr=learning_rate)
print("Number of parameters: ", estimate_params(m))
print("Running on", device)
print("Mode: ", "Dry run" if dry_run else "Normal")
eval_iter = 1 if dry_run else eval_iter

for iter in tqdm(range(epoches)):
  xb, yb = get_batch('train')
  optimizer.zero_grad(set_to_none=True)
  logits, loss = m(xb, yb)
  loss.backward()
  optimizer.step()
  # Pretty print loss every 10 epoches
  if iter % eval_iter == 0 or iter == epoches - 1:
    losses = estimate_loss(m)
    print(f'Epoch {iter} | Train loss: {losses["train"]:.2f} | Val loss: {losses["val"]:.2f}')
    if dry_run:
      break



print(loss.item())
print(generate_text(m, 500))
# save_model(m, 'model.pt')

Number of parameters:  56065
Running on cuda
Mode:  Normal


  0%|          | 0/5000 [00:00<?, ?it/s]

Epoch 0 | Train loss: 4.14 | Val loss: 4.16
Epoch 400 | Train loss: 2.09 | Val loss: 2.12
Epoch 800 | Train loss: 1.94 | Val loss: 2.03
Epoch 1200 | Train loss: 1.87 | Val loss: 1.98
Epoch 1600 | Train loss: 1.84 | Val loss: 1.95
Epoch 2000 | Train loss: 1.80 | Val loss: 1.94
Epoch 2400 | Train loss: 1.78 | Val loss: 1.92
Epoch 2800 | Train loss: 1.78 | Val loss: 1.91
Epoch 3200 | Train loss: 1.77 | Val loss: 1.90
Epoch 3600 | Train loss: 1.76 | Val loss: 1.89
Epoch 4000 | Train loss: 1.75 | Val loss: 1.89
Epoch 4400 | Train loss: 1.74 | Val loss: 1.88
Epoch 4800 | Train loss: 1.74 | Val loss: 1.87
Epoch 4999 | Train loss: 1.74 | Val loss: 1.87
1.8687546253204346

No, lord, swornt to or in, not must to of, he good
Than didopt hay have befanneran's 'two.

ROMEO:
All then flought: the not shall?
Go-for your Spopprang: I sweep
of msea, I shope?
I's prallling the suw; thei ful duad talk.

Mitidingg Northing whock drom his knerat dingclesson stroy,
Ort that strel at.

Clourt Calliodion Mure

# Scaling up

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm.auto import tqdm

torch.manual_seed(203)

# -*- coding: utf-8 -*-

# Load data
with open('input.txt') as file:
  content = file.read()

chars = sorted(set(list(content)))
stoi = {c:i for i, c in enumerate(chars)}
itoc= {i:c for i, c in enumerate(chars)}

encode = lambda s: [stoi[c] for c in s ]
decode = lambda e: ''.join([itoc[i] for i in e])

tensor = torch.tensor(encode(content), dtype=torch.long)
train_sz = int(len(tensor)*0.9)
# Split train and valid
train_ts = tensor[:train_sz]
valid_ts = tensor[train_sz:]

#===============================================================================
# Parameters
#===============================================================================

# model parameters
block_size = 64 # number of characters to predict
vocab_size = len(chars)
batch_size = 64
n_embs = 128
num_heads = 4
num_layers = 8
dropout_rate = 0.2

# training parameters
epoches = 5000
# epoches = 100
learning_rate = 1e-2
eval_iter = 400
# Dry run/Debug
dry_run = False
# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#===============================================================================
# Utils
#===============================================================================
@torch.no_grad()
def estimate_loss(model):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(epoches).to(device)
        for k in range(epoches):
            X, Y = get_batch(split)
            _, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

def get_batch(ds:str = 'train'):
  data = train_ts if ds == 'train' else valid_ts
  ix = torch.randint(data.shape[0] - block_size, (batch_size,))
  xb = torch.stack([data[i:i+block_size] for i in ix])
  yb = torch.stack([data[i+1:i+block_size+1] for i in ix])
  xb, yb = xb.to(device), yb.to(device)
  return xb, yb


#===============================================================================
# Model definition
#===============================================================================

class Head(nn.Module):
  def __init__(self, head_dim):
    super().__init__()
    self.query = nn.Linear(n_embs, head_dim)
    self.key = nn.Linear(n_embs, head_dim)
    self.value = nn.Linear(n_embs, head_dim)
    self.register_buffer('tril', torch.tril(torch.ones((block_size, block_size))))
    self.dropout = nn.Dropout(dropout_rate)
  def forward(self, x):
    B, T, C = x.shape

    q = self.query(x)
    k = self.key(x)

    head_dim = k.shape[-1]
    wei = q @ k.transpose(-1, -2) / (head_dim ** 0.5)
    wei = wei.masked_fill(self.tril[:T,:T] == 0, float('-inf'))
    wei = F.softmax(wei, dim=-1)
    wei = self.dropout(wei)

    v = self.value(x) # B, T, C
    out = wei @ v # (B, T, T) @ (B, T, head_size) = (B, T, head_size)
    return out

class MultiHead(nn.Module):
  def __init__(self, head_size, num_heads) -> None:
    super().__init__()
    self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
    self.proj = nn.Linear(head_size * num_heads, n_embs)
    self.dropout = nn.Dropout(dropout_rate)

  def forward(self, x):
    out = torch.cat([h(x) for h in self.heads], dim=-1)
    out = self.dropout(out)
    return self.proj(out)

class Feedforward(nn.Module):
  def __init__(self):
    super().__init__()
    self.net = nn.Sequential(
      nn.Linear(n_embs, n_embs * 4),
      nn.ReLU(),
      nn.Linear(n_embs * 4, n_embs),
      nn.Dropout(dropout_rate)
    )

  def forward(self, x):
    return self.net(x)

class Block(nn.Module):
  def __init__(self):
    super().__init__()
    self.sa = MultiHead(n_embs // num_heads, num_heads)
    self.ffwd = Feedforward()
    self.ln1 = nn.LayerNorm(n_embs)
    self.ln2 = nn.LayerNorm(n_embs)

  def forward(self, x):
    x = x + self.sa(self.ln1(x))
    x = x + self.ffwd(self.ln2(x))
    return x


class BigramLM(nn.Module):
  def __init__(self):
    super().__init__()
    self.token_embd = nn.Embedding(vocab_size, n_embs)
    self.position_embd = nn.Embedding(block_size, n_embs)
    self.blocks = nn.Sequential(*[Block() for _ in range(num_layers)])
    self.lm_head = nn.Linear(n_embs, vocab_size)


  def forward(self, idx, targets=None):
    B, T = idx.shape
    token_em = self.token_embd(idx) # (B, T, C) and C = vocab_size
    position_em = self.position_embd(torch.arange(T, device=idx.device)) # (T, C)
    x = token_em + position_em # (B, T, C)
    x = self.blocks(x)
    logits = self.lm_head(x) # (B, T, C)

    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C) # (B*T, C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets)
    return logits, loss

  def generate(self,idx,max_generate):
    for _ in range(max_generate):
      idx_cond = idx[:,-block_size:] # B, T
      logits, _ = self(idx_cond) # B, T, C
      logits = logits[:,-1,:]
      probs = F.softmax(logits, dim=-1) # B, C
      idx_next = torch.multinomial(probs, num_samples=1)
      idx = torch.cat((idx, idx_next), dim=1) # B, T+1
    return idx

def generate_text(m, max_size):
  init = torch.zeros((1, 1), dtype=torch.long).to(device)
  o = m.generate(init, max_size)
  return decode(list(o[0].tolist()))

def estimate_params(m):
  return sum(p.numel() for p in m.parameters() if p.requires_grad)

def save_model(m, path):
  torch.save(m.state_dict(), path)

m = BigramLM()
m = m.to(device)

optimizer = torch.optim.AdamW(m.parameters(), lr=learning_rate)
print("Number of parameters: ", estimate_params(m))
print("Running on", device)
print("Mode: ", "Dry run" if dry_run else "Normal")
eval_iter = 1 if dry_run else eval_iter

for iter in tqdm(range(epoches)):
  xb, yb = get_batch('train')
  optimizer.zero_grad(set_to_none=True)
  logits, loss = m(xb, yb)
  loss.backward()
  optimizer.step()
  # Pretty print loss every 10 epoches
  if iter % eval_iter == 0 or iter == epoches - 1:
    losses = estimate_loss(m)
    print(f'Epoch {iter} | Train loss: {losses["train"]:.2f} | Val loss: {losses["val"]:.2f}')
    if dry_run:
      break



print(loss.item())
print(generate_text(m, 500))
# save_model(m, 'model.pt')

Number of parameters:  1611073
Running on cuda
Mode:  Normal


  0%|          | 0/5000 [00:00<?, ?it/s]

Epoch 0 | Train loss: 18.76 | Val loss: 18.95
Epoch 400 | Train loss: 1.92 | Val loss: 2.02
Epoch 800 | Train loss: 1.71 | Val loss: 1.87
Epoch 1200 | Train loss: 1.61 | Val loss: 1.77
Epoch 1600 | Train loss: 1.55 | Val loss: 1.73
Epoch 2000 | Train loss: 1.52 | Val loss: 1.70
Epoch 2400 | Train loss: 1.49 | Val loss: 1.67
Epoch 2800 | Train loss: 1.47 | Val loss: 1.66
Epoch 3200 | Train loss: 1.47 | Val loss: 1.67
Epoch 3600 | Train loss: 1.46 | Val loss: 1.67
Epoch 4000 | Train loss: 1.45 | Val loss: 1.65
Epoch 4400 | Train loss: 1.45 | Val loss: 1.66
Epoch 4800 | Train loss: 1.44 | Val loss: 1.67
