<a href="https://colab.research.google.com/github/max-monty/llm_from_scratch/blob/master/Infinite_Shel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

## Imports

In [None]:
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m58.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.8.0


In [None]:
import torch
import tiktoken
from google.colab import drive

## Configs

In [31]:
GPT_CONFIG_128M = {
    "tokenizer": "gpt2",    # Tokenizer
    "vocab_size": 50257,    # Vocabulary size
    "n_layers": 12,         # Number of layers
    "n_heads": 12,          # Number of attention heads
    "emb_dim": 768,         # Embedding dimension
    "context_len": 256,    # Context length
    "drop_rate": 0.1,       # Dropout rate
    "qkv_bias": False,      # Query-Key-Value bias
}

# GPT_CONFIG_128M = {
#     "tokenizer": "gpt2",    # Tokenizer
#     "vocab_size": 50257,    # Vocabulary size
#     "n_layers": 4,         # Number of layers
#     "n_heads": 4,          # Number of attention heads
#     "emb_dim": 256,         # Embedding dimension
#     "context_len": 256,    # Context length
#     "drop_rate": 0.1,       # Dropout rate
#     "qkv_bias": False,      # Query-Key-Value bias
#     "batch_size": 16,       # Batch size
# }

## Raw Data

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
path = '/content/drive/My Drive/Colab Notebooks/data/'
sidewalk_path = path + 'sidewalk.txt'
verdict_path = path + 'verdict.txt'

with open(sidewalk_path, 'r') as f:
  sidewalk = f.read()

with open(verdict_path, 'r') as f:
  verdict = f.read()

# Classes

## DataLoader

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, text, tokenizer, context_size):
        self.x = []
        self.y = []
        enc_txt = tokenizer.encode(text)
        for i in range(0, len(enc_txt) - context_size):
            x = enc_txt[i:i+context_size]
            y = enc_txt[i+1:i+context_size+1]
            self.x.append(torch.tensor(x))
            self.y.append(torch.tensor(y))
        self.x = torch.stack(self.x)
        self.y = torch.stack(self.y)

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

    def load_data(self, batch_size=32, shuffle=True):
        return torch.utils.data.DataLoader(self, batch_size=batch_size, shuffle=shuffle, drop_last=True, num_workers=0)

## Layers

### Layer Normalization

In [None]:
class LayerNorm(torch.nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.weight = torch.nn.Parameter(torch.ones(cfg["emb_dim"]))
        self.bias = torch.nn.Parameter(torch.zeros(cfg["emb_dim"]))
        self.eps = 1e-5

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        std = x.std(dim=-1, keepdim=True)
        normalized = (x - mean) / (std + self.eps)
        return normalized * self.weight + self.bias

### Token and Posistion Embedding

In [None]:
class EmbeddingLayer(torch.nn.Module):
    def __init__(self, vocab_size, embed_size, max_len):
        super().__init__()
        self.token_embedding = torch.nn.Embedding(vocab_size, embed_size)
        self.position_embedding = torch.nn.Embedding(max_len, embed_size)

    def forward(self, x):
        tok_embed = self.token_embedding(x)
        pos_embed = self.position_embedding(torch.arange(x.shape[1], device=x.device))
        return tok_embed + pos_embed

### Multi-head Attention

In [None]:
class MultiHeadAttention(torch.nn.Module):
    def __init__(self, d_in, d_out, context_length, n_heads, drop_rate, qkv_bias):
        super().__init__()
        self.d_out = d_out
        self.n_heads = n_heads
        self.head_dim = d_out // n_heads
        self.q = torch.nn.Linear(d_in, d_out, bias=qkv_bias)
        self.k = torch.nn.Linear(d_in, d_out, bias=qkv_bias)
        self.v = torch.nn.Linear(d_in, d_out, bias=qkv_bias)
        self.dropout = torch.nn.Dropout(drop_rate)
        self.out_proj = torch.nn.Linear(d_out, d_out)
        self.register_buffer("mask", torch.triu(torch.ones(context_length, context_length), diagonal=1))

    def forward(self, x):
        B, T, C = x.shape
        Q = self.q(x).view(B, T, self.n_heads, self.head_dim).transpose(1, 2)
        K = self.k(x).view(B, T, self.n_heads, self.head_dim).transpose(1, 2)
        V = self.v(x).view(B, T, self.n_heads, self.head_dim).transpose(1, 2)
        attention_scores = Q @ K.transpose(-2, -1)
        mask = self.mask[:T, :T]
        attention_scores = attention_scores.masked_fill(mask.bool(), -torch.inf)
        attention_weights = torch.softmax(attention_scores / (self.head_dim ** 0.5), dim=-1)
        attention_weights = self.dropout(attention_weights)
        context_vectors = attention_weights @ V
        context_vectors = context_vectors.transpose(1, 2).contiguous().view(B, T, C)
        return self.out_proj(context_vectors)

### MLP

In [None]:
class MLP(torch.nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = torch.nn.Sequential(
            torch.nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
            torch.nn.GELU(),
            torch.nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]),
        )

    def forward(self, x):
        return self.layers(x)

### Transformer

In [None]:
class TransformerBlock(torch.nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.att = MultiHeadAttention(
            d_in=cfg["emb_dim"],
            d_out=cfg["emb_dim"],
            context_length=cfg["context_len"],
            n_heads=cfg["n_heads"],
            drop_rate=cfg["drop_rate"],
            qkv_bias=cfg["qkv_bias"]
        )
        self.ff = MLP(cfg)
        self.norm_1 = LayerNorm(cfg)
        self.norm_2 = LayerNorm(cfg)
        self.dropout = torch.nn.Dropout(cfg["drop_rate"])

    def forward(self, x):
        short_cut = x
        x = self.norm_1(x)
        x = self.att(x)
        x = self.dropout(x)
        x = x + short_cut

        short_cut = x
        x = self.norm_2(x)
        x = self.ff(x)
        x = self.dropout(x)
        x = x + short_cut
        return x

## Model

In [None]:
class GPT(torch.nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.embedding_layer = EmbeddingLayer(cfg["vocab_size"], cfg["emb_dim"], cfg["context_len"])
        self.drop_emb = torch.nn.Dropout(cfg["drop_rate"])
        self.blocks = torch.nn.Sequential(*[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])
        self.ln_f = LayerNorm(cfg)
        self.out_head = torch.nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False)

    def forward(self, x):
        x = self.drop_emb(self.embedding_layer(x))
        x = self.blocks(x)
        x = self.ln_f(x)
        return self.out_head(x)

## Utils

### Load Data

In [40]:
def load_data(cfg, data, tokenizer, batch_size):
    dataset = Dataset(data, tokenizer, context_size=cfg["context_len"])
    dataloader = dataset.load_data(batch_size, shuffle=True)
    #iter_dataloader = iter(dataloader)
    return dataloader

### Split Data

In [39]:
def split_data(data):
  train_data = data[:int(len(data) * 0.9)]
  val_data = data[int(len(data) * 0.9):]
  return train_data, val_data

### Create Train-Val Sets

In [38]:
def train_val_split(cfg, data, tokenizer, batch_size):
  train_data, val_data = split_data(data)
  train_dataloader = load_data(cfg, train_data, tokenizer, batch_size)
  val_dataloader = load_data(cfg, val_data, tokenizer, batch_size)
  return train_dataloader, val_dataloader

### Calculate Loss (Batch)

In [None]:
def calc_loss_batch(model, x, y, device):
    logits = model(x.to(device))
    targets = y.to(device)
    # B, T, C = logits.shape
    # loss = torch.nn.functional.cross_entropy(logits.view(B*T, C), targets.view(B*T))
    loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1), targets.flatten())
    return loss

### Calculate Loss (Loader)

In [None]:
def calc_loss_loader(model, dataloader, device, num_batches=None):
  total_loss = 0
  if len(dataloader) == 0:
    return float("nan")
  elif num_batches is None:
    num_batches = len(dataloader)
  else:
    num_batches = min(num_batches, len(dataloader))
  for i, (x, y) in enumerate(dataloader):
    if i >= num_batches:
      break
    loss = calc_loss_batch(model, x, y, device)
    total_loss += loss.item()
  return total_loss / num_batches

### Evaluate Model

In [None]:
def evaluate_model(model, train_loader, val_loader, eval_iter, device):
  model.eval()
  with torch.no_grad():
    train_loss = calc_loss_loader(model, train_loader, device, eval_iter)
    val_loss = calc_loss_loader(model, val_loader, device, eval_iter)
  model.train()
  return train_loss, val_loss

### Text to Tokens Ids

In [None]:
def text_to_tokens(text, tokenizer):
  encoded = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
  encoded_tensor = torch.tensor(encoded).unsqueeze(0)
  return encoded_tensor

### Token Ids to Text

In [None]:
def tokens_to_text(tokens, tokenizer):
  flat = tokens.squeeze(0)
  return tokenizer.decode(flat.tolist())

### Generate Sample

In [None]:
def generate_text(model, idx, max_tokens, context_size):
  for _ in range(max_tokens):
    idx_cond = idx[:, -context_size:]
    with torch.no_grad():
      logits = model(idx_cond)
    logits = logits[:, -1, :]
    probs = torch.nn.functional.softmax(logits, dim=-1)
    idx_next = torch.multinomial(probs, num_samples=1)
    idx = torch.cat((idx, idx_next), dim=1)
  return idx

In [None]:
def generate_sample(model, tokenizer, context, max_tokens, device):
  model.eval()
  context_size = model.embedding_layer.position_embedding.weight.shape[0]
  idx = text_to_tokens(context, tokenizer).to(device)
  with torch.no_grad():
    idx = generate_text(model, idx, max_tokens, context_size)
  return tokens_to_text(idx, tokenizer)

# Train

## Model Parameters

In [83]:
cfg = GPT_CONFIG_128M
tokenizer = tiktoken.get_encoding(cfg["tokenizer"])
data = verdict #sidewalk
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
epochs = 10
eval_freq = 100
eval_iter = 5
max_tokens = 10
start_context = "Every effort moves you"
batch_size = 8

## Compile Model

In [84]:
model = GPT(cfg).to(device)

## Initialize Optimizer

In [85]:
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)

## Prepare Data

In [86]:
train, val = train_val_split(cfg, data, tokenizer, batch_size)

## Train Model

In [87]:
def train_model(model, train_loader, val_loader, optimizer, num_epochs, device,
          eval_freq, eval_iter, start_context, tokenizer):
  train_losses, val_losses, track_tokens_seen = [], [], []
  tokens_seen, global_step = 0, -1

  model.train()
  for epoch in range(num_epochs):
    model.train()
    for i, (x, y) in enumerate(train_loader):
      optimizer.zero_grad()
      loss = calc_loss_batch(model, x, y, device)
      loss.backward()
      optimizer.step()
      tokens_seen += x.numel()
      global_step += 1
      if global_step % eval_freq == 0:
        train_loss, val_loss = evaluate_model(model, train_loader, val_loader, eval_iter, device)
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        track_tokens_seen.append(tokens_seen)
        print(f"Epoch: {epoch + 1}, Step: {global_step:06d}, Train Loss: {train_loss:.3f}, Val Loss: {val_loss:.3f}")
        print(generate_sample(model, tokenizer, start_context, max_tokens , device))
  return train_losses, val_losses, track_tokens_seen

In [None]:
train_losses, val_losses, track_tokens_seen = train_model(model, train, val, optimizer, epochs, device, eval_freq, eval_iter, start_context, tokenizer)