In [15]:
# BPE is Byte Pair Tokenization
# word based tokenization is not really possible because there could billions
# of combinations
# however, the character based token, there are only 256 distinct.
# the character level losses the meaning of the word.

# BPE is sub word tokenziation, best of both world, word and characters based.
# e.g boys, will be divided into boy and s. so the meanful breaking.

# 1. Tokenization (word base, chararacter base, mostly BPE is used)
# 2. Vector Embeddings
# 3. Positional Embeddings
# 4. Input Embeddings = Vector + Postional - Context Embeddings.
# 5. Input Embeddings is input to neural networks.


import torch
import re, collections
import importlib.metadata
import tiktoken
from torch.utils.data import Dataset, DataLoader

from google.colab import drive
drive.mount('/content/drive')

with open('/content/drive/My Drive/Colab Notebooks/the-verdict.txt',
         'r', encoding='utf-8') as f:
  raw_text = f.read()

print (raw_text[:99])

GPT_CONFIG_124M = {
    "emb_dim": 256, #embedding dimenstion
    "n_heads": 8, #number of attention heads
    "n_layers": 12, #number of transformer layers
    "context_length": 8, #how many token process in any given time
    "vocab_size": 50257, # vocabalary size
    "drop_rate": 0.1,
    "qkv_bias": False
}



#raw_text = "Your journey start with one step"
# this is STEP1 & STEP2
class GPTDataset(Dataset):

  def __init__(self, txt, tokenizer, max_length, stride):
    self.input_ids = []
    self.target_ids = []

    #Tokenize the entire text
    token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|?}"})

    # use the sliding winder to check the book
    for i in range(0, len(token_ids) - max_length, stride):
      input_chunks = token_ids[i:i+max_length]
      target_chunks = token_ids[i+1:i+1+max_length]
      self.input_ids.append(torch.tensor(input_chunks, dtype=torch.long))
      self.target_ids.append(torch.tensor(target_chunks, dtype=torch.long))
  def __len__(self):
    return len(self.input_ids)
  def __getitem__(self, idx):
    return self.input_ids[idx], self.target_ids[idx]


def create_dataloader (dataset, batch_size=8, max_length=256, stride=128,
                       shuffle=True, last_drop=True, num_workers=0):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDataset(dataset, tokenizer, max_length, stride)
    dataloader = DataLoader (dataset, batch_size, shuffle=False,
                            num_workers=0)
    return dataloader


# 1. Initialize the tokenizer
# 2. Create the dataset
# 3. drop_last=true drop the last batch.
# 4. number CPUs.

print ("PyTorch verson:", torch.__version__)
dataloader = create_dataloader (
    raw_text,
    batch_size=2,
    max_length=8,
    stride=1,
    shuffle=False)


data_iter = iter(dataloader)
inputs, targets = next(data_iter)
#print (len(dataloader))

# train/validation data
train_ratio = 0.9
split_idx = int(train_ratio * len(raw_text))
train_data = raw_text[:split_idx]
val_data = raw_text[split_idx:]

torch.manual_seed(123)

train_dataloader = create_dataloader (
    train_data,
    batch_size=2,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    last_drop=True,
    shuffle=True,
    num_workers=0)

val_dataloader = create_dataloader (
    val_data,
    batch_size=2,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    last_drop=False,
    shuffle=False,
    num_workers=0)

print ("Train loader:")
for x, y in train_dataloader:
  print (x.shape, y.shape)
print ("Val loader:")
for x, y in val_dataloader:
  print (x.shape, y.shape)

print (len(train_dataloader))
print (len(val_dataloader))

vocab_size = 50257
output_dim = 256

#vector or token embeddings
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
token_embedding = token_embedding_layer(inputs)
print("Token Embedding Size", token_embedding.shape)

#positional embeddings
context_length = max_length = 8
position_embedding_layer = torch.nn.Embedding(context_length, output_dim)
position_embedding = position_embedding_layer(torch.arange(max_length))
print("Position Embedding Size:", position_embedding.shape)

#input embedding = vector embedding + positional embedding.
input_embedding = token_embedding + position_embedding
print("Input Embedding Shape:", input_embedding.shape)


# self attentions.
# 1. Query, Key, Value
# 2. Q = vector @ Query, K = vector @ keys, V = vector @ value
# 3. Attention score = Q * K^t
# 4. Attention normalized = Attention score/sqrt(key embedding dim)
# 5. Softmax (Attention normalized).
# 6. Context Vector = Softmax Attention * V

# causal attentions. (ensure only factor current or previous token)
# 1. use of tril or triu
# 2. here you will set the upper triangle not zero but -inf.
# 3. after setting -inf, you can use the softmax. remaining should sum to 1.
# 4. dropout


d_in = 256
d_out = 256
dropout = 0.5

print ("Input Shape is", inputs.shape)


# multiple head = causaul attn * many
class AttentionIsAllYouNeed(torch.nn.Module):
  def __init__(self, d_in, d_out, context_length, dropout):
    super().__init__()
    self.dropout = torch.nn.Dropout(0.5)
    self.W_query = torch.nn.Linear(d_in, d_out, bias=False)
    self.W_key = torch.nn.Linear(d_in, d_out, bias=False)
    self.W_value = torch.nn.Linear(d_in, d_out, bias=False)
    self.register_buffer("mask", torch.triu(torch.ones(context_length, context_length), diagonal=1))

  def forward(self, X):
    b, num_tokens, d_in = X.shape
    print ("X shape is", X.shape)

    keys = self.W_key(X)
    queries = self.W_query(X)
    values = self.W_value(X)
    print ("keys shape is", keys.shape)

    attn_scores = queries @ keys.transpose(1,2) # 6x2 @ 2x6 = 6x6 matrix

    attn_scores = attn_scores.masked_fill_(
        self.mask.bool()[:num_tokens, :num_tokens], -torch.inf) # num_token incase the
        # the size context is less than context size
    attn_weight = torch.softmax(attn_scores / (d_out ** 0.5), dim=-1)
    attn_weight = self.dropout(attn_weight)
    context_vector = attn_weight @ values # 6x6 @ 6x2 = 6x2
    return context_vector

class MultiheadAttentionWrapper(torch.nn.Module):
  def __init__(self, d_in, d_out, context_length, dropout, num_heads):
    super().__init__()
    self.heads = torch.nn.ModuleList(
        [AttentionIsAllYouNeed(d_in, d_out, context_length, dropout) for _ in range(num_heads)]
    )
  def forward(self, X):
    return torch.cat([head(X) for head in self.heads], dim=-1)


#input1 = 6x3
torch.manual_seed(789)
context_length = 8
# Test MultiheadAttentionWrapper
mha = MultiheadAttentionWrapper(d_in, d_out, context_length, dropout=0.5, num_heads=2)
mha_vec = mha(token_embedding)


# copied from vasuria used in chatgpt2.
class MultiheadAttention (torch.nn.Module):
  def __init__(self, d_in, d_out, context_length, dropout, num_heads):
    super().__init__()
    print ("d_out", d_out)
    print ("num_heads", num_heads)
    assert (d_out % num_heads == 0), \
      "d_out must be divible by num_heads"
    self.d_out = d_out
    self.num_heads = num_heads
    #STEP2 - define d_out and number of heads
    # we have decided to use d_out = 6, and num_heads = 2
    self.head_dim = d_out // num_heads
    self.W_query = torch.nn.Linear(d_in, d_out, bias=False) # this is 6x6
    self.W_key = torch.nn.Linear(d_in, d_out, bias=False) # this is 6x6
    self.W_value = torch.nn.Linear(d_in, d_out, bias=False) # this is 6x6
    self.out_proj = torch.nn.Linear(d_out, d_out)
    self.dropout = torch.nn.Dropout(dropout)
    self.register_buffer("mask", torch.triu(torch.ones(context_length, context_length), diagonal=1))

  def forward(self, X):
    b, num_tokens, d_in = X.shape #STEP1 - 1,3,6
    queries = self.W_query(X)
    keys = self.W_key(X)
    values = self.W_value(X)

    keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
    values = values.view(b, num_tokens, self.num_heads, self.head_dim)
    queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)

    # transpose
    keys = keys.transpose(1,2)
    values = values.transpose(1,2)
    queries = queries.transpose(1,2)

    attn_scores = queries @ keys.transpose(2,3)

    mask_bool = self.mask.bool()[:num_tokens, :num_tokens]
    attn_scores = attn_scores.masked_fill_(mask_bool, -torch.inf)

    attn_weight = torch.softmax(attn_scores / (self.head_dim ** 0.5), dim=-1)
    attn_weight = self.dropout(attn_weight)
    context_vector = attn_weight @ values

    context_vector = context_vector.transpose(1,2)
    context_vector = context_vector.reshape(b, num_tokens, self.d_out)
    context_vector = self.out_proj(context_vector)
    return context_vector

torch.manual_seed(123)
# Test MultiheadAttention
# Use a tensor with shape (batch_size, num_tokens, d_in) for testing
batch_for_mha = torch.randn(2, 8, 256) # Example shape, adjust as needed
mha2 = MultiheadAttention(256, 256, 8, dropout=0.0, num_heads=2)
mha2_vec = mha2(batch_for_mha)


# LLM architecture
# Transformer block
# 1. LayerNormalization
# 2. Multi-head attention
# 3. Dropout
# 4. shortcut (+)
# 5. LayerNormalization again
# 6. Feed forward
#     1. linear layer
#     2. GELU activation
#     3. Linear layer
# 7. Dropout
# 8. shortcut (+)
# 9. finally output layers


# Transformers.
# transformers have layers, meaning it has that many transformer layers.



# STEP 1.

import torch.nn as nn

class GPTModel(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.tok_emb = nn.Embedding(config["vocab_size"], config["emb_dim"])
    self.pos_emb = nn.Embedding(config["context_length"], config["emb_dim"])
    self.drop_emb = nn.Dropout(config["drop_rate"])
    self.trf_blocks = nn.Sequential(
        *[TransformerBlock(config) for _ in range(config["n_layers"])])
    self.final_norm = LayerNorm(config["emb_dim"])
    self.out_head = nn.Linear(config["emb_dim"], config["vocab_size"], bias=False)

  def forward(self, x):
    batch_size, context_length = x.shape
    tok_emb = self.tok_emb(x)
    pos_emb = self.pos_emb(torch.arange(context_length, device=x.device))
    x = tok_emb + pos_emb
    x = self.drop_emb(x)
    x = self.trf_blocks(x) # transformer block
    x = self.final_norm(x)
    logits = self.out_head(x)
    return logits


class TransformerBlock(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.attn = MultiheadAttention(config["emb_dim"], config["emb_dim"],
                                   config["context_length"], config["drop_rate"],
                                   config["n_heads"])
    self.ff = FeedForward(config["emb_dim"], config["drop_rate"])
    self.ln1 = LayerNorm(config["emb_dim"])
    self.ln2 = LayerNorm(config["emb_dim"])
    self.drop_shortcut = nn.Dropout(config["drop_rate"])


  def forward(self, x):
    shortcut = x
    x = self.ln1(x)
    x = self.attn(x)
    x = self.drop_shortcut(x)
    x = shortcut + x
    shortcut = x
    x = self.ln2(x)
    x = self.ff(x)
    x = self.drop_shortcut(x)
    x = shortcut + x
    return x

class LayerNorm(nn.Module):
  def __init__(self, emb_dim):
    super().__init__()
    self.eps = 1e-5
    self.scale = nn.Parameter(torch.ones(emb_dim))
    self.shift = nn.Parameter(torch.zeros(emb_dim))
  def forward(self, x):
    mean = x.mean(dim=-1, keepdim=True)
    var = x.var(dim=-1, keepdim=True)
    x = (x - mean) / (torch.sqrt(var + self.eps))
    x = self.scale * x + self.shift
    return x

class FeedForward(nn.Module):
  def __init__(self, emb_dim, drop_rate):
    super().__init__()
    self.net = nn.Sequential(
        nn.Linear(emb_dim, 4 * emb_dim),
        nn.GELU(),
        nn.Linear(4 * emb_dim, emb_dim),
        #nn.Dropout(drop_rate)
    )
  def forward(self, x):
      return self.net(x)


def generate_text_simple(model, idx, max_new_tokens, context_size):
    # idx is (batch, n_tokens) array of indices in the current context
    # how much to output. (max_new_token)
    for _ in range(max_new_tokens):

        # Crop current context if it exceeds the supported context size
        # E.g., if LLM supports only 5 tokens, and the context size is 10
        # then only the last 5 tokens are used as context
        idx_cond = idx[:, -context_size:]

        # Get the predictions
        with torch.no_grad():
            logits = model(idx_cond)

        # Focus only on the last time step
        # (batch, n_tokens, vocab_size) becomes (batch, vocab_size)
        logits = logits[:, -1, :]  # how much to output.

        # Apply softmax to get probabilities
        probas = torch.softmax(logits, dim=-1)  # (batch, vocab_size)

        # Get the idx of the vocab entry with the highest probability value
        idx_next = torch.argmax(probas, dim=-1, keepdim=True)  # (batch, 1)

        # Append sampled index to the running sequence
        idx = torch.cat((idx, idx_next), dim=1)  # (batch, n_tokens+1)

    return idx



torch.manual_seed(123)
tokenizer = tiktoken.get_encoding("gpt2")
batch = []
text1 = "Every effort move you"
text2 = "Every day hold a"
batch.append(torch.tensor(tokenizer.encode(text1)))
batch.append(torch.tensor(tokenizer.encode(text2)))
batch = torch.stack(batch, dim=0)
#print (batch.shape)
#print (batch)


model = GPTModel(GPT_CONFIG_124M)
model.eval()
#out = model(batch)
#print("Input batch:\n", batch)
#print("\nOutput shape:", out.shape)
#print(out)


start_text = "Hello, I am"
encoded = tokenizer.encode(start_text)
#print ("encoded", encoded)
##encoded_tensor = torch.tensor(encoded, dtype=torch.long).unsqueeze(0)
#print("encoded_tensor.shape", encoded_tensor.shape)
#model.eval()
#out = generate_text_simple(model=model,idx=encoded_tensor,max_new_tokens=6,context_size=GPT_CONFIG_124M["context_length"])
#print("Output:", out)
#print("Output length:", len(out[0]))
#decoded_text = tokenizer.decode(out.squeeze(0).tolist())
#print("Decoded text:", decoded_text)


def text_to_token_ids (text, tokenizer):
  encoded_tensor = torch.tensor(encoded, dtype=torch.long).unsqueeze(0)
  #encoded_tensor = torch.tensor(encoded).unsqueeze(0)
  return encoded_tensor

def token_ids_to_text (token_ids, tokenizer):
  flat = token_ids.squeeze(0).tolist()
  decoded_text = tokenizer.decode(flat)
  return decoded_text

start_context = "Every effor moves you"
tokenizer = tiktoken.get_encoding("gpt2")
model.eval()
token_ids = generate_text_simple(
    model=model,
    idx=text_to_token_ids(start_context, tokenizer),
    max_new_tokens=10,
    context_size=GPT_CONFIG_124M["context_length"]
)

print("Output text1111:\n",token_ids_to_text(token_ids, tokenizer))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 
PyTorch verson: 2.8.0+cu126
Train loader:
torch.Size([2, 8]) torch.Size([2, 8])
torch.Size([2, 8]) torch.Size([2, 8])
torch.Size([2, 8]) torch.Size([2, 8])
torch.Size([2, 8]) torch.Size([2, 8])
torch.Size([2, 8]) torch.Size([2, 8])
torch.Size([2, 8]) torch.Size([2, 8])
torch.Size([2, 8]) torch.Size([2, 8])
torch.Size([2, 8]) torch.Size([2, 8])
torch.Size([2, 8]) torch.Size([2, 8])
torch.Size([2, 8]) torch.Size([2, 8])
torch.Size([2, 8]) torch.Size([2, 8])
torch.Size([2, 8]) torch.Size([2, 8])
torch.Size([2, 8]) torch.Size([2, 8])
torch.Size([2, 8]) torch.Size([2, 8])
torch.Size([2, 8]) torch.Size([2, 8])
torch.Size([2, 8]) torch.Size([2, 8])
torch.Size([2, 8]) torch.Size([2, 8])
torch.Size([2, 8]) torch.Size([2, 8])
torch.Size([2, 8]) torch.Size([2, 8])
torch.S