# DataLoader Tests

In [2]:
import sys
import os
sys.path.append('./lib')

from dataloader import create_dataloader
import torch
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")

# Test text
sample_text = """
In the world of artificial intelligence, language models have revolutionized how we process and generate text. 
These systems use architectures like transformers to learn complex patterns in textual data.
The tokenization process converts text into numbers that the model can process efficiently.
DataLoaders allow loading and processing data in batches during training.
The sliding window technique with stride helps create overlapping sequences for better learning.
"""

print("=== Testing DataLoader with different stride values ===\n")

# Base configuration
batch_size = 2
max_length = 20

# Test with different stride values
strides = [1,5]

for stride in strides:
    print(f"--- STRIDE = {stride} ---")
    
    # Create DataLoader
    dataloader = create_dataloader(
        text=sample_text,
        batch_size=batch_size,
        max_length=max_length,
        stride=stride,
        shuffle=False 
    )
    
    print(f"Dataset size: {len(dataloader.dataset)} sequences")

    print(f"Number of batches: {len(dataloader)}")
    
    # Show first 2 batches
    for i, (inputs, targets) in enumerate(dataloader):
        if i >= 2:  # Only show first 2 batches
            break
        print(f"Batch {i}:")
        print(f"  Input shape: {inputs.shape}")
        print(f"  Input tokens (first seq): {inputs[0][:30].tolist()}...")
        print(f"  Target tokens (first seq): {targets[0][:30].tolist()}...")
        print(f"  Target text (first seq): {tokenizer.decode(targets[0].tolist())}")
        print(f"  Input tokens (second seq): {inputs[1][:10].tolist()}...")
        print(f"  Target tokens (second seq): {targets[1][:10].tolist()}...")
        print()
    

    

=== Testing DataLoader with different stride values ===

--- STRIDE = 1 ---
Dataset size: 63 sequences
Number of batches: 31
Batch 0:
  Input shape: torch.Size([2, 20])
  Input tokens (first seq): [198, 818, 262, 995, 286, 11666, 4430, 11, 3303, 4981, 423, 5854, 1143, 703, 356, 1429, 290, 7716, 2420, 13]...
  Target tokens (first seq): [818, 262, 995, 286, 11666, 4430, 11, 3303, 4981, 423, 5854, 1143, 703, 356, 1429, 290, 7716, 2420, 13, 220]...
  Target text (first seq): In the world of artificial intelligence, language models have revolutionized how we process and generate text. 
  Input tokens (second seq): [818, 262, 995, 286, 11666, 4430, 11, 3303, 4981, 423]...
  Target tokens (second seq): [262, 995, 286, 11666, 4430, 11, 3303, 4981, 423, 5854]...

Batch 1:
  Input shape: torch.Size([2, 20])
  Input tokens (first seq): [262, 995, 286, 11666, 4430, 11, 3303, 4981, 423, 5854, 1143, 703, 356, 1429, 290, 7716, 2420, 13, 220, 198]...
  Target tokens (first seq): [995, 286, 11666, 443

In [None]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,   # Vocabulary size
    "context_length": 256, # Shortened context length (orig: 1024)
    "emb_dim": 768,        # Embedding dimension
    "n_heads": 12,         # Number of attention heads
    "n_layers": 12,        # Number of layers
    "drop_rate": 0.1,      # Dropout rate
    "qkv_bias": False      # Query-key-value bias
}


tok_emb = torch.nn.Embedding(GPT_CONFIG_124M["vocab_size"], GPT_CONFIG_124M["emb_dim"])
pos_emb = torch.nn.Embedding(GPT_CONFIG_124M["context_length"], GPT_CONFIG_124M["emb_dim"])
drop_emb = torch.nn.Dropout(GPT_CONFIG_124M["drop_rate"])


for i, (inputs, targets) in enumerate(dataloader):
    if i >= 2:  
        break

    batch_size, seq_len = inputs.shape
    print(f"--- Batch {i} ---")
    print(f"Batch size: {batch_size}")
    print(f"Sequence length: {seq_len}")

    tok_embeds = tok_emb(inputs)
    print(f"Token embeddings shape: {tok_embeds.shape}")
    print(tok_embeds)

    pos_indices = torch.arange(seq_len)
    print(f"Position indices: {pos_indices}")
    pos_embeds = pos_emb(pos_indices)
    print(f"Position embeddings shape: {pos_embeds.shape}")
    print(pos_embeds)

    final_embeds = tok_embeds + pos_embeds
    result = drop_emb(final_embeds)
    print(f"Final result shape: {result.shape}")
    print(result)
    print()

In [None]:
import torch.nn as nn
import torch
class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert d_out % num_heads == 0, "d_out must be divisible by n_heads"

        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads  # Reduce the projection dim to match desired output dim

        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.out_proj = nn.Linear(d_out, d_out)  # Linear layer to combine head outputs
        self.dropout = nn.Dropout(dropout)
        self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1))

    def forward(self, x):
        print("=" * 50)
        print("MULTIHEAD ATTENTION FORWARD PASS")
        print("=" * 50)
        print("W_key weight matrix:")
        print(self.W_key.weight)
        
        b, num_tokens, d_in = x.shape
        print(f"INPUT: batch_size={b}, num_tokens={num_tokens}, d_in={d_in}")
        print(f"Input shape: {x.shape}")
        print(f"Configuration: d_out={self.d_out}, num_heads={self.num_heads}, head_dim={self.head_dim}")
        print()

        # Linear transformations
        print("1. LINEAR TRANSFORMATIONS (Q, K, V)")
        print("-" * 30)
        print("W_key weight matrix:")
        print(self.W_key.weight)
        keys = self.W_key(x)  # Shape: (b, num_tokens, d_out)
        queries = self.W_query(x)
        values = self.W_value(x)
        print(f"After linear layers:")
        print(f"  Keys shape: {keys.shape}")
        print(f"  Keys tensor:")
        print(keys)
        print(f"  Queries shape: {queries.shape}")
        print(f"  Values shape: {values.shape}")
        print()

        # We implicitly split the matrix by adding a `num_heads` dimension
        # Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim)
        print("2. VIEW OPERATION - SPLIT INTO HEADS")
        print("-" * 30)
        print(f"Reshaping from (b, num_tokens, d_out) to (b, num_tokens, num_heads, head_dim)")
        print(f"  d_out={self.d_out} = num_heads={self.num_heads} × head_dim={self.head_dim}")
        
        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)
        
        print(f"After view operation:")
        print(f"  Keys shape: {keys.shape}")
        print(f"  Keys AW: {keys}")
        print(f"  Queries shape: {queries.shape}")
        print(f"  Values shape: {values.shape}")
        print()

        # Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim)
        print("3. TRANSPOSE OPERATION")
        print("-" * 30)
        print(f"Transposing dimensions 1 and 2:")
        print(f"  (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim)")
        
        keys = keys.transpose(1, 2)
        queries = queries.transpose(1, 2)
        values = values.transpose(1, 2)
        
        print(f"After transpose:")
        print(f"  Keys shape: {keys.shape}")
        print(f"  Keys AT: {keys}")
        print(f"  Queries shape: {queries.shape}")
        print(f"  Values shape: {values.shape}")
        print()

        # Compute scaled dot-product attention (aka self-attention) with a causal mask
        print("4. ATTENTION SCORES COMPUTATION")
        print("-" * 30)
        print(f"Computing Q @ K^T for each head:")
        print(f"  queries shape: {queries.shape}")
        print(f"  keys.transpose(2,3) shape: {keys.transpose(2, 3).shape}")
        
        attn_scores = queries @ keys.transpose(2, 3)  # Dot product for each head
        print(f"Attention scores shape: {attn_scores.shape}")
        print(f"Scaling factor (sqrt(head_dim)): {keys.shape[-1]**0.5}")
        print()

        # Original mask truncated to the number of tokens and converted to boolean
        print("5. CAUSAL MASK APPLICATION")
        print("-" * 30)
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]
        print(f"Mask shape: {mask_bool.shape}")
        print(f"Mask (True=masked, False=allowed):")
        print(mask_bool)
        print()

        # Use the mask to fill attention scores
        print("Before masking - attention scores (first head of first batch):")
        print(attn_scores[0, 0])
        attn_scores.masked_fill_(mask_bool, -torch.inf)
        print("After masking - attention scores (first head of first batch):")
        print(attn_scores[0, 0])
        print()

        print("6. SOFTMAX AND DROPOUT")
        print("-" * 30)
        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
        print(f"Attention weights shape: {attn_weights.shape}")
        print("Attention weights (first head of first batch):")
        print(attn_weights[0, 0])
        
        attn_weights = self.dropout(attn_weights)
        print("After dropout applied")
        print()

        # Shape: (b, num_tokens, num_heads, head_dim)
        print("7. WEIGHTED VALUES COMPUTATION")
        print("-" * 30)
        print(f"Computing attention_weights @ values:")
        print(f"  attention_weights shape: {attn_weights.shape}")
        print(f"  values shape: {values.shape}")
        
        context_vec = (attn_weights @ values).transpose(1, 2)
        print(f"Context vector after transpose: {context_vec.shape}")
        print()

        # Combine heads, where self.d_out = self.num_heads * self.head_dim
        print("8. COMBINE HEADS (RESHAPE)")
        print("-" * 30)
        print(f"Reshaping from {context_vec.shape} to (b, num_tokens, d_out)")
        context_vec = context_vec.reshape(b, num_tokens, self.d_out)
        print(f"Combined heads shape: {context_vec.shape}")
        
        print("9. OUTPUT PROJECTION")
        print("-" * 30)
        context_vec = self.out_proj(context_vec)  # optional projection
        print(f"Final output shape: {context_vec.shape}")
        print()

        return context_vec

In [None]:
# import sys
# import os
# sys.path.append('./lib')
# from multihead_attention import MultiHeadAttention
# import torch

# Create example with specified parameters
mha = MultiHeadAttention(
    d_in=6,
    d_out=6,
    context_length=4,
    num_heads=3,
    dropout=0.1,
    qkv_bias=False
)

torch.manual_seed(42)
# Create sample input data (batch_size=2, sequence_length=4, d_in=6)
sample_input = torch.tensor([
    # Batch 1
    [
        [1.0, 2.0, 3.0, 4.0, 5.0, 6.0],    # Token 1
        [0.5, 1.5, 2.5, 3.5, 4.5, 5.5],    # Token 2
        [2.0, 4.0, 6.0, 8.0, 10.0, 12.0],  # Token 3
        [1.0, 0.0, -1.0, 2.0, 3.0, -2.0]   # Token 4
    ],
    # Batch 2
    [
        [0.1, 0.2, 0.3, 0.4, 0.5, 0.6],    # Token 1
        [1.1, 1.2, 1.3, 1.4, 1.5, 1.6],    # Token 2
        [-1.0, 2.0, -3.0, 4.0, -5.0, 6.0], # Token 3
        [0.0, 1.0, 0.0, 1.0, 0.0, 1.0]     # Token 4
    ]
])

print("Input shape:", sample_input.shape)
print("Input tensor:")
print(sample_input)
print()

# Forward pass through MultiHeadAttention
output = mha(sample_input)
print("Output shape:", output.shape)
print("Output tensor:")
print(output)

In [23]:
import sys
import os
sys.path.append('./lib')

from dataloader import create_dataloader
from gpt_model import GPTModel
import torch
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")

print("Starting GPT test...")

# Test text
sample_text = """
In the world of artificial intelligence, language models have revolutionized how we process and generate text. 
These systems use architectures like transformers to learn complex patterns in textual data.
The tokenization process converts text into numbers that the model can process efficiently.
DataLoaders allow loading and processing data in batches during training.
The sliding window technique with stride helps create overlapping sequences for better learning.
"""

GPT_CONFIG_124M = {
    "vocab_size": 50257,   # Vocabulary size
    "context_length": 256, # Shortened context length (orig: 1024)
    "emb_dim": 768,        # Embedding dimension
    "n_heads": 12,         # Number of attention heads
    "n_layers": 12,        # Number of layers
    "drop_rate": 0.1,      # Dropout rate
    "qkv_bias": False      # Query-key-value bias
}

print("Creating GPT model...")
model = GPTModel(GPT_CONFIG_124M)
print("Model created successfully!")

print("Creating dataloader...")
dataloader = create_dataloader(
    text=sample_text,
    batch_size=1,
    max_length=50,  
    stride=1,      
    shuffle=False 
)
print(f"Dataloader created! Dataset size: {len(dataloader.dataset)}")
print(f"Number of batches: {len(dataloader)}")

print("Starting forward pass...")
for i, (inputs, targets) in enumerate(dataloader):
    if i >= 11:  # Only process first batch
        break
    print(f"Processing Batch {i}:")
    print(f"Input shape: {inputs.shape}")
    
    try:
        a = inputs[0].tolist()
        prompt_str = tokenizer.decode(a)
        print("prompt_str", prompt_str)
        logits = model.forward(inputs)
        print("logits:", logits)
        print(logits.shape)
        predicted_token_id = torch.argmax(logits, dim=-1)[-1]
        print("predicted_token_id", predicted_token_id)
        a.append(predicted_token_id[0])
        b = tokenizer.decode(a)
        print(b)
        print(f"Success! Output shape: {logits.shape}")
        print(f"Sample logits: {logits[0, 0, :5]}")
    except Exception as e:
        print(f"Error during forward pass: {e}")
    print()

Starting GPT test...
Creating GPT model...
Model created successfully!
Creating dataloader...
Dataloader created! Dataset size: 33
Number of batches: 33
Starting forward pass...
Processing Batch 0:
Input shape: torch.Size([1, 50])
prompt_str 
In the world of artificial intelligence, language models have revolutionized how we process and generate text. 
These systems use architectures like transformers to learn complex patterns in textual data.
The tokenization process converts text into numbers that the model can
in_idx.shape() torch.Size([1, 50])
logits: tensor([[[ 0.7509,  0.1538,  0.0703,  ..., -0.5794, -0.5429, -0.3805],
         [-0.0838, -0.4568, -0.1122,  ...,  0.2135, -1.6031,  0.9259],
         [ 0.2893,  0.4958, -0.4716,  ..., -0.2217, -1.2560,  0.8927],
         ...,
         [ 0.0173,  0.6606, -0.1656,  ..., -0.3013, -0.9760, -0.1228],
         [ 0.1959,  0.0796,  0.5696,  ..., -0.0725, -0.7797,  0.0600],
         [-0.5975,  0.0665,  0.3491,  ...,  0.2148, -0.7986,  0.2971]