# DataLoader Tests

In [3]:
import sys
import os
sys.path.append('./lib')

from dataloader import create_dataloader
import torch
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")

# Test text
sample_text = """
In the world of artificial intelligence, language models have revolutionized how we process and generate text. 
These systems use architectures like transformers to learn complex patterns in textual data.
The tokenization process converts text into numbers that the model can process efficiently.
DataLoaders allow loading and processing data in batches during training.
The sliding window technique with stride helps create overlapping sequences for better learning.
"""

print("=== Testing DataLoader with different stride values ===\n")

# Base configuration
batch_size = 2
max_length = 20

# Test with different stride values
strides = [5]

for stride in strides:
    print(f"--- STRIDE = {stride} ---")
    
    # Create DataLoader
    dataloader = create_dataloader(
        text=sample_text,
        batch_size=batch_size,
        max_length=max_length,
        stride=stride,
        shuffle=False 
    )
    
    print(f"Dataset size: {len(dataloader.dataset)} sequences")

    print(f"Number of batches: {len(dataloader)}")
    
    # Show first 2 batches
    for i, (inputs, targets) in enumerate(dataloader):
        if i >= 2:  # Only show first 2 batches
            break
        print(f"Batch {i}:")
        print(f"  Input shape: {inputs.shape}")
        print(f"  Input tokens (first seq): {inputs[0][:30].tolist()}...")
        print(f"  Target tokens (first seq): {targets[0][:30].tolist()}...")
        print(f"  Target text (first seq): {tokenizer.decode(targets[0].tolist())}")
        print(f"  Input tokens (second seq): {inputs[1][:10].tolist()}...")
        print(f"  Target tokens (second seq): {targets[1][:10].tolist()}...")
        print()
    

    

=== Testing DataLoader with different stride values ===

--- STRIDE = 5 ---
Dataset size: 13 sequences
Number of batches: 6
Batch 0:
  Input shape: torch.Size([2, 20])
  Input tokens (first seq): [198, 818, 262, 995, 286, 11666, 4430, 11, 3303, 4981, 423, 5854, 1143, 703, 356, 1429, 290, 7716, 2420, 13]...
  Target tokens (first seq): [818, 262, 995, 286, 11666, 4430, 11, 3303, 4981, 423, 5854, 1143, 703, 356, 1429, 290, 7716, 2420, 13, 220]...
  Target text (first seq): In the world of artificial intelligence, language models have revolutionized how we process and generate text. 
  Input tokens (second seq): [11666, 4430, 11, 3303, 4981, 423, 5854, 1143, 703, 356]...
  Target tokens (second seq): [4430, 11, 3303, 4981, 423, 5854, 1143, 703, 356, 1429]...

Batch 1:
  Input shape: torch.Size([2, 20])
  Input tokens (first seq): [423, 5854, 1143, 703, 356, 1429, 290, 7716, 2420, 13, 220, 198, 4711, 3341, 779, 45619, 588, 6121, 364, 284]...
  Target tokens (first seq): [5854, 1143, 703, 3

In [8]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,   # Vocabulary size
    "context_length": 256, # Shortened context length (orig: 1024)
    "emb_dim": 768,        # Embedding dimension
    "n_heads": 12,         # Number of attention heads
    "n_layers": 12,        # Number of layers
    "drop_rate": 0.1,      # Dropout rate
    "qkv_bias": False      # Query-key-value bias
}


tok_emb = torch.nn.Embedding(GPT_CONFIG_124M["vocab_size"], GPT_CONFIG_124M["emb_dim"])
pos_emb = torch.nn.Embedding(GPT_CONFIG_124M["context_length"], GPT_CONFIG_124M["emb_dim"])
drop_emb = torch.nn.Dropout(GPT_CONFIG_124M["drop_rate"])


for i, (inputs, targets) in enumerate(dataloader):
    if i >= 2:  
        break

    batch_size, seq_len = inputs.shape
    print(f"--- Batch {i} ---")
    print(f"Batch size: {batch_size}")
    print(f"Sequence length: {seq_len}")

    tok_embeds = tok_emb(inputs)
    print(f"Token embeddings shape: {tok_embeds.shape}")
    print(tok_embeds)

    pos_indices = torch.arange(seq_len)
    print(f"Position indices: {pos_indices}")
    pos_embeds = pos_emb(pos_indices)
    print(f"Position embeddings shape: {pos_embeds.shape}")
    print(pos_embeds)

    final_embeds = tok_embeds + pos_embeds
    result = drop_emb(final_embeds)
    print(f"Final result shape: {result.shape}")
    print(result)
    print()

--- Batch 0 ---
Batch size: 2
Sequence length: 20
Token embeddings shape: torch.Size([2, 20, 768])
tensor([[[-6.6324e-01,  4.1704e-01, -1.2894e+00,  ..., -1.8791e+00,
           8.2641e-01,  9.7305e-01],
         [-1.4581e+00,  7.4356e-01, -5.9911e-02,  ...,  1.6649e+00,
          -9.2707e-01,  1.4055e+00],
         [ 3.9818e-01,  2.1474e+00, -1.0382e+00,  ...,  1.3475e-03,
          -8.9302e-01,  1.7736e+00],
         ...,
         [ 1.1110e+00,  1.2616e+00, -5.4483e-01,  ..., -1.9394e-01,
           1.6118e+00, -6.3609e-01],
         [ 2.0582e+00, -2.6760e-01,  1.7627e+00,  ..., -1.1155e+00,
          -1.6484e+00, -2.1745e-01],
         [ 5.8419e-01, -2.8842e-01, -1.4957e+00,  ...,  1.3730e+00,
           1.8477e+00,  1.3674e+00]],

        [[-6.2240e-01,  5.3057e-01, -3.9224e-02,  ...,  2.1672e+00,
           2.1350e-01, -3.8101e-02],
         [-9.7485e-01,  1.0174e+00,  1.0718e+00,  ...,  1.0746e+00,
          -3.1815e-01, -1.4813e+00],
         [-3.8978e-01,  9.1824e-02, -5.8360e-