# DataLoader Tests

In [None]:
import sys
import os
sys.path.append('./lib')

from dataloader import create_dataloader
import torch
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")

# Test text
sample_text = """
In the world of artificial intelligence, language models have revolutionized how we process and generate text. 
These systems use architectures like transformers to learn complex patterns in textual data.
The tokenization process converts text into numbers that the model can process efficiently.
DataLoaders allow loading and processing data in batches during training.
The sliding window technique with stride helps create overlapping sequences for better learning.
"""

print("=== Testing DataLoader with different stride values ===\n")

# Base configuration
batch_size = 2
max_length = 20

# Test with different stride values
strides = [5]

for stride in strides:
    print(f"--- STRIDE = {stride} ---")
    
    # Create DataLoader
    dataloader = create_dataloader(
        text=sample_text,
        batch_size=batch_size,
        max_length=max_length,
        stride=stride,
        shuffle=False 
    )
    
    print(f"Dataset size: {len(dataloader.dataset)} sequences")
    print(f"Number of batches: {len(dataloader)}")
    
    # Show first 2 batches
    for i, (inputs, targets) in enumerate(dataloader):
        if i >= 2:  # Only show first 2 batches
            break
        print(f"Batch {i}:")
        print(f"  Input shape: {inputs.shape}")
        print(f"  Input tokens (first seq): {inputs[0][:30].tolist()}...")
        print(f"  Target tokens (first seq): {targets[0][:30].tolist()}...")
        print(f"  Target text (first seq): {tokenizer.decode(targets[0].tolist())}")
        print(f"  Input tokens (second seq): {inputs[1][:10].tolist()}...")
        print(f"  Target tokens (second seq): {targets[1][:10].tolist()}...")
        print()
    
    print(f"{'='*50}\n")