# Experiments

> Notebook for running training, experiments, and analysis. This includes dataset acquisition, model training, and evaluation.

In [1]:
import sys
from pathlib import Path
import os

# --- Project Root Setup --- 
project_root = Path(os.getcwd())
# Check if running from nbs/
if project_root.name == 'nbs' and (project_root.parent / 'settings.ini').exists():
    project_root = project_root.parent
elif not (project_root / 'settings.ini').exists():
     # Try going up one level if settings.ini not found directly
     if (project_root.parent / 'settings.ini').exists():
          project_root = project_root.parent
     else:
          # Heuristic: if 'nbs' is in the path, go up one level from 'nbs'
          current_parts = project_root.parts
          if 'nbs' in current_parts:
            nbs_index = current_parts.index('nbs')
            project_root = Path(*current_parts[:nbs_index])
          if not (project_root / 'settings.ini').exists(): # Final check
            print("Warning: Could not automatically determine project root based on 'settings.ini' or 'nbs' dir. Assuming current dir or its parent might be it, or sys.path is already correct.")

project_root_str = str(project_root.resolve())
if project_root_str not in sys.path:
    # print(f"Adding project root to sys.path: {project_root_str}") # Verbose option
    sys.path.insert(0, project_root_str)
    # Also add the library path if it's distinct and not just project_root/lib_name
    # Assuming settings.ini is at project_root and lib_path is specified relative to it
    try:
        import configparser
        config = configparser.ConfigParser()
        settings_file = project_root / 'settings.ini'
        if settings_file.exists():
            config.read(settings_file)
            lib_name = config['DEFAULT'].get('lib_name', 'lm') # Default to 'lm' if not found
            lib_path_str = str((project_root / lib_name).resolve()) # Assumes lib is directly under project root
            if lib_path_str not in sys.path:
                # print(f"Adding library path to sys.path: {lib_path_str}") # Verbose
                sys.path.insert(0, lib_path_str)
    except Exception as e:
        print(f"Note: Could not parse settings.ini to add specific lib_path: {e}")
else:
    # print(f"Project root already in sys.path: {project_root_str}") # Less verbose
    pass
# --- End Project Root Setup --- 

In [2]:
import datasets
import os
from pathlib import Path
from typing import Optional, Iterable, Dict, List, Any # Added List, Any
import tqdm
import time # For measuring execution time
import tempfile # For quick verification test

# Try importing from the current project's structure first
try:
    from lm.bpe_training import train_bpe
    from lm.utils import save_tokenizer_components, load_tokenizer_components
    from lm.bpe_tokenizer_class import Tokenizer 
    print("Successfully imported from 'bpe_tokenizer_scratch' library modules.")
except ImportError as e:
    print(f"Could not import from 'bpe_tokenizer_scratch' library. Attempting fallback. Error: {e}")
    # Fallback if nbdev_export hasn't been run or if in a different environment
    try:
        # This fallback might be for when running nbs/04_experiments.ipynb directly 
        # and the main library `bpe_tokenizer_scratch` is in the parent directory.
        # The project root setup above should ideally handle this.
        from lm.bpe_training import train_bpe # Assuming direct import from exported .py in same dir (less likely with nbdev)
        from lm.utils import save_tokenizer_components, load_tokenizer_components
        from lm.bpe_tokenizer_class import Tokenizer 
        print("Successfully imported from local .py files (fallback). Ensure nbdev_export has been run for proper library structure.")
    except ImportError as e2:
        print(f"Fallback import also failed. Error: {e2}")
        print("Please ensure nbdev_export has been run and the library 'bpe_tokenizer_scratch' is in your PYTHONPATH or accessible.")
        # Dummy placeholders if imports fail, to allow notebook to load
        def train_bpe(*args, **kwargs):
            raise NotImplementedError("train_bpe not imported. Run nbdev_export for 02_bpe_training.ipynb")
        def save_tokenizer_components(*args, **kwargs):
            raise NotImplementedError("save_tokenizer_components not imported. Run nbdev_export for 00_utils.ipynb")
        def load_tokenizer_components(*args, **kwargs):
            raise NotImplementedError("load_tokenizer_components not imported. Run nbdev_export for 00_utils.ipynb")
        class Tokenizer:
            def __init__(self, *args, **kwargs):
                raise NotImplementedError("Tokenizer not imported. Run nbdev_export for 03_bpe_tokenizer_class.ipynb")
            @classmethod
            def from_files(cls, *args, **kwargs):
                raise NotImplementedError("Tokenizer.from_files not imported.")
            def encode(self, *args, **kwargs):
                raise NotImplementedError("Tokenizer.encode not imported.")
            def decode(self, *args, **kwargs):
                raise NotImplementedError("Tokenizer.decode not imported.")

import numpy as np

Successfully imported from 'bpe_tokenizer_scratch' library modules.


## 0. Quick Encode/Decode Verification

This section provides a quick way to verify the `encode` and `decode` functionality of the `Tokenizer` using a very small, self-contained corpus. This is useful for rapid checks without processing large datasets.

In [3]:
def run_quick_encode_decode_verification():
    """Trains a BPE tokenizer on a small sample text and verifies encode/decode roundtrip."""
    print("--- Starting Quick Encode/Decode Verification ---")
    
    # 1. Define a small, multi-line string corpus
    sample_corpus = (
        "hello world.\n"
        "this is a test.\n"
        "hello again, world!\n"
        "special tokens like <|endoftext|> should be handled.\n"
        "another line for more content and merges. aabbcc aabb dd."
    )
    special_tokens_for_quick_test = ["<|endoftext|>"]
    quick_vocab_size = 256 + 15 # Base bytes + a few merges

    # Create a temporary file for the corpus
    temp_corpus_file = None
    try:
        with tempfile.NamedTemporaryFile(mode='w', delete=False, encoding='utf-8') as tmp_f:
            tmp_f.write(sample_corpus)
            temp_corpus_file = tmp_f.name
        print(f"Small corpus written to temporary file: {temp_corpus_file}")

        # 2. Call train_bpe on this corpus
        print(f"Training BPE on small corpus with vocab_size={quick_vocab_size}...")
        quick_vocab, quick_merges = train_bpe(
            input_path=temp_corpus_file,
            vocab_size=quick_vocab_size,
            special_tokens=special_tokens_for_quick_test
        )
        print(f"Training complete. Vocab size: {len(quick_vocab)}, Merges: {len(quick_merges)}")

        # 3. Instantiate the Tokenizer
        quick_tokenizer = Tokenizer(quick_vocab, quick_merges, special_tokens_for_quick_test)
        print("Tokenizer instantiated.")

        # 4. Perform encode/decode roundtrip tests
        test_strings = [
            "hello world.",
            "this is a test.",
            "hello again, world!",
            "<|endoftext|> handled?",
            sample_corpus, # Test with the full training corpus as well
            "new string not in training data aabbcc",
            "€uro symbol and some more text.",
            ""
        ]

        all_passed = True
        for i, text_to_test in enumerate(test_strings):
            print(f"\nTest string {i+1}: '{text_to_test[:100].replace('\n', '\\n')}{'...' if len(text_to_test) > 100 else ''}'")
            try:
                encoded_ids = quick_tokenizer.encode(text_to_test)
                print(f"  Encoded IDs ({len(encoded_ids)}): {encoded_ids[:20]}{'...' if len(encoded_ids) > 20 else ''}")
                decoded_text = quick_tokenizer.decode(encoded_ids)
                print(f"  Decoded text: '{decoded_text[:100].replace('\n', '\\n')}{'...' if len(decoded_text) > 100 else ''}'")
                
                if decoded_text == text_to_test:
                    print(f"  PASSED: Roundtrip successful.")
                else:
                    print(f"  FAILED: Decoded text does not match original.")
                    print(f"    Original: '{text_to_test!r}'")
                    print(f"    Decoded:  '{decoded_text!r}'")
                    all_passed = False
            except Exception as e:
                print(f"  ERRORED during encode/decode: {e}")
                import traceback
                traceback.print_exc()
                all_passed = False
        
        if all_passed:
            print("\nSUCCESS: All quick encode/decode verification tests passed.")
        else:
            print("\nFAILURE: Some quick encode/decode verification tests failed.")

    except Exception as e:
        print(f"An error occurred during quick verification: {e}")
        import traceback
        traceback.print_exc()
    finally:
        if temp_corpus_file and os.path.exists(temp_corpus_file):
            os.remove(temp_corpus_file)
            print(f"Cleaned up temporary corpus file: {temp_corpus_file}")
            
    print("--- Quick Encode/Decode Verification Finished ---")

## 1. Dataset Acquisition and Preparation

This section implements functions to download TinyStories and OpenWebText datasets using the `datasets` library from Hugging Face and prepares them into raw text files.

In [4]:
# Define base path for raw data and tokenizers
DATA_RAW_PATH = Path(project_root / "data/raw") # project_root is defined in the first cell
DATA_TOKENIZERS_PATH = Path(project_root / "data/tokenizers")
DATA_TOKENIZED_DATASETS_PATH = Path(project_root / "data/tokenized_datasets") 

# Special token delimiter to be used between documents
SPECIAL_TOKEN_DELIMITER = "<|endoftext|>"

TINYSTORIES_TRAIN_FILE = DATA_RAW_PATH / "tinystories_train.txt"
TINYSTORIES_DEV_FILE = DATA_RAW_PATH / "tinystories_dev.txt"
OPENWEBTEXT_TRAIN_FILE = DATA_RAW_PATH / "openwebtext_train.txt"
OPENWEBTEXT_DEV_FILE = DATA_RAW_PATH / "openwebtext_dev.txt"

TINYSTORIES_VOCAB_FILE = DATA_TOKENIZERS_PATH / "tinystories_vocab.json"
TINYSTORIES_MERGES_FILE = DATA_TOKENIZERS_PATH / "tinystories_merges.json"
OPENWEBTEXT_VOCAB_FILE = DATA_TOKENIZERS_PATH / "openwebtext_vocab.json"
OPENWEBTEXT_MERGES_FILE = DATA_TOKENIZERS_PATH / "openwebtext_merges.json"

In [5]:
def _ensure_data_dir_exists(path: Path) -> None:
    """Ensures that the directory for the given path exists.

    Args:
        path (Path): The file path for which the parent directory needs to exist.
    """
    path.parent.mkdir(parents=True, exist_ok=True)

def _process_and_save_dataset_split(
    dataset_split: Iterable[Dict[str, any]], 
    output_file: Path, 
    delimiter: str,
    text_field: str = "text",
    limit: Optional[int] = None
):
    """Processes a dataset split, concatenates text documents, and saves to a file.

    Args:
        dataset_split (Iterable[Dict[str, any]]): The Hugging Face dataset split to process.
        output_file (Path): The path to save the concatenated text file.
        delimiter (str): The delimiter string to insert between documents.
        text_field (str): The name of the field in the dataset dictionary that contains the text.
        limit (Optional[int]): If provided, limit processing to this many documents.
    """
    _ensure_data_dir_exists(output_file)
    count = 0
    with open(output_file, 'w', encoding='utf-8') as f:
        for example in tqdm.tqdm(dataset_split, desc=f"Processing {output_file.name}"):
            if limit is not None and count >= limit:
                break
            try:
                text_content = example[text_field]
                if text_content and isinstance(text_content, str):
                    f.write(text_content)
                    f.write(delimiter) # Add delimiter after each document
                    count += 1
                elif not text_content:
                    # print(f"Warning: Empty text content in an example. Skipping.") # Can be too verbose
                    pass
                else:
                    print(f"Warning: Text content is not a string ({type(text_content)}). Skipping: {str(text_content)[:100]}...")
            except KeyError:
                print(f"Warning: Text field '{text_field}' not found in an example. Skipping.")
            except Exception as e:
                print(f"Error processing an example: {e}. Skipping.")
    print(f"Finished processing. {count} documents written to {output_file}")

In [6]:
def prepare_tinystories_dataset(
    output_train_file: Path = TINYSTORIES_TRAIN_FILE,
    output_dev_file: Path = TINYSTORIES_DEV_FILE,
    delimiter: str = SPECIAL_TOKEN_DELIMITER
) -> None:
    """Downloads and prepares the TinyStories dataset.

    Downloads the 'train' and 'validation' splits, concatenates documents 
    with the specified delimiter, and saves them to text files.

    Args:
        output_train_file (Path): Path to save the training data.
        output_dev_file (Path): Path to save the development/validation data.
        delimiter (str): Delimiter to use between documents.
    """
    print("Preparing TinyStories dataset...")
    try:
        # Download train split
        print("Loading TinyStories train split...")
        ts_train = datasets.load_dataset("roneneldan/TinyStories", split="train", streaming=True)
        _process_and_save_dataset_split(ts_train, output_train_file, delimiter)

        # Download validation split
        print("Loading TinyStories validation split...")
        ts_dev = datasets.load_dataset("roneneldan/TinyStories", split="validation", streaming=True)
        _process_and_save_dataset_split(ts_dev, output_dev_file, delimiter)

        print("TinyStories dataset preparation complete.")
    except Exception as e:
        print(f"An error occurred during TinyStories dataset preparation: {e}")

In [7]:
def prepare_openwebtext_dataset(
    output_train_file: Path = OPENWEBTEXT_TRAIN_FILE,
    output_dev_file: Path = OPENWEBTEXT_DEV_FILE,
    delimiter: str = SPECIAL_TOKEN_DELIMITER,
    dev_split_size: int = 5000, # Number of documents for the dev split from train
    train_split_limit: Optional[int] = None # Optional limit for the full train split processing
) -> None:
    """Downloads and prepares the OpenWebText dataset.

    Downloads the 'train' split, concatenates documents with the delimiter, 
    and saves it. A smaller dev split is created from the beginning of the train split.

    Args:
        output_train_file (Path): Path to save the training data.
        output_dev_file (Path): Path to save the development data.
        delimiter (str): Delimiter to use between documents.
        dev_split_size (int): Number of documents from the train set to use for the dev set.
        train_split_limit (Optional[int]): If provided, limit the full train split processing to this many documents.
    """
    print("Preparing OpenWebText dataset...")
    
    try:
        print(f"Creating dev split from OpenWebText (first {dev_split_size} documents)...")
        owt_dev_iterable = datasets.load_dataset("Skylion007/openwebtext", split="train", streaming=True)
        _process_and_save_dataset_split(owt_dev_iterable, output_dev_file, delimiter, limit=dev_split_size)

        print("Processing full OpenWebText train split...")
        # Note: Processing the full OpenWebText can take a very long time and a lot of disk space.
        owt_train_full_iterable = datasets.load_dataset("Skylion007/openwebtext", split="train", streaming=True) 
        _process_and_save_dataset_split(owt_train_full_iterable, output_train_file, delimiter, limit=train_split_limit)

        print("OpenWebText dataset preparation complete.")
    except Exception as e:
        print(f"An error occurred during OpenWebText dataset preparation: {e}")
        print("Please ensure you have enough disk space and a stable internet connection.")
        import traceback
        traceback.print_exc()

## 2. BPE Training Experiments

### Problem (train_bpe_tinystories): BPE Training on TinyStories

In [8]:
def run_train_bpe_tinystories_experiment():
    """Runs the BPE training experiment on the TinyStories dataset."""
    print("--- Starting BPE Training on TinyStories ---")
    
    _ensure_data_dir_exists(TINYSTORIES_VOCAB_FILE) # Ensures data/tokenizers directory exists

    special_tokens_ts = [SPECIAL_TOKEN_DELIMITER] # Using "<|endoftext|>" as special token
    vocab_size_ts = 10000

    if not TINYSTORIES_TRAIN_FILE.exists():
        print(f"Error: TinyStories training file not found at {TINYSTORIES_TRAIN_FILE}.")
        print("Please run the dataset preparation step first.")
        return None, None # Return None if training fails

    print(f"Training BPE on: {TINYSTORIES_TRAIN_FILE}")
    print(f"Target vocab size: {vocab_size_ts}")
    print(f"Special tokens: {special_tokens_ts}")

    start_time = time.perf_counter()
    vocab_ts, merges_ts = None, None
    try:
        vocab_ts, merges_ts = train_bpe(
            input_path=str(TINYSTORIES_TRAIN_FILE),
            vocab_size=vocab_size_ts,
            special_tokens=special_tokens_ts
        )
    except Exception as e:
        print(f"An error occurred during train_bpe for TinyStories: {e}")
        import traceback
        traceback.print_exc()
        return None, None # Return None if training fails
        
    end_time = time.perf_counter()
    training_time_seconds = end_time - start_time

    print(f"BPE training for TinyStories completed in {training_time_seconds:.2f} seconds.")

    # Serialize vocab and merges
    print(f"Serializing vocabulary to: {TINYSTORIES_VOCAB_FILE}")
    print(f"Serializing merges to: {TINYSTORIES_MERGES_FILE}")
    try:
        if vocab_ts is not None and merges_ts is not None:
             save_tokenizer_components(vocab_ts, merges_ts, str(TINYSTORIES_VOCAB_FILE), str(TINYSTORIES_MERGES_FILE))
             print("Serialization complete.")
        else:
            print("Serialization skipped as vocab_ts or merges_ts is None.")
    except Exception as e:
        print(f"Error during serialization: {e}")
        return vocab_ts, merges_ts # Return what we have even if serialization fails

    # Report: Longest token
    if vocab_ts:
        max_len_token_bytes = max(vocab_ts.values(), key=len, default=b'')
        max_len = len(max_len_token_bytes)
        try:
            max_len_token_str = max_len_token_bytes.decode('utf-8', 'replace')
        except Exception as e:
            max_len_token_str = f"(Error decoding: {e}) {max_len_token_bytes!r}"
        print(f"Longest token found in TinyStories vocab: Length={max_len} bytes, Content (decoded): '{max_len_token_str}'")
        print("  Consider if this token represents a common, repetitive sequence in TinyStories.")
    else:
        print("TinyStories vocabulary is empty or None, cannot determine longest token.")

    print(f"TinyStories Training Time: {training_time_seconds:.2f} seconds")
    print("Peak Memory Estimation: Requires manual observation or a memory profiler.")
    print("Profiling `train_bpe` (Bottlenecks): Best done on a subset of data (e.g., TinyStories dev set). Use cProfile or Scalene.")
    print("--- TinyStories BPE Training Experiment Finished ---")
    return vocab_ts, merges_ts

### Problem (train_bpe_expts_owt): BPE Training on OpenWebText

In [9]:
def run_train_bpe_openwebtext_experiment():
    """Runs the BPE training experiment on the OpenWebText dataset."""
    print("--- Starting BPE Training on OpenWebText ---")

    _ensure_data_dir_exists(OPENWEBTEXT_VOCAB_FILE) # Ensures data/tokenizers directory exists

    special_tokens_owt = [SPECIAL_TOKEN_DELIMITER] 
    vocab_size_owt = 32000

    if not OPENWEBTEXT_TRAIN_FILE.exists():
        print(f"Error: OpenWebText training file not found at {OPENWEBTEXT_TRAIN_FILE}.")
        print("Please run the dataset preparation step first.")
        return None, None

    print(f"Training BPE on: {OPENWEBTEXT_TRAIN_FILE}")
    print(f"Target vocab size: {vocab_size_owt}")
    print(f"Special tokens: {special_tokens_owt}")

    start_time = time.perf_counter()
    vocab_owt, merges_owt = None, None
    try:
        vocab_owt, merges_owt = train_bpe(
            input_path=str(OPENWEBTEXT_TRAIN_FILE),
            vocab_size=vocab_size_owt,
            special_tokens=special_tokens_owt
        )
    except Exception as e:
        print(f"An error occurred during train_bpe for OpenWebText: {e}")
        import traceback
        traceback.print_exc()
        return None, None

    end_time = time.perf_counter()
    training_time_seconds = end_time - start_time

    print(f"BPE training for OpenWebText completed in {training_time_seconds:.2f} seconds.")

    print(f"Serializing vocabulary to: {OPENWEBTEXT_VOCAB_FILE}")
    print(f"Serializing merges to: {OPENWEBTEXT_MERGES_FILE}")
    try:
        if vocab_owt is not None and merges_owt is not None:
            save_tokenizer_components(vocab_owt, merges_owt, str(OPENWEBTEXT_VOCAB_FILE), str(OPENWEBTEXT_MERGES_FILE))
            print("Serialization complete.")
        else:
            print("Serialization skipped as vocab_owt or merges_owt is None.")
    except Exception as e:
        print(f"Error during serialization for OpenWebText: {e}")
        return vocab_owt, merges_owt # Return what we have

    if vocab_owt:
        max_len_token_bytes_owt = max(vocab_owt.values(), key=len, default=b'')
        max_len_owt = len(max_len_token_bytes_owt)
        try:
            max_len_token_str_owt = max_len_token_bytes_owt.decode('utf-8', 'replace')
        except Exception as e:
            max_len_token_str_owt = f"(Error decoding: {e}) {max_len_token_bytes_owt!r}"
        print(f"Longest token found in OpenWebText vocab: Length={max_len_owt} bytes, Content (decoded): '{max_len_token_str_owt}'")
        print("  Analyze if this token makes sense for OpenWebText (e.g., common HTML/markup, repetitive sequences).")
    else:
        print("OpenWebText vocabulary is empty or None, cannot determine longest token.")

    print(f"OpenWebText Training Time: {training_time_seconds:.2f} seconds")
    print("--- OpenWebText BPE Training Experiment Finished ---")
    return vocab_owt, merges_owt

#### Compare/Contrast Tokenizers

**(Qualitative comparison after both tokenizers are trained)**

1.  **Common Long Tokens:** 
    *   *TinyStories:* (Fill in after observing results - likely story-related phrases, character names, simple sentence structures)
    *   *OpenWebText:* (Fill in after observing results - likely common web phrases, HTML tags, URLs, programming language keywords if code is present)
2.  **Differences in Types of Tokens Learned:**
    *   *TinyStories:* Expected to learn tokens corresponding to simpler words, common story elements (e.g., "Once upon a time", character names like "Lily", "Tom"), and repetitive sentence structures found in children's stories.
    *   *OpenWebText:* Expected to learn a more diverse set of tokens, including more complex vocabulary, technical terms, potentially HTML/CSS/JavaScript snippets, common URL components, and a wider range of linguistic styles reflective of general web content.
3.  **Vocabulary Composition (General Feel):**
    *   *TinyStories:* Vocabulary might be smaller naturally if only simple words are present, but BPE will still try to find common subwords. The resulting tokens might be more easily interpretable as whole words or very common word parts relevant to narrative fiction for children.
    *   *OpenWebText:* Vocabulary will be more varied. Tokens might represent a broader range of subwords, including those from technical jargon, diverse proper nouns, and possibly artifacts of web scraping (e.g., parts of markup). The distribution of token lengths might also be different.

## 3. Tokenizer Experiments

### Problem (tokenizer_experiments): (a) Compression Ratio

In [10]:
def run_compression_ratio_experiment():
    """Runs the compression ratio experiment for TinyStories and OpenWebText tokenizers."""
    print("--- Starting Compression Ratio Experiment ---")

    special_tokens_list = [SPECIAL_TOKEN_DELIMITER]
    num_samples = 10

    # Load Tokenizers
    tokenizer_ts, tokenizer_owt = None, None
    try:
        if not (TINYSTORIES_VOCAB_FILE.exists() and TINYSTORIES_MERGES_FILE.exists()):
            print(f"TinyStories tokenizer files not found. Skipping TS part. Searched: {TINYSTORIES_VOCAB_FILE}, {TINYSTORIES_MERGES_FILE}")
        else:
            tokenizer_ts = Tokenizer.from_files(str(TINYSTORIES_VOCAB_FILE), str(TINYSTORIES_MERGES_FILE), special_tokens_list)
            print("TinyStories Tokenizer loaded.")

        if not (OPENWEBTEXT_VOCAB_FILE.exists() and OPENWEBTEXT_MERGES_FILE.exists()):
            print(f"OpenWebText tokenizer files not found. Skipping OWT part. Searched: {OPENWEBTEXT_VOCAB_FILE}, {OPENWEBTEXT_MERGES_FILE}")
        else:
            tokenizer_owt = Tokenizer.from_files(str(OPENWEBTEXT_VOCAB_FILE), str(OPENWEBTEXT_MERGES_FILE), special_tokens_list)
            print("OpenWebText Tokenizer loaded.")

    except FileNotFoundError as e:
        print(f"Error loading tokenizers: {e}. Ensure training experiments (G.2, G.3) were run and files exist.")
        return
    except Exception as e:
        print(f"An unexpected error occurred while loading tokenizers: {e}")
        import traceback
        traceback.print_exc()
        return

    # --- TinyStories Compression ---
    if tokenizer_ts:
        print(f"\n--- TinyStories Compression (using tokenizer_ts) on {num_samples} samples ---")
        try:
            # Using 'validation' split for TinyStories samples as it's smaller and representative for this test
            ts_samples_iter = datasets.load_dataset("roneneldan/TinyStories", split="validation", streaming=True).take(num_samples)
            ts_samples = list(ts_samples_iter) # Materialize the samples
            if not ts_samples:
                print("No samples loaded for TinyStories. Skipping compression calculation.")
            else:
                total_input_bytes_ts = sum(len(doc['text'].encode('utf-8')) for doc in ts_samples if doc.get('text'))
                total_ts_tokens = sum(len(tokenizer_ts.encode(doc['text'])) for doc in ts_samples if doc.get('text'))
                
                if total_ts_tokens > 0:
                    compression_ratio_ts = total_input_bytes_ts / total_ts_tokens
                    print(f"Total Input Bytes (TinyStories): {total_input_bytes_ts}")
                    print(f"Total Tokens (TinyStories with tokenizer_ts): {total_ts_tokens}")
                    print(f"Compression Ratio (TinyStories with tokenizer_ts): {compression_ratio_ts:.2f} bytes/token")
                else:
                    print("No tokens generated for TinyStories samples. Cannot calculate compression ratio.")
        except Exception as e:
            print(f"Error during TinyStories compression calculation: {e}")
            import traceback; traceback.print_exc()

    # --- OpenWebText Compression ---
    if tokenizer_owt:
        print(f"\n--- OpenWebText Compression (using tokenizer_owt) on {num_samples} samples ---")
        try:
            # Using 'train' split for OWT samples, as it's the main source
            owt_samples_iter = datasets.load_dataset("Skylion007/openwebtext", split="train", streaming=True).take(num_samples)
            owt_samples = list(owt_samples_iter) # Materialize the samples
            if not owt_samples:
                print("No samples loaded for OpenWebText. Skipping compression calculation.")
            else:
                total_input_bytes_owt = sum(len(doc['text'].encode('utf-8')) for doc in owt_samples if doc.get('text'))
                total_owt_tokens = sum(len(tokenizer_owt.encode(doc['text'])) for doc in owt_samples if doc.get('text'))

                if total_owt_tokens > 0:
                    compression_ratio_owt = total_input_bytes_owt / total_owt_tokens
                    print(f"Total Input Bytes (OpenWebText): {total_input_bytes_owt}")
                    print(f"Total Tokens (OpenWebText with tokenizer_owt): {total_owt_tokens}")
                    print(f"Compression Ratio (OpenWebText with tokenizer_owt): {compression_ratio_owt:.2f} bytes/token")
                else:
                    print("No tokens generated for OpenWebText samples. Cannot calculate compression ratio.")
        except Exception as e:
            print(f"Error during OpenWebText compression calculation: {e}")
            import traceback; traceback.print_exc()

    print("--- Compression Ratio Experiment Finished ---")

### Problem (tokenizer_experiments): (b) Cross-Tokenization

In [11]:
def run_cross_tokenization_experiment():
    """Runs the cross-tokenization experiment: OWT samples with TinyStories tokenizer."""
    print("--- Starting Cross-Tokenization Experiment ---")
    
    special_tokens_list = [SPECIAL_TOKEN_DELIMITER]
    num_samples = 10

    # Load TinyStories Tokenizer
    tokenizer_ts = None
    try:
        if not (TINYSTORIES_VOCAB_FILE.exists() and TINYSTORIES_MERGES_FILE.exists()):
            print(f"TinyStories tokenizer files not found. Cannot run cross-tokenization. Searched: {TINYSTORIES_VOCAB_FILE}, {TINYSTORIES_MERGES_FILE}")
            return
        tokenizer_ts = Tokenizer.from_files(str(TINYSTORIES_VOCAB_FILE), str(TINYSTORIES_MERGES_FILE), special_tokens_list)
        print("TinyStories Tokenizer loaded for cross-tokenization.")
    except FileNotFoundError as e:
        print(f"Error loading TinyStories tokenizer: {e}.")
        return
    except Exception as e:
        print(f"An unexpected error occurred while loading TinyStories tokenizer: {e}")
        import traceback; traceback.print_exc()
        return

    # Get OpenWebText samples
    print(f"\n--- OpenWebText Cross-Tokenization (using tokenizer_ts) on {num_samples} samples ---")
    owt_samples = []
    try:
        owt_samples_iter = datasets.load_dataset("Skylion007/openwebtext", split="train", streaming=True).take(num_samples)
        owt_samples = list(owt_samples_iter)
        if not owt_samples:
            print("No samples loaded for OpenWebText. Skipping cross-tokenization calculation.")
            return
    except Exception as e:
        print(f"Error loading OpenWebText samples: {e}")
        import traceback; traceback.print_exc()
        return

    # Calculate compression
    try:
        total_input_bytes_owt_sample = sum(len(doc['text'].encode('utf-8')) for doc in owt_samples if doc.get('text'))
        total_tokens_owt_on_ts = sum(len(tokenizer_ts.encode(doc['text'])) for doc in owt_samples if doc.get('text'))

        if total_tokens_owt_on_ts > 0:
            compression_ratio_owt_on_ts = total_input_bytes_owt_sample / total_tokens_owt_on_ts
            print(f"Total Input Bytes (OpenWebText Samples): {total_input_bytes_owt_sample}")
            print(f"Total Tokens (OpenWebText with tokenizer_ts): {total_tokens_owt_on_ts}")
            print(f"Compression Ratio (OpenWebText with tokenizer_ts): {compression_ratio_owt_on_ts:.2f} bytes/token")
        else:
            print("No tokens generated for OpenWebText samples with TinyStories tokenizer. Cannot calculate compression ratio.")
    except Exception as e:
        print(f"Error during OpenWebText cross-tokenization calculation: {e}")
        import traceback; traceback.print_exc()

    print("--- Cross-Tokenization Experiment Finished ---")

#### Qualitative Description of Cross-Tokenization Differences:

*(To be filled in after running the experiment and observing the tokenization results)*

When tokenizing OpenWebText data with the TinyStories tokenizer (`tokenizer_ts`), we expect to observe:
1.  **Higher Number of Tokens / Lower Compression Ratio:** The `tokenizer_ts` vocabulary is optimized for the simpler language and themes of TinyStories. It will likely lack tokens for more complex words, technical jargon, or web-specific constructs (like HTML tags or parts of URLs) common in OpenWebText. As a result, these out-of-domain terms will be broken down into many smaller, more generic subword units or even individual bytes if they are completely unseen.
2.  **Shorter Average Token Length (for OWT-specific terms):** Complex words from OWT that would be single tokens (or few tokens) with `tokenizer_owt` will be represented by sequences of shorter tokens by `tokenizer_ts`.
3.  **Increased Unknowns (or sub-optimal segmentation):** While our BPE implementation aims to tokenize everything into byte sequences if necessary, the efficiency of representing OWT will be poor. Words or character sequences common in OWT but rare/absent in TinyStories will not have dedicated merged tokens in `tokenizer_ts`.
4.  **Examples:**
    *   A technical term like "hyperparameter" might be tokenized as `['hyper', 'para', 'meter']` or even `['h', 'y', 'p', 'e', 'r', 'p', 'a', 'r', 'a', 'm', 'e', 't', 'e', 'r']` by `tokenizer_ts` if these sub-parts are not common in TinyStories, whereas `tokenizer_owt` might have `['hyperparameter']` or `['hyper', 'parameter']`.
    *   An HTML tag like `<div>` would likely be `['<', 'd', 'i', 'v', '>']` with `tokenizer_ts`, while `tokenizer_owt` might have learned `['<div>']` or `['<div', '>']` if HTML is common in its training data.

### Problem (tokenizer_experiments): (c) Throughput Estimation

In [12]:
def run_throughput_estimation_experiment():
    """Estimates the tokenization throughput of tokenizer_owt and time to tokenize the Pile dataset."""
    print("--- Starting Throughput Estimation Experiment ---")

    special_tokens_list = [SPECIAL_TOKEN_DELIMITER]

    # Load OpenWebText Tokenizer
    tokenizer_owt = None
    try:
        if not (OPENWEBTEXT_VOCAB_FILE.exists() and OPENWEBTEXT_MERGES_FILE.exists()):
            print(f"OpenWebText tokenizer files not found. Cannot run throughput estimation. Searched: {OPENWEBTEXT_VOCAB_FILE}, {OPENWEBTEXT_MERGES_FILE}")
            return
        tokenizer_owt = Tokenizer.from_files(str(OPENWEBTEXT_VOCAB_FILE), str(OPENWEBTEXT_MERGES_FILE), special_tokens_list)
        print("OpenWebText Tokenizer loaded for throughput estimation.")
    except FileNotFoundError as e:
        print(f"Error loading OpenWebText tokenizer: {e}.")
        return
    except Exception as e:
        print(f"An unexpected error occurred while loading OpenWebText tokenizer: {e}")
        import traceback; traceback.print_exc()
        return

    # Use OpenWebText dev split as the large text file for testing throughput
    # The size of this file will affect the reliability of the estimate.
    # A larger file (e.g., 10-100MB) is better. OPENWEBTEXT_DEV_FILE might be suitable.
    large_text_file_path = OPENWEBTEXT_DEV_FILE
    if not large_text_file_path.exists():
        print(f"Error: Large text file for throughput test not found at {large_text_file_path}.")
        print("Please ensure OpenWebText dev split (or another large file) is prepared.")
        # Fallback to TinyStories dev if OWT dev is missing, for basic test flow
        if TINYSTORIES_DEV_FILE.exists():
            print(f"Falling back to TinyStories dev file: {TINYSTORIES_DEV_FILE}")
            large_text_file_path = TINYSTORIES_DEV_FILE
        else:
            print("No suitable large text file found. Aborting throughput estimation.")
            return

    try:
        print(f"Reading large text file: {large_text_file_path} (size: {large_text_file_path.stat().st_size / (1024*1024):.2f} MB)")
        with open(large_text_file_path, 'r', encoding='utf-8') as f:
            large_text_content = f.read()
        
        if not large_text_content:
            print("The large text file is empty. Cannot estimate throughput.")
            return

        print("Estimating throughput for tokenizer_owt.encode()...")
        start_time = time.perf_counter()
        _ = tokenizer_owt.encode(large_text_content) # Perform encoding
        duration = time.perf_counter() - start_time

        input_bytes = len(large_text_content.encode('utf-8'))
        
        if duration > 0:
            bytes_per_second = input_bytes / duration
            print(f"Input size: {input_bytes / (1024*1024):.2f} MB")
            print(f"Encoding time: {duration:.2f} seconds")
            print(f"Throughput (tokenizer_owt.encode): {bytes_per_second / (1024*1024):.2f} MB/second")

            # Estimate time to tokenize Pile (825 GiB)
            pile_size_gib = 825
            pile_size_bytes = pile_size_gib * (1024**3)
            estimated_time_seconds_pile = pile_size_bytes / bytes_per_second
            estimated_time_days_pile = estimated_time_seconds_pile / (3600 * 24)
            print(f"Estimated time to tokenize The Pile ({pile_size_gib} GiB): {estimated_time_days_pile:.2f} days")
        else:
            print("Encoding duration was zero. Cannot calculate throughput (file might be too small or timer resolution issue).")

    except Exception as e:
        print(f"Error during throughput estimation: {e}")
        import traceback; traceback.print_exc()

    print("--- Throughput Estimation Experiment Finished ---")

### Problem (tokenizer_experiments): (d) Dataset Tokenization and Storage

In [13]:
def run_dataset_tokenization_and_storage_experiment():
    """Tokenizes train/dev splits of TinyStories and OpenWebText and saves them as NumPy arrays."""
    print("--- Starting Dataset Tokenization and Storage Experiment ---")

    special_tokens_list = [SPECIAL_TOKEN_DELIMITER]

    # Ensure output directory exists
    DATA_TOKENIZED_DATASETS_PATH.mkdir(parents=True, exist_ok=True)

    # Define output .npy file paths
    ts_train_npy = DATA_TOKENIZED_DATASETS_PATH / "tinystories_train.npy"
    ts_dev_npy = DATA_TOKENIZED_DATASETS_PATH / "tinystories_dev.npy"
    owt_train_npy = DATA_TOKENIZED_DATASETS_PATH / "openwebtext_train.npy"
    owt_dev_npy = DATA_TOKENIZED_DATASETS_PATH / "openwebtext_dev.npy"

    # Load Tokenizers
    tokenizer_ts, tokenizer_owt = None, None
    try:
        if not (TINYSTORIES_VOCAB_FILE.exists() and TINYSTORIES_MERGES_FILE.exists()):
            print(f"TinyStories tokenizer files not found. Skipping TS tokenization. Searched: {TINYSTORIES_VOCAB_FILE}, {TINYSTORIES_MERGES_FILE}")
        else:
            tokenizer_ts = Tokenizer.from_files(str(TINYSTORIES_VOCAB_FILE), str(TINYSTORIES_MERGES_FILE), special_tokens_list)
            print("TinyStories Tokenizer loaded.")

        if not (OPENWEBTEXT_VOCAB_FILE.exists() and OPENWEBTEXT_MERGES_FILE.exists()):
            print(f"OpenWebText tokenizer files not found. Skipping OWT tokenization. Searched: {OPENWEBTEXT_VOCAB_FILE}, {OPENWEBTEXT_MERGES_FILE}")
        else:
            tokenizer_owt = Tokenizer.from_files(str(OPENWEBTEXT_VOCAB_FILE), str(OPENWEBTEXT_MERGES_FILE), special_tokens_list)
            print("OpenWebText Tokenizer loaded.")
    except FileNotFoundError as e:
        print(f"Error loading tokenizers: {e}. Ensure training experiments were run.")
        return
    except Exception as e:
        print(f"An unexpected error occurred while loading tokenizers: {e}")
        import traceback; traceback.print_exc()
        return

    def _tokenize_and_save(input_text_file: Path, output_npy_file: Path, tokenizer: Tokenizer, dataset_name: str):
        """Helper function to read a text file, tokenize, and save as .npy."""
        if not tokenizer:
            print(f"Tokenizer for {dataset_name} not loaded. Skipping tokenization of {input_text_file.name}.")
            return
            
        print(f"\nTokenizing {dataset_name} split: {input_text_file.name} -> {output_npy_file.name}")
        if not input_text_file.exists():
            print(f"Error: Input text file {input_text_file} not found. Skipping.")
            return
        try:
            with open(input_text_file, 'r', encoding='utf-8') as f:
                text_content = f.read()
            
            if not text_content:
                print(f"Warning: Text file {input_text_file} is empty. Saving empty array.")
                token_ids_array = np.array([], dtype=np.uint16)
            else:
                print(f"Encoding {input_text_file.name}...")
                start_time = time.perf_counter()
                token_ids = tokenizer.encode(text_content)
                duration = time.perf_counter() - start_time
                print(f"Encoded {len(token_ids)} tokens in {duration:.2f} seconds.")
                token_ids_array = np.array(token_ids, dtype=np.uint16)
            
            np.save(str(output_npy_file), token_ids_array)
            print(f"Successfully saved tokenized data to {output_npy_file}")
            print(f"  Shape: {token_ids_array.shape}, Dtype: {token_ids_array.dtype}, Size: {token_ids_array.nbytes / (1024*1024):.2f} MB")

        except Exception as e:
            print(f"Error tokenizing or saving {input_text_file.name}: {e}")
            import traceback; traceback.print_exc()

    # Tokenize TinyStories splits
    if tokenizer_ts:
        _tokenize_and_save(TINYSTORIES_TRAIN_FILE, ts_train_npy, tokenizer_ts, "TinyStories Train")
        _tokenize_and_save(TINYSTORIES_DEV_FILE, ts_dev_npy, tokenizer_ts, "TinyStories Dev")
    else:
        print("TinyStories tokenizer not available, skipping its dataset tokenization.")

    # Tokenize OpenWebText splits
    if tokenizer_owt:
        _tokenize_and_save(OPENWEBTEXT_TRAIN_FILE, owt_train_npy, tokenizer_owt, "OpenWebText Train")
        _tokenize_and_save(OPENWEBTEXT_DEV_FILE, owt_dev_npy, tokenizer_owt, "OpenWebText Dev")
    else:
        print("OpenWebText tokenizer not available, skipping its dataset tokenization.")

    print("--- Dataset Tokenization and Storage Experiment Finished ---")

#### Justification for `np.uint16`

`np.uint16` is an appropriate data type for storing the token IDs for the following reasons:

1.  **Range Coverage:** `np.uint16` is an unsigned 16-bit integer, meaning it can represent values from 0 to 2<sup>16</sup> - 1, which is 0 to 65,535.
    *   For the TinyStories tokenizer, the target `vocab_size` is 10,000.
    *   For the OpenWebText tokenizer, the target `vocab_size` is 32,000.
    Both 10,000 and 32,000 fall comfortably within the 0-65,535 range. This ensures that all possible token IDs generated by these tokenizers can be stored without overflow or data loss.

2.  **Memory Efficiency:** Using `np.uint16` (2 bytes per token ID) is more memory-efficient than using larger integer types like `np.int32` (4 bytes) or `np.int64` (8 bytes) when the maximum ID does not require the larger range. For large tokenized datasets, this difference in memory usage can be substantial.
    *   If we used `np.int32`, we would be using twice the necessary memory.

3.  **Non-Negativity:** Token IDs are typically non-negative integers starting from 0, so an unsigned integer type (`uint`) is suitable.

Therefore, `np.uint16` provides a good balance between correctly representing all possible token IDs and minimizing memory footprint for the specified vocabulary sizes.

### Example Usage and Main Execution Block

In [14]:
if __name__ == '__main__':
    # Ensure the base data directories exist
    DATA_RAW_PATH.mkdir(parents=True, exist_ok=True)
    DATA_TOKENIZERS_PATH.mkdir(parents=True, exist_ok=True)
    DATA_TOKENIZED_DATASETS_PATH.mkdir(parents=True, exist_ok=True)
    print(f"Data directories '{DATA_RAW_PATH.resolve()}',\n'{DATA_TOKENIZERS_PATH.resolve()}',\nand '{DATA_TOKENIZED_DATASETS_PATH.resolve()}' ensured.")

    # --- Quick Encode/Decode Verification (NEW) ---
    print("\n--- Running Quick Encode/Decode Verification (Section 0) ---")
    run_quick_encode_decode_verification()

    # --- Dataset Preparation (Section 1) ---
    # print("\n--- Preparing TinyStories (Section 1) ---")
    # prepare_tinystories_dataset()
    # print("\n--- Preparing OpenWebText (Section 1) ---")
    # # Reduce dev_split_size for quicker testing if needed, e.g., dev_split_size=100
    # # Also consider limiting the main train split for OWT during initial testing due to its size.
    # # prepare_openwebtext_dataset(dev_split_size=100, train_split_limit=1000) # Example: 100 for dev, 1000 for train limit
    # prepare_openwebtext_dataset() # For full OWT prep

    # --- BPE Training Experiment for TinyStories (Section 2) ---
    # print("\n--- Running TinyStories BPE Training Experiment (Section 2) ---")
    # vocab_ts, merges_ts = run_train_bpe_tinystories_experiment()

    # --- BPE Training Experiment for OpenWebText (Section 2) ---
    # print("\n--- Running OpenWebText BPE Training Experiment (Section 2) ---")
    # vocab_owt, merges_owt = run_train_bpe_openwebtext_experiment()

    # --- Tokenizer Experiments (Section 3) ---
    # Part (a) Compression Ratio
    # print("\n--- Running Compression Ratio Experiment (Section 3a) ---")
    # run_compression_ratio_experiment()
    
    # Part (b) Cross-Tokenization
    # print("\n--- Running Cross-Tokenization Experiment (Section 3b) ---")
    # run_cross_tokenization_experiment()

    # Part (c) Throughput Estimation
    # print("\n--- Running Throughput Estimation Experiment (Section 3c) ---")
    # run_throughput_estimation_experiment()

    # Part (d) Dataset Tokenization and Storage
    # print("\n--- Running Dataset Tokenization and Storage Experiment (Section 3d) ---")
    # run_dataset_tokenization_and_storage_experiment()

    print("\nExperiment script finished. Uncomment calls in __main__ to run specific parts.")

Data directories '/Users/abhisheksharma/Desktop/src/github/projects/llm_scratch/lm/data/raw',
'/Users/abhisheksharma/Desktop/src/github/projects/llm_scratch/lm/data/tokenizers',
and '/Users/abhisheksharma/Desktop/src/github/projects/llm_scratch/lm/data/tokenized_datasets' ensured.

--- Running Quick Encode/Decode Verification (Section 0) ---
--- Starting Quick Encode/Decode Verification ---
Small corpus written to temporary file: /var/folders/1_/__n8_ny14_g0tlyfd57xj1rw0000gn/T/tmp9epck826
Training BPE on small corpus with vocab_size=271...


Training complete. Vocab size: 271, Merges: 14
Tokenizer instantiated.

Test string 1: 'hello world.'
  Encoded IDs (12): [105, 102, 109, 109, 112, 33, 120, 112, 115, 109, 101, 47]
  Decoded text: 'hello world.'
  PASSED: Roundtrip successful.

Test string 2: 'this is a test.'
  Encoded IDs (15): [117, 105, 106, 116, 33, 106, 116, 33, 98, 33, 117, 102, 116, 117, 47]
  Decoded text: 'this is a test.'
  PASSED: Roundtrip successful.

Test string 3: 'hello again, world!'
  Encoded IDs (19): [105, 102, 109, 109, 112, 33, 98, 104, 98, 106, 111, 45, 33, 120, 112, 115, 109, 101, 34]
  Decoded text: 'hello again, world!'
  PASSED: Roundtrip successful.

Test string 4: '<|endoftext|> handled?'
  Encoded IDs (10): [0, 33, 105, 98, 111, 101, 109, 102, 101, 64]
  Decoded text: '<|endoftext|> handled?'
  PASSED: Roundtrip successful.

Test string 5: 'hello world.\nthis is a test.\nhello again, world!\nspecial tokens like <|endoftext|> should be handled...'
  Encoded IDs (147): [105, 102, 109, 109, 

---