# SASLM Data Preparation Pipeline

This notebook implements the data extraction pipeline for the Sri Aurobindo Small Language Model (SASLM).
It uses Tesseract OCR with the `script/Latin` model to accurately capture Romanized Sanskrit diacritics.

In [None]:
# Install dependencies (Run this if on Colab)
!sudo apt-get install tesseract-ocr tesseract-ocr-script-latn poppler-utils
!pip install pytesseract pdf2image tqdm

In [None]:
import os
import glob
import logging
from concurrent.futures import ThreadPoolExecutor
import pytesseract
from pdf2image import convert_from_path
from tqdm.auto import tqdm

# Settings
RAW_DATA_DIR = "raw_data" # Path to folder containing PDFs
OUTPUT_DIR = "processed_text"
DPI = 300
LANG = 'script/Latin'
HEADER_CUTOFF_PERCENT = 0.08
FOOTER_CUTOFF_PERCENT = 0.08

if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [None]:
def clean_ocr_text(text):
    if not text: return ""
    lines = text.split('\n')
    cleaned_lines = []
    for line in lines:
        s = line.strip()
        if not s:
            continue
        # Remove standalone page numbers
        if s.isdigit() and len(s) < 4:
            continue
        cleaned_lines.append(line)
    return "\n".join(cleaned_lines)

def process_pdf(pdf_path):
    filename = os.path.basename(pdf_path)
    item_name = os.path.splitext(filename)[0]
    output_path = os.path.join(OUTPUT_DIR, f"{item_name}.txt")
    
    if os.path.exists(output_path):
        print(f"Skipping {filename}, already processed.")
        return
    
    print(f"Processing {filename}...")
    try:
        # For Colab/Notebook, strict page iteration to show progress on single file
        # We'll rely on pdf2image convert_from_path generator if available, or chunking
        # Simple chunking logic to avoid RAM explosion
        full_text = []
        # Lazy way: let pdf2image handle it. For 600 pages, it might use 2GB RAM.
        # If strictly confined resources, use chunking.
        images = convert_from_path(pdf_path, dpi=DPI)
        
        for img in tqdm(images, desc=f"Pages in {filename}"):
            w, h = img.size
            top = int(h * HEADER_CUTOFF_PERCENT)
            bottom = int(h * (1 - FOOTER_CUTOFF_PERCENT))
            cropped_img = img.crop((0, top, w, bottom))
            
            text = pytesseract.image_to_string(cropped_img, lang=LANG)
            full_text.append(clean_ocr_text(text))
            
        with open(output_path, "w", encoding="utf-8") as f:
            f.write("\n\n".join(full_text))
            
    except Exception as e:
        print(f"Error processing {filename}: {e}")

In [None]:
# Run
pdf_files = glob.glob(os.path.join(RAW_DATA_DIR, "*.pdf"))
print(f"Found {len(pdf_files)} PDFs.")

# For notebook, sequential is better to see progress bars
for p in pdf_files:
    process_pdf(p)

In [None]:
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers, processors
import glob
import os
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def train_tokenizer(vocab_size=30000):
    """
    Trains a BPE tokenizer on the .txt files in processed_text/.
    """
    files = glob.glob("processed_text/*.txt")
    if not files:
        logging.error("No text files found in processed_text/. Run extraction pipeline first.")
        return

    logging.info(f"Found {len(files)} files to train on.")

    # Initialize Tokenizer in BPE
    tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
    
    # Pre-tokenizer: Split by whitespace and punctuation
    tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
    
    # Decoder: Reconstruct from ByteLevel
    tokenizer.decoder = decoders.ByteLevel()
    
    # Trainer
    trainer = trainers.BpeTrainer(
        vocab_size=vocab_size, 
        min_frequency=2,
        special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]", "<|endoftext|>"],
        initial_alphabet=pre_tokenizers.ByteLevel.alphabet()
    )
    
    # Train
    logging.info("Starting tokenizer training...")
    tokenizer.train(files, trainer)
    
    # Post-processor (for BERT-like wrapping if needed, but for GPT usually simple)
    # GPT2 uses ByteLevel, no explicit post-processor beyond that usually needed for generation loop.
    
    # Save
    save_path = "saslm_tokenizer.json"
    tokenizer.save(save_path)
    logging.info(f"Tokenizer saved to {save_path}")
    
    # Test
    sample = "The nature of the Supermind is rta-chit."
    encoded = tokenizer.encode(sample)
    logging.info(f"Test Sentence: '{sample}'")
    logging.info(f"Tokens: {encoded.tokens}")
    logging.info(f"IDs: {encoded.ids}")