# Complete Deep Learning Pipeline
Complete Pipeline: Preprocess ‚Üí Tokenizer ‚Üí Train Models ‚Üí Embeddings ‚Üí Evaluation

This notebook orchestrates the entire pipeline with configurable parameters at the top.

In [None]:
# ============================================================================
# üîß CONFIGURATION - MODIFY THESE BEFORE RUNNING
# ============================================================================

# ========== SKIP FLAGS - Set to True to skip a stage ==========
SKIP_PREPROCESS = False
SKIP_TOKENIZER = True
SKIP_LSTM = False
SKIP_TRANSFORMER = False
SKIP_EMBEDDINGS = False
SKIP_EVALUATION = False

# ========== PREPROCESSING PARAMETERS ==========
CORPUS_SIZE = 10      # Tiny data size for testing

# ========== TOKENIZER PARAMETERS ==========
TOKENIZER_VOCAB_SIZE = 2000

# ========== LSTM TRAINING PARAMETERS ==========
LSTM_EPOCHS = 1         # Just for testing small number
LSTM_BATCH_SIZE = 32
LSTM_SEQ_LENGTH = 128
LSTM_LEARNING_RATE = 0.001

# ========== TRANSFORMER TRAINING PARAMETERS ==========
TRANSFORMER_EPOCHS = 1 # Same here
TRANSFORMER_BATCH_SIZE = 32
TRANSFORMER_SEQ_LENGTH = 128
TRANSFORMER_LEARNING_RATE = 0.001

# ========== EMBEDDINGS PARAMETERS ==========
EMBEDDINGS_MODELS = None  # None = all models ['byt5', 'canine', 'bpe-lstm', 'bpe-transformer', 'bert']
EMBEDDINGS_CLEAR_EXISTING = True

# ============================================================================
# Display current configuration
# ============================================================================
print("="*80)
print("PIPELINE CONFIGURATION")
print("="*80)
print("\nüìç SKIP FLAGS:")
print(f"  SKIP_PREPROCESS: {SKIP_PREPROCESS}")
print(f"  SKIP_TOKENIZER: {SKIP_TOKENIZER}")
print(f"  SKIP_LSTM: {SKIP_LSTM}")
print(f"  SKIP_TRANSFORMER: {SKIP_TRANSFORMER}")
print(f"  SKIP_EMBEDDINGS: {SKIP_EMBEDDINGS}")
print(f"  SKIP_EVALUATION: {SKIP_EVALUATION}")
print("\n‚öôÔ∏è PARAMETERS:")
print(f"  Corpus size: {CORPUS_SIZE}")
print(f"  Tokenizer vocab size: {TOKENIZER_VOCAB_SIZE}")
print(f"  LSTM epochs: {LSTM_EPOCHS}, batch size: {LSTM_BATCH_SIZE}, seq length: {LSTM_SEQ_LENGTH}")
print(f"  Transformer epochs: {TRANSFORMER_EPOCHS}, batch size: {TRANSFORMER_BATCH_SIZE}, seq length: {TRANSFORMER_SEQ_LENGTH}")
print(f"  Embeddings models: {EMBEDDINGS_MODELS}")
print("="*80)

In [None]:
# ============================================================================
# SETUP: Import Libraries & Set Path
# ============================================================================
import os
import sys
import subprocess
from pathlib import Path

# Add repo root to path - go up from pipeline dir to repo root
notebook_dir = Path.cwd()
repo_root = notebook_dir.parent if notebook_dir.name == 'pipeline' else notebook_dir

# Add repo root to Python path
if str(repo_root) not in sys.path:
    sys.path.insert(0, str(repo_root))

# Change working directory to repo root
os.chdir(repo_root)

print(f"Notebook directory: {notebook_dir}")
print(f"Repository root: {repo_root}")
print(f"Current working directory: {os.getcwd()}")
print(f"Python path updated")

## Stage 1: Preprocessing
Preprocess NQ dataset: filter corpus and align queries

In [None]:
def stage_preprocess():
    """Preprocess NQ dataset: filter corpus and align queries"""
    if SKIP_PREPROCESS:
        print("\n[SKIP] Preprocessing")
        return
    
    print("\n" + "="*80)
    print("STAGE 1: PREPROCESSING")
    print("="*80)
    
    from data_processing.nq_preprocess import preprocess_data
    
    try:
        print(f"\nParameters:")
        print(f"  Corpus size: {CORPUS_SIZE}")
        
        corpus_file, queries_file = preprocess_data(corpus_size=CORPUS_SIZE)
        print(f"\n[OK] Preprocessing complete")
        print(f"  Corpus: {corpus_file}")
        print(f"  Queries: {queries_file}")
    except Exception as e:
        print(f"\n[ERROR] Preprocessing failed: {e}")
        raise

# Run preprocessing stage
stage_preprocess()

## Stage 2: Tokenizer Training
Train BPE tokenizer on dataset

In [None]:
def stage_tokenizer():
    """Train BPE tokenizer on dataset"""
    if SKIP_TOKENIZER:
        print("\n[SKIP] Tokenizer training")
        return
    
    print("\n" + "="*80)
    print("STAGE 2: TOKENIZER TRAINING")
    print("="*80)
    
    tokenizer_script = repo_root / 'tokenization' / 'our_tokenizers' / 'train_tokenizer.py'
    
    try:
        print(f"\nParameters:")
        print(f"  Vocab size: {TOKENIZER_VOCAB_SIZE}")
        print(f"\nRunning tokenizer training...")
        result = subprocess.run(
            [sys.executable, str(tokenizer_script)],
            cwd=repo_root / 'tokenization' / 'our_tokenizers',
            check=True,
            capture_output=False
        )
        print(f"\n[OK] Tokenizer training complete")
    except subprocess.CalledProcessError as e:
        print(f"\n[ERROR] Tokenizer training failed with exit code {e.returncode}")
        raise
    except Exception as e:
        print(f"\n[ERROR] Tokenizer training failed: {e}")
        raise

# Run tokenizer stage
stage_tokenizer()

## Stage 3A: Train LSTM Model
Train LSTM language model with BPE tokenization

In [None]:
def stage_train_lstm():
    """Train LSTM language model with BPE tokenization"""
    if SKIP_LSTM:
        print("\n[SKIP] LSTM model training")
        return
    
    print("\n" + "="*80)
    print("STAGE 3A: LSTM MODEL TRAINING")
    print("="*80)
    
    from models.LSTM.training.train_bpe_lstm import main as train_lstm_main
    
    try:
        print(f"\nParameters:")
        print(f"  Epochs: {LSTM_EPOCHS}")
        print(f"  Batch size: {LSTM_BATCH_SIZE}")
        print(f"  Sequence length: {LSTM_SEQ_LENGTH}")
        print(f"  Learning rate: {LSTM_LEARNING_RATE}")
        
        train_lstm_main(
            batch_size=LSTM_BATCH_SIZE,
            seq_length=LSTM_SEQ_LENGTH,
            num_epochs=LSTM_EPOCHS,
            learning_rate=LSTM_LEARNING_RATE
        )
        print(f"\n[OK] LSTM training complete")
    except Exception as e:
        print(f"\n[ERROR] LSTM training failed: {e}")
        raise

# Run LSTM training stage
stage_train_lstm()

## Stage 3B: Train Transformer Model
Train Transformer language model with BPE tokenization

In [None]:
def stage_train_transformer():
    """Train Transformer language model with BPE tokenization"""
    if SKIP_TRANSFORMER:
        print("\n[SKIP] Transformer model training")
        return
    
    print("\n" + "="*80)
    print("STAGE 3B: TRANSFORMER MODEL TRAINING")
    print("="*80)
    
    from models.Transformer.training.train_bpe_transformer import main as train_transformer_main
    
    try:
        print(f"\nParameters:")
        print(f"  Epochs: {TRANSFORMER_EPOCHS}")
        print(f"  Batch size: {TRANSFORMER_BATCH_SIZE}")
        print(f"  Sequence length: {TRANSFORMER_SEQ_LENGTH}")
        print(f"  Learning rate: {TRANSFORMER_LEARNING_RATE}")
        
        train_transformer_main(
            batch_size=TRANSFORMER_BATCH_SIZE,
            seq_length=TRANSFORMER_SEQ_LENGTH,
            num_epochs=TRANSFORMER_EPOCHS,
            learning_rate=TRANSFORMER_LEARNING_RATE
        )
        print(f"\n[OK] Transformer training complete")
    except Exception as e:
        print(f"\n[ERROR] Transformer training failed: {e}")
        raise

# Run Transformer training stage
stage_train_transformer()

## Stage 4: Embeddings Generation
Generate embeddings using all models and store in database

In [None]:
def stage_embeddings():
    """Generate embeddings using all models and store in database"""
    if SKIP_EMBEDDINGS:
        print("\n[SKIP] Embeddings generation")
        return
    
    print("\n" + "="*80)
    print("STAGE 4: EMBEDDINGS GENERATION")
    print("="*80)
    
    from pipeline.run_all_embeddings import run_embeddings_pipeline
    
    try:
        # Prepare models to run
        if EMBEDDINGS_MODELS is None:
            models = ['byt5', 'canine', 'bpe-lstm', 'bpe-transformer', 'bert']
        else:
            models = EMBEDDINGS_MODELS
        
        print(f"\nParameters:")
        print(f"  Models: {', '.join(models)}")
        print(f"  Clear tables: {EMBEDDINGS_CLEAR_EXISTING}")
        
        results = run_embeddings_pipeline(
            models=models,
            clear_existing=EMBEDDINGS_CLEAR_EXISTING
        )
        print(f"\n[OK] Embeddings generation complete")
    except Exception as e:
        print(f"\n[ERROR] Embeddings generation failed: {e}")
        raise

# Run embeddings stage
stage_embeddings()

## Stage 5: Evaluation
Evaluate all embedding models on retrieval task

In [None]:
def stage_evaluation():
    """Evaluate all embedding models on retrieval task"""
    if SKIP_EVALUATION:
        print("\n[SKIP] Evaluation")
        return
    
    print("\n" + "="*80)
    print("STAGE 5: EVALUATION")
    print("="*80)
    
    from tokenization.evaluation.evaluation import main as evaluation_main
    
    try:
        evaluation_main()
        print(f"\n[OK] Evaluation complete")
    except Exception as e:
        print(f"\n[ERROR] Evaluation failed: {e}")
        raise

# Run evaluation stage
stage_evaluation()

## Pipeline Summary
Display the final status and summary

In [None]:
print("\n" + "="*80)
print("‚úÖ PIPELINE EXECUTION COMPLETE")
print("="*80)
print("\nConfiguration Summary:")
print(f"  SKIP_PREPROCESS: {SKIP_PREPROCESS}")
print(f"  SKIP_TOKENIZER: {SKIP_TOKENIZER}")
print(f"  SKIP_LSTM: {SKIP_LSTM}")
print(f"  SKIP_TRANSFORMER: {SKIP_TRANSFORMER}")
print(f"  SKIP_EMBEDDINGS: {SKIP_EMBEDDINGS}")
print(f"  SKIP_EVALUATION: {SKIP_EVALUATION}")
print("="*80)