# PIPELINE v0.1

## Libraries

In [1]:
import csv
import torch
import os
from typing import List, Type
from tqdm import tqdm
import json

# Database Imports
from sqlmodel import SQLModel, Field, Session, create_engine, select
from sqlalchemy import Column, text
from pgvector.sqlalchemy import Vector

## Configuration

In [2]:
import os
from dotenv import load_dotenv

# Load .env file if present (this lets docker write a .env we can use locally)
load_dotenv()

# Database Connection (reads from environment; default points to local Docker DB)
DATABASE_URL = os.environ.get(
    "DATABASE_URL",
    "postgresql+psycopg://nick:secret@localhost:5433/vectordb",
)

# Dataset Path (can be overridden via env)
DATASET_PATH = os.environ.get(
    "DATASET_PATH",
    "../data_filtered/corpus_filtered.jsonl",
)

In [3]:
# Setup - Configure which model to use
# Experiment A: BPE not ready yet
# CURRENT_MODEL_ID = 'BPE'
# CURRENT_TABLE_NAME = 'BPE'
# VECTOR_DIMENSION = cuck
# CURRENT_EMBEDDER = bpe

# Experiment B: ByT5 (This one works)
# CURRENT_MODEL_ID = 'google/byt5-small'
# CURRENT_TABLE_NAME = 'byt5_small'
# VECTOR_DIMENSION = 1472

# Experiment C: Canine (This one should work have not tested yet delete this if you run it)
CURRENT_MODEL_ID = 'google/canine-s'
CURRENT_TABLE_NAME = 'canine_s'
VECTOR_DIMENSION = 768

# Experiment D: SentencePiece (Maybe coming soon who knows)
# CURRENT_MODEL_ID = 'SentencePiece thing'
# CURRENT_TABLE_NAME = 'sentencepiece'
# VECTOR_DIMENSION = idk yet

# Import embedders - simple relative import
import sys
import os

# Add parent directory to path (repo root from pipeline folder)
repo_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if repo_root not in sys.path:
    sys.path.insert(0, repo_root)

from tokenization.our_tokenizers.ByT5.ByT5_embedding import ByT5Embedder
from tokenization.our_tokenizers.Canine.Canine_embedding import CanineEmbedder

# Set embedder based on experiment
CURRENT_EMBEDDER = CanineEmbedder  # Change this to CanineEmbedder for Experiment C


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Check GPU availability
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")
    print(f"CUDA version: {torch.version.cuda}")
else:
    print("⚠️  Running on CPU - embeddings will be slow")

PyTorch version: 2.6.0.dev20241112+cu121
CUDA available: True
CUDA device: NVIDIA GeForce RTX 3050 Laptop GPU
CUDA version: 12.1
CUDA available: True
CUDA device: NVIDIA GeForce RTX 3050 Laptop GPU
CUDA version: 12.1


## Dynamic Database Creation

In [5]:
def create_table_class(table_name: str, dim: int) -> Type[SQLModel]:
    """
    Dynamically creates a SQLModel class.
    This allows us to save data to different tables (e.g., 'bert_v1', 'bert_v2')
    without rewriting the class code manually.
    """
    # We define the class attributes dynamically
    class DynamicTable(SQLModel, table=True):
        __tablename__ = table_name
        __table_args__ = {'extend_existing': True} # Allows overwriting if class exists in memory

        # Mapping CSV '_id' to primary key
        id: str = Field(primary_key=True) 
        title: str
        text: str
        
        # The Vector column
        embedding: List[float] = Field(sa_column=Column(Vector(dim)))

    return DynamicTable

## Pipeline

In [6]:
def run_pipeline(batch_embedding_size=32):
    # A. Setup Database
    engine = create_engine(DATABASE_URL)
    
    # Ensure pgvector extension exists
    with engine.connect() as conn:
        conn.execute(text("CREATE EXTENSION IF NOT EXISTS vector"))
        conn.commit()

    # B. Define the Table Model based on configuration
    TableClass = create_table_class(CURRENT_TABLE_NAME, VECTOR_DIMENSION)
    SQLModel.metadata.create_all(engine)

    # C. Initialize ML Model
    embedder = CURRENT_EMBEDDER(CURRENT_MODEL_ID)

    # D. Process JSONL and Insert
    if not os.path.exists(DATASET_PATH):
        print(f"Error: Dataset not found at {DATASET_PATH}")
        return

    print(f"--- Processing JSONL: {DATASET_PATH} ---")
    print(f"--- Target Table: {CURRENT_TABLE_NAME} ---")
    print(f"--- Batch embedding size: {batch_embedding_size} (GPU batching enabled) ---")

    data_buffer = []
    text_buffer = []
    metadata_buffer = []
    BATCH_SIZE = 100 

    with Session(engine) as session:
        # Open the JSONL file
        with open(DATASET_PATH, mode='r', encoding='utf-8') as f:
            
            # Iterate line by line. 
            # We wrap 'f' with tqdm to show progress (lines processed)
            for line in tqdm(f, desc="Embedding Docs"):
                try:
                    if not line.strip():
                        continue # Skip empty lines

                    # 1. Parse JSON
                    row = json.loads(line)

                    # 2. Extract Data
                    doc_id = row.get('_id')
                    title = row.get('title', '')
                    doc_text = row.get('text', '')

                    # Skip if ID is missing
                    if not doc_id:
                        continue

                    # 3. Prepare text and metadata for batch embedding
                    full_content = f"{title}: {doc_text}"
                    text_buffer.append(full_content)
                    metadata_buffer.append({'id': doc_id, 'title': title, 'text': doc_text})

                    # 4. Process batch when buffer is full
                    if len(text_buffer) >= batch_embedding_size:
                        # Generate embeddings in batch (GPU accelerated!)
                        if hasattr(embedder, 'generate_embeddings_batch'):
                            vectors = embedder.generate_embeddings_batch(text_buffer)
                        else:
                            # Fallback to single embedding if batch method not available
                            vectors = [embedder.generate_embedding(text) for text in text_buffer]
                        
                        # Create records
                        for meta, vector in zip(metadata_buffer, vectors):
                            record = TableClass(
                                id=meta['id'],
                                title=meta['title'],
                                text=meta['text'],
                                embedding=vector
                            )
                            data_buffer.append(record)
                        
                        # Clear buffers
                        text_buffer = []
                        metadata_buffer = []

                    # 5. Batch Commit to DB
                    if len(data_buffer) >= BATCH_SIZE:
                        session.add_all(data_buffer)
                        session.commit()
                        data_buffer = []

                except json.JSONDecodeError:
                    print(f"Skipping invalid JSON line")
                except Exception as e:
                    print(f"Error processing doc: {e}")

            # 6. Process remaining texts
            if text_buffer:
                if hasattr(embedder, 'generate_embeddings_batch'):
                    vectors = embedder.generate_embeddings_batch(text_buffer)
                else:
                    vectors = [embedder.generate_embedding(text) for text in text_buffer]
                    
                for meta, vector in zip(metadata_buffer, vectors):
                    record = TableClass(
                        id=meta['id'],
                        title=meta['title'],
                        text=meta['text'],
                        embedding=vector
                    )
                    data_buffer.append(record)

            # 7. Commit remaining records
            if data_buffer:
                session.add_all(data_buffer)
                session.commit()

    print("\n--- Pipeline Finished Successfully ---")

In [7]:
# if __name__ == "__main__":
# Run the full prgocess ADJUST EMBEDDING SIZE ACCORDING TO YOUR GPU IF OUT OF MEMORY USE SMALLER SIZE
run_pipeline(batch_embedding_size=8)

--- Loading CANINE Model: google/canine-s ---
Using device: cuda
--- Processing JSONL: ../data_filtered/corpus_filtered.jsonl ---
--- Target Table: canine_s ---
--- Batch embedding size: 8 (GPU batching enabled) ---
--- Processing JSONL: ../data_filtered/corpus_filtered.jsonl ---
--- Target Table: canine_s ---
--- Batch embedding size: 8 (GPU batching enabled) ---


Embedding Docs: 79it [00:10,  7.23it/s]



KeyboardInterrupt: 