## Pipeline Overview

```
Parsed JSON Documents
    ↓
Load with TextChunker
    ↓
Create semantic chunks
    ↓
Embed with EmbeddingProvider
    ↓
Store in PostgreSQL
    ↓
Vector similarity indexed
```

## Pipeline Implementation

In [None]:
import json
import sys
from pathlib import Path
from typing import List
import logging

# Import modules from previous notebooks
# Assume they're available in the Python environment

# Minimal logging configuration
logging.basicConfig(
    level=logging.INFO,
    format="%(levelname)s: %(message)s"
)
logger = logging.getLogger(__name__)


class Pipeline:
    """Main ETL pipeline."""
    
    def __init__(
        self,
        data_dir: str = "../data",
        embedder_provider: str = "sentence-transformers",
        embedder_model: str = "sentence-transformers/all-MiniLM-L6-v2",
        db_host: str = None,
        db_port: int = None,
        db_name: str = None,
        db_user: str = None,
        db_password: str = None,
        max_chunk_tokens: int = 512,
        chunk_overlap_tokens: int = 50
    ):
        """
        Initialize pipeline.
        
        Args:
            data_dir: Base data directory path
            embedder_provider: "openai" or "sentence-transformers"
            embedder_model: Model name for embedder
            db_host: Database host
            db_port: Database port
            db_name: Database name
            db_user: Database user
            db_password: Database password
            max_chunk_tokens: Maximum tokens per chunk
            chunk_overlap_tokens: Token overlap between chunks
        """
        self.data_dir = Path(data_dir).resolve()
        self.processed_dir = self.data_dir / "processed"
        
        # Initialize components
        # from chunker import TextChunker
        # from embedder import create_embedder
        # from db import DatabaseConnector
        
        # self.chunker = TextChunker(
        #     max_chunk_tokens=max_chunk_tokens,
        #     overlap_tokens=chunk_overlap_tokens
        # )
        # 
        # self.embedder = create_embedder(
        #     provider=embedder_provider,
        #     model=embedder_model if embedder_provider == "sentence-transformers" else None,
        #     api_key=None if embedder_provider != "openai" else None
        # )
        # 
        # self.db = DatabaseConnector(
        #     host=db_host,
        #     port=db_port,
        #     database=db_name,
        #     user=db_user,
        #     password=db_password
        # )
        
        logger.info(f"Pipeline initialized with data dir: {self.data_dir}")
    
    def load_parsed_documents(self) -> List[dict]:
        """Load all parsed JSON documents."""
        if not self.processed_dir.exists():
            raise FileNotFoundError(f"Processed data directory not found: {self.processed_dir}")
        
        documents = []
        json_files = list(self.processed_dir.glob("*.json"))
        
        if not json_files:
            raise ValueError(f"No JSON files found in {self.processed_dir}")
        
        for json_file in json_files:
            with open(json_file, 'r', encoding='utf-8') as f:
                doc = json.load(f)
                documents.append(doc)
        
        logger.info(f"Loaded {len(documents)} parsed documents")
        return documents
    
    def process_documents(self, documents: List[dict], skip_components: List[str] = None):
        """
        Process documents through full pipeline.
        
        Args:
            documents: Parsed documents from generate_ast.ts
            skip_components: Component names to skip
        """
        skip_components = skip_components or []
        total_chunks_stored = 0
        
        for doc in documents:
            component = doc.get("name", "unknown")
            
            if component in skip_components:
                logger.info(f"Skipping {component} (in skip list)")
                continue
            
            logger.info(f"Processing {component}")
            
            # Clear previous chunks for this component
            # deleted = self.db.delete_component_chunks(component)
            # if deleted > 0:
            #     logger.info(f"Deleted {deleted} previous chunks for {component}")
            
            # Chunk the document
            # chunks = self.chunker.chunk_documents([doc])
            # logger.info(f"Created {len(chunks)} chunks for {component}")
            
            # Embed and store each chunk
            # for chunk in chunks:
            #     try:
            #         embedding = self.embedder.embed_single(chunk.content)
            #         chunk_id = self.db.store_chunk(
            #             component=chunk.component,
            #             section_title=chunk.section_title,
            #             section_path=chunk.section_path,
            #             content=chunk.content,
            #             embedding=embedding,
            #             demo_files=chunk.demo_files,
            #             images=chunk.images,
            #             token_count=chunk.token_count
            #         )
            #         total_chunks_stored += 1
            #     except Exception as e:
            #         logger.error(
            #             f"Error storing chunk for {component} "
            #             f"(section: {chunk.section_title}): {e}"
            #         )
        
        logger.info(f"Stored {total_chunks_stored} chunks total")
        return total_chunks_stored
    
    def run(self, skip_components: List[str] = None):
        """
        Run complete pipeline.
        
        Args:
            skip_components: Component names to skip
        """
        try:
            logger.info("Starting ETL pipeline")
            
            # Connect to database
            # self.db.connect()
            
            # Create tables
            # logger.info("Creating database tables")
            # self.db.create_tables()
            
            # Load parsed documents
            documents = self.load_parsed_documents()
            
            # Process all documents
            total = self.process_documents(documents, skip_components)
            
            # Log statistics
            # stats = self.db.get_stats()
            # logger.info("Pipeline complete")
            # logger.info(f"Total chunks in database: {stats['total_chunks']}")
            # logger.info(f"Average chunk size: {stats['avg_token_count']:.0f} tokens")
            
            # return stats
            
            return {"chunks_processed": total}
            
        except Exception as e:
            logger.error(f"Pipeline failed: {e}")
            raise
        # finally:
        #     self.db.disconnect()


print("Pipeline module loaded successfully")

## Running the Pipeline

In [None]:
# Initialize pipeline
# pipeline = Pipeline(
#     data_dir="../data",
#     embedder_provider="sentence-transformers",
#     embedder_model="sentence-transformers/all-MiniLM-L6-v2"
# )

# # Run pipeline with specific components
# stats = pipeline.run(skip_components=["Button", "Checkbox"])
# print(f"Pipeline results: {stats}")

print("Pipeline ready to execute")

## CLI Integration

In [None]:
import argparse

def create_cli_parser():
    """Create command-line argument parser."""
    parser = argparse.ArgumentParser(
        description="Run Marigold RAG ETL pipeline"
    )
    parser.add_argument(
        "--embedder",
        choices=["openai", "sentence-transformers"],
        default="sentence-transformers",
        help="Embedding provider"
    )
    parser.add_argument(
        "--model",
        default="sentence-transformers/all-MiniLM-L6-v2",
        help="Model name for embedder"
    )
    parser.add_argument(
        "--data-dir",
        default="../data",
        help="Base data directory"
    )
    parser.add_argument(
        "--skip",
        nargs="+",
        default=[],
        help="Component names to skip"
    )
    parser.add_argument(
        "--chunk-size",
        type=int,
        default=512,
        help="Maximum tokens per chunk"
    )
    
    return parser

# Example usage
parser = create_cli_parser()
print("CLI parser created with following options:")
parser.print_help()

## Configuration Summary

In [None]:
# Pipeline configuration summary
config_summary = {
    "Input": "Parsed JSON from TypeScript parser (etl/data/processed/)",
    "Chunking": "Semantic text chunking with 512 token limit, 50 token overlap",
    "Embedding": "Sentence-transformers (all-MiniLM-L6-v2) or OpenAI API",
    "Storage": "PostgreSQL with pgvector extension for vector indexing",
    "Output": "Vector-indexed chunks with metadata in database"
}

for stage, description in config_summary.items():
    print(f"{stage:12} : {description}")