# Multi-Model Sentence Transformer Processing
**Processes multiple sentence transformer models with TF-IDF comparison and stores similarity results in Neo4j database for document hierarchy analysis.**


In [None]:
# Cell [0] - Setup and Imports
# Purpose: Import all required libraries and configure environment settings for Multi-LLM testing
# Dependencies: os, gc, sys, pathlib, logging, dotenv, matplotlib, seaborn, datetime, json, torch, numpy, typing, tqdm, spacy, praxis_sentence_transformer
# Breadcrumbs: Setup -> Imports -> Environment Configuration

# Standard library imports
import os
import gc
import sys
from pathlib import Path
import logging

# Third-party imports
from dotenv import load_dotenv
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import json
import torch
import numpy as np
from typing import List, Dict, Tuple, Optional
from tqdm.notebook import tqdm
import spacy
from spacy.cli import download

# Load environment variables
load_dotenv()

# Import all required functionality from the main package (installed via pip)
try:
    from praxis_sentence_transformer import (
        # Core
        setup_logging, 
        handle_exception, 
        DebugTimer,
        Neo4jClient,
        
        # Data models
        Project,
        Document,
        Section,
        Requirement,
        DocumentHierarchyLoader,
        RequirementsLoader,
        RequirementsPreprocessor,
        
        # Neo4j
        RequirementsTraceabilityGraph,
        RequirementNode,
        
        # Utils
        initialize_cuda,
        cleanup_cuda,
        cleanup_resources,
        create_results_directory,
    )
    
    # Import SentenceTransformerAnalyzer directly from analyzers package
    from praxis_sentence_transformer.analyzers import SentenceTransformerAnalyzer
    
except ImportError as e:
    logger = logging.getLogger(__name__)
    logger.error(f"Failed to import praxis_sentence_transformer: {str(e)}")
    logger.info("Please install the package using pip install praxis-sentence-transformer")
    raise

# Ensure both English and Italian spaCy models are downloaded
try:
    nlp_en = spacy.load('en_core_web_sm')
except OSError:
    print("Downloading English language model...")
    download('en_core_web_sm')
    nlp_en = spacy.load('en_core_web_sm')

try:
    nlp_it = spacy.load('it_core_news_sm')
except OSError:
    print("Downloading Italian language model...")
    download('it_core_news_sm')
    nlp_it = spacy.load('it_core_news_sm')

# Set up logging
logger = setup_logging("sentence-transformer-notebook")
logger.info("Environment setup completed successfully")

In [None]:
# Cell [1] - Path Verification
# Purpose: Verify project paths are correctly configured for module imports (No longer needed with pip install)
# Dependencies: None (using pip installed package)
# Breadcrumbs: Environment Configuration -> Path Verification -> Module Access

# Path verification is no longer needed since we're using pip-installed package
print("Using praxis-sentence-transformer installed via pip")

In [None]:
# Cell [2] - Document Loading and Model Processing Pipeline
# Purpose: Initialize database connections, load documents, and process multiple transformer models
# Dependencies: neo4j_client, document_loaders, analyzers, graph, cuda utilities
# Breadcrumbs: Path Verification -> Document Processing -> Multi-Model Analysis Pipeline

try:
    # Log project info
    project_name = os.getenv('PROJECT_NAME')
    logger.info(f"Processing project: {project_name}")
    
    # Initialize CUDA
    device, cuda_available = initialize_cuda()
    logger.info(f"Using device: {device} (CUDA available: {cuda_available})")
    
    # Initialize Neo4j client
    neo4j_client = Neo4jClient(
        uri=os.getenv('NEO4J_URI'),
        username=os.getenv('NEO4J_USER'),
        password=os.getenv('NEO4J_PASSWORD'),
        database='neo4j'
    )
    logger.info(f"Using Neo4j database: {neo4j_client.database}")
    
    # Set up fresh constraints first
    logger.info("Setting up fresh constraints...")
    neo4j_client.setup_constraints()
    
    # Initialize document loader
    doc_loader = DocumentHierarchyLoader(neo4j_client=neo4j_client)
    
    # Create project if it doesn't exist
    logger.info(f"Creating/verifying project: {project_name}")
    doc_loader.create_project()
    
    # Clean database before starting
    logger.info(f"Cleaning {project_name} database before analysis...")
    neo4j_client.cleanup_project(project_name)
    
    # Create results directory
    results_dir = create_results_directory(
        model_name=os.getenv('MODEL_LIST', '[]').strip('[]').split(',')[0].strip().strip('\"\''),
        dataset_name=project_name
    )
    
    # Load answer set first
    logger.info("Loading answer set...")
    requirements_loader = RequirementsLoader(neo4j_client=neo4j_client)
    answer_set = requirements_loader.parse_answer_set(os.getenv('ANSWER_FILE'))
    logger.info(f"Loaded {len(answer_set)} reference mappings from answer set")
    
    # Load and validate requirements from files
    source_reqs = requirements_loader.parse_requirements(os.getenv('SOURCE_FILE'))
    target_reqs = requirements_loader.parse_requirements(os.getenv('TARGET_FILE'))
    requirements_loader.validate_requirements(source_reqs, target_reqs, answer_set)
    
    # Load and store documents
    logger.info("Loading document hierarchy...")
    source_doc, target_doc = doc_loader.load_and_store_documents(
        source_file=os.getenv('SOURCE_FILE'),
        target_file=os.getenv('TARGET_FILE')
    )
    
    # Create ground truth links
    doc_loader.create_ground_truth_links(answer_set)
    
    # Get model list from environment
    model_list = eval(os.getenv('MODEL_LIST', '["sentence-transformers/multi-qa-mpnet-base-cos-v1"]'))
    logger.info(f"Processing {len(model_list)} models: {model_list}")
    
    # First compute TF-IDF similarities using first model's analyzer
    logger.info("Computing TF-IDF similarities...")
    analyzer = SentenceTransformerAnalyzer(
        model_name=model_list[0],
        alpha=0.3,
        device=device
    )
    analyzer.initialize()
    
    # Initialize graph with analyzer
    logger.info("Initializing graph...")
    graph = RequirementsTraceabilityGraph(
        analyzer=analyzer, 
        alpha=0.3,
        project_name=project_name
    )
    
    # Compute TF-IDF similarities first
    logger.info("Computing and storing TF-IDF similarities...")
    graph.compute_tfidf_similarities()
    
    # Process each sentence transformer model
    for model_name in model_list:
        logger.info(f"\nProcessing model: {model_name}")
        try:
            # Initialize analyzer for current model
            analyzer = SentenceTransformerAnalyzer(
                model_name=model_name,
                alpha=0.3,
                device=device
            )
            analyzer.initialize()
            
            # Update graph with new analyzer
            graph.analyzer = analyzer
            
            # Compute sentence transformer similarities
            logger.info(f"Computing similarities for {model_name}...")
            graph.compute_sentence_transformer_similarities()
            
            # Log metrics for current model
            graph.log_database_metrics()
            
        except Exception as e:
            logger.error(f"Error processing model {model_name}: {str(e)}")
            logger.exception("Detailed error trace:")
            continue
        finally:
            cleanup_cuda()
    
except Exception as e:
    logger.error(f"Error in processing: {str(e)}")
    cleanup_resources()
    if 'neo4j_client' in locals():
        neo4j_client.close()
    sys.exit(1)