In [1]:
import nest_asyncio
try:
    nest_asyncio.apply()
    print("nest_asyncio applied.")
except Exception as e:
    print(f"Failed to apply nest_asyncio: {e}")
from src.query_pipeline.vector_retriever.vector_retriever import all_vector_retrieve
from src.query_pipeline.graph_retriever.graph_retriever import classical_and_llm_run_graph_search
import os
from langchain_google_genai import ChatGoogleGenerativeAI
import logging
from typing import Any, Dict
from sentence_transformers import SentenceTransformer
from src.ingestion_pipeline.helper_functions import create_neo4j_driver
from src.query_pipeline.all_query_pipeline import generate_response_langchain, create_evaluation_object
from src.evaluation.generate_eveluation import (safety_settings,GoogleVertexAI, 
                                                calculate_answer_relevancy, 
                                                calculate_contextual_precision, 
                                                calculate_contextual_recall, 
                                                calculate_rouge_score,
                                                calculate_cosine_similarity
                                                )
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

nest_asyncio applied.


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

In [2]:
embedding_model = SentenceTransformer(os.getenv('DENSE_MODEL_KEY'))
logging.info(f"loading embedding model {os.getenv('DENSE_MODEL_KEY')} done")

2025-06-10 19:28:28,628 - INFO - Load pretrained SentenceTransformer: BAAI/bge-large-en-v1.5
2025-06-10 19:28:36,462 - INFO - Use pytorch device_name: cpu
2025-06-10 19:28:36,468 - INFO - loading embedding model BAAI/bge-large-en-v1.5 done


In [3]:
def run_evaluation_metric(
    file_name: str,
    response_obj: Dict[str, Any],
    embedding_model ,
    model: str,  
    fallback_model_name = "gemini-1.5-pro", 
    max_retries: int = 2,
    retry_delay_seconds: int = 3,
):
    """
    Calculates evaluation metrics (Relevancy, Precision, Recall, ROUGE-1)
    with retries and fallback model logic for LLM-based metrics,
    and appends the results to a JSON file.

    Args:
        file_name (str): Identifier for the search type, used for naming the output file.
        response_obj (dict): Dictionary containing evaluation data.
        model (str): Name of the primary evaluation LLM to use (e.g., "gemini-1.5-pro").
        max_retries (int): Max retries for failing LLM-based metric calculation with the primary model.
        retry_delay_seconds (int): Seconds to wait between retries.
    """
    print(f"\n--- Evaluating for search_type: {file_name} (Primary Model: {model}) ---")

    primary_googleai_gemini_model = None
    fallback_googleai_gemini_model = None

    # --- Initialize Primary Model ---
    try:
        if not os.getenv('GOOGLE_API_KEY'):
             print("Warning: GOOGLE_API_KEY not set, using dummy value for demo.")
             os.environ['GOOGLE_API_KEY'] = 'dummy-key-for-testing'
             # raise ValueError("GOOGLE_API_KEY environment variable not set.")

        primary_custom_model_gemini = ChatGoogleGenerativeAI(
            model=model, safety_settings=safety_settings,
            google_api_key=os.getenv('GOOGLE_API_KEY'), temperature=0.2
        )
        primary_googleai_gemini_model = GoogleVertexAI(model=primary_custom_model_gemini)
        print(f"Primary model '{model}' initialized successfully.")
    except Exception as e:
        print(f"  CRITICAL ERROR during primary model setup for '{file_name}' ({model}): {e}. Aborting evaluation.")
        return 

    # --- Extract data from response_obj ---
    query = response_obj.get('query')
    actual_output = response_obj.get('llm_response')
    expected_output = response_obj.get('expected_output')
    retrieval_context = response_obj.get('retrieval_context')
    if isinstance(retrieval_context, str):
        retrieval_context = [retrieval_context]

    # --- Calculate Metrics with Primary Model ---
    relevancy_score, relevancy_failed = calculate_answer_relevancy(
        query, actual_output, primary_googleai_gemini_model, max_retries, retry_delay_seconds
    )
    precision_score, precision_failed = calculate_contextual_precision(
        query, actual_output, expected_output, retrieval_context, primary_googleai_gemini_model, max_retries, retry_delay_seconds
    )
    recall_score, recall_failed = calculate_contextual_recall(
        query, actual_output, expected_output, retrieval_context, primary_googleai_gemini_model, max_retries, retry_delay_seconds
    )
    
    # Non-LLM metrics (no retries/fallback needed for these specific calculations)
    # ROUGE score calculation (no LLM, no retries needed here)
    rouge1_score = calculate_rouge_score(
        query, actual_output, expected_output, rouge_type="rouge1"
    )
    cosine_sim_score = calculate_cosine_similarity(
        actual_output, expected_output , embedding_model 

    )
    
    # --- Attempt Fallback Model if Necessary ---
    needs_fallback = relevancy_failed or precision_failed or recall_failed
    fallback_attempted = False
    fallback_succeeded_relevancy = False
    fallback_succeeded_precision = False
    fallback_succeeded_recall = False

    if needs_fallback and model != fallback_model_name:
        print(f"\n--- Primary model '{model}' failed for some LLM metrics. Attempting fallback with '{fallback_model_name}' ---")
        fallback_attempted = True
        # --- Initialize Fallback Model ---
        try:
            if not os.getenv('GOOGLE_API_KEY'): # Re-check just in case
                 raise ValueError("GOOGLE_API_KEY environment variable not set (checked before fallback).")

            fallback_custom_model_gemini = ChatGoogleGenerativeAI(
                model=fallback_model_name, safety_settings=safety_settings,
                google_api_key=os.getenv('GOOGLE_API_KEY'), temperature=0.2
            )
            fallback_googleai_gemini_model = GoogleVertexAI(model=fallback_custom_model_gemini)
            print(f"Fallback model '{fallback_model_name}' initialized successfully.")

        except Exception as e:
            print(f"  ERROR during fallback model setup ('{fallback_model_name}'): {e}. Skipping fallback attempts.")
            fallback_googleai_gemini_model = None # Ensure it's None if setup fails

        # --- Retry Failed Metrics with Fallback Model (Single Attempt) ---
        if fallback_googleai_gemini_model:
            # Note: We only retry metrics that failed the *primary* attempt.
            # We call the same calculation functions, passing the fallback model and 0 retries.
            if relevancy_failed:
                relevancy_score_fb, failed_fb = calculate_answer_relevancy(
                    query, actual_output, fallback_googleai_gemini_model, max_retries=3, retry_delay_seconds=7
                )
                if not failed_fb: # If fallback succeeded
                    relevancy_score = relevancy_score_fb # Update the score
                    relevancy_failed = False # Mark as no longer failed
                    fallback_succeeded_relevancy = True

            if precision_failed:
                 precision_score_fb, failed_fb = calculate_contextual_precision(
                    query, actual_output, expected_output, retrieval_context, fallback_googleai_gemini_model, max_retries=3, retry_delay_seconds=7
                 )
                 if not failed_fb:
                     precision_score = precision_score_fb
                     precision_failed = False
                     fallback_succeeded_precision = True

            if recall_failed:
                recall_score_fb, failed_fb = calculate_contextual_recall(
                    query, actual_output, expected_output, retrieval_context, fallback_googleai_gemini_model, max_retries=3, retry_delay_seconds=7
                )
                if not failed_fb:
                    recall_score = recall_score_fb
                    recall_failed = False
                    fallback_succeeded_recall = True
        else:
             print("  Fallback model could not be initialized. Scores for failed metrics will remain None/default.")

    elif needs_fallback and model == fallback_model_name:
         print(f"\n--- Primary model '{model}' failed for some LLM metrics, but it is already the fallback model. No further fallback attempted. ---")


    # --- Finalize Scores ---
    # Assign default 0.0 if score is still None (due to missing data or persistent failure)
    final_relevancy_score = 0.000 if relevancy_score is None else relevancy_score
    final_precision_score = 0.000 if precision_score is None else precision_score
    final_recall_score = 0.000 if recall_score is None else recall_score
    final_rouge1_score = 0.000 if rouge1_score is None else float(rouge1_score)
    final_cosine_sim_score = 0.000 if cosine_sim_score is None else float(cosine_sim_score)

    # --- Prepare Result Data ---
    num_input_token = response_obj.get('num_input_token')
    num_output_token = response_obj.get('num_output_token')
    time_taken = response_obj.get('time_taken')
    qa_level = response_obj.get('level')
    
    result_data = {
        "search_type": file_name,
        "precision": final_precision_score,
        "recall": final_recall_score,
        "relevancy": final_relevancy_score,
        "rouge1": final_rouge1_score, 
        "cosine_similarity": final_cosine_sim_score, 
        "time_taken": time_taken,
        "num_input_token": num_input_token,
        "num_output_token": num_output_token,
        "query" : query,
        "qa_level": qa_level,
        "actual_output" : actual_output,
        "expected_output" : expected_output,
        "retrieval_context" : retrieval_context[0], # Keep as string since the list always has 1 element
        "primary_llm_used": model,
        "fallback_llm_attempted": fallback_model_name if fallback_attempted else None,
        "fallback_used_and_succeeded_relevancy": fallback_succeeded_relevancy,
        "fallback_used_and_succeeded_precision": fallback_succeeded_precision,
        "fallback_used_and_succeeded_recall": fallback_succeeded_recall,
        # Record final failure state *after* potential fallback attempts
        "final_state_failed_relevancy": relevancy_failed,
        "final_state_failed_precision": precision_failed,
        "final_state_failed_recall": recall_failed,
    }

    return result_data

In [4]:
driver = create_neo4j_driver()

if not driver:
    logging.error("*********** Failed to establish Neo4j connection. Check your credentials and server status.**************")

2025-06-10 19:28:37,549 - INFO - Attempting to connect to Neo4j at neo4j://localhost:7687...
2025-06-10 19:28:37,706 - INFO - Successfully connected to Neo4j database 'neo4j' via neo4j://localhost:7687 after 0 retries.


In [5]:
def retriever_vector_context(search_query):    
    num_results = 4          
    vector_dense_context, vector_sparse_context, vector_hybrid_context = all_vector_retrieve(search_query, num_results)
    return vector_dense_context, vector_sparse_context, vector_hybrid_context



In [6]:
def get_classical_graph_context(driver, search_query):
    global_DL_context, local_DL_context, drift_DL_context = classical_and_llm_run_graph_search(driver, search_query, graph_prefix='DL', embedding_model = embedding_model)
    return global_DL_context, local_DL_context, drift_DL_context

    

In [7]:
def get_llm_graph_context(driver, search_query):
    global_LLM_context, local_LLM_context, drift_LLM_context = classical_and_llm_run_graph_search(driver, search_query, graph_prefix='LLM', embedding_model = embedding_model)
    return global_LLM_context, local_LLM_context, drift_LLM_context

    

In [18]:
  
def retriever_and_metrics_analysis_pipeline( search_query,
                                            expected_output,
                                            qa_pair_level,
                                            vector_dense_context,
                                            vector_sparse_context,
                                            vector_hybrid_context,
                                            global_DL_context, 
                                            local_DL_context,
                                            drift_DL_context,
                                            global_LLM_context,
                                            local_LLM_context,
                                            drift_LLM_context 
                                            ):    

          
    all_evaluation_objects = []
    
    all_final_result = []
    # Combine all retrievers into a list
    total_retriever_for_query = [
        ("vector_dense_search_metrics", vector_dense_context),# vector_dense_context
        ("vector_sparse_search_metrics", vector_sparse_context),# vector_sparse_context
        ("vector_hybrid_search_metrics", vector_hybrid_context),# vector_hybrid_context
        ("graph_classical_global_search_metrics", global_DL_context),#  global_DL_global_context
        ("graph_classical_local_search_metrics", local_DL_context), # graph_local_DL_context
        ("graph_classical_drift_search_metrics", drift_DL_context), #graph_drift_DL_context
        ("graph_llm_global_search_metrics", global_LLM_context), # graph_global_LLM_context
        ("graph_llm_local_search_metrics", local_LLM_context), # graph_local_LLM_context
        ("graph_llm_drift_search_metrics", drift_LLM_context) # graph_drift_LLM_context
    ]
    
    for context_name, context_value in total_retriever_for_query:
        # Generate response
        response, usage_data = generate_response_langchain(query= search_query,
                                                           context_ = context_value.get('context'),
                                                           context_name = context_name, 
                                                           model= "gemini-1.5-pro")
        
        # Create evaluation object
        response_obj_to_evaluate = create_evaluation_object(
            qa_level = qa_pair_level,
            query=search_query,
            expected_output=expected_output,
            retrieval_package=context_value,
            llm_response=response,
            llm_usage_data=usage_data
        )
        
        all_evaluation_objects.append(response_obj_to_evaluate)
                    
        # Run evaluation and save metrics
        result = run_evaluation_metric(
            file_name=context_name,  # Dynamically set the dedicated file name based on the context name
            response_obj=response_obj_to_evaluate,
            embedding_model = embedding_model,
            model = "gemini-1.5-pro", #"gemini-2.5-pro-preview-03-25", gemini-2.5-flash-preview-04-17
            fallback_model_name = "gemini-2.5-pro-preview-03-25",
            max_retries = 4,
            retry_delay_seconds = 5
        )
        all_final_result.append(result)    
    logging.info("Finished processing all QA pairs.")    
    return all_final_result   

## level 1 

In [34]:
test_case_level_1 = {
"question": ''' What is the composition of Earth's atmosphere near the surface? ''',
                
"answer": '''Near the surface, Earth's atmosphere consists of 78% nitrogen, 21% oxygen, and 1% other gases such as argon, carbon dioxide, and neon.
            This atmosphere affects Earth's climate and weather, shields from harmful solar radiation, and protects from most meteoroids."
           ''',
            
"level": 1    
}

In [35]:
vector_dense_context, vector_sparse_context, vector_hybrid_context =  retriever_vector_context(test_case_level_1['question'])



2025-06-10 20:14:47,059 - INFO - HTTP Request: POST http://localhost:6333/collections/Solar_System_dense/points/query "HTTP/1.1 200 OK"



Running vector retrieval for query: ' What is the composition of Earth's atmosphere near the surface? '

--- Retrieving top 4 chunks for query: ' What is the composition of Earth's atmosphere near the surface? ' ---
Searching in collection: Solar_System_dense
Executing search using client.query_points (dense)...


2025-06-10 20:14:47,091 - INFO - HTTP Request: POST http://localhost:6333/collections/Solar_System_sparse/points/query "HTTP/1.1 200 OK"


Found 4 results.

--- Retrieving top 4 chunks for query: ' What is the composition of Earth's atmosphere near the surface? ' ---
Searching in collection: Solar_System_sparse
Executing search using client.query_points (sparse)...
Found 4 results.

--- Retrieving top 4 chunks for query: ' What is the composition of Earth's atmosphere near the surface? ' ---
Searching in collection: Solar_System_hybrid
Executing search using client.query_points (hybrid)...


2025-06-10 20:14:47,336 - INFO - HTTP Request: POST http://localhost:6333/collections/Solar_System_hybrid/points/query "HTTP/1.1 200 OK"


Found 4 results.


In [11]:
global_DL_context, local_DL_context, drift_DL_context =  get_classical_graph_context(driver, test_case_level_1['question'])



Running graph searches for query: ' What is the composition of Earth's atmosphere near the surface? ' with prefix: 'DL'
the number of entities found by llm,  4
the extracted entities  ['earth', 'atmosphere', 'surface', 'composition']
--- Running Global Search (Params: {}) ---
GlobalSearchLLMFocus: Starting search for ' What is the composition of Earth's atmosphere near the surface? '


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Ranked 1009 communities by embedding similarity.
Aggregated 214 unique candidate chunks from top communities.
LLM Chunk Reranker (Async): Preparing to process 214 chunks in batches of 15.
LLM Chunk Reranker (Async): Processing Batch 1, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 2, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 3, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 4, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 5, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 6, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 7, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 8, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 9, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 10, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 11, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 12, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 13, Attempt 1/3.
LLM Chunk Reran

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

--- local_search: Found 8 vector candidates ---
--- local_search: Found 4 LLM name candidates ---
--- local_search: Combined unique candidate entities: 11 ---
--- local_search: Selected final 8 seed entities (sorted by degree) ---


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

local vector searching chunks length 14
local vector searching chunks length chunks_entities 44
the length of final chunks 46


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

===== LOCAL SEARCH COMPLETED =====
--- Running DRIFT Search (Params: {}) ---


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

DEBUG: Found 1 communities via entity names: ['NASA and SOLAR SYSTEM and SUN']
DEBUG: Entity search found 1 communities.
DEBUG: Entity search found vector_results 70 communities.
DEBUG: Entity search found final_list 70 communities.
 top_relevant_communities ----> 30


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

 direct_query_entities  11
DEBUG build_drift_context: Selected top 30 chunks based on max_context_chunks=30
DEBUG build_drift_context: Adding Chunk 1/30 - Entity: 'BREATHE' - Score: 0.8998 - Tokens: 398
DEBUG build_drift_context: Adding Chunk 5/30 - Entity: 'ICE GIANT' - Score: 0.8585 - Tokens: 390
DEBUG build_drift_context: Adding Chunk 7/30 - Entity: 'OCEAN' - Score: 0.8553 - Tokens: 409
DEBUG build_drift_context: Adding Chunk 10/30 - Entity: 'BREATHE' - Score: 0.8504 - Tokens: 401
DEBUG build_drift_context: Adding Chunk 11/30 - Entity: 'WATER' - Score: 0.8464 - Tokens: 408
DEBUG build_drift_context: Adding Chunk 16/30 - Entity: 'CHEMICAL COMPOUND' - Score: 0.8439 - Tokens: 388
DEBUG build_drift_context: Adding Chunk 18/30 - Entity: 'TESSERAE' - Score: 0.8390 - Tokens: 384
DEBUG build_drift_context: Adding Chunk 19/30 - Entity: 'PLUTO' - Score: 0.8371 - Tokens: 400
DEBUG build_drift_context: Adding Chunk 21/30 - Entity: 'PLANETARY ATMOSPHERE' - Score: 0.8357 - Tokens: 395
DEBUG build

In [None]:
print(global_DL_context['context'])

In [13]:
global_LLM_context, local_LLM_context, drift_LLM_context =  get_llm_graph_context(driver, test_case_level_1['question'])

    


Running graph searches for query: ' What is the composition of Earth's atmosphere near the surface? ' with prefix: 'LLM'
the number of entities found by llm,  4
the extracted entities  ['earth', 'atmosphere', 'surface', 'composition']
--- Running Global Search (Params: {}) ---
GlobalSearchLLMFocus: Starting search for ' What is the composition of Earth's atmosphere near the surface? '


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Ranked 810 communities by embedding similarity.
Aggregated 223 unique candidate chunks from top communities.
LLM Chunk Reranker (Async): Preparing to process 223 chunks in batches of 15.
LLM Chunk Reranker (Async): Processing Batch 1, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 2, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 3, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 4, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 5, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 6, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 7, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 8, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 9, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 10, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 11, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 12, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 13, Attempt 1/3.
LLM Chunk Rerank

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

--- local_search: Found 8 vector candidates ---
--- local_search: Found 1 LLM name candidates ---
--- local_search: Combined unique candidate entities: 9 ---
--- local_search: Selected final 8 seed entities (sorted by degree) ---


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

local vector searching chunks length 14
local vector searching chunks length chunks_entities 35
the length of final chunks 34


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

===== LOCAL SEARCH COMPLETED =====
--- Running DRIFT Search (Params: {}) ---


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

DEBUG: Found 1 communities via entity names: ["NASA's Exploration of the Sola"]
DEBUG: Entity search found 1 communities.
DEBUG: Entity search found vector_results 19 communities.
DEBUG: Entity search found final_list 19 communities.
 top_relevant_communities ----> 19


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

 direct_query_entities  8
DEBUG build_drift_context: Selected top 30 chunks based on max_context_chunks=30
DEBUG build_drift_context: Adding Chunk 1/30 - Entity: 'NASA' - Score: 0.8898 - Tokens: 398
DEBUG build_drift_context: Adding Chunk 2/30 - Entity: 'NASA' - Score: 0.8585 - Tokens: 390
DEBUG build_drift_context: Adding Chunk 4/30 - Entity: 'PLUTO'S ATMOSPHERE' - Score: 0.8471 - Tokens: 400
DEBUG build_drift_context: Adding Chunk 5/30 - Entity: 'SUN' - Score: 0.8464 - Tokens: 408
DEBUG build_drift_context: Adding Chunk 6/30 - Entity: 'DARK STREAKS' - Score: 0.8439 - Tokens: 388
DEBUG build_drift_context: Adding Chunk 9/30 - Entity: 'ICE CRYSTALS' - Score: 0.8416 - Tokens: 386
DEBUG build_drift_context: Adding Chunk 11/30 - Entity: 'CASSINI SPACECRAFT' - Score: 0.8258 - Tokens: 395
DEBUG build_drift_context: Adding Chunk 12/30 - Entity: 'SATURN' - Score: 0.8257 - Tokens: 395
DEBUG build_drift_context: Adding Chunk 13/30 - Entity: 'ERIS' - Score: 0.8242 - Tokens: 404
DEBUG build_drift

In [36]:

result_level_1 = retriever_and_metrics_analysis_pipeline(
                                                         search_query=test_case_level_1['question'], 
                                                         expected_output=test_case_level_1['answer'],
                                                         qa_pair_level=test_case_level_1['level'],
                                                         vector_dense_context = vector_dense_context,
                                                         vector_sparse_context= vector_sparse_context,
                                                         vector_hybrid_context = vector_hybrid_context,
                                                         global_DL_context = global_DL_context,  
                                                         local_DL_context = local_DL_context, 
                                                         drift_DL_context = drift_DL_context, 
                                                         global_LLM_context = global_LLM_context,
                                                         local_LLM_context = local_LLM_context, 
                                                         drift_LLM_context = drift_LLM_context
                                                         )

2025-06-10 20:15:03,569 - INFO - LLM Token Usage recorded: Prompt=1975, Completion=32, Total=2007
2025-06-10 20:15:03,571 - INFO - RAG chain invocation complete.
2025-06-10 20:15:03,573 - INFO - vector_dense_search_metrics Usage Data (with response time): {'prompt_tokens': 1975, 'completion_tokens': 32, 'total_tokens': 2007, 'time_taken': 0.8499060000012832}
2025-06-10 20:15:03,575 - INFO - Created evaluation object. Retrieval: 0.1980s, LLM: 0.8499s, Total: 1.0479s


Output()


--- Evaluating for search_type: vector_dense_search_metrics (Primary Model: gemini-1.5-pro) ---
Primary model 'gemini-1.5-pro' initialized successfully.


Output()

  Answer Relevancy calculated successfully on attempt 1. Score: 1.0


Output()

  Contextual Precision calculated successfully on attempt 1. Score: 1.0


2025-06-10 20:15:10,953 - INFO - Using default tokenizer.


  Contextual Recall calculated successfully on attempt 1. Score: 1.0

Attempting ROUGE-1 calculation...
  ROUGE-1 calculated successfully. Score: 0.6333333333333334

Attempting Cosine Similarity calculation...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Cosine Similarity calculated successfully. Score: 0.8727


2025-06-10 20:15:13,134 - INFO - LLM Token Usage recorded: Prompt=1995, Completion=32, Total=2027
2025-06-10 20:15:13,145 - INFO - RAG chain invocation complete.
2025-06-10 20:15:13,147 - INFO - vector_sparse_search_metrics Usage Data (with response time): {'prompt_tokens': 1995, 'completion_tokens': 32, 'total_tokens': 2027, 'time_taken': 0.8503679999994347}
2025-06-10 20:15:13,147 - INFO - Created evaluation object. Retrieval: 0.0311s, LLM: 0.8504s, Total: 0.8815s


Output()


--- Evaluating for search_type: vector_sparse_search_metrics (Primary Model: gemini-1.5-pro) ---
Primary model 'gemini-1.5-pro' initialized successfully.


Output()

  Answer Relevancy calculated successfully on attempt 1. Score: 1.0


Output()

  Contextual Precision calculated successfully on attempt 1. Score: 1.0


2025-06-10 20:15:19,760 - INFO - Using default tokenizer.


  Contextual Recall calculated successfully on attempt 1. Score: 0.5

Attempting ROUGE-1 calculation...
  ROUGE-1 calculated successfully. Score: 0.6333333333333334

Attempting Cosine Similarity calculation...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Cosine Similarity calculated successfully. Score: 0.8727


2025-06-10 20:15:22,383 - INFO - LLM Token Usage recorded: Prompt=1979, Completion=32, Total=2011
2025-06-10 20:15:22,385 - INFO - RAG chain invocation complete.
2025-06-10 20:15:22,387 - INFO - vector_hybrid_search_metrics Usage Data (with response time): {'prompt_tokens': 1979, 'completion_tokens': 32, 'total_tokens': 2011, 'time_taken': 1.1417223999997077}
2025-06-10 20:15:22,387 - INFO - Created evaluation object. Retrieval: 0.2436s, LLM: 1.1417s, Total: 1.3853s


Output()


--- Evaluating for search_type: vector_hybrid_search_metrics (Primary Model: gemini-1.5-pro) ---
Primary model 'gemini-1.5-pro' initialized successfully.


Output()

  Answer Relevancy calculated successfully on attempt 1. Score: 1.0


Output()

  Contextual Precision calculated successfully on attempt 1. Score: 1.0


2025-06-10 20:15:31,044 - INFO - Using default tokenizer.


  Contextual Recall calculated successfully on attempt 1. Score: 1.0

Attempting ROUGE-1 calculation...
  ROUGE-1 calculated successfully. Score: 0.6333333333333334

Attempting Cosine Similarity calculation...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Cosine Similarity calculated successfully. Score: 0.8727


2025-06-10 20:15:33,394 - INFO - LLM Token Usage recorded: Prompt=7619, Completion=14, Total=7633
2025-06-10 20:15:33,396 - INFO - RAG chain invocation complete.
2025-06-10 20:15:33,397 - INFO - graph_classical_global_search_metrics Usage Data (with response time): {'prompt_tokens': 7619, 'completion_tokens': 14, 'total_tokens': 7633, 'time_taken': 0.9668542999988858}
2025-06-10 20:15:33,400 - INFO - Created evaluation object. Retrieval: 52.7593s, LLM: 0.9669s, Total: 53.7262s


Output()


--- Evaluating for search_type: graph_classical_global_search_metrics (Primary Model: gemini-1.5-pro) ---
Primary model 'gemini-1.5-pro' initialized successfully.


Output()

  Answer Relevancy calculated successfully on attempt 1. Score: 0.0


Output()

  Contextual Precision calculated successfully on attempt 1. Score: 0


2025-06-10 20:15:40,226 - INFO - Using default tokenizer.


  Contextual Recall calculated successfully on attempt 1. Score: 0.0

Attempting ROUGE-1 calculation...
  ROUGE-1 calculated successfully. Score: 0.07547169811320753

Attempting Cosine Similarity calculation...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Cosine Similarity calculated successfully. Score: 0.4155


2025-06-10 20:15:42,728 - INFO - LLM Token Usage recorded: Prompt=6215, Completion=14, Total=6229
2025-06-10 20:15:42,729 - INFO - RAG chain invocation complete.
2025-06-10 20:15:42,731 - INFO - graph_classical_local_search_metrics Usage Data (with response time): {'prompt_tokens': 6215, 'completion_tokens': 14, 'total_tokens': 6229, 'time_taken': 1.2173431999999593}
2025-06-10 20:15:42,733 - INFO - Created evaluation object. Retrieval: 3.0465s, LLM: 1.2173s, Total: 4.2639s


Output()


--- Evaluating for search_type: graph_classical_local_search_metrics (Primary Model: gemini-1.5-pro) ---
Primary model 'gemini-1.5-pro' initialized successfully.


Output()

  Answer Relevancy calculated successfully on attempt 1. Score: 0.0


Output()

2025-06-10 20:15:50,242 - INFO - Using default tokenizer.


  Contextual Recall calculated successfully on attempt 1. Score: 0.0

Attempting ROUGE-1 calculation...
  ROUGE-1 calculated successfully. Score: 0.07547169811320753

Attempting Cosine Similarity calculation...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Cosine Similarity calculated successfully. Score: 0.4155


2025-06-10 20:15:52,403 - INFO - LLM Token Usage recorded: Prompt=5213, Completion=14, Total=5227
2025-06-10 20:15:52,404 - INFO - RAG chain invocation complete.
2025-06-10 20:15:52,405 - INFO - graph_classical_drift_search_metrics Usage Data (with response time): {'prompt_tokens': 5213, 'completion_tokens': 14, 'total_tokens': 5227, 'time_taken': 1.0420902999994723}
2025-06-10 20:15:52,406 - INFO - Created evaluation object. Retrieval: 2.1566s, LLM: 1.0421s, Total: 3.1987s


Output()


--- Evaluating for search_type: graph_classical_drift_search_metrics (Primary Model: gemini-1.5-pro) ---
Primary model 'gemini-1.5-pro' initialized successfully.


Output()

  Answer Relevancy calculated successfully on attempt 1. Score: 0.0


Output()

2025-06-10 20:15:58,951 - INFO - Using default tokenizer.


  Contextual Recall calculated successfully on attempt 1. Score: 0.5

Attempting ROUGE-1 calculation...
  ROUGE-1 calculated successfully. Score: 0.07547169811320753

Attempting Cosine Similarity calculation...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Cosine Similarity calculated successfully. Score: 0.4155


2025-06-10 20:16:01,113 - INFO - LLM Token Usage recorded: Prompt=7722, Completion=14, Total=7736
2025-06-10 20:16:01,114 - INFO - RAG chain invocation complete.
2025-06-10 20:16:01,116 - INFO - graph_llm_global_search_metrics Usage Data (with response time): {'prompt_tokens': 7722, 'completion_tokens': 14, 'total_tokens': 7736, 'time_taken': 0.9957063000001654}
2025-06-10 20:16:01,119 - INFO - Created evaluation object. Retrieval: 48.0849s, LLM: 0.9957s, Total: 49.0806s


Output()


--- Evaluating for search_type: graph_llm_global_search_metrics (Primary Model: gemini-1.5-pro) ---
Primary model 'gemini-1.5-pro' initialized successfully.


Output()

  Answer Relevancy calculated successfully on attempt 1. Score: 0.0


Output()

  Contextual Precision calculated successfully on attempt 1. Score: 0


2025-06-10 20:16:08,377 - INFO - Using default tokenizer.


  Contextual Recall calculated successfully on attempt 1. Score: 0.0

Attempting ROUGE-1 calculation...
  ROUGE-1 calculated successfully. Score: 0.07547169811320753

Attempting Cosine Similarity calculation...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Cosine Similarity calculated successfully. Score: 0.4155


2025-06-10 20:16:11,014 - INFO - LLM Token Usage recorded: Prompt=4748, Completion=14, Total=4762
2025-06-10 20:16:11,016 - INFO - RAG chain invocation complete.
2025-06-10 20:16:11,017 - INFO - graph_llm_local_search_metrics Usage Data (with response time): {'prompt_tokens': 4748, 'completion_tokens': 14, 'total_tokens': 4762, 'time_taken': 1.4722230000006675}
2025-06-10 20:16:11,018 - INFO - Created evaluation object. Retrieval: 3.0205s, LLM: 1.4722s, Total: 4.4927s


Output()


--- Evaluating for search_type: graph_llm_local_search_metrics (Primary Model: gemini-1.5-pro) ---
Primary model 'gemini-1.5-pro' initialized successfully.


Output()

  Answer Relevancy calculated successfully on attempt 1. Score: 0.0


Output()

  Contextual Precision calculated successfully on attempt 1. Score: 0


2025-06-10 20:16:17,543 - INFO - Using default tokenizer.


  Contextual Recall calculated successfully on attempt 1. Score: 0.0

Attempting ROUGE-1 calculation...
  ROUGE-1 calculated successfully. Score: 0.07547169811320753

Attempting Cosine Similarity calculation...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Cosine Similarity calculated successfully. Score: 0.4155


2025-06-10 20:16:19,702 - INFO - LLM Token Usage recorded: Prompt=5013, Completion=14, Total=5027
2025-06-10 20:16:19,705 - INFO - RAG chain invocation complete.
2025-06-10 20:16:19,706 - INFO - graph_llm_drift_search_metrics Usage Data (with response time): {'prompt_tokens': 5013, 'completion_tokens': 14, 'total_tokens': 5027, 'time_taken': 1.0241671999992832}
2025-06-10 20:16:19,707 - INFO - Created evaluation object. Retrieval: 2.3442s, LLM: 1.0242s, Total: 3.3684s


Output()


--- Evaluating for search_type: graph_llm_drift_search_metrics (Primary Model: gemini-1.5-pro) ---
Primary model 'gemini-1.5-pro' initialized successfully.


Output()

  Answer Relevancy calculated successfully on attempt 1. Score: 0.0


Output()

  Contextual Precision calculated successfully on attempt 1. Score: 0


2025-06-10 20:16:26,611 - INFO - Using default tokenizer.


  Contextual Recall calculated successfully on attempt 1. Score: 0.0

Attempting ROUGE-1 calculation...
  ROUGE-1 calculated successfully. Score: 0.07547169811320753

Attempting Cosine Similarity calculation...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-10 20:16:27,929 - INFO - Finished processing all QA pairs.


  Cosine Similarity calculated successfully. Score: 0.4155


In [37]:
result_level_1

[{'search_type': 'vector_dense_search_metrics',
  'precision': 1.0,
  'recall': 1.0,
  'relevancy': 1.0,
  'rouge1': 0.6333333333333334,
  'cosine_similarity': 0.8727470636367798,
  'time_taken': 1.0479338000004645,
  'num_input_token': 1975,
  'num_output_token': 32,
  'query': " What is the composition of Earth's atmosphere near the surface? ",
  'qa_level': 1,
  'actual_output': "Earth's atmosphere near the surface consists of 78% nitrogen, 21% oxygen, and 1% other gases such as argon.",
  'expected_output': 'Near the surface, Earth\'s atmosphere consists of 78% nitrogen, 21% oxygen, and 1% other gases such as argon, carbon dioxide, and neon.\n            This atmosphere affects Earth\'s climate and weather, shields from harmful solar radiation, and protects from most meteoroids."\n           ',
  'retrieval_context': 'title: An Overview of Earth and NASA\'s Role in Studying It\n text: he Easy Earth\'s atmosphere is 78% nitrogen, 21% oxygen and 1% other ingredients. It\'s the perfec

## level 2

In [22]:
test_case_level_2 = {
"question": ''' Contrast the current scientific thinking described regarding the potential for life on Venus versus Mars, including the specific
                environments or evidence being considered.''',
                
"answer": '''For Venus, the potential for life is considered primarily in its upper atmosphere, specifically about 30 miles (50 km) up where
            temperatures (86-158 F / 30-70 C) and atmospheric pressure are similar to Earth's surface. Scientists speculate that extremophile microbes 
            could exist there, possibly protected by sulfur compounds, potentially explaining persistent dark streaks observed in the clouds absorbing 
            UV radiation. Evidence is currently inconclusive, but the possibility is considered due to these conditions and detected micron-sized 
            particles. For Mars, scientists are not expecting to find currently thriving life but are searching for signs of past life from billions of
            years ago when Mars was warmer, wetter, and had a thicker atmosphere. Evidence includes ancient river valleys, deltas, lakebeds, and
            minerals formed in liquid water. Current exploration focuses on finding biosignatures in rocks and soil from this earlier period,
            although water-ice and seasonal briny flows exist today.''',
            
"level": 2    
}

In [23]:
vector_dense_context, vector_sparse_context, vector_hybrid_context =  retriever_vector_context(test_case_level_2['question'])



Running vector retrieval for query: ' Contrast the current scientific thinking described regarding the potential for life on Venus versus Mars, including the specific
                environments or evidence being considered.'

--- Retrieving top 4 chunks for query: ' Contrast the current scientific thinking described regarding the potential for life on Venus versus Mars, including the specific
                environments or evidence being considered.' ---
Searching in collection: Solar_System_dense
Executing search using client.query_points (dense)...


2025-06-10 19:54:52,280 - INFO - HTTP Request: POST http://localhost:6333/collections/Solar_System_dense/points/query "HTTP/1.1 200 OK"
2025-06-10 19:54:52,306 - INFO - HTTP Request: POST http://localhost:6333/collections/Solar_System_sparse/points/query "HTTP/1.1 200 OK"


Found 4 results.

--- Retrieving top 4 chunks for query: ' Contrast the current scientific thinking described regarding the potential for life on Venus versus Mars, including the specific
                environments or evidence being considered.' ---
Searching in collection: Solar_System_sparse
Executing search using client.query_points (sparse)...
Found 4 results.

--- Retrieving top 4 chunks for query: ' Contrast the current scientific thinking described regarding the potential for life on Venus versus Mars, including the specific
                environments or evidence being considered.' ---
Searching in collection: Solar_System_hybrid
Executing search using client.query_points (hybrid)...


2025-06-10 19:54:52,598 - INFO - HTTP Request: POST http://localhost:6333/collections/Solar_System_hybrid/points/query "HTTP/1.1 200 OK"


Found 4 results.


In [24]:
global_DL_context, local_DL_context, drift_DL_context =  get_classical_graph_context(driver, test_case_level_2['question'])




Running graph searches for query: ' Contrast the current scientific thinking described regarding the potential for life on Venus versus Mars, including the specific
                environments or evidence being considered.' with prefix: 'DL'
the number of entities found by llm,  6
the extracted entities  ['venus', 'mars', 'life', 'scientific thinking', 'environments', 'evidence']
--- Running Global Search (Params: {}) ---
GlobalSearchLLMFocus: Starting search for ' Contrast the current scientific thinking described regarding the potential for life on Venus versus Mars, including the specific
                environments or evidence being considered.'


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Ranked 1009 communities by embedding similarity.
Aggregated 229 unique candidate chunks from top communities.
LLM Chunk Reranker (Async): Preparing to process 229 chunks in batches of 15.
LLM Chunk Reranker (Async): Processing Batch 1, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 2, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 3, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 4, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 5, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 6, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 7, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 8, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 9, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 10, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 11, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 12, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 13, Attempt 1/3.
LLM Chunk Reran

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

--- local_search: Found 8 vector candidates ---
--- local_search: Found 3 LLM name candidates ---
--- local_search: Combined unique candidate entities: 10 ---
--- local_search: Selected final 8 seed entities (sorted by degree) ---


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

local vector searching chunks length 14
local vector searching chunks length chunks_entities 44
the length of final chunks 47


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

===== LOCAL SEARCH COMPLETED =====
--- Running DRIFT Search (Params: {}) ---


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

DEBUG: Found 1 communities via entity names: ['NASA and SOLAR SYSTEM and SUN']
DEBUG: Entity search found 1 communities.
DEBUG: Entity search found vector_results 70 communities.
DEBUG: Entity search found final_list 70 communities.
 top_relevant_communities ----> 30


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

 direct_query_entities  10
DEBUG build_drift_context: Selected top 30 chunks based on max_context_chunks=30
DEBUG build_drift_context: Adding Chunk 1/30 - Entity: 'POTENTIAL FOR LIFE' - Score: 0.8525 - Tokens: 388
DEBUG build_drift_context: Adding Chunk 5/30 - Entity: 'TESSERAE' - Score: 0.8383 - Tokens: 384
DEBUG build_drift_context: Adding Chunk 6/30 - Entity: 'OXIDIZE' - Score: 0.8343 - Tokens: 402
DEBUG build_drift_context: Adding Chunk 7/30 - Entity: 'NAMED THE OBJECTS' - Score: 0.8326 - Tokens: 401
DEBUG build_drift_context: Adding Chunk 11/30 - Entity: 'ABUNDANT LIFE' - Score: 0.8281 - Tokens: 392
DEBUG build_drift_context: Adding Chunk 23/30 - Entity: 'POTENTIAL FOR LIFE' - Score: 0.8190 - Tokens: 402
DEBUG build_drift_context: Adding Chunk 25/30 - Entity: 'JUPITER' - Score: 0.8187 - Tokens: 394
DEBUG build_drift_context: Adding Chunk 27/30 - Entity: 'URANUS' - Score: 0.8186 - Tokens: 394
DEBUG build_drift_context: Adding Chunk 28/30 - Entity: 'NASA' - Score: 0.8184 - Tokens: 4

In [25]:
global_LLM_context, local_LLM_context, drift_LLM_context =  get_llm_graph_context(driver, test_case_level_2['question'])



Running graph searches for query: ' Contrast the current scientific thinking described regarding the potential for life on Venus versus Mars, including the specific
                environments or evidence being considered.' with prefix: 'LLM'
the number of entities found by llm,  6
the extracted entities  ['venus', 'mars', 'life', 'scientific thinking', 'environments', 'evidence']
--- Running Global Search (Params: {}) ---
GlobalSearchLLMFocus: Starting search for ' Contrast the current scientific thinking described regarding the potential for life on Venus versus Mars, including the specific
                environments or evidence being considered.'


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Ranked 810 communities by embedding similarity.
Aggregated 226 unique candidate chunks from top communities.
LLM Chunk Reranker (Async): Preparing to process 226 chunks in batches of 15.
LLM Chunk Reranker (Async): Processing Batch 1, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 2, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 3, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 4, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 5, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 6, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 7, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 8, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 9, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 10, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 11, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 12, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 13, Attempt 1/3.
LLM Chunk Rerank

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

--- local_search: Found 8 vector candidates ---
--- local_search: Found 3 LLM name candidates ---
--- local_search: Combined unique candidate entities: 11 ---
--- local_search: Selected final 8 seed entities (sorted by degree) ---


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

local vector searching chunks length 14
local vector searching chunks length chunks_entities 44
the length of final chunks 52


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

===== LOCAL SEARCH COMPLETED =====
--- Running DRIFT Search (Params: {}) ---


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

DEBUG: Found 1 communities via entity names: ["NASA's Exploration of the Sola"]
DEBUG: Entity search found 1 communities.
DEBUG: Entity search found vector_results 19 communities.
DEBUG: Entity search found final_list 19 communities.
 top_relevant_communities ----> 19


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

 direct_query_entities  10
DEBUG build_drift_context: Selected top 30 chunks based on max_context_chunks=30
DEBUG build_drift_context: Adding Chunk 1/30 - Entity: 'DARK STREAKS' - Score: 0.8525 - Tokens: 388
DEBUG build_drift_context: Adding Chunk 7/30 - Entity: 'LEWIS AND CLARK' - Score: 0.8383 - Tokens: 384
DEBUG build_drift_context: Adding Chunk 9/30 - Entity: 'LIZ LANDAU' - Score: 0.8281 - Tokens: 392
DEBUG build_drift_context: Adding Chunk 11/30 - Entity: 'ORION' - Score: 0.8243 - Tokens: 402
DEBUG build_drift_context: Adding Chunk 14/30 - Entity: 'NASA' - Score: 0.8226 - Tokens: 401
DEBUG build_drift_context: Adding Chunk 16/30 - Entity: 'JUNO MISSION' - Score: 0.8187 - Tokens: 394
DEBUG build_drift_context: Adding Chunk 22/30 - Entity: 'JAMES WEBB SPACE TELESCOPE' - Score: 0.8186 - Tokens: 394
DEBUG build_drift_context: Adding Chunk 24/30 - Entity: 'EXOPLANETS' - Score: 0.8152 - Tokens: 397
DEBUG build_drift_context: Adding Chunk 25/30 - Entity: 'EZIE' - Score: 0.8133 - Tokens: 

In [26]:

result_level_2 = retriever_and_metrics_analysis_pipeline(
                                                         search_query=test_case_level_2['question'], 
                                                         expected_output=test_case_level_2['answer'],
                                                         qa_pair_level=test_case_level_2['level'],
                                                         vector_dense_context = vector_dense_context,
                                                         vector_sparse_context= vector_sparse_context,
                                                         vector_hybrid_context = vector_hybrid_context,
                                                         global_DL_context = global_DL_context,  
                                                         local_DL_context = local_DL_context, 
                                                         drift_DL_context = drift_DL_context, 
                                                         global_LLM_context = global_LLM_context,
                                                         local_LLM_context = local_LLM_context, 
                                                         drift_LLM_context = drift_LLM_context
                                                         )

2025-06-10 19:57:05,064 - INFO - LLM Token Usage recorded: Prompt=1920, Completion=120, Total=2040
2025-06-10 19:57:05,066 - INFO - RAG chain invocation complete.
2025-06-10 19:57:05,067 - INFO - vector_dense_search_metrics Usage Data (with response time): {'prompt_tokens': 1920, 'completion_tokens': 120, 'total_tokens': 2040, 'time_taken': 3.219165200000134}
2025-06-10 19:57:05,069 - INFO - Created evaluation object. Retrieval: 0.2904s, LLM: 3.2192s, Total: 3.5096s


Output()


--- Evaluating for search_type: vector_dense_search_metrics (Primary Model: gemini-1.5-pro) ---
Primary model 'gemini-1.5-pro' initialized successfully.


Output()

  Answer Relevancy calculated successfully on attempt 1. Score: 0.8888888888888888


Output()

  Contextual Precision calculated successfully on attempt 1. Score: 1.0


2025-06-10 19:57:19,618 - INFO - Using default tokenizer.


  Contextual Recall calculated successfully on attempt 1. Score: 1.0

Attempting ROUGE-1 calculation...
  ROUGE-1 calculated successfully. Score: 0.46825396825396826

Attempting Cosine Similarity calculation...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Cosine Similarity calculated successfully. Score: 0.9128


2025-06-10 19:57:26,168 - INFO - LLM Token Usage recorded: Prompt=1986, Completion=159, Total=2145
2025-06-10 19:57:26,170 - INFO - RAG chain invocation complete.
2025-06-10 19:57:26,171 - INFO - vector_sparse_search_metrics Usage Data (with response time): {'prompt_tokens': 1986, 'completion_tokens': 159, 'total_tokens': 2145, 'time_taken': 3.067897500000072}
2025-06-10 19:57:26,173 - INFO - Created evaluation object. Retrieval: 0.0259s, LLM: 3.0679s, Total: 3.0938s


Output()


--- Evaluating for search_type: vector_sparse_search_metrics (Primary Model: gemini-1.5-pro) ---
Primary model 'gemini-1.5-pro' initialized successfully.


Output()

  Answer Relevancy calculated successfully on attempt 1. Score: 1.0


Output()

  Contextual Precision calculated successfully on attempt 1. Score: 1.0


2025-06-10 19:57:40,899 - INFO - Using default tokenizer.


  Contextual Recall calculated successfully on attempt 1. Score: 1.0

Attempting ROUGE-1 calculation...
  ROUGE-1 calculated successfully. Score: 0.5232974910394265

Attempting Cosine Similarity calculation...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Cosine Similarity calculated successfully. Score: 0.9236


2025-06-10 19:57:48,832 - INFO - LLM Token Usage recorded: Prompt=1920, Completion=159, Total=2079
2025-06-10 19:57:48,834 - INFO - RAG chain invocation complete.
2025-06-10 19:57:48,835 - INFO - vector_hybrid_search_metrics Usage Data (with response time): {'prompt_tokens': 1920, 'completion_tokens': 159, 'total_tokens': 2079, 'time_taken': 3.607324999999946}
2025-06-10 19:57:48,837 - INFO - Created evaluation object. Retrieval: 0.2912s, LLM: 3.6073s, Total: 3.8985s


Output()


--- Evaluating for search_type: vector_hybrid_search_metrics (Primary Model: gemini-1.5-pro) ---
Primary model 'gemini-1.5-pro' initialized successfully.


Output()

  Answer Relevancy calculated successfully on attempt 1. Score: 0.7857142857142857


Output()

  Contextual Precision calculated successfully on attempt 1. Score: 1.0


2025-06-10 19:58:05,144 - INFO - Using default tokenizer.


  Contextual Recall calculated successfully on attempt 1. Score: 1.0

Attempting ROUGE-1 calculation...
  ROUGE-1 calculated successfully. Score: 0.49110320284697506

Attempting Cosine Similarity calculation...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Cosine Similarity calculated successfully. Score: 0.9367


2025-06-10 19:58:12,670 - INFO - LLM Token Usage recorded: Prompt=4161, Completion=163, Total=4324
2025-06-10 19:58:12,673 - INFO - RAG chain invocation complete.
2025-06-10 19:58:12,674 - INFO - graph_classical_global_search_metrics Usage Data (with response time): {'prompt_tokens': 4161, 'completion_tokens': 163, 'total_tokens': 4324, 'time_taken': 3.5200818000002982}
2025-06-10 19:58:12,677 - INFO - Created evaluation object. Retrieval: 53.7152s, LLM: 3.5201s, Total: 57.2353s


Output()


--- Evaluating for search_type: graph_classical_global_search_metrics (Primary Model: gemini-1.5-pro) ---
Primary model 'gemini-1.5-pro' initialized successfully.


Output()

  Answer Relevancy calculated successfully on attempt 1. Score: 1.0


Output()

  Contextual Precision calculated successfully on attempt 1. Score: 1.0


  Error calculating Contextual Recall on attempt 1: Evaluation LLM outputted an invalid JSON. Please use a better evaluation model.
  Retrying in 5 seconds...


Output()

  Error calculating Contextual Recall on attempt 2: Evaluation LLM outputted an invalid JSON. Please use a better evaluation model.
  Retrying in 5 seconds...


Output()

  Error calculating Contextual Recall on attempt 3: Evaluation LLM outputted an invalid JSON. Please use a better evaluation model.
  Retrying in 5 seconds...


Output()

  Error calculating Contextual Recall on attempt 4: Evaluation LLM outputted an invalid JSON. Please use a better evaluation model.
  Retrying in 5 seconds...


Output()

2025-06-10 19:59:28,331 - INFO - Using default tokenizer.


  Error calculating Contextual Recall on attempt 5: Evaluation LLM outputted an invalid JSON. Please use a better evaluation model.
  Max retries (5) reached for Contextual Recall

Attempting ROUGE-1 calculation...
  ROUGE-1 calculated successfully. Score: 0.5602836879432624

Attempting Cosine Similarity calculation...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Output()

  Cosine Similarity calculated successfully. Score: 0.9139

--- Primary model 'gemini-1.5-pro' failed for some LLM metrics. Attempting fallback with 'gemini-2.5-pro-preview-03-25' ---
Fallback model 'gemini-2.5-pro-preview-03-25' initialized successfully.


  Contextual Recall calculated successfully on attempt 1. Score: 1.0


2025-06-10 20:00:41,160 - INFO - LLM Token Usage recorded: Prompt=6243, Completion=142, Total=6385
2025-06-10 20:00:41,161 - INFO - RAG chain invocation complete.
2025-06-10 20:00:41,162 - INFO - graph_classical_local_search_metrics Usage Data (with response time): {'prompt_tokens': 6243, 'completion_tokens': 142, 'total_tokens': 6385, 'time_taken': 3.8088066000000254}
2025-06-10 20:00:41,163 - INFO - Created evaluation object. Retrieval: 2.3136s, LLM: 3.8088s, Total: 6.1224s


Output()


--- Evaluating for search_type: graph_classical_local_search_metrics (Primary Model: gemini-1.5-pro) ---
Primary model 'gemini-1.5-pro' initialized successfully.


Output()

  Answer Relevancy calculated successfully on attempt 1. Score: 1.0


Output()

  Contextual Precision calculated successfully on attempt 1. Score: 1.0


2025-06-10 20:01:00,536 - INFO - Using default tokenizer.


  Contextual Recall calculated successfully on attempt 1. Score: 1.0

Attempting ROUGE-1 calculation...
  ROUGE-1 calculated successfully. Score: 0.5627376425855514

Attempting Cosine Similarity calculation...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Cosine Similarity calculated successfully. Score: 0.9303


2025-06-10 20:01:07,228 - INFO - LLM Token Usage recorded: Prompt=4359, Completion=114, Total=4473
2025-06-10 20:01:07,228 - INFO - RAG chain invocation complete.
2025-06-10 20:01:07,228 - INFO - graph_classical_drift_search_metrics Usage Data (with response time): {'prompt_tokens': 4359, 'completion_tokens': 114, 'total_tokens': 4473, 'time_taken': 2.9461537999995926}
2025-06-10 20:01:07,228 - INFO - Created evaluation object. Retrieval: 1.4785s, LLM: 2.9462s, Total: 4.4246s


Output()


--- Evaluating for search_type: graph_classical_drift_search_metrics (Primary Model: gemini-1.5-pro) ---
Primary model 'gemini-1.5-pro' initialized successfully.


Output()

  Answer Relevancy calculated successfully on attempt 1. Score: 1.0


Output()

  Contextual Precision calculated successfully on attempt 1. Score: 1.0


2025-06-10 20:01:20,932 - INFO - Using default tokenizer.


  Contextual Recall calculated successfully on attempt 1. Score: 1.0

Attempting ROUGE-1 calculation...
  ROUGE-1 calculated successfully. Score: 0.5163934426229508

Attempting Cosine Similarity calculation...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Cosine Similarity calculated successfully. Score: 0.9128


2025-06-10 20:01:28,129 - INFO - LLM Token Usage recorded: Prompt=4251, Completion=167, Total=4418
2025-06-10 20:01:28,131 - INFO - RAG chain invocation complete.
2025-06-10 20:01:28,132 - INFO - graph_llm_global_search_metrics Usage Data (with response time): {'prompt_tokens': 4251, 'completion_tokens': 167, 'total_tokens': 4418, 'time_taken': 3.618146199999501}
2025-06-10 20:01:28,133 - INFO - Created evaluation object. Retrieval: 48.2109s, LLM: 3.6181s, Total: 51.8290s


Output()


--- Evaluating for search_type: graph_llm_global_search_metrics (Primary Model: gemini-1.5-pro) ---
Primary model 'gemini-1.5-pro' initialized successfully.


Output()

  Answer Relevancy calculated successfully on attempt 1. Score: 1.0


Output()

  Contextual Precision calculated successfully on attempt 1. Score: 1.0


2025-06-10 20:01:52,862 - INFO - Using default tokenizer.


  Contextual Recall calculated successfully on attempt 1. Score: 1.0

Attempting ROUGE-1 calculation...
  ROUGE-1 calculated successfully. Score: 0.5208333333333334

Attempting Cosine Similarity calculation...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Cosine Similarity calculated successfully. Score: 0.9010


2025-06-10 20:02:00,330 - INFO - LLM Token Usage recorded: Prompt=4161, Completion=130, Total=4291
2025-06-10 20:02:00,330 - INFO - RAG chain invocation complete.
2025-06-10 20:02:00,330 - INFO - graph_llm_local_search_metrics Usage Data (with response time): {'prompt_tokens': 4161, 'completion_tokens': 130, 'total_tokens': 4291, 'time_taken': 3.1947964999999385}
2025-06-10 20:02:00,330 - INFO - Created evaluation object. Retrieval: 2.2003s, LLM: 3.1948s, Total: 5.3951s


Output()


--- Evaluating for search_type: graph_llm_local_search_metrics (Primary Model: gemini-1.5-pro) ---
Primary model 'gemini-1.5-pro' initialized successfully.


Output()

  Answer Relevancy calculated successfully on attempt 1. Score: 1.0


Output()

  Contextual Precision calculated successfully on attempt 1. Score: 1.0


2025-06-10 20:02:14,469 - INFO - Using default tokenizer.


  Contextual Recall calculated successfully on attempt 1. Score: 1.0

Attempting ROUGE-1 calculation...
  ROUGE-1 calculated successfully. Score: 0.525096525096525

Attempting Cosine Similarity calculation...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Cosine Similarity calculated successfully. Score: 0.9200


2025-06-10 20:02:21,287 - INFO - LLM Token Usage recorded: Prompt=5037, Completion=115, Total=5152
2025-06-10 20:02:21,297 - INFO - RAG chain invocation complete.
2025-06-10 20:02:21,300 - INFO - graph_llm_drift_search_metrics Usage Data (with response time): {'prompt_tokens': 5037, 'completion_tokens': 115, 'total_tokens': 5152, 'time_taken': 2.992064600000049}
2025-06-10 20:02:21,300 - INFO - Created evaluation object. Retrieval: 1.6004s, LLM: 2.9921s, Total: 4.5925s


Output()


--- Evaluating for search_type: graph_llm_drift_search_metrics (Primary Model: gemini-1.5-pro) ---
Primary model 'gemini-1.5-pro' initialized successfully.


Output()

  Answer Relevancy calculated successfully on attempt 1. Score: 1.0


Output()

  Contextual Precision calculated successfully on attempt 1. Score: 0.6666666666666666


2025-06-10 20:02:42,457 - INFO - Using default tokenizer.


  Contextual Recall calculated successfully on attempt 1. Score: 1.0

Attempting ROUGE-1 calculation...
  ROUGE-1 calculated successfully. Score: 0.45267489711934156

Attempting Cosine Similarity calculation...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-10 20:02:46,112 - INFO - Finished processing all QA pairs.


  Cosine Similarity calculated successfully. Score: 0.8997


In [27]:
result_level_2

[{'search_type': 'vector_dense_search_metrics',
  'precision': 1.0,
  'recall': 1.0,
  'relevancy': 0.8888888888888888,
  'rouge1': 0.46825396825396826,
  'cosine_similarity': 0.912827730178833,
  'time_taken': 3.5095918999995774,
  'num_input_token': 1920,
  'num_output_token': 120,
  'query': ' Contrast the current scientific thinking described regarding the potential for life on Venus versus Mars, including the specific\n                environments or evidence being considered.',
  'qa_level': 2,
  'actual_output': "The provided context mentions potential for life on Venus in its cloud layers, where temperatures and pressures are more Earth-like.  Specifically, dark streaks absorbing UV radiation could potentially be microbial life protected by sulfur coatings.  The context also notes particles found in Venus' lower atmosphere similar in size to Earth bacteria. However, none of this is considered compelling evidence.  Regarding Mars, the context mentions evidence of a much wetter a

## level 3

In [28]:
test_case_level_3 = {   
"question": ''' Synthesize a comparative overview of the five dwarf planets (Ceres, Pluto, Haumea, Makemake, Eris) based on all provided text,
                highlighting their locations, relative sizes, known moons, surface compositions, and defining characteristics.
             ''',
             
"answer": ''' Based on the provided texts, the five dwarf planets exhibit distinct characteristics and locations.
            Ceres is unique as the only one located in the inner solar system's asteroid belt between Mars and Jupiter; 
            it is the largest object in the belt, comprises 25% of its mass, was visited by the Dawn spacecraft,
            has a layered interior possibly including a water ice mantle and salt deposits on its rocky crust, but lacks moons and rings.
            The other four – Pluto, Haumea, Makemake, and Eris – reside in the Kuiper Belt beyond Neptune. Pluto, explored by New Horizons, 
            is about half the width of the U.S., possesses mountains, valleys, plains, and craters, has a nitrogen-methane-carbon monoxide atmosphere
            that varies with solar distance, and has five moons, with the largest, Charon, forming a notable double system. Haumea is distinguished
            by its oval shape caused by rapid rotation (a 4-hour day), is composed of rock with an ice coating, has two moons (Hi'iaka and Namaka),
            and is the only dwarf planet mentioned with confirmed rings. Makemake, slightly smaller than Pluto and reddish-brown, features frozen 
            methane and ethane on its surface, has one small provisional moon (MK 2), and its discovery contributed significantly to the planet
            redefinition debate. Eris is similar in size to Pluto but much more distant (average 68 AU), played a pivotal role alongside Makemake
            in the planet definition controversy, has one small moon (Dysnomia), a likely rocky surface, and an atmosphere that freezes and thaws 
            during its 557-year orbit. All four Kuiper Belt dwarf planets are described as miniature icy worlds formed early in the solar system's
            history.''',
"level": 3            
}

In [29]:
vector_dense_context, vector_sparse_context, vector_hybrid_context =  retriever_vector_context(test_case_level_3['question'])



Running vector retrieval for query: ' Synthesize a comparative overview of the five dwarf planets (Ceres, Pluto, Haumea, Makemake, Eris) based on all provided text,
                highlighting their locations, relative sizes, known moons, surface compositions, and defining characteristics.
             '

--- Retrieving top 4 chunks for query: ' Synthesize a comparative overview of the five dwarf planets (Ceres, Pluto, Haumea, Makemake, Eris) based on all provided text,
                highlighting their locations, relative sizes, known moons, surface compositions, and defining characteristics.
             ' ---
Searching in collection: Solar_System_dense
Executing search using client.query_points (dense)...


2025-06-10 20:03:59,272 - INFO - HTTP Request: POST http://localhost:6333/collections/Solar_System_dense/points/query "HTTP/1.1 200 OK"
2025-06-10 20:03:59,300 - INFO - HTTP Request: POST http://localhost:6333/collections/Solar_System_sparse/points/query "HTTP/1.1 200 OK"


Found 4 results.

--- Retrieving top 4 chunks for query: ' Synthesize a comparative overview of the five dwarf planets (Ceres, Pluto, Haumea, Makemake, Eris) based on all provided text,
                highlighting their locations, relative sizes, known moons, surface compositions, and defining characteristics.
             ' ---
Searching in collection: Solar_System_sparse
Executing search using client.query_points (sparse)...
Found 4 results.

--- Retrieving top 4 chunks for query: ' Synthesize a comparative overview of the five dwarf planets (Ceres, Pluto, Haumea, Makemake, Eris) based on all provided text,
                highlighting their locations, relative sizes, known moons, surface compositions, and defining characteristics.
             ' ---
Searching in collection: Solar_System_hybrid
Executing search using client.query_points (hybrid)...


2025-06-10 20:03:59,764 - INFO - HTTP Request: POST http://localhost:6333/collections/Solar_System_hybrid/points/query "HTTP/1.1 200 OK"


Found 4 results.


In [30]:
global_DL_context, local_DL_context, drift_DL_context =  get_classical_graph_context(driver, test_case_level_3['question'])



Running graph searches for query: ' Synthesize a comparative overview of the five dwarf planets (Ceres, Pluto, Haumea, Makemake, Eris) based on all provided text,
                highlighting their locations, relative sizes, known moons, surface compositions, and defining characteristics.
             ' with prefix: 'DL'
the number of entities found by llm,  6
the extracted entities  ['ceres', 'pluto', 'haumea', 'makemake', 'eris', 'dwarf planets']
--- Running Global Search (Params: {}) ---
GlobalSearchLLMFocus: Starting search for ' Synthesize a comparative overview of the five dwarf planets (Ceres, Pluto, Haumea, Makemake, Eris) based on all provided text,
                highlighting their locations, relative sizes, known moons, surface compositions, and defining characteristics.
             '


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Ranked 1009 communities by embedding similarity.
Aggregated 216 unique candidate chunks from top communities.
LLM Chunk Reranker (Async): Preparing to process 216 chunks in batches of 15.
LLM Chunk Reranker (Async): Processing Batch 1, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 2, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 3, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 4, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 5, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 6, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 7, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 8, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 9, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 10, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 11, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 12, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 13, Attempt 1/3.
LLM Chunk Reran

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

--- local_search: Found 8 vector candidates ---
--- local_search: Found 4 LLM name candidates ---
--- local_search: Combined unique candidate entities: 10 ---
--- local_search: Selected final 8 seed entities (sorted by degree) ---


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

local vector searching chunks length 14
local vector searching chunks length chunks_entities 65
the length of final chunks 47


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

===== LOCAL SEARCH COMPLETED =====
--- Running DRIFT Search (Params: {}) ---


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

DEBUG: Found 1 communities via entity names: ['NASA and SOLAR SYSTEM and SUN']
DEBUG: Entity search found 1 communities.
DEBUG: Entity search found vector_results 70 communities.
DEBUG: Entity search found final_list 70 communities.
 top_relevant_communities ----> 30


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

 direct_query_entities  11
DEBUG build_drift_context: Selected top 30 chunks based on max_context_chunks=30
DEBUG build_drift_context: Adding Chunk 1/30 - Entity: 'HAUMEA' - Score: 0.8655 - Tokens: 402
DEBUG build_drift_context: Adding Chunk 3/30 - Entity: 'HAUMEA' - Score: 0.8617 - Tokens: 417
DEBUG build_drift_context: Adding Chunk 13/30 - Entity: 'CERES' - Score: 0.8420 - Tokens: 419
DEBUG build_drift_context: Adding Chunk 14/30 - Entity: 'SOLAR SYSTEM' - Score: 0.8391 - Tokens: 389
DEBUG build_drift_context: Adding Chunk 17/30 - Entity: 'KUIPER BELT' - Score: 0.8358 - Tokens: 388
DEBUG build_drift_context: Adding Chunk 21/30 - Entity: 'EARTH' - Score: 0.8277 - Tokens: 412
DEBUG build_drift_context: Adding Chunk 22/30 - Entity: 'PLUTO FANS' - Score: 0.8275 - Tokens: 409
DEBUG build_drift_context: Adding Chunk 23/30 - Entity: 'ATMOSPHERE' - Score: 0.8260 - Tokens: 395
DEBUG build_drift_context: Adding Chunk 26/30 - Entity: 'ASTEROIDS, COMETS & METEORS' - Score: 0.8250 - Tokens: 385
D

In [31]:
global_LLM_context, local_LLM_context, drift_LLM_context =  get_llm_graph_context(driver, test_case_level_3['question'])




Running graph searches for query: ' Synthesize a comparative overview of the five dwarf planets (Ceres, Pluto, Haumea, Makemake, Eris) based on all provided text,
                highlighting their locations, relative sizes, known moons, surface compositions, and defining characteristics.
             ' with prefix: 'LLM'
the number of entities found by llm,  6
the extracted entities  ['ceres', 'pluto', 'haumea', 'makemake', 'eris', 'dwarf planets']
--- Running Global Search (Params: {}) ---
GlobalSearchLLMFocus: Starting search for ' Synthesize a comparative overview of the five dwarf planets (Ceres, Pluto, Haumea, Makemake, Eris) based on all provided text,
                highlighting their locations, relative sizes, known moons, surface compositions, and defining characteristics.
             '


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Ranked 810 communities by embedding similarity.
Aggregated 223 unique candidate chunks from top communities.
LLM Chunk Reranker (Async): Preparing to process 223 chunks in batches of 15.
LLM Chunk Reranker (Async): Processing Batch 1, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 2, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 3, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 4, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 5, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 6, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 7, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 8, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 9, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 10, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 11, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 12, Attempt 1/3.
LLM Chunk Reranker (Async): Processing Batch 13, Attempt 1/3.
LLM Chunk Rerank

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

--- local_search: Found 8 vector candidates ---
--- local_search: Found 5 LLM name candidates ---
--- local_search: Combined unique candidate entities: 9 ---
--- local_search: Selected final 8 seed entities (sorted by degree) ---


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

local vector searching chunks length 14
local vector searching chunks length chunks_entities 71
the length of final chunks 51


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

===== LOCAL SEARCH COMPLETED =====
--- Running DRIFT Search (Params: {}) ---


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

DEBUG: Found 1 communities via entity names: ["NASA's Exploration of the Sola"]
DEBUG: Entity search found 1 communities.
DEBUG: Entity search found vector_results 19 communities.
DEBUG: Entity search found final_list 19 communities.
 top_relevant_communities ----> 19


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

 direct_query_entities  12
DEBUG build_drift_context: Selected top 30 chunks based on max_context_chunks=30
DEBUG build_drift_context: Adding Chunk 1/30 - Entity: 'ORION' - Score: 0.8555 - Tokens: 402
DEBUG build_drift_context: Adding Chunk 7/30 - Entity: 'NASA' - Score: 0.8517 - Tokens: 417
DEBUG build_drift_context: Adding Chunk 10/30 - Entity: 'GEMINI OBSERVATORY' - Score: 0.8480 - Tokens: 417
DEBUG build_drift_context: Adding Chunk 11/30 - Entity: 'EXOPLANETS' - Score: 0.8405 - Tokens: 387
DEBUG build_drift_context: Adding Chunk 12/30 - Entity: 'ARTEMIS CAMPAIGN' - Score: 0.8391 - Tokens: 389
DEBUG build_drift_context: Adding Chunk 17/30 - Entity: 'PLUTOIDS' - Score: 0.8360 - Tokens: 395
DEBUG build_drift_context: Adding Chunk 18/30 - Entity: 'PLUTOIDS' - Score: 0.8350 - Tokens: 394
DEBUG build_drift_context: Adding Chunk 19/30 - Entity: 'EXOPLANETS' - Score: 0.8326 - Tokens: 388
DEBUG build_drift_context: Adding Chunk 20/30 - Entity: 'JAMES WEBB SPACE TELESCOPE' - Score: 0.8320 - 

In [32]:

result_level_3 = retriever_and_metrics_analysis_pipeline(
                                                         search_query=test_case_level_3['question'], 
                                                         expected_output=test_case_level_3['answer'],
                                                         qa_pair_level=test_case_level_3['level'],
                                                         vector_dense_context = vector_dense_context,
                                                         vector_sparse_context= vector_sparse_context,
                                                         vector_hybrid_context = vector_hybrid_context,
                                                         global_DL_context = global_DL_context,  
                                                         local_DL_context = local_DL_context, 
                                                         drift_DL_context = drift_DL_context, 
                                                         global_LLM_context = global_LLM_context,
                                                         local_LLM_context = local_LLM_context, 
                                                         drift_LLM_context = drift_LLM_context
                                                         )

2025-06-10 20:06:05,545 - INFO - LLM Token Usage recorded: Prompt=1935, Completion=203, Total=2138
2025-06-10 20:06:05,550 - INFO - RAG chain invocation complete.
2025-06-10 20:06:05,550 - INFO - vector_dense_search_metrics Usage Data (with response time): {'prompt_tokens': 1935, 'completion_tokens': 203, 'total_tokens': 2138, 'time_taken': 3.9736353999996936}
2025-06-10 20:06:05,550 - INFO - Created evaluation object. Retrieval: 0.5298s, LLM: 3.9736s, Total: 4.5034s


Output()


--- Evaluating for search_type: vector_dense_search_metrics (Primary Model: gemini-1.5-pro) ---
Primary model 'gemini-1.5-pro' initialized successfully.


Output()

  Answer Relevancy calculated successfully on attempt 1. Score: 0.9333333333333333


Output()

  Contextual Precision calculated successfully on attempt 1. Score: 0.9166666666666666


2025-06-10 20:06:28,488 - INFO - Using default tokenizer.


  Contextual Recall calculated successfully on attempt 1. Score: 0.5

Attempting ROUGE-1 calculation...
  ROUGE-1 calculated successfully. Score: 0.4376470588235294

Attempting Cosine Similarity calculation...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Cosine Similarity calculated successfully. Score: 0.9031


2025-06-10 20:06:42,274 - INFO - LLM Token Usage recorded: Prompt=1956, Completion=368, Total=2324
2025-06-10 20:06:42,274 - INFO - RAG chain invocation complete.
2025-06-10 20:06:42,274 - INFO - vector_sparse_search_metrics Usage Data (with response time): {'prompt_tokens': 1956, 'completion_tokens': 368, 'total_tokens': 2324, 'time_taken': 6.814278299999387}
2025-06-10 20:06:42,274 - INFO - Created evaluation object. Retrieval: 0.0283s, LLM: 6.8143s, Total: 6.8425s


Output()


--- Evaluating for search_type: vector_sparse_search_metrics (Primary Model: gemini-1.5-pro) ---
Primary model 'gemini-1.5-pro' initialized successfully.


Output()

  Answer Relevancy calculated successfully on attempt 1. Score: 0.8


Output()

  Contextual Precision calculated successfully on attempt 1. Score: 1.0


2025-06-10 20:07:05,479 - INFO - Using default tokenizer.


  Contextual Recall calculated successfully on attempt 1. Score: 0.9166666666666666

Attempting ROUGE-1 calculation...
  ROUGE-1 calculated successfully. Score: 0.41422594142259417

Attempting Cosine Similarity calculation...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Cosine Similarity calculated successfully. Score: 0.9184


2025-06-10 20:07:18,839 - INFO - LLM Token Usage recorded: Prompt=1984, Completion=155, Total=2139
2025-06-10 20:07:18,839 - INFO - RAG chain invocation complete.
2025-06-10 20:07:18,839 - INFO - vector_hybrid_search_metrics Usage Data (with response time): {'prompt_tokens': 1984, 'completion_tokens': 155, 'total_tokens': 2139, 'time_taken': 3.156644000000597}
2025-06-10 20:07:18,839 - INFO - Created evaluation object. Retrieval: 0.4625s, LLM: 3.1566s, Total: 3.6192s


Output()


--- Evaluating for search_type: vector_hybrid_search_metrics (Primary Model: gemini-1.5-pro) ---
Primary model 'gemini-1.5-pro' initialized successfully.


Output()

  Answer Relevancy calculated successfully on attempt 1. Score: 0.8333333333333334


Output()

  Contextual Precision calculated successfully on attempt 1. Score: 1.0


2025-06-10 20:07:41,867 - INFO - Using default tokenizer.


  Contextual Recall calculated successfully on attempt 1. Score: 0.7142857142857143

Attempting ROUGE-1 calculation...
  ROUGE-1 calculated successfully. Score: 0.39577836411609496

Attempting Cosine Similarity calculation...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Cosine Similarity calculated successfully. Score: 0.8983


2025-06-10 20:07:59,447 - INFO - LLM Token Usage recorded: Prompt=7658, Completion=693, Total=8351
2025-06-10 20:07:59,450 - INFO - RAG chain invocation complete.
2025-06-10 20:07:59,452 - INFO - graph_classical_global_search_metrics Usage Data (with response time): {'prompt_tokens': 7658, 'completion_tokens': 693, 'total_tokens': 8351, 'time_taken': 10.624616999999489}
2025-06-10 20:07:59,454 - INFO - Created evaluation object. Retrieval: 52.7593s, LLM: 10.6246s, Total: 63.3840s


Output()


--- Evaluating for search_type: graph_classical_global_search_metrics (Primary Model: gemini-1.5-pro) ---
Primary model 'gemini-1.5-pro' initialized successfully.


Output()

  Answer Relevancy calculated successfully on attempt 1. Score: 1.0


Output()

  Contextual Precision calculated successfully on attempt 1. Score: 1.0


2025-06-10 20:08:41,075 - INFO - Using default tokenizer.


  Contextual Recall calculated successfully on attempt 1. Score: 1.0

Attempting ROUGE-1 calculation...
  ROUGE-1 calculated successfully. Score: 0.5134328358208956

Attempting Cosine Similarity calculation...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Cosine Similarity calculated successfully. Score: 0.9194


2025-06-10 20:08:58,802 - INFO - LLM Token Usage recorded: Prompt=6254, Completion=387, Total=6641
2025-06-10 20:08:58,804 - INFO - RAG chain invocation complete.
2025-06-10 20:08:58,805 - INFO - graph_classical_local_search_metrics Usage Data (with response time): {'prompt_tokens': 6254, 'completion_tokens': 387, 'total_tokens': 6641, 'time_taken': 7.0759716000011394}
2025-06-10 20:08:58,807 - INFO - Created evaluation object. Retrieval: 3.0465s, LLM: 7.0760s, Total: 10.1225s


Output()


--- Evaluating for search_type: graph_classical_local_search_metrics (Primary Model: gemini-1.5-pro) ---
Primary model 'gemini-1.5-pro' initialized successfully.


Output()

  Answer Relevancy calculated successfully on attempt 1. Score: 1.0


Output()

  Contextual Precision calculated successfully on attempt 1. Score: 1.0


2025-06-10 20:09:22,069 - INFO - Using default tokenizer.


  Contextual Recall calculated successfully on attempt 1. Score: 1.0

Attempting ROUGE-1 calculation...
  ROUGE-1 calculated successfully. Score: 0.485207100591716

Attempting Cosine Similarity calculation...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Cosine Similarity calculated successfully. Score: 0.8993


2025-06-10 20:09:37,778 - INFO - LLM Token Usage recorded: Prompt=5252, Completion=338, Total=5590
2025-06-10 20:09:37,778 - INFO - RAG chain invocation complete.
2025-06-10 20:09:37,778 - INFO - graph_classical_drift_search_metrics Usage Data (with response time): {'prompt_tokens': 5252, 'completion_tokens': 338, 'total_tokens': 5590, 'time_taken': 5.944847200000368}
2025-06-10 20:09:37,778 - INFO - Created evaluation object. Retrieval: 2.1566s, LLM: 5.9448s, Total: 8.1014s


Output()


--- Evaluating for search_type: graph_classical_drift_search_metrics (Primary Model: gemini-1.5-pro) ---
Primary model 'gemini-1.5-pro' initialized successfully.


Output()

  Answer Relevancy calculated successfully on attempt 1. Score: 0.9


Output()

  Contextual Precision calculated successfully on attempt 1. Score: 0.746164021164021


2025-06-10 20:10:13,994 - INFO - Using default tokenizer.


  Contextual Recall calculated successfully on attempt 1. Score: 1.0

Attempting ROUGE-1 calculation...
  ROUGE-1 calculated successfully. Score: 0.5595238095238095

Attempting Cosine Similarity calculation...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Cosine Similarity calculated successfully. Score: 0.9279


2025-06-10 20:10:32,591 - INFO - LLM Token Usage recorded: Prompt=7761, Completion=666, Total=8427
2025-06-10 20:10:32,593 - INFO - RAG chain invocation complete.
2025-06-10 20:10:32,594 - INFO - graph_llm_global_search_metrics Usage Data (with response time): {'prompt_tokens': 7761, 'completion_tokens': 666, 'total_tokens': 8427, 'time_taken': 10.07841080000071}
2025-06-10 20:10:32,595 - INFO - Created evaluation object. Retrieval: 48.0849s, LLM: 10.0784s, Total: 58.1633s


Output()


--- Evaluating for search_type: graph_llm_global_search_metrics (Primary Model: gemini-1.5-pro) ---
Primary model 'gemini-1.5-pro' initialized successfully.


Output()

  Answer Relevancy calculated successfully on attempt 1. Score: 1.0


Output()

  Contextual Precision calculated successfully on attempt 1. Score: 1.0


2025-06-10 20:11:19,070 - INFO - Using default tokenizer.


  Contextual Recall calculated successfully on attempt 1. Score: 1.0

Attempting ROUGE-1 calculation...
  ROUGE-1 calculated successfully. Score: 0.46029173419773095

Attempting Cosine Similarity calculation...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Cosine Similarity calculated successfully. Score: 0.9101


2025-06-10 20:11:34,935 - INFO - LLM Token Usage recorded: Prompt=4787, Completion=289, Total=5076
2025-06-10 20:11:34,938 - INFO - RAG chain invocation complete.
2025-06-10 20:11:34,938 - INFO - graph_llm_local_search_metrics Usage Data (with response time): {'prompt_tokens': 4787, 'completion_tokens': 289, 'total_tokens': 5076, 'time_taken': 5.1601448000001255}
2025-06-10 20:11:34,941 - INFO - Created evaluation object. Retrieval: 3.0205s, LLM: 5.1601s, Total: 8.1806s


Output()


--- Evaluating for search_type: graph_llm_local_search_metrics (Primary Model: gemini-1.5-pro) ---
Primary model 'gemini-1.5-pro' initialized successfully.


Output()

  Answer Relevancy calculated successfully on attempt 1. Score: 0.7857142857142857


Output()

  Contextual Precision calculated successfully on attempt 1. Score: 1.0


2025-06-10 20:12:02,056 - INFO - Using default tokenizer.


  Contextual Recall calculated successfully on attempt 1. Score: 0.9444444444444444

Attempting ROUGE-1 calculation...
  ROUGE-1 calculated successfully. Score: 0.5031712473572939

Attempting Cosine Similarity calculation...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Cosine Similarity calculated successfully. Score: 0.9353


2025-06-10 20:12:16,459 - INFO - LLM Token Usage recorded: Prompt=5052, Completion=357, Total=5409
2025-06-10 20:12:16,466 - INFO - RAG chain invocation complete.
2025-06-10 20:12:16,468 - INFO - graph_llm_drift_search_metrics Usage Data (with response time): {'prompt_tokens': 5052, 'completion_tokens': 357, 'total_tokens': 5409, 'time_taken': 6.142203200000949}
2025-06-10 20:12:16,470 - INFO - Created evaluation object. Retrieval: 2.3442s, LLM: 6.1422s, Total: 8.4864s


Output()


--- Evaluating for search_type: graph_llm_drift_search_metrics (Primary Model: gemini-1.5-pro) ---
Primary model 'gemini-1.5-pro' initialized successfully.


Output()

  Answer Relevancy calculated successfully on attempt 1. Score: 0.9565217391304348


  Error calculating Contextual Precision on attempt 1: Evaluation LLM outputted an invalid JSON. Please use a better evaluation model.
  Retrying in 5 seconds...


Output()

Output()

  Contextual Precision calculated successfully on attempt 2. Score: 0.8551587301587301


2025-06-10 20:13:02,744 - INFO - Using default tokenizer.


  Contextual Recall calculated successfully on attempt 1. Score: 1.0

Attempting ROUGE-1 calculation...
  ROUGE-1 calculated successfully. Score: 0.4934086629001883

Attempting Cosine Similarity calculation...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-10 20:13:11,599 - INFO - Finished processing all QA pairs.


  Cosine Similarity calculated successfully. Score: 0.9256


In [33]:
result_level_3

[{'search_type': 'vector_dense_search_metrics',
  'precision': 0.9166666666666666,
  'recall': 0.5,
  'relevancy': 0.9333333333333333,
  'rouge1': 0.4376470588235294,
  'cosine_similarity': 0.9030611515045166,
  'time_taken': 4.5034145999998145,
  'num_input_token': 1935,
  'num_output_token': 203,
  'query': ' Synthesize a comparative overview of the five dwarf planets (Ceres, Pluto, Haumea, Makemake, Eris) based on all provided text,\n                highlighting their locations, relative sizes, known moons, surface compositions, and defining characteristics.\n             ',
  'qa_level': 3,
  'actual_output': "The five officially recognized dwarf planets are Ceres, Pluto, Haumea, Makemake, and Eris.  Makemake, Pluto, Haumea, and Eris are located in the Kuiper Belt, a donut-shaped region of icy bodies beyond the orbit of Neptune. Makemake is slightly smaller than Pluto and is the second-brightest object in the Kuiper Belt as seen from Earth, while Pluto is the brightest. Eris is abo