In [2]:
import os
import pandas as pd
from ragas.metrics import answer_relevancy, answer_correctness, FactualCorrectness, SemanticSimilarity
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain.chat_models import ChatOpenAI
from langchain_localai import LocalAIEmbeddings
from datasets import Dataset
from dotenv import load_dotenv
import json

import json
import pandas as pd
import os
from datasets import Dataset
from tqdm import tqdm
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
load_dotenv()
llm = ChatOpenAI(
    openai_api_base=os.getenv("OPENAI_API_BASE"), 
    openai_api_key=os.getenv("OPENAI_API_KEY"),              
    model_name=os.getenv("LLM_MODEL_NAME")          
)
llm = LangchainLLMWrapper(llm)

embedding = LocalAIEmbeddings(
    openai_api_base=os.getenv("EMBED_URL"), 
    openai_api_key=os.getenv("EMBED_TOKEN"), 
    model=os.getenv("EMBED_MODEL")
)
embedding = LangchainEmbeddingsWrapper(embedding)

  llm = ChatOpenAI(


# First Evaluation

In [None]:
def start_evaluate(test_set_file, output_file):
    with open(test_set_file, 'r', encoding='utf-8') as file:
        data = json.load(file)

    if os.path.exists(output_file):
        existing_df = pd.read_csv(output_file)
    else:
        existing_df = pd.DataFrame(columns=[
            'user_input', 'response', 'reference',
            'answer_correctness', 'answer_relevancy'
        ])

    existing_df.fillna('', inplace=True)

    for idx, (q, a, g) in enumerate(zip(data['question'], data['answer'], data['ground_truth']), start=1):
        short_q = q[:50].replace('\n', ' ')
        print(f"[{idx}] Processing question: \"{short_q}...\"")
        if a == "x":
            print(f"[{idx}] Skipped (x in ans)")
            continue
        match = existing_df[
            (existing_df['user_input'] == q) &
            (existing_df['response'] == a) &
            (existing_df['reference'] == g)
        ]
        if not match.empty:
            existing_row = match.iloc[0]
            correctness = pd.to_numeric(existing_row['answer_correctness'], errors='coerce')
            relevancy = pd.to_numeric(existing_row['answer_relevancy'], errors='coerce')
            if (
                not pd.isna(correctness) and not pd.isna(relevancy) and
                correctness > 0.0 and relevancy > 0.0
            ):
                print(f"[{idx}] Skipped (already evaluated with valid metrics)")
                continue

        print(f"[{idx}] Running evaluation...")
        single_data = {
            "question": [q],
            "answer": [a],
            "ground_truth": [g]
        }
        single_dataset = Dataset.from_dict(single_data)

        try:
            results = evaluate(
                dataset=single_dataset,
                metrics=[answer_correctness, answer_relevancy],
                llm=llm,
                embeddings=embedding
            )
            row_df = results.to_pandas()
            print(f"[{idx}] Evaluation completed successfully.")
        except Exception as e:
            print(f"[{idx}] Evaluation error: {e}")
            row_df = pd.DataFrame([{
                'user_input': q,
                'response': a,
                'reference': g,
                'answer_correctness': 0.0,
                'answer_relevancy': 0.0
            }])

        existing_df = pd.concat([existing_df, row_df], ignore_index=True)
        existing_df.drop_duplicates(
            subset=['user_input', 'response', 'reference'], keep='last', inplace=True
        )
        existing_df.to_csv(output_file, index=False)

    print(f"All done — results saved to: {output_file}")


## Evaluation with meta-llama/llama-4-scout-17b-16e-instruct

In [13]:
llm = ChatOpenAI(
    openai_api_base=os.getenv("OPENAI_API_BASE"), 
    openai_api_key=os.getenv("OPENAI_API_KEY"),              
    model_name="meta-llama/llama-4-scout-17b-16e-instruct"         
)
llm = LangchainLLMWrapper(llm)

embedding = LocalAIEmbeddings(
    openai_api_base=os.getenv("EMBED_URL"), 
    openai_api_key=os.getenv("EMBED_TOKEN"), 
    model=os.getenv("EMBED_MODEL")
)
embedding = LangchainEmbeddingsWrapper(embedding)

In [14]:
test_set_file = './test_data/study_rules/test_set_method1_lightrag_hybrid.json'
output_file = './results/study_rules/method1_lightrag_hybrid.csv'

#results = start_evaluate(test_set_file, output_file)

df_lr_hybrid = pd.read_csv(output_file)
print(f"FILE: {output_file}")
print("Average answer correctness:", df_lr_hybrid['answer_correctness'].mean())
print("Average answer relevancy:", df_lr_hybrid['answer_relevancy'].mean())

FILE: ./results/study_rules/method1_lightrag_hybrid.csv
Average answer correctness: 0.5122069836697539
Average answer relevancy: 0.6982478951966491


In [15]:
test_set_file = './test_data/study_rules/test_set_method1_lightrag_local.json'
output_file = './results/study_rules/method1_lightrag_local.csv'

#results = start_evaluate(test_set_file, output_file)

df_lr_local = pd.read_csv(output_file)
print(f"FILE: {output_file}")
print("Average answer correctness:", df_lr_local['answer_correctness'].mean())
print("Average answer relevancy:", df_lr_local['answer_relevancy'].mean())

FILE: ./results/study_rules/method1_lightrag_local.csv
Average answer correctness: 0.4249733737262503
Average answer relevancy: 0.6681967775090379


In [16]:
test_set_file = './test_data/study_rules/test_set_method2_graphrag_drift.json'
output_file = './results/study_rules/method2_graphrag_drift.csv'

#results = start_evaluate(test_set_file, output_file)

df_gr_drift = pd.read_csv(output_file)
print(f"FILE: {output_file}")
print("Average answer correctness:", df_gr_drift['answer_correctness'].mean())
print("Average answer relevancy:", df_gr_drift['answer_relevancy'].mean())

FILE: ./results/study_rules/method2_graphrag_drift.csv
Average answer correctness: 0.3500237400348909
Average answer relevancy: 0.7774822101039724


In [17]:
test_set_file = './test_data/study_rules/test_set_naive_rag.json'
output_file = './results/study_rules/naive_rag.csv'

#results = start_evaluate(test_set_file, output_file)

df_naive = pd.read_csv(output_file)
print(f"FILE: {output_file}")
print("Average answer correctness:", df_naive['answer_correctness'].mean())
print("Average answer relevancy:", df_naive['answer_relevancy'].mean())

FILE: ./results/study_rules/naive_rag.csv
Average answer correctness: 0.5506225438324175
Average answer relevancy: 0.7527904156645951


In [None]:
test_set_file = './test_data/study_rules/test_set_naive_rag_small_top_k.json'
output_file = './results/study_rules/naive_rag_small_top_k.csv'

#results = start_evaluate(test_set_file, output_file)

df_naive_small_k = pd.read_csv(output_file)
print(f"FILE: {output_file}")
print("Average answer correctness:", df_naive_small_k['answer_correctness'].mean())
print("Average answer relevancy:", df_naive_small_k['answer_relevancy'].mean())

FILE: ./results/study_rules/naive_rag_small_top_k.csv
Average answer correctness: 0.5652947016792146
Average answer relevancy: 0.6714817258125219


## Evaluation with deepseek-r1-distill-llama-70b

In [24]:
llm = ChatOpenAI(
    openai_api_base=os.getenv("OPENAI_API_BASE"), 
    openai_api_key=os.getenv("OPENAI_API_KEY"),              
    model_name="deepseek-r1-distill-llama-70b"         
)
llm = LangchainLLMWrapper(llm)

embedding = LocalAIEmbeddings(
    openai_api_base=os.getenv("EMBED_URL"), 
    openai_api_key=os.getenv("EMBED_TOKEN"), 
    model=os.getenv("EMBED_MODEL")
)
embedding = LangchainEmbeddingsWrapper(embedding)

In [20]:
test_set_file = './test_data/study_rules/test_set_method1_lightrag_hybrid.json'
output_file = './results/study_rules/method1_lightrag_hybrid_deepseek.csv'

#results = start_evaluate(test_set_file, output_file)

df_lr_hybrid_deepseek = pd.read_csv(output_file)
print(f"FILE: {output_file}")
print("Average answer correctness:", df_lr_hybrid_deepseek['answer_correctness'].mean())
print("Average answer relevancy:", df_lr_hybrid_deepseek['answer_relevancy'].mean())

FILE: ./results/study_rules/method1_lightrag_hybrid_deepseek.csv
Average answer correctness: 0.49797992824280307
Average answer relevancy: 0.7349126593819586


In [21]:
test_set_file = './test_data/study_rules/test_set_method1_lightrag_local.json'
output_file = './results/study_rules/method1_lightrag_local_deepseek.csv'

#results = start_evaluate(test_set_file, output_file)

df_lr_local_deepseek = pd.read_csv(output_file)
print(f"FILE: {output_file}")
print("Average answer correctness:", df_lr_local_deepseek['answer_correctness'].mean())
print("Average answer relevancy:", df_lr_local_deepseek['answer_relevancy'].mean())

FILE: ./results/study_rules/method1_lightrag_local_deepseek.csv
Average answer correctness: 0.4016634365973334
Average answer relevancy: 0.5804095482320186


In [22]:
test_set_file = './test_data/study_rules/test_set_method2_graphrag_drift.json'
output_file = './results/study_rules/method2_graphrag_drift_deepseek.csv'

#results = start_evaluate(test_set_file, output_file)

df_gr_drift_deepseek = pd.read_csv(output_file)
print(f"FILE: {output_file}")
print("Average answer correctness:", df_gr_drift_deepseek['answer_correctness'].mean())
print("Average answer relevancy:", df_gr_drift_deepseek['answer_relevancy'].mean())

FILE: ./results/study_rules/method2_graphrag_drift_deepseek.csv
Average answer correctness: 0.39543015771151035
Average answer relevancy: 0.7339491973490562


In [27]:
test_set_file = './test_data/study_rules/test_set_naive_rag.json'
output_file = './results/study_rules/naive_rag_deepseek.csv'

#results = start_evaluate(test_set_file, output_file)

df_naive_deepseek = pd.read_csv(output_file)
print(f"FILE: {output_file}")
print("Average answer correctness:", df_naive_deepseek['answer_correctness'].mean())
print("Average answer relevancy:", df_naive_deepseek['answer_relevancy'].mean())

FILE: ./results/study_rules/naive_rag_deepseek.csv
Average answer correctness: 0.5585839181481368
Average answer relevancy: 0.7402449547731885


In [30]:
test_set_file = './test_data/study_rules/test_set_naive_rag_small_top_k.json'
output_file = './results/study_rules/naive_rag_small_top_k_deepseek.csv'

#results = start_evaluate(test_set_file, output_file)

df_naive_small_k_deepseek = pd.read_csv(output_file)
print(f"FILE: {output_file}")
print("Average answer correctness:", df_naive_small_k_deepseek['answer_correctness'].mean())
print("Average answer relevancy:", df_naive_small_k_deepseek['answer_relevancy'].mean())

FILE: ./results/study_rules/naive_rag_small_top_k_deepseek.csv
Average answer correctness: 0.5452149662099343
Average answer relevancy: 0.6653694964764589


# Second Evaluation

In [4]:
def start_evaluate_2(test_set_file, output_file):
    with open(test_set_file, 'r', encoding='utf-8') as file:
        data = json.load(file)

    if os.path.exists(output_file):
        existing_df = pd.read_csv(output_file)
    else:
        existing_df = pd.DataFrame(columns=[
            'user_input', 'response', 'reference',
            'answer_correctness', 'answer_relevancy', 'factual_correctness(mode=f1)',
            'factual_correctness(mode=recall)', 'semantic_similarity'
        ])

    existing_df.fillna('', inplace=True)

    for idx, (q, a, g) in enumerate(zip(data['question'], data['answer'], data['ground_truth']), start=1):
        short_q = q[:50].replace('\n', ' ')
        print(f"[{idx}] Processing question: \"{short_q}...\"")
        if a == "x":
            print(f"[{idx}] Skipped (x in ans)")
            continue
        match = existing_df[
            (existing_df['user_input'] == q) &
            (existing_df['response'] == a) &
            (existing_df['reference'] == g)
        ]
        if not match.empty:
            existing_row = match.iloc[0]
            correctness = pd.to_numeric(existing_row['answer_correctness'], errors='coerce')
            relevancy = pd.to_numeric(existing_row['answer_relevancy'], errors='coerce')
            factual_correctness = pd.to_numeric(existing_row['factual_correctness(mode=f1)'], errors='coerce')
            factual_correctness_recall = pd.to_numeric(existing_row['factual_correctness(mode=recall)'], errors='coerce')
            semantic_similarity = pd.to_numeric(existing_row['semantic_similarity'], errors='coerce')
            if (
                not pd.isna(correctness) and not pd.isna(relevancy) and not pd.isna(factual_correctness) 
                and not pd.isna(factual_correctness_recall) and not pd.isna(semantic_similarity) and 
                correctness > 0.0 and relevancy > 0.0 and factual_correctness > 0.0  and factual_correctness_recall > 0.0
                and semantic_similarity > 0.0
            ):
                print(f"[{idx}] Skipped (already evaluated with valid metrics)")
                continue

        print(f"[{idx}] Running evaluation...")
        single_data = {
            "question": [q],
            "answer": [a],
            "ground_truth": [g]
        }
        single_dataset = Dataset.from_dict(single_data)

        try:
            results = evaluate(
                dataset=single_dataset,
                metrics=[answer_correctness, answer_relevancy, FactualCorrectness(),
                         FactualCorrectness(mode="recall"), SemanticSimilarity()],
                llm=llm,
                embeddings=embedding
            )
            # results = evaluate(
            #     dataset=single_dataset,
            #     metrics=[answer_correctness, answer_relevancy, FactualCorrectness(atomicity="high", coverage="high"),
            #              FactualCorrectness(mode="recall",atomicity="high", coverage="high"), SemanticSimilarity()],
            #     llm=llm,
            #     embeddings=embedding
            # )
            print(results)
            row_df = results.to_pandas()
            print(f"[{idx}] Evaluation completed successfully.")
        except Exception as e:
            print(f"[{idx}] Evaluation error: {e}")
            row_df = pd.DataFrame([{
                'user_input': q,
                'response': a,
                'reference': g,
                'answer_correctness': 0.0,
                'answer_relevancy': 0.0,
                'factual_correctness(mode=f1)': 0.0,
                'factual_correctness(mode=recall)': 0.0,
                'semantic_similarity': 0.0
            }])

        existing_df = pd.concat([existing_df, row_df], ignore_index=True)
        existing_df.drop_duplicates(
            subset=['user_input', 'response', 'reference'], keep='last', inplace=True
        )
        existing_df.to_csv(output_file, index=False)

    print(f"All done — results saved to: {output_file}")

## Study Rules Evaluation with meta-llama/llama-4-scout-17b-16e-instruct

In [5]:
llm = ChatOpenAI(
    openai_api_base=os.getenv("OPENAI_API_BASE"), 
    openai_api_key=os.getenv("OPENAI_API_KEY"),              
    model_name="meta-llama/llama-4-scout-17b-16e-instruct"         
)
llm = LangchainLLMWrapper(llm)

embedding = LocalAIEmbeddings(
    openai_api_base=os.getenv("EMBED_URL"), 
    openai_api_key=os.getenv("EMBED_TOKEN"), 
    model=os.getenv("EMBED_MODEL")
)
embedding = LangchainEmbeddingsWrapper(embedding)

In [None]:
test_set_file = './test_data/study_rules/test_set_method1_lightrag_hybrid.json'
output_file = './results/study_rules_2/method1_lightrag_hybrid.csv'

#results = start_evaluate_2(test_set_file, output_file)

df_lr_hybrid2 = pd.read_csv(output_file)
print(f"FILE: {output_file}")
print("Average answer correctness:", df_lr_hybrid2['answer_correctness'].mean())
print("Average answer relevancy:", df_lr_hybrid2['answer_relevancy'].mean())
print("Average factual correctness(mode=f1):", df_lr_hybrid2['factual_correctness(mode=f1)'].mean())
print("Average factual correctness(mode=recall):", df_lr_hybrid2['factual_correctness(mode=recall)'].mean())
print("Average semantic similarity:", df_lr_hybrid2['semantic_similarity'].mean())

FILE: ./results/study_rules_2/method1_lightrag_hybrid.csv
Average answer correctness: 0.5093708492159725
Average answer relevancy: 0.6982478951966493
Average factual correctness(mode=f1): 0.36714285714285716
Average factual correctness(mode=recall): 0.3614285714285715
Average semantic similarity: 0.7553770316263085


In [14]:
test_set_file = './test_data/study_rules/test_set_method1_lightrag_local.json'
output_file = './results/study_rules_2/method1_lightrag_local.csv'

#results = start_evaluate_2(test_set_file, output_file)

df_lr_local2 = pd.read_csv(output_file)
print(f"FILE: {output_file}")
print("Average answer correctness:", df_lr_local2['answer_correctness'].mean())
print("Average answer relevancy:", df_lr_local2['answer_relevancy'].mean())
print("Average factual correctness(mode=f1):", df_lr_local2['factual_correctness(mode=f1)'].mean())
print("Average factual correctness(mode=recall):", df_lr_local2['factual_correctness(mode=recall)'].mean())
print("Average semantic similarity:", df_lr_local2['semantic_similarity'].mean())

FILE: ./results/study_rules_2/method1_lightrag_local.csv
Average answer correctness: 0.4249733737262503
Average answer relevancy: 0.6681967775090379
Average factual correctness(mode=f1): 0.2865
Average factual correctness(mode=recall): 0.2575
Average semantic similarity: 0.7379938908387216


In [15]:
test_set_file = './test_data/study_rules/test_set_method2_graphrag_drift.json'
output_file = './results/study_rules_2/method2_graphrag_drift.csv'

#results = start_evaluate_2(test_set_file, output_file)

df_gr_drift2 = pd.read_csv(output_file)
print(f"FILE: {output_file}")
print("Average answer correctness:", df_gr_drift2['answer_correctness'].mean())
print("Average answer relevancy:", df_gr_drift2['answer_relevancy'].mean())
print("Average factual correctness(mode=f1):", df_gr_drift2['factual_correctness(mode=f1)'].mean())
print("Average factual correctness(mode=recall):", df_gr_drift2['factual_correctness(mode=recall)'].mean())
print("Average semantic similarity:", df_gr_drift2['semantic_similarity'].mean())

FILE: ./results/study_rules_2/method2_graphrag_drift.csv
Average answer correctness: 0.3500237400348909
Average answer relevancy: 0.7774822101039724
Average factual correctness(mode=f1): 0.2935
Average factual correctness(mode=recall): 0.34400000000000003
Average semantic similarity: 0.7109516323324775


In [6]:
test_set_file = './test_data/study_rules/test_set_naive_rag.json'
output_file = './results/study_rules_2/naive_rag.csv'

#results = start_evaluate_2(test_set_file, output_file)

df_naive2 = pd.read_csv(output_file)
print(f"FILE: {output_file}")
print("Average answer correctness:", df_naive2['answer_correctness'].mean())
print("Average answer relevancy:", df_naive2['answer_relevancy'].mean())
print("Average factual correctness(mode=f1):", df_naive2['factual_correctness(mode=f1)'].mean())
print("Average factual correctness(mode=recall):", df_naive2['factual_correctness(mode=recall)'].mean())
print("Average semantic similarity:", df_naive2['semantic_similarity'].mean())

FILE: ./results/study_rules_2/naive_rag.csv
Average answer correctness: 0.5502497388212984
Average answer relevancy: 0.7530999940893139
Average factual correctness(mode=f1): 0.4615
Average factual correctness(mode=recall): 0.43050000000000005
Average semantic similarity: 0.7995977826213055


In [8]:
test_set_file = './test_data/study_rules/test_set_naive_rag_small_top_k.json'
output_file = './results/study_rules_2/naive_rag_small_top_k.csv'

#results = start_evaluate_2(test_set_file, output_file)

df_naive_small_k2 = pd.read_csv(output_file)
print(f"FILE: {output_file}")
print("Average answer correctness:", df_naive_small_k2['answer_correctness'].mean())
print("Average answer relevancy:", df_naive_small_k2['answer_relevancy'].mean())
print("Average factual correctness(mode=f1):", df_naive_small_k2['factual_correctness(mode=f1)'].mean())
print("Average factual correctness(mode=recall):", df_naive_small_k2['factual_correctness(mode=recall)'].mean())
print("Average semantic similarity:", df_naive_small_k2['semantic_similarity'].mean())

FILE: ./results/study_rules_2/naive_rag_small_top_k.csv
Average answer correctness: 0.5536140471861982
Average answer relevancy: 0.6617379602286304
Average factual correctness(mode=f1): 0.45050000000000007
Average factual correctness(mode=recall): 0.4425
Average semantic similarity: 0.7969714609933498


## Genetics Evaluation with meta-llama/llama-4-scout-17b-16e-instruct

In [10]:
test_set_file = './test_data/genetics/test_set_method1_lightrag_hybrid.json'
output_file = './results/genetics/method1_lightrag_hybrid.csv'

results = start_evaluate_2(test_set_file, output_file)

df_lr_hybrid2_genetics = pd.read_csv(output_file)
print(f"FILE: {output_file}")
print("Average answer correctness:", df_lr_hybrid2_genetics['answer_correctness'].mean())
print("Average answer relevancy:", df_lr_hybrid2_genetics['answer_relevancy'].mean())
print("Average factual correctness(mode=f1):", df_lr_hybrid2_genetics['factual_correctness(mode=f1)'].mean())
print("Average factual correctness(mode=recall):", df_lr_hybrid2_genetics['factual_correctness(mode=recall)'].mean())
print("Average semantic similarity:", df_lr_hybrid2_genetics['semantic_similarity'].mean())

  existing_df.fillna('', inplace=True)


[1] Processing question: "Explain Griffith's transformation experiments. Wha..."
[1] Skipped (already evaluated with valid metrics)
[2] Processing question: "Why were radioactive sulfur and phosphorous used t..."
[2] Running evaluation...


Evaluating:   0%|          | 0/5 [00:00<?, ?it/s]Exception raised in Job[2]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for model `meta-llama/llama-4-scout-17b-16e-instruct` in organization `org_01jtgz94g3fdsbcwjdvkgqmjb3` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 500907, Requested 678. Please try again in 4m34.0008s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}})
Evaluating:  40%|████      | 2/5 [01:05<01:38, 32.82s/it]Exception raised in Job[1]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for model `meta-llama/llama-4-scout-17b-16e-instruct` in organization `org_01jtgz94g3fdsbcwjdvkgqmjb3` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 500844, Requested 671. Please try again in 4m21.891199999s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billin

{'answer_correctness': nan, 'answer_relevancy': nan, 'factual_correctness(mode=f1)': nan, 'factual_correctness(mode=recall)': nan, 'semantic_similarity': 0.5869}
[2] Evaluation completed successfully.
[3] Processing question: "Provide a brief summary of the Sanger sequencing m..."
[3] Skipped (already evaluated with valid metrics)
[4] Processing question: "Describe the structure and complementary base pair..."
[4] Skipped (already evaluated with valid metrics)
[5] Processing question: "How did the scientific community learn that DNA re..."
[5] Running evaluation...


Evaluating:   0%|          | 0/5 [00:00<?, ?it/s]Exception raised in Job[1]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for model `meta-llama/llama-4-scout-17b-16e-instruct` in organization `org_01jtgz94g3fdsbcwjdvkgqmjb3` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 499911, Requested 1030. Please try again in 2m42.527399999s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}})
Evaluating:  40%|████      | 2/5 [00:58<01:28, 29.39s/it]Exception raised in Job[3]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for model `meta-llama/llama-4-scout-17b-16e-instruct` in organization `org_01jtgz94g3fdsbcwjdvkgqmjb3` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 499639, Requested 1036. Please try again in 1m56.5642s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/bill

{'answer_correctness': nan, 'answer_relevancy': nan, 'factual_correctness(mode=f1)': nan, 'factual_correctness(mode=recall)': nan, 'semantic_similarity': 0.4424}
[5] Evaluation completed successfully.
[6] Processing question: "DNA replication is bidirectional and discontinuous..."
[6] Skipped (already evaluated with valid metrics)
[7] Processing question: "What are Okazaki fragments and how they are formed..."
[7] Skipped (already evaluated with valid metrics)
[8] Processing question: "If the rate of replication in a particular prokary..."
[8] Skipped (already evaluated with valid metrics)
[9] Processing question: "Explain the events taking place at the replication..."
[9] Skipped (already evaluated with valid metrics)
[10] Processing question: "What is the role of a primer in DNA replication? W..."
[10] Skipped (already evaluated with valid metrics)
[11] Processing question: "How do the linear chromosomes in eukaryotes ensure..."
[11] Skipped (already evaluated with valid metrics)
[12

Evaluating:   0%|          | 0/5 [00:00<?, ?it/s]Exception raised in Job[1]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for model `meta-llama/llama-4-scout-17b-16e-instruct` in organization `org_01jtgz94g3fdsbcwjdvkgqmjb3` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 499292, Requested 1081. Please try again in 1m4.4532s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}})
Evaluating:  40%|████      | 2/5 [00:26<00:39, 13.08s/it]Exception raised in Job[3]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for model `meta-llama/llama-4-scout-17b-16e-instruct` in organization `org_01jtgz94g3fdsbcwjdvkgqmjb3` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 500931, Requested 1088. Please try again in 5m49.0482s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 

{'answer_correctness': nan, 'answer_relevancy': nan, 'factual_correctness(mode=f1)': nan, 'factual_correctness(mode=recall)': nan, 'semantic_similarity': 0.7078}
[14] Evaluation completed successfully.
[15] Processing question: "If mRNA is complementary to the DNA template stran..."
[15] Skipped (already evaluated with valid metrics)
[16] Processing question: "In your own words, describe the difference between..."
[16] Skipped (already evaluated with valid metrics)
[17] Processing question: "Transcribe and translate the following DNA sequenc..."
[17] Skipped (already evaluated with valid metrics)
[18] Processing question: "Explain how single nucleotide changes can have vas..."
[18] Skipped (already evaluated with valid metrics)
[19] Processing question: "Name two differences between prokaryotic and eukar..."
[19] Skipped (already evaluated with valid metrics)
[20] Processing question: "Describe how controlling gene expression will alte..."
[20] Running evaluation...


Evaluating:   0%|          | 0/5 [00:00<?, ?it/s]Exception raised in Job[1]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for model `meta-llama/llama-4-scout-17b-16e-instruct` in organization `org_01jtgz94g3fdsbcwjdvkgqmjb3` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 499958, Requested 1058. Please try again in 2m55.4752s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}})
Evaluating:  40%|████      | 2/5 [01:37<02:26, 48.71s/it]Exception raised in Job[0]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for model `meta-llama/llama-4-scout-17b-16e-instruct` in organization `org_01jtgz94g3fdsbcwjdvkgqmjb3` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 499773, Requested 1159. Please try again in 2m41.028999999s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/bill

{'answer_correctness': nan, 'answer_relevancy': nan, 'factual_correctness(mode=f1)': nan, 'factual_correctness(mode=recall)': nan, 'semantic_similarity': 0.7265}
[20] Evaluation completed successfully.
[21] Processing question: "Describe how transcription in prokaryotic cells ca..."
[21] Running evaluation...


Evaluating:   0%|          | 0/5 [00:00<?, ?it/s]Exception raised in Job[0]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for model `meta-llama/llama-4-scout-17b-16e-instruct` in organization `org_01jtgz94g3fdsbcwjdvkgqmjb3` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 499391, Requested 1093. Please try again in 1m23.4652s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}})
Evaluating:  40%|████      | 2/5 [00:51<01:17, 25.95s/it]Exception raised in Job[1]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for model `meta-llama/llama-4-scout-17b-16e-instruct` in organization `org_01jtgz94g3fdsbcwjdvkgqmjb3` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 500780, Requested 983. Please try again in 5m4.7328s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', '

{'answer_correctness': nan, 'answer_relevancy': nan, 'factual_correctness(mode=f1)': nan, 'factual_correctness(mode=recall)': nan, 'semantic_similarity': 0.7626}
[21] Evaluation completed successfully.
[22] Processing question: "What is the difference between a repressible and a..."
[22] Running evaluation...


Evaluating:   0%|          | 0/5 [00:00<?, ?it/s]Exception raised in Job[1]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for model `meta-llama/llama-4-scout-17b-16e-instruct` in organization `org_01jtgz94g3fdsbcwjdvkgqmjb3` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 500097, Requested 919. Please try again in 2m55.671599999s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}})
Evaluating:  40%|████      | 2/5 [01:34<02:21, 47.16s/it]Exception raised in Job[2]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for model `meta-llama/llama-4-scout-17b-16e-instruct` in organization `org_01jtgz94g3fdsbcwjdvkgqmjb3` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 500044, Requested 926. Please try again in 2m47.6402s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billin

{'answer_correctness': nan, 'answer_relevancy': nan, 'factual_correctness(mode=f1)': nan, 'factual_correctness(mode=recall)': nan, 'semantic_similarity': 0.7918}
[22] Evaluation completed successfully.
[23] Processing question: "In cancer cells, alteration to epigenetic modifica..."
[23] Running evaluation...


Evaluating:   0%|          | 0/5 [00:00<?, ?it/s]Exception raised in Job[2]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for model `meta-llama/llama-4-scout-17b-16e-instruct` in organization `org_01jtgz94g3fdsbcwjdvkgqmjb3` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 500853, Requested 844. Please try again in 4m53.4122s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}})
Evaluating:  40%|████      | 2/5 [01:58<02:57, 59.22s/it]Exception raised in Job[3]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for model `meta-llama/llama-4-scout-17b-16e-instruct` in organization `org_01jtgz94g3fdsbcwjdvkgqmjb3` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 500615, Requested 844. Please try again in 4m12.1312s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', '

{'answer_correctness': nan, 'answer_relevancy': nan, 'factual_correctness(mode=f1)': nan, 'factual_correctness(mode=recall)': nan, 'semantic_similarity': 0.6882}
[23] Evaluation completed successfully.
[24] Processing question: "A mutation within the promoter region can alter tr..."
[24] Running evaluation...


Evaluating:   0%|          | 0/5 [00:00<?, ?it/s]Exception raised in Job[2]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for model `meta-llama/llama-4-scout-17b-16e-instruct` in organization `org_01jtgz94g3fdsbcwjdvkgqmjb3` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 499929, Requested 826. Please try again in 2m10.353799999s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}})
Evaluating:  40%|████      | 2/5 [01:31<02:17, 45.73s/it]Exception raised in Job[3]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for model `meta-llama/llama-4-scout-17b-16e-instruct` in organization `org_01jtgz94g3fdsbcwjdvkgqmjb3` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 499879, Requested 826. Please try again in 2m1.8128s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing

{'answer_correctness': nan, 'answer_relevancy': nan, 'factual_correctness(mode=f1)': nan, 'factual_correctness(mode=recall)': nan, 'semantic_similarity': 0.8163}
[24] Evaluation completed successfully.
[25] Processing question: "What could happen if a cell had too much of an act..."
[25] Running evaluation...


Evaluating:  40%|████      | 2/5 [01:11<01:46, 35.66s/it]Exception raised in Job[0]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for model `meta-llama/llama-4-scout-17b-16e-instruct` in organization `org_01jtgz94g3fdsbcwjdvkgqmjb3` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 503786, Requested 928. Please try again in 13m34.656s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}})
Evaluating:  60%|██████    | 3/5 [02:46<02:00, 60.30s/it]Exception raised in Job[2]: TimeoutError()
Exception raised in Job[3]: TimeoutError()
Evaluating: 100%|██████████| 5/5 [03:00<00:00, 36.00s/it]


{'answer_correctness': nan, 'answer_relevancy': 0.9648, 'factual_correctness(mode=f1)': nan, 'factual_correctness(mode=recall)': nan, 'semantic_similarity': 0.7035}
[25] Evaluation completed successfully.
[26] Processing question: "Protein modification can alter gene expression in ..."
[26] Running evaluation...


Evaluating:   0%|          | 0/5 [00:00<?, ?it/s]Exception raised in Job[1]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for model `meta-llama/llama-4-scout-17b-16e-instruct` in organization `org_01jtgz94g3fdsbcwjdvkgqmjb3` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 503298, Requested 1058. Please try again in 12m32.734999999s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}})
Evaluating:  40%|████      | 2/5 [01:06<01:39, 33.19s/it]Exception raised in Job[3]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for model `meta-llama/llama-4-scout-17b-16e-instruct` in organization `org_01jtgz94g3fdsbcwjdvkgqmjb3` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 503026, Requested 1065. Please try again in 11m47.0226s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/bi

{'answer_correctness': nan, 'answer_relevancy': nan, 'factual_correctness(mode=f1)': nan, 'factual_correctness(mode=recall)': nan, 'semantic_similarity': 0.6606}
[26] Evaluation completed successfully.
[27] Processing question: "Alternative forms of a protein can be beneficial o..."
[27] Running evaluation...


Evaluating:  20%|██        | 1/5 [00:00<00:00,  9.04it/s]Exception raised in Job[1]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for model `meta-llama/llama-4-scout-17b-16e-instruct` in organization `org_01jtgz94g3fdsbcwjdvkgqmjb3` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 502221, Requested 788. Please try again in 8m40.118999999s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}})
Evaluating:  40%|████      | 2/5 [01:08<02:00, 40.20s/it]Exception raised in Job[0]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for model `meta-llama/llama-4-scout-17b-16e-instruct` in organization `org_01jtgz94g3fdsbcwjdvkgqmjb3` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 501911, Requested 915. Please try again in 8m8.4396s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings

{'answer_correctness': nan, 'answer_relevancy': nan, 'factual_correctness(mode=f1)': nan, 'factual_correctness(mode=recall)': nan, 'semantic_similarity': 0.6408}
[27] Evaluation completed successfully.
[28] Processing question: "Changes in epigenetic modifications alter the acce..."
[28] Running evaluation...


Evaluating:   0%|          | 0/5 [00:00<?, ?it/s]Exception raised in Job[2]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for model `meta-llama/llama-4-scout-17b-16e-instruct` in organization `org_01jtgz94g3fdsbcwjdvkgqmjb3` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 501192, Requested 1001. Please try again in 6m19.0334s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}})
Evaluating:  40%|████      | 2/5 [01:05<01:38, 32.79s/it]Exception raised in Job[1]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for model `meta-llama/llama-4-scout-17b-16e-instruct` in organization `org_01jtgz94g3fdsbcwjdvkgqmjb3` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 500940, Requested 994. Please try again in 5m34.303799999s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billi

{'answer_correctness': nan, 'answer_relevancy': nan, 'factual_correctness(mode=f1)': nan, 'factual_correctness(mode=recall)': nan, 'semantic_similarity': 0.7476}
[28] Evaluation completed successfully.
[29] Processing question: "New drugs are being developed that decrease DNA me..."
[29] Running evaluation...


Evaluating:   0%|          | 0/5 [00:00<?, ?it/s]Exception raised in Job[3]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for model `meta-llama/llama-4-scout-17b-16e-instruct` in organization `org_01jtgz94g3fdsbcwjdvkgqmjb3` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 500230, Requested 1549. Please try again in 5m7.419799999s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}})
Evaluating:  40%|████      | 2/5 [01:24<02:06, 42.05s/it]Exception raised in Job[1]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for model `meta-llama/llama-4-scout-17b-16e-instruct` in organization `org_01jtgz94g3fdsbcwjdvkgqmjb3` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 500153, Requested 1543. Please try again in 4m53.069s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billin

{'answer_correctness': nan, 'answer_relevancy': nan, 'factual_correctness(mode=f1)': nan, 'factual_correctness(mode=recall)': nan, 'semantic_similarity': 0.5331}
[29] Evaluation completed successfully.
[30] Processing question: "How can understanding the gene expression pattern ..."
[30] Running evaluation...


Evaluating:  40%|████      | 2/5 [02:06<03:10, 63.48s/it]Exception raised in Job[0]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for model `meta-llama/llama-4-scout-17b-16e-instruct` in organization `org_01jtgz94g3fdsbcwjdvkgqmjb3` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 503738, Requested 1037. Please try again in 13m45.2302s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}})
Evaluating:  60%|██████    | 3/5 [02:34<01:36, 48.33s/it]Exception raised in Job[2]: TimeoutError()
Exception raised in Job[3]: TimeoutError()
Evaluating: 100%|██████████| 5/5 [03:00<00:00, 36.01s/it]


{'answer_correctness': nan, 'answer_relevancy': 0.9425, 'factual_correctness(mode=f1)': nan, 'factual_correctness(mode=recall)': nan, 'semantic_similarity': 0.7586}
[30] Evaluation completed successfully.
[31] Processing question: "Describe the process of Southern blotting...."
[31] Running evaluation...


Evaluating:  20%|██        | 1/5 [00:30<02:02, 30.62s/it]


KeyboardInterrupt: 

Exception raised in Job[1]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for model `meta-llama/llama-4-scout-17b-16e-instruct` in organization `org_01jtgz94g3fdsbcwjdvkgqmjb3` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 503124, Requested 995. Please try again in 11m51.8626s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}})
Exception raised in Job[0]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for model `meta-llama/llama-4-scout-17b-16e-instruct` in organization `org_01jtgz94g3fdsbcwjdvkgqmjb3` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 502956, Requested 1083. Please try again in 11m38.062s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}})
Exception raised in Job[2]: RateLimitError(Error code:

In [None]:
test_set_file = './test_data/genetics/test_set_method2_graphrag_drift.json'
output_file = './results/genetics/method2_graphrag_drift.csv'

#results = start_evaluate_2(test_set_file, output_file)

df_gr_drift2_genetics = pd.read_csv(output_file)
print(f"FILE: {output_file}")
print("Average answer correctness:", df_gr_drift2_genetics['answer_correctness'].mean())
print("Average answer relevancy:", df_gr_drift2_genetics['answer_relevancy'].mean())
print("Average factual correctness(mode=f1):", df_gr_drift2_genetics['factual_correctness(mode=f1)'].mean())
print("Average factual correctness(mode=recall):", df_gr_drift2_genetics['factual_correctness(mode=recall)'].mean())
print("Average semantic similarity:", df_gr_drift2_genetics['semantic_similarity'].mean())

In [None]:
test_set_file = './test_data/genetics/test_set_naive_rag_5_top_k.json'
output_file = './results/genetics/naive_rag_5_top_k.csv'

#results = start_evaluate_2(test_set_file, output_file)

df_naive_small_k2_genetics = pd.read_csv(output_file)
print(f"FILE: {output_file}")
print("Average answer correctness:", df_naive_small_k2_genetics['answer_correctness'].mean())
print("Average answer relevancy:", df_naive_small_k2_genetics['answer_relevancy'].mean())
print("Average factual correctness(mode=f1):", df_naive_small_k2_genetics['factual_correctness(mode=f1)'].mean())
print("Average factual correctness(mode=recall):", df_naive_small_k2_genetics['factual_correctness(mode=recall)'].mean())
print("Average semantic similarity:", df_naive_small_k2_genetics['semantic_similarity'].mean())