In [1]:
import os
import pandas as pd
from ragas.metrics import answer_relevancy, answer_correctness, FactualCorrectness, SemanticSimilarity
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain.chat_models import ChatOpenAI
from langchain_localai import LocalAIEmbeddings
from datasets import Dataset
from dotenv import load_dotenv
import json

import json
import pandas as pd
import os
from datasets import Dataset
from tqdm import tqdm
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv()
llm = ChatOpenAI(
    openai_api_base=os.getenv("OPENAI_API_BASE"), 
    openai_api_key=os.getenv("OPENAI_API_KEY"),              
    model_name=os.getenv("LLM_MODEL_NAME")          
)
llm = LangchainLLMWrapper(llm)

embedding = LocalAIEmbeddings(
    openai_api_base=os.getenv("EMBED_URL"), 
    openai_api_key=os.getenv("EMBED_TOKEN"), 
    model=os.getenv("EMBED_MODEL")
)
embedding = LangchainEmbeddingsWrapper(embedding)

  llm = ChatOpenAI(


# First Evaluation

In [None]:
def start_evaluate(test_set_file, output_file):
    with open(test_set_file, 'r', encoding='utf-8') as file:
        data = json.load(file)

    if os.path.exists(output_file):
        existing_df = pd.read_csv(output_file)
    else:
        existing_df = pd.DataFrame(columns=[
            'user_input', 'response', 'reference',
            'answer_correctness', 'answer_relevancy'
        ])

    existing_df.fillna('', inplace=True)

    for idx, (q, a, g) in enumerate(zip(data['question'], data['answer'], data['ground_truth']), start=1):
        short_q = q[:50].replace('\n', ' ')
        print(f"[{idx}] Processing question: \"{short_q}...\"")
        if a == "x":
            print(f"[{idx}] Skipped (x in ans)")
            continue
        match = existing_df[
            (existing_df['user_input'] == q) &
            (existing_df['response'] == a) &
            (existing_df['reference'] == g)
        ]
        if not match.empty:
            existing_row = match.iloc[0]
            correctness = pd.to_numeric(existing_row['answer_correctness'], errors='coerce')
            relevancy = pd.to_numeric(existing_row['answer_relevancy'], errors='coerce')
            if (
                not pd.isna(correctness) and not pd.isna(relevancy) and
                correctness > 0.0 and relevancy > 0.0
            ):
                print(f"[{idx}] Skipped (already evaluated with valid metrics)")
                continue

        print(f"[{idx}] Running evaluation...")
        single_data = {
            "question": [q],
            "answer": [a],
            "ground_truth": [g]
        }
        single_dataset = Dataset.from_dict(single_data)

        try:
            results = evaluate(
                dataset=single_dataset,
                metrics=[answer_correctness, answer_relevancy],
                llm=llm,
                embeddings=embedding
            )
            row_df = results.to_pandas()
            print(f"[{idx}] Evaluation completed successfully.")
        except Exception as e:
            print(f"[{idx}] Evaluation error: {e}")
            row_df = pd.DataFrame([{
                'user_input': q,
                'response': a,
                'reference': g,
                'answer_correctness': 0.0,
                'answer_relevancy': 0.0
            }])

        existing_df = pd.concat([existing_df, row_df], ignore_index=True)
        existing_df.drop_duplicates(
            subset=['user_input', 'response', 'reference'], keep='last', inplace=True
        )
        existing_df.to_csv(output_file, index=False)

    print(f"All done — results saved to: {output_file}")


## Evaluation with meta-llama/llama-4-scout-17b-16e-instruct

In [13]:
llm = ChatOpenAI(
    openai_api_base=os.getenv("OPENAI_API_BASE"), 
    openai_api_key=os.getenv("OPENAI_API_KEY"),              
    model_name="meta-llama/llama-4-scout-17b-16e-instruct"         
)
llm = LangchainLLMWrapper(llm)

embedding = LocalAIEmbeddings(
    openai_api_base=os.getenv("EMBED_URL"), 
    openai_api_key=os.getenv("EMBED_TOKEN"), 
    model=os.getenv("EMBED_MODEL")
)
embedding = LangchainEmbeddingsWrapper(embedding)

In [14]:
test_set_file = './test_data/study_rules/test_set_method1_lightrag_hybrid.json'
output_file = './results/study_rules/method1_lightrag_hybrid.csv'

#results = start_evaluate(test_set_file, output_file)

df_lr_hybrid = pd.read_csv(output_file)
print(f"FILE: {output_file}")
print("Average answer correctness:", df_lr_hybrid['answer_correctness'].mean())
print("Average answer relevancy:", df_lr_hybrid['answer_relevancy'].mean())

FILE: ./results/study_rules/method1_lightrag_hybrid.csv
Average answer correctness: 0.5122069836697539
Average answer relevancy: 0.6982478951966491


In [15]:
test_set_file = './test_data/study_rules/test_set_method1_lightrag_local.json'
output_file = './results/study_rules/method1_lightrag_local.csv'

#results = start_evaluate(test_set_file, output_file)

df_lr_local = pd.read_csv(output_file)
print(f"FILE: {output_file}")
print("Average answer correctness:", df_lr_local['answer_correctness'].mean())
print("Average answer relevancy:", df_lr_local['answer_relevancy'].mean())

FILE: ./results/study_rules/method1_lightrag_local.csv
Average answer correctness: 0.4249733737262503
Average answer relevancy: 0.6681967775090379


In [16]:
test_set_file = './test_data/study_rules/test_set_method2_graphrag_drift.json'
output_file = './results/study_rules/method2_graphrag_drift.csv'

#results = start_evaluate(test_set_file, output_file)

df_gr_drift = pd.read_csv(output_file)
print(f"FILE: {output_file}")
print("Average answer correctness:", df_gr_drift['answer_correctness'].mean())
print("Average answer relevancy:", df_gr_drift['answer_relevancy'].mean())

FILE: ./results/study_rules/method2_graphrag_drift.csv
Average answer correctness: 0.3500237400348909
Average answer relevancy: 0.7774822101039724


In [17]:
test_set_file = './test_data/study_rules/test_set_naive_rag.json'
output_file = './results/study_rules/naive_rag.csv'

#results = start_evaluate(test_set_file, output_file)

df_naive = pd.read_csv(output_file)
print(f"FILE: {output_file}")
print("Average answer correctness:", df_naive['answer_correctness'].mean())
print("Average answer relevancy:", df_naive['answer_relevancy'].mean())

FILE: ./results/study_rules/naive_rag.csv
Average answer correctness: 0.5506225438324175
Average answer relevancy: 0.7527904156645951


In [None]:
test_set_file = './test_data/study_rules/test_set_naive_rag_small_top_k.json'
output_file = './results/study_rules/naive_rag_small_top_k.csv'

#results = start_evaluate(test_set_file, output_file)

df_naive_small_k = pd.read_csv(output_file)
print(f"FILE: {output_file}")
print("Average answer correctness:", df_naive_small_k['answer_correctness'].mean())
print("Average answer relevancy:", df_naive_small_k['answer_relevancy'].mean())

FILE: ./results/study_rules/naive_rag_small_top_k.csv
Average answer correctness: 0.5652947016792146
Average answer relevancy: 0.6714817258125219


## Evaluation with deepseek-r1-distill-llama-70b

In [24]:
llm = ChatOpenAI(
    openai_api_base=os.getenv("OPENAI_API_BASE"), 
    openai_api_key=os.getenv("OPENAI_API_KEY"),              
    model_name="deepseek-r1-distill-llama-70b"         
)
llm = LangchainLLMWrapper(llm)

embedding = LocalAIEmbeddings(
    openai_api_base=os.getenv("EMBED_URL"), 
    openai_api_key=os.getenv("EMBED_TOKEN"), 
    model=os.getenv("EMBED_MODEL")
)
embedding = LangchainEmbeddingsWrapper(embedding)

In [20]:
test_set_file = './test_data/study_rules/test_set_method1_lightrag_hybrid.json'
output_file = './results/study_rules/method1_lightrag_hybrid_deepseek.csv'

#results = start_evaluate(test_set_file, output_file)

df_lr_hybrid_deepseek = pd.read_csv(output_file)
print(f"FILE: {output_file}")
print("Average answer correctness:", df_lr_hybrid_deepseek['answer_correctness'].mean())
print("Average answer relevancy:", df_lr_hybrid_deepseek['answer_relevancy'].mean())

FILE: ./results/study_rules/method1_lightrag_hybrid_deepseek.csv
Average answer correctness: 0.49797992824280307
Average answer relevancy: 0.7349126593819586


In [21]:
test_set_file = './test_data/study_rules/test_set_method1_lightrag_local.json'
output_file = './results/study_rules/method1_lightrag_local_deepseek.csv'

#results = start_evaluate(test_set_file, output_file)

df_lr_local_deepseek = pd.read_csv(output_file)
print(f"FILE: {output_file}")
print("Average answer correctness:", df_lr_local_deepseek['answer_correctness'].mean())
print("Average answer relevancy:", df_lr_local_deepseek['answer_relevancy'].mean())

FILE: ./results/study_rules/method1_lightrag_local_deepseek.csv
Average answer correctness: 0.4016634365973334
Average answer relevancy: 0.5804095482320186


In [22]:
test_set_file = './test_data/study_rules/test_set_method2_graphrag_drift.json'
output_file = './results/study_rules/method2_graphrag_drift_deepseek.csv'

#results = start_evaluate(test_set_file, output_file)

df_gr_drift_deepseek = pd.read_csv(output_file)
print(f"FILE: {output_file}")
print("Average answer correctness:", df_gr_drift_deepseek['answer_correctness'].mean())
print("Average answer relevancy:", df_gr_drift_deepseek['answer_relevancy'].mean())

FILE: ./results/study_rules/method2_graphrag_drift_deepseek.csv
Average answer correctness: 0.39543015771151035
Average answer relevancy: 0.7339491973490562


In [27]:
test_set_file = './test_data/study_rules/test_set_naive_rag.json'
output_file = './results/study_rules/naive_rag_deepseek.csv'

#results = start_evaluate(test_set_file, output_file)

df_naive_deepseek = pd.read_csv(output_file)
print(f"FILE: {output_file}")
print("Average answer correctness:", df_naive_deepseek['answer_correctness'].mean())
print("Average answer relevancy:", df_naive_deepseek['answer_relevancy'].mean())

FILE: ./results/study_rules/naive_rag_deepseek.csv
Average answer correctness: 0.5585839181481368
Average answer relevancy: 0.7402449547731885


In [30]:
test_set_file = './test_data/study_rules/test_set_naive_rag_small_top_k.json'
output_file = './results/study_rules/naive_rag_small_top_k_deepseek.csv'

#results = start_evaluate(test_set_file, output_file)

df_naive_small_k_deepseek = pd.read_csv(output_file)
print(f"FILE: {output_file}")
print("Average answer correctness:", df_naive_small_k_deepseek['answer_correctness'].mean())
print("Average answer relevancy:", df_naive_small_k_deepseek['answer_relevancy'].mean())

FILE: ./results/study_rules/naive_rag_small_top_k_deepseek.csv
Average answer correctness: 0.5452149662099343
Average answer relevancy: 0.6653694964764589


# Second Evaluation

In [3]:
def start_evaluate_2(test_set_file, output_file):
    with open(test_set_file, 'r', encoding='utf-8') as file:
        data = json.load(file)

    if os.path.exists(output_file):
        existing_df = pd.read_csv(output_file)
    else:
        existing_df = pd.DataFrame(columns=[
            'user_input', 'response', 'reference',
            'answer_correctness', 'answer_relevancy', 'factual_correctness(mode=f1)',
            'factual_correctness(mode=recall)', 'semantic_similarity'
        ])

    existing_df.fillna('', inplace=True)

    for idx, (q, a, g) in enumerate(zip(data['question'], data['answer'], data['ground_truth']), start=1):
        short_q = q[:50].replace('\n', ' ')
        print(f"[{idx}] Processing question: \"{short_q}...\"")
        if a == "x":
            print(f"[{idx}] Skipped (x in ans)")
            continue
        match = existing_df[
            (existing_df['user_input'] == q) &
            (existing_df['response'] == a) &
            (existing_df['reference'] == g)
        ]
        if not match.empty:
            existing_row = match.iloc[0]
            correctness = pd.to_numeric(existing_row['answer_correctness'], errors='coerce')
            relevancy = pd.to_numeric(existing_row['answer_relevancy'], errors='coerce')
            factual_correctness = pd.to_numeric(existing_row['factual_correctness(mode=f1)'], errors='coerce')
            factual_correctness_recall = pd.to_numeric(existing_row['factual_correctness(mode=recall)'], errors='coerce')
            semantic_similarity = pd.to_numeric(existing_row['semantic_similarity'], errors='coerce')
            if (
                not pd.isna(correctness) and not pd.isna(relevancy) and not pd.isna(factual_correctness) 
                and not pd.isna(factual_correctness_recall) and not pd.isna(semantic_similarity) and 
                correctness > 0.0 and relevancy > 0.0 and factual_correctness > 0.0  and factual_correctness_recall > 0.0
                and semantic_similarity > 0.0
            ):
                print(f"[{idx}] Skipped (already evaluated with valid metrics)")
                continue

        print(f"[{idx}] Running evaluation...")
        single_data = {
            "question": [q],
            "answer": [a],
            "ground_truth": [g]
        }
        single_dataset = Dataset.from_dict(single_data)

        try:
            results = evaluate(
                dataset=single_dataset,
                metrics=[answer_correctness, answer_relevancy, FactualCorrectness(),
                         FactualCorrectness(mode="recall"), SemanticSimilarity()],
                llm=llm,
                embeddings=embedding
            )
            # results = evaluate(
            #     dataset=single_dataset,
            #     metrics=[answer_correctness, answer_relevancy, FactualCorrectness(atomicity="high", coverage="high"),
            #              FactualCorrectness(mode="recall",atomicity="high", coverage="high"), SemanticSimilarity()],
            #     llm=llm,
            #     embeddings=embedding
            # )
            print(results)
            row_df = results.to_pandas()
            print(f"[{idx}] Evaluation completed successfully.")
        except Exception as e:
            print(f"[{idx}] Evaluation error: {e}")
            row_df = pd.DataFrame([{
                'user_input': q,
                'response': a,
                'reference': g,
                'answer_correctness': 0.0,
                'answer_relevancy': 0.0,
                'factual_correctness(mode=f1)': 0.0,
                'factual_correctness(mode=recall)': 0.0,
                'semantic_similarity': 0.0
            }])

        existing_df = pd.concat([existing_df, row_df], ignore_index=True)
        existing_df.drop_duplicates(
            subset=['user_input', 'response', 'reference'], keep='last', inplace=True
        )
        existing_df.to_csv(output_file, index=False)

    print(f"All done — results saved to: {output_file}")

## Study Rules Evaluation with meta-llama/llama-4-scout-17b-16e-instruct

In [4]:
llm = ChatOpenAI(
    openai_api_base=os.getenv("OPENAI_API_BASE"), 
    openai_api_key=os.getenv("OPENAI_API_KEY"),              
    model_name="meta-llama/llama-4-scout-17b-16e-instruct"         
)
llm = LangchainLLMWrapper(llm)

embedding = LocalAIEmbeddings(
    openai_api_base=os.getenv("EMBED_URL"), 
    openai_api_key=os.getenv("EMBED_TOKEN"), 
    model=os.getenv("EMBED_MODEL")
)
embedding = LangchainEmbeddingsWrapper(embedding)

In [None]:
test_set_file = './test_data/study_rules/test_set_method1_lightrag_hybrid.json'
output_file = './results/study_rules_2/method1_lightrag_hybrid.csv'

#results = start_evaluate_2(test_set_file, output_file)

df_lr_hybrid2 = pd.read_csv(output_file)
print(f"FILE: {output_file}")
print("Average answer correctness:", df_lr_hybrid2['answer_correctness'].mean())
print("Average answer relevancy:", df_lr_hybrid2['answer_relevancy'].mean())
print("Average factual correctness(mode=f1):", df_lr_hybrid2['factual_correctness(mode=f1)'].mean())
print("Average factual correctness(mode=recall):", df_lr_hybrid2['factual_correctness(mode=recall)'].mean())
print("Average semantic similarity:", df_lr_hybrid2['semantic_similarity'].mean())

FILE: ./results/study_rules_2/method1_lightrag_hybrid.csv
Average answer correctness: 0.5093708492159725
Average answer relevancy: 0.6982478951966493
Average factual correctness(mode=f1): 0.36714285714285716
Average factual correctness(mode=recall): 0.3614285714285715
Average semantic similarity: 0.7553770316263085


In [14]:
test_set_file = './test_data/study_rules/test_set_method1_lightrag_local.json'
output_file = './results/study_rules_2/method1_lightrag_local.csv'

#results = start_evaluate_2(test_set_file, output_file)

df_lr_local2 = pd.read_csv(output_file)
print(f"FILE: {output_file}")
print("Average answer correctness:", df_lr_local2['answer_correctness'].mean())
print("Average answer relevancy:", df_lr_local2['answer_relevancy'].mean())
print("Average factual correctness(mode=f1):", df_lr_local2['factual_correctness(mode=f1)'].mean())
print("Average factual correctness(mode=recall):", df_lr_local2['factual_correctness(mode=recall)'].mean())
print("Average semantic similarity:", df_lr_local2['semantic_similarity'].mean())

FILE: ./results/study_rules_2/method1_lightrag_local.csv
Average answer correctness: 0.4249733737262503
Average answer relevancy: 0.6681967775090379
Average factual correctness(mode=f1): 0.2865
Average factual correctness(mode=recall): 0.2575
Average semantic similarity: 0.7379938908387216


In [15]:
test_set_file = './test_data/study_rules/test_set_method2_graphrag_drift.json'
output_file = './results/study_rules_2/method2_graphrag_drift.csv'

#results = start_evaluate_2(test_set_file, output_file)

df_gr_drift2 = pd.read_csv(output_file)
print(f"FILE: {output_file}")
print("Average answer correctness:", df_gr_drift2['answer_correctness'].mean())
print("Average answer relevancy:", df_gr_drift2['answer_relevancy'].mean())
print("Average factual correctness(mode=f1):", df_gr_drift2['factual_correctness(mode=f1)'].mean())
print("Average factual correctness(mode=recall):", df_gr_drift2['factual_correctness(mode=recall)'].mean())
print("Average semantic similarity:", df_gr_drift2['semantic_similarity'].mean())

FILE: ./results/study_rules_2/method2_graphrag_drift.csv
Average answer correctness: 0.3500237400348909
Average answer relevancy: 0.7774822101039724
Average factual correctness(mode=f1): 0.2935
Average factual correctness(mode=recall): 0.34400000000000003
Average semantic similarity: 0.7109516323324775


In [6]:
test_set_file = './test_data/study_rules/test_set_naive_rag.json'
output_file = './results/study_rules_2/naive_rag.csv'

#results = start_evaluate_2(test_set_file, output_file)

df_naive2 = pd.read_csv(output_file)
print(f"FILE: {output_file}")
print("Average answer correctness:", df_naive2['answer_correctness'].mean())
print("Average answer relevancy:", df_naive2['answer_relevancy'].mean())
print("Average factual correctness(mode=f1):", df_naive2['factual_correctness(mode=f1)'].mean())
print("Average factual correctness(mode=recall):", df_naive2['factual_correctness(mode=recall)'].mean())
print("Average semantic similarity:", df_naive2['semantic_similarity'].mean())

FILE: ./results/study_rules_2/naive_rag.csv
Average answer correctness: 0.5502497388212984
Average answer relevancy: 0.7530999940893139
Average factual correctness(mode=f1): 0.4615
Average factual correctness(mode=recall): 0.43050000000000005
Average semantic similarity: 0.7995977826213055


In [None]:
#TODO run again
test_set_file = './test_data/study_rules/test_set_naive_rag_small_top_k.json'
output_file = './results/study_rules_2/naive_rag_small_top_k.csv'

#results = start_evaluate_2(test_set_file, output_file)

df_naive_small_k2 = pd.read_csv(output_file)
print(f"FILE: {output_file}")
print("Average answer correctness:", df_naive_small_k2['answer_correctness'].mean())
print("Average answer relevancy:", df_naive_small_k2['answer_relevancy'].mean())
print("Average factual correctness(mode=f1):", df_naive_small_k2['factual_correctness(mode=f1)'].mean())
print("Average factual correctness(mode=recall):", df_naive_small_k2['factual_correctness(mode=recall)'].mean())
print("Average semantic similarity:", df_naive_small_k2['semantic_similarity'].mean())

FILE: ./results/study_rules_2/naive_rag_small_top_k.csv
Average answer correctness: 0.6082791145290342
Average answer relevancy: 0.6331640165853277
Average factual correctness(mode=f1): 0.5135714285714286
Average factual correctness(mode=recall): 0.5121428571428571
Average semantic similarity: 0.7969714609933498


## Genetics Evaluation with meta-llama/llama-4-scout-17b-16e-instruct