In [9]:
import os
import pandas as pd
from ragas.metrics import answer_relevancy, answer_correctness
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain.chat_models import ChatOpenAI
from langchain_localai import LocalAIEmbeddings
from datasets import Dataset
from dotenv import load_dotenv
import json

import json
import pandas as pd
import os
from datasets import Dataset
from tqdm import tqdm
import numpy as np

In [24]:
load_dotenv()
print(os.getenv("LLM_MODEL_NAME"))
llm = ChatOpenAI(
    openai_api_base=os.getenv("OPENAI_API_BASE"), 
    openai_api_key=os.getenv("OPENAI_API_KEY"),              
    model_name=os.getenv("LLM_MODEL_NAME")          
)
llm = LangchainLLMWrapper(llm)

embedding = LocalAIEmbeddings(
    openai_api_base=os.getenv("EMBED_URL"), 
    openai_api_key=os.getenv("EMBED_TOKEN"), 
    model=os.getenv("EMBED_MODEL")
)
embedding = LangchainEmbeddingsWrapper(embedding)

meta-llama/llama-4-scout-17b-16e-instruct


In [27]:
def start_evaluate(test_set_file, output_file):
    with open(test_set_file, 'r', encoding='utf-8') as file:
        data = json.load(file)

    if os.path.exists(output_file):
        existing_df = pd.read_csv(output_file)
    else:
        existing_df = pd.DataFrame(columns=[
            'user_input', 'response', 'reference',
            'answer_correctness', 'answer_relevancy'
        ])

    existing_df.fillna('', inplace=True)

    for idx, (q, a, g) in enumerate(zip(data['question'], data['answer'], data['ground_truth']), start=1):
        short_q = q[:50].replace('\n', ' ')
        print(f"[{idx}] Processing question: \"{short_q}...\"")

        match = existing_df[
            (existing_df['user_input'] == q) &
            (existing_df['response'] == a) &
            (existing_df['reference'] == g)
        ]
        if not match.empty:
            existing_row = match.iloc[0]
            correctness = pd.to_numeric(existing_row['answer_correctness'], errors='coerce')
            relevancy = pd.to_numeric(existing_row['answer_relevancy'], errors='coerce')
            if (
                not pd.isna(correctness) and not pd.isna(relevancy) and
                correctness > 0 and relevancy > 0
            ):
                print(f"[{idx}] Skipped (already evaluated with valid metrics)")
                continue

        print(f"[{idx}] Running evaluation...")
        single_data = {
            "question": [q],
            "answer": [a],
            "ground_truth": [g]
        }
        single_dataset = Dataset.from_dict(single_data)

        try:
            results = evaluate(
                dataset=single_dataset,
                metrics=[answer_correctness, answer_relevancy],
                llm=llm,
                embeddings=embedding
            )
            row_df = results.to_pandas()
            print(f"[{idx}] Evaluation completed successfully.")
        except Exception as e:
            print(f"[{idx}] Evaluation error: {e}")
            row_df = pd.DataFrame([{
                'user_input': q,
                'response': a,
                'reference': g,
                'answer_correctness': 0,
                'answer_relevancy': 0
            }])

        existing_df = pd.concat([existing_df, row_df], ignore_index=True)
        existing_df.drop_duplicates(
            subset=['user_input', 'response', 'reference'], keep='last', inplace=True
        )
        existing_df.to_csv(output_file, index=False)

    print(f"All done â€” results saved to: {output_file}")


### LightRAG evaluation with meta-llama/llama-4-scout-17b-16e-instruct

In [33]:
test_set_file = './test_data/study_rules/test_set_method1_lightrag_hybrid.json'
output_file = './results/study_rules/method1_lightrag_hybrid.csv'

#results = start_evaluate(test_set_file, output_file)

df_lr_hybrid = pd.read_csv(output_file)
print(f"FILE: {output_file}")
print("Average answer correctness:", df_lr_hybrid['answer_correctness'].mean())
print("Average answer relevancy:", df_lr_hybrid['answer_relevancy'].mean())

FILE: ./results/study_rules/method1_lightrag_hybrid.csv
Average answer correctness: 0.5122069836697539
Average answer relevancy: 0.6982478951966491


In [34]:
test_set_file = './test_data/study_rules/test_set_method1_lightrag_local.json'
output_file = './results/study_rules/method1_lightrag_local.csv'

#results = start_evaluate(test_set_file, output_file)

df_lr_local = pd.read_csv(output_file)
print(f"FILE: {output_file}")
print("Average answer correctness:", df_lr_local['answer_correctness'].mean())
print("Average answer relevancy:", df_lr_local['answer_relevancy'].mean())

FILE: ./results/study_rules/method1_lightrag_local.csv
Average answer correctness: 0.4249733737262503
Average answer relevancy: 0.6681967775090379


### LightRAG evaluation with deepseek-r1-distill-llama-70b

In [26]:
llm = ChatOpenAI(
    openai_api_base=os.getenv("OPENAI_API_BASE"), 
    openai_api_key=os.getenv("OPENAI_API_KEY"),              
    model_name="deepseek-r1-distill-llama-70b"         
)
llm = LangchainLLMWrapper(llm)

embedding = LocalAIEmbeddings(
    openai_api_base=os.getenv("EMBED_URL"), 
    openai_api_key=os.getenv("EMBED_TOKEN"), 
    model=os.getenv("EMBED_MODEL")
)
embedding = LangchainEmbeddingsWrapper(embedding)

In [32]:
test_set_file = './test_data/study_rules/test_set_method1_lightrag_hybrid.json'
output_file = './results/study_rules/method1_lightrag_hybrid_deepseek.csv'

#results = start_evaluate(test_set_file, output_file)

df_lr_hybrid_deepseek = pd.read_csv(output_file)
print(f"FILE: {output_file}")
print("Average answer correctness:", df_lr_hybrid_deepseek['answer_correctness'].mean())
print("Average answer relevancy:", df_lr_hybrid_deepseek['answer_relevancy'].mean())

FILE: ./results/study_rules/method1_lightrag_hybrid_deepseek.csv
Average answer correctness: 0.49797992824280307
Average answer relevancy: 0.7349126593819586


In [37]:
test_set_file = './test_data/study_rules/test_set_method1_lightrag_local.json'
output_file = './results/study_rules/method1_lightrag_local_deepseek.csv'

#results = start_evaluate(test_set_file, output_file)

df_lr_local_deepseek = pd.read_csv(output_file)
print(f"FILE: {output_file}")
print("Average answer correctness:", df_lr_local_deepseek['answer_correctness'].mean())
print("Average answer relevancy:", df_lr_local_deepseek['answer_relevancy'].mean())

FILE: ./results/study_rules/method1_lightrag_local_deepseek.csv
Average answer correctness: 0.4016634365973334
Average answer relevancy: 0.5804095482320186
