In [None]:
import os
from config import set_environment 
set_environment()

import logging
import sys
#logging.basicConfig(stream=sys.stdout, level=logging.INFO)
#logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

# Only for notebook
import nest_asyncio
nest_asyncio.apply()

In [None]:
from llama_index.core import Settings
from llama_index.core.evaluation import (
    CorrectnessEvaluator,
)
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding

from llama_index.llms.cohere import Cohere
from llama_index.embeddings.cohere import CohereEmbedding
import pandas as pd
import random
import json
from datetime import datetime

In [None]:
eval_name = os.environ["EVAL_NAME"]
eval_questions = os.environ["EVAL_QUESTIONS"]
eval_results_dir = os.environ["EVAL_RESULTS_DIR"]

bb_output_file = os.environ["BB_OUTPUT_FILE"]

rag_strategy = os.environ["RAG_STRATEGY"]

generation_llm_family = os.environ["GENERATION_LLM_FAMILY"]
generation_llm_model = os.environ["GENERATION_LLM_MODEL"]
embedding_llm_family = os.environ["EMBEDDING_LLM_FAMILY"]
embedding_llm_model = os.environ["EMBEDDING_LLM_MODEL"]
embedding_dimensions = int(os.environ["EMBEDDING_DIMESIONS"])

generation_string = generation_llm_model.replace("meta-llama/", "").replace("Qwen/", "").replace("models/", "").replace("mistralai/", "") 
embed_string = embedding_llm_model.replace("models/", "") if "models/" in embedding_llm_model else embedding_llm_model

if rag_strategy == "S008_00":
    rag_strategy_desc = "Fusion_AI_PDF"
elif rag_strategy == "S008_01": 
    rag_strategy_desc = "Fusion_AI_TXT"
elif rag_strategy == "S008_02": 
    rag_strategy_desc = "OCI_AI_PDF"
elif rag_strategy == "S008_03": 
    rag_strategy_desc = "OCI_AI_TXT"
elif rag_strategy == "S008_04": 
    rag_strategy_desc = "COH_P01"
elif rag_strategy == "S008_05": 
    rag_strategy_desc = "COH_P02"

batch_id = f"{eval_name}_{rag_strategy}_GM_{generation_string}_EM_{embed_string}_{random.randint(0, 999):03}"

output_file = f"{eval_results_dir}/{batch_id}.xlsx"

In [None]:
output_file

In [None]:
evaluation_llm_family = os.environ["EVALUATION_LLM_FAMILY"]
evaluation_llm_model = os.environ["EVALUATION_LLM_MODEL"]

if evaluation_llm_family == "OPENAI":
    Settings.eval_llm = OpenAI(temperature=0, model=evaluation_llm_model)
elif evaluation_llm_family == "COHERE":
    Settings.eval_llm = Cohere(api_key=os.environ["COHERE_API_KEY"], model=evaluation_llm_model, temperature=0)

In [None]:
from llama_index.core.evaluation import (
    CorrectnessEvaluator,
)
eval_lidx_c = CorrectnessEvaluator(llm=Settings.eval_llm)

In [None]:
file_extension = os.path.splitext(bb_output_file)[1].lower()

if file_extension == '.xlsx':
    correctness_df = pd.read_excel(bb_output_file)
elif file_extension == '.jsonl':    
    correctness_df = pd.read_json(bb_output_file, lines=True)
elif file_extension == ".json":
    data_list = []
    with open(bb_output_file, 'r') as file:
        for line in file:
            try:
                data = json.loads(line)
                data_list.append(data)
            except json.JSONDecodeError:
                # Handle or log the error if necessary
                continue

# Extract the required fields and load them into a DataFrame
    correctness_df = pd.DataFrame([{
        'query_num': item['query_num'],
        'query': item['question'],
        'generated_answer': item['answer']
    } for item in data_list])

eval_questions_df = pd.read_excel(pd.ExcelFile(eval_questions))
correctness_df = correctness_df.merge(eval_questions_df, on='query_num', suffixes=('', '_drop'))
correctness_df.drop(columns=['query_drop'], inplace=True)
correctness_df = correctness_df[['query_num', 'query', 'expected_answer', 'generated_answer']]


In [None]:
# Define the function to evaluate responses
def evaluate_response(query, expected_answer, generated_answer):
    # Simple example of evaluation: check if expected answer is in the generated answer
    result = eval_lidx_c.evaluate( query=query, response=generated_answer, reference=expected_answer, )
    correctness = result.score
    feedback = result.feedback
    return correctness, feedback

# Apply the function to each row and create new columns
correctness_df[['correctness_llm', 'feedback_llm']] = correctness_df.apply(
    lambda row: evaluate_response(row['query'], row['expected_answer'], row['generated_answer']),
    axis=1, result_type='expand'
)

correctness_df['correctness_human'] = correctness_df['correctness_llm'] 
correctness_df['feedback_human'] = ""
correctness_df['batch_id'] = batch_id 



In [None]:
responses_df = pd.DataFrame()
responses_df = correctness_df[['query_num', 'query', 'expected_answer', 'generated_answer', 'correctness_llm']]
responses_df['correctness_human'] = responses_df['correctness_llm']
responses_df.loc[:, ['faithfulness_llm', 'faithfulness_human']] = ""
responses_df['rag_strategy'] = rag_strategy
responses_df['rag_strategy_desc'] = rag_strategy_desc
responses_df.loc[:, ['parameter_1','parameter_2', 'parameter_3', 'parameter_4', 'parameter_5']] = ""
responses_df['model'] = generation_llm_model 
responses_df['embed_model'] = embedding_llm_model 
responses_df['eval_model'] = evaluation_llm_model
responses_df['embed_dimensions'] = embedding_dimensions   
responses_df['reranker'] = ""
responses_df['run_date'] = datetime.today().strftime('%Y-%m-%d') 
responses_df['eval_name'] = eval_name
responses_df['batch_id'] = batch_id

In [None]:
correctness_sum = correctness_df['correctness_llm'].sum()
correctness_mean = correctness_df['correctness_llm'].mean()

# Create a new DataFrame for the summary
summary_df = pd.DataFrame({
    'Metric': ['Sum', 'Mean'],
    'Value': [correctness_sum, correctness_mean]
})

In [None]:
with pd.ExcelWriter(output_file) as writer:
   responses_df.to_excel(writer, sheet_name="Responses", index=False)
   summary_df.to_excel(writer, sheet_name="Summary", index=False)
   correctness_df.to_excel(writer, sheet_name="Correctness", index=False)

