# Scrape Dataset

In [1]:
import csv
questions_answers = []
language_dataset = '../trivia_qa_polish.csv'

# Open the CSV file with the appropriate encoding
with open(language_dataset, mode='r', encoding='utf-8') as file:
    reader = csv.reader(file)
    for row in reader:
        question, pre_configured_answer = row
        questions_answers.append((question, pre_configured_answer))

# Model Comparison

In [None]:
import json
from openai import OpenAI
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertTokenizer, BertModel, pipeline
import torch
from rouge_score import rouge_scorer
from transformers import logging
import os
from dotenv import load_dotenv


# Remove warning messages
logging.set_verbosity_error()

class Evaluator: # example of conciseness, completeness, faithfulness
    def __init__(self):
        # Load models that will be used across evaluation functions to avoid reloading multiple times
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.model = BertModel.from_pretrained('bert-base-uncased')
        self.nli_model = pipeline("text-classification", model="roberta-large-mnli")

    def calculate_completeness(self, reference, generated):
        scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
        score = scorer.score(reference, generated)
        completeness_score = score['rougeL'].recall
        return completeness_score

    def calculate_faithfulness(self, reference, generated):
        result = self.nli_model(f"{reference} [SEP] {generated}")
        label = result[0]['label']
        
        if label == 'ENTAILMENT':
            return 1.0
        elif label == 'CONTRADICTION':
            return 0.0
        else:
            return 0.5

    def calculate_conciseness(self, reference, generated):
        # Tokenize and get embeddings
        ref_tokens = self.tokenizer(reference, return_tensors='pt')
        gen_tokens = self.tokenizer(generated, return_tensors='pt')
        
        with torch.no_grad():
            ref_emb = self.model(**ref_tokens).last_hidden_state.mean(dim=1)
            gen_emb = self.model(**gen_tokens).last_hidden_state.mean(dim=1)
        
        similarity = cosine_similarity(ref_emb, gen_emb)
        length_penalty = 0.01 * abs(len(generated) - len(reference))
        
        return similarity - length_penalty

    def evaluate_response(self, reference, generated):
        conciseness_score = self.calculate_conciseness(reference, generated)
        completeness_score = self.calculate_completeness(reference, generated)
        faithfulness_score = self.calculate_faithfulness(reference, generated)
        
        combined_score = {
            "Conciseness": conciseness_score,
            "Completeness": completeness_score,
            "Faithfulness": faithfulness_score
        }
        return combined_score

class LLM_Client:
    def __init__(self, api_key, base_url):
        self.LLM_Client = OpenAI(api_key=api_key, base_url=base_url)

    def generate_response(self, question):
        messages = [
            {"role": "system", "content": "Force each answer to be less than 100 tokens no matter what. Do not accept answers longer than 100 tokens. You are a helpful assistant helping answer questions as briefly, concisely and accurately as possible."},
            {"role": "user", "content": question}
        ]
        
        response = self.LLM_Client.chat.completions.create(
            model="meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
            messages=messages,
            max_tokens=100
        )

        llm_answer = response.choices[0].message.content.strip()
        return llm_answer

class ResponseManager:
    def __init__(self):
        self.all_results = []

    def format_scores(self, scores):
        prettified_scores = {}
        for k, v in scores.items():
            if isinstance(v, (float, int)):
                prettified_scores[k] = round(v, 3)
            elif hasattr(v, 'tolist'):
                list_value = v.tolist()
                prettified_scores[k] = round(list_value[0][0], 3) if isinstance(list_value, list) and isinstance(list_value[0], list) else round(list_value, 3)
            else:
                prettified_scores[k] = v
        return prettified_scores

    def add_result(self, question, pre_configured_answer, llm_answer, scores):
        prettified_scores = self.format_scores(scores)
        result_entry = {
            "Question": question,
            "Pre-configured Answer": pre_configured_answer,
            "LLM Answer": llm_answer,
            "Evaluation Scores": prettified_scores
        }
        self.all_results.append(result_entry)

    def save_results(self, output_filename="evaluation_results.json"):
        with open(output_filename, 'w', encoding='utf-8') as outfile:
            json.dump(self.all_results, outfile, indent=4, ensure_ascii=False)
        print(f"All results have been saved to {output_filename}")

    def visualise_results(self, scores, question, pre_configured_answer, llm_answer):
        print("=" * 60)
        print("Evaluation Scores:")
        print(self.format_scores(scores))
        print("-" * 60)
        print(f"Question:\n{question}\n")
        print(f"Pre-configured Answer:\n{pre_configured_answer}\n")
        print(f"LLM Answer:\n{llm_answer}\n")
        print("=" * 60, "\n")



if __name__ == "__main__":

    load_dotenv()
    MY_ENV_VAR = os.getenv('KEY') # follow https://stackoverflow.com/questions/40216311/reading-in-environment-variables-from-an-environment-file for key setup
    api_key = MY_ENV_VAR
    base_url = "https://api.aimlapi.com"


    LLM_Client = LLM_Client(api_key=api_key, base_url=base_url)
    evaluator = Evaluator()
    response_manager = ResponseManager()

    # Loop through the questions and evaluate the responses
    for question, pre_configured_answer in questions_answers:
        llm_answer = LLM_Client.generate_response(question)
        scores = evaluator.evaluate_response(pre_configured_answer, llm_answer)
        response_manager.add_result(question, pre_configured_answer, llm_answer, scores)
        visualised_results = response_manager.visualise_results(scores, question, pre_configured_answer, llm_answer)
    # Save the results to a JSON file
    response_manager.save_results()