In [None]:
!pip install langchain openai -q

In [None]:
from google.colab import userdata

In [None]:
import os
os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')

In [None]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.llms import OpenAI

In [None]:
def generate_embedding_with_ada(text: str) -> list:
    embeddings_model = OpenAIEmbeddings(model="text-embedding-ada-002")
    embedding = embeddings_model.embed_query(text)
    return embedding

In [None]:
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate

def qualitative_evaluation_with_model(model, answer: str, question: str, reference_answer: str, aspect: str) -> str:
    prompt_template = PromptTemplate(
        input_variables=["answer", "question", "reference_answer"],
        template=(
            f"Avalie a {aspect} da resposta '{{answer}}' para a pergunta '{{question}}', "
            f"em comparação com a resposta de referência '{{reference_answer}}'."
        )
    )

    llm_chain = LLMChain(llm=model, prompt=prompt_template)

    evaluation = llm_chain.run(answer=answer, question=question, reference_answer=reference_answer)
    return evaluation

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score
from scipy.spatial.distance import cosine
from typing import List, Dict

In [None]:
class DatasetLoader:
    def __init__(self, data_path: str):
        self.data_path = data_path

    def load_data(self):
        data = pd.read_csv(self.data_path)
        return data

In [None]:
class DataPreprocessor:
    def preprocess(self, data):
        data['question'] = data['question'].str.lower().str.strip()
        data['answer'] = data['answer'].str.lower().str.strip()
        return data

In [None]:

class ModelRunner:
    def __init__(self, model):
        self.model = model

    def get_answer(self, image, question): #resposta pode vir direto do dataset
        response = self.model.predict(image, question)
        return response

    def evaluate_answer(self, answer: str, question: str, reference_answer: str) -> Dict[str, str]:
        """
        Avalia a resposta em múltiplas métricas qualitativas, usando diferentes prompts para cada uma.
        """
        relevance_prompt = (
            f"Avalie a relevância da resposta '{answer}' para a pergunta '{question}', "
            f"considerando a resposta de referência '{reference_answer}'."
        )

        coherence_prompt = (
            f"Avalie a coerência da resposta '{answer}' em relação à pergunta '{question}', "
            f"com base na resposta de referência '{reference_answer}'."
        )

        clarity_prompt = (
            f"Avalie a clareza da resposta '{answer}' dada para a pergunta '{question}', "
            f"comparando com a resposta de referência '{reference_answer}'."
        )

        relevance_evaluation = self.model(relevance_prompt)
        coherence_evaluation = self.model(coherence_prompt)
        clarity_evaluation = self.model(clarity_prompt)

        return {
            "relevance": relevance_evaluation,
            "coherence": coherence_evaluation,
            "clarity": clarity_evaluation
        }


In [None]:
class MetricsEvaluator:
    def __init__(self, reference_answers: List[str], model_answers: List[str], questions: List[str]):
        self.reference_answers = reference_answers
        self.model_answers = model_answers
        self.questions = questions

    def accuracy(self):
        return accuracy_score(self.reference_answers, self.model_answers)

    def f1_score(self):
        return f1_score(self.reference_answers, self.model_answers, average='weighted')

    def semantic_relevance(self):
        similarities = []
        for ref, model_ans in zip(self.reference_answers, self.model_answers):
            ref_embedding = generate_embedding_with_ada(ref)
            model_embedding = generate_embedding_with_ada(model_ans)
            similarity = 1 - cosine(ref_embedding, model_embedding)
            similarities.append(similarity)
        return np.mean(similarities)

    def qualitative_evaluation(self):
        evaluations = []
        for question, ref, model_ans in zip(self.questions, self.reference_answers, self.model_answers):
            eval_text = qualitative_evaluation_with_gpt4o(model_ans, question, ref)
            evaluations.append(eval_text)
        return evaluations

    def evaluate(self):
        metrics = {
            "accuracy": self.accuracy(),
            "f1_score": self.f1_score(),
            "semantic_relevance": self.semantic_relevance(),
            "qualitative_evaluation": self.qualitative_evaluation()
        }
        return metrics

In [None]:
class ResultsReporter:
    def __init__(self, metrics: Dict[str, float]):
        self.metrics = metrics

    def report(self):
        print("Relatório de Resultados:")
        for metric, value in self.metrics.items():
            if metric != "qualitative_evaluation":
                print(f"{metric}: {value:.2f}")
            else:
                print("\nAvaliações Qualitativas das Respostas:")
                for eval_text in value:
                    print(f"- {eval_text}")

In [None]:
def main(data_path, model):
    loader = DatasetLoader(data_path)
    data = loader.load_data()

    preprocessor = DataPreprocessor()
    data = preprocessor.preprocess(data)

    model_runner = ModelRunner(model)
    model_answers = [
        model_runner.get_answer(row['image'], row['question'])
        for _, row in data.iterrows()
    ]

    evaluator = MetricsEvaluator(data['answer'].tolist(), model_answers, data['question'].tolist())
    metrics = evaluator.evaluate()

    reporter = ResultsReporter(metrics)
    reporter.report()

In [None]:
main("dataset.csv", "gpt-4o-mini")