In [1]:
from langchain_ollama import OllamaLLM
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser


import pandas as pd
# from datasets import Dataset
import json


import os
# import google.generativeai as genai

  from .autonotebook import tqdm as notebook_tqdm


## Carregar Vector Store e configurar funções de retrieval e geração de resposta

In [2]:
#sentence_transformers
#rank-bm25
#langchain_text_splitters
#chromadb

In [3]:
from src.vectorstore.hybrid_vector_store import HybridVectorStore
from src.config.settings import settings

#### Chroma Vector Store
#### Para criar do zero, ver Vector_Store.ipynb
store = HybridVectorStore(
    persist_path=settings.DATA_DB,
    embedding_model=settings.EMBEDDING_MODEL
)

In [4]:
from langchain_ollama import OllamaLLM
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser


# Prompt para a parte geracional do retrieval
prompt_rag = ChatPromptTemplate.from_template(
    """
    Você é um ACP - um Agente Conversacional Pedagógico especializado em ajudar estudantes com dúvidas relacionadas a conteúdos acadêmicos.
    Sua função é fornecer respostas claras e informativas com base no material de estudo fornecido pelo contexto.
    Inclua todas as informações relevantes do contexto em suas respostas, evitando suposições ou informações externas.
    Suas respostas devem ser sempre em português brasileiro e devem usar um tom leve.

    Se não encontrar a resposta no contexto, diga:
    "Nenhuma informação disponível no contexto."
                
    Contexto:
    {context}

    Pergunta:
    {question}
    """
)


def rag_pipeline(question: str, prompt: str, model:str):

    docs = store.hybrid_search(question, top_k=5)
    context = "\n\n".join([d[1] for d in docs])

    llm = OllamaLLM(model=model, temperature=0)

    chain = (
            prompt
            | llm
            | StrOutputParser()
        )
    
    return chain.invoke({"context": context, "question": question})

# Teste
# response = rag_pipeline(question = "Qual a diferença entre Data Scientist e ML Engineer?",
#                         prompt = prompt_rag,
#                         model="llama3.1:latest"
#                         )
# print(response)

In [5]:
# Função apenas de recuperação de documentos

def real_retrieval(question: str):
    docs = store.hybrid_search(question, top_k=5)
    return [docs[i][1] for i in range(len(docs))]

# Teste
retrieve = real_retrieval("Qual a diferença entre Data Scientist e ML Engineer?")
print(retrieve)

['the machine learning ecosystem.\nWhat is a Data Scientist ?\nA Data Scientist is an expert in extracting valuable insights from large\nvolumes of data. With a strong background in mathematics, statistics, and\nprogramming, Data Scientists analyze and interpret data to solve complex\nproblems and support better decision-making within an organization. They\nwork closely with business stakeholders to understand their objectives and\ndevelop machine learning models to help achieve those goals.\nKey responsibilities of a Data Scientist include:\nGathering, cleaning, and preprocessing data\nDeveloping machine learning models and algorithms\nEvaluating model performance and optimizing as needed\nCommunicating findings and insights to business stakeholders\nWhat is an ML Engineer?\nAn ML Engineer, or Machine Learning Engineer, is a professional who\ndesigns, develops, and implements machine learning models. They work\nclosely with data scientists to translate prototypes into efficient and', 

In [6]:
def generate_dataset_for_evaluation(ground_truth_csv_path: str, json_output_path: str, model: str):

    os.makedirs(os.path.dirname(json_output_path) + '/' + model.replace(":", "_"), exist_ok=True)

    test_cases = []

    df = pd.read_csv(ground_truth_csv_path, encoding="latin-1")

    for index, row in df.iterrows():
        test_cases.append({
            "input": row["question"],
            "expected_output": row["answer"],
            "actual_output": rag_pipeline(question=row["question"], 
                                          prompt=prompt_rag, 
                                          model=model),
            "retrieval_context": real_retrieval(row["question"])
        })

    with open(os.path.dirname(json_output_path + model.replace(":", "_") + '/') + "/test_cases_dataset.json", "w", encoding="utf-8") as f:
        json.dump(test_cases, f, ensure_ascii=False, indent=4)

In [7]:
import shutil
from pathlib import Path

def copy_folder(src: str, dst: str) -> None:
    src_path = Path(src)
    dst_path = Path(dst)

    if not src_path.exists():
        raise FileNotFoundError(f"Source folder does not exist: {src}")

    # copytree copia tudo recursivamente
    shutil.copytree(
        src_path,
        dst_path,
        dirs_exist_ok=True  # permite que o destino já exista (Python 3.8+)
    )

## Métricas de avaliação de LLMs usando Deepeval

### Criando dataset de conteudo recuperado (retrieval) e geração de resposta.

In [8]:
#### Descomentar para gerar os datasets de avaliação

# ground_truth_csv_path = "data/ground_truth/ground_truth_mlops.csv"
# json_output_path = "artifacts/eval_metrics/datasets/"
# models = ["llama3.1:latest", "deepseek-r1:latest", "mistral:latest", "gpt-oss:latest"]

# for model in models:
#     generate_dataset_for_evaluation(ground_truth_csv_path=ground_truth_csv_path, 
#                                     json_output_path=json_output_path, 
#                                     model=model)

In [9]:
##### Copiando os datasets para incremento das métricas de ragas posteriormente. Descomentar para usar
# PS: Dessa forma todas as infos ficam salvas em um unico json por modelo

# copy_folder(src="artifacts/eval_metrics/datasets/", dst="artifacts/eval_metrics/ragas/context/")

### Métricas de Retrieval de Contexto

**Context Precision:** Context Precision is a metric that evaluates the retriever's ability to rank relevant chunks higher than irrelevant ones for a given query in the retrieved context. Specifically, it assesses the degree to which relevant chunks in the retrieved context are placed at the top of the ranking.

It is calculated as the mean of the precision@k for each chunk in the context. Precision@k is the ratio of the number of relevant chunks at rank k to the total number of chunks at rank k.

Datasets completos e as avaliações estão dentro da pasta artifacts.

In [10]:
from dotenv import load_dotenv
load_dotenv()

from deepeval.test_case import LLMTestCase
import json
import pandas as pd
from deepeval import settings

settings.DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS_OVERRIDE = 600

In [11]:
from deepeval.models import OllamaModel

def create_evaluator_llm(model: str):
    return OllamaModel(
    model=model,
    base_url="http://localhost:11434",
    temperature=0,
)

In [12]:
from deepeval.metrics import (
  ContextualRelevancyMetric,
  ContextualPrecisionMetric,
  AnswerRelevancyMetric,
  FaithfulnessMetric
)


def calculate_context_precision(case: dict, evaluator_model: str):

    evaluator_llm = create_evaluator_llm(model=evaluator_model)
    contextual_precision = ContextualPrecisionMetric(model=evaluator_llm, include_reason=False)

    return contextual_precision.measure(case)

def calculate_context_relevancy(case: dict, evaluator_model: str):

    evaluator_llm = create_evaluator_llm(model=evaluator_model)
    contextual_relevancy = ContextualRelevancyMetric(model=evaluator_llm, include_reason=False)

    return contextual_relevancy.measure(case)

def calculate_answer_relevancy(case: dict, evaluator_model: str):

    evaluator_llm = create_evaluator_llm(model=evaluator_model)
    answer_relevancy = AnswerRelevancyMetric(model=evaluator_llm, include_reason=False)

    return answer_relevancy.measure(case)

def calculate_faithfulness(case: dict, evaluator_model: str):

    evaluator_llm = create_evaluator_llm(model=evaluator_model)
    faithfulness = FaithfulnessMetric(model=evaluator_llm, include_reason=False)

    return faithfulness.measure(case)

In [13]:
def run_evaluation_on_datasets(dataset_path: str, evaluator_model: str):

    with open(dataset_path, "r", encoding="utf-8") as f:
        test_cases = json.load(f)

    for case in test_cases:

        # Criando listas para as métricas de contexto por modelo de evaluation
        if 'context_precision_score' not in case:
            case['context_precision_score'] = []
        if 'context_relevancy_score' not in case:
            case['context_relevancy_score'] = []
        if 'answer_relevancy_score' not in case:
            case['answer_relevancy_score'] = []
        if 'faithfulness_score' not in case:
            case['faithfulness_score'] = []

        test_case_obj = LLMTestCase(
            input=case["input"],
            expected_output=case["expected_output"],
            actual_output=case["actual_output"],
            retrieval_context=case["retrieval_context"]
        )

        try:
            precision = calculate_context_precision(test_case_obj, evaluator_model=evaluator_model)

            case['context_precision_score'].append(
                {
                evaluator_model: precision
                }
            )
        except Exception as e:
            print(f"Error calculating context precision for case {case['input']}: {e}")
            case['context_precision_score'].append(
                {
                evaluator_model: None,
                'logs': str(e)
                }
            )

        try:
            recall = calculate_context_relevancy(test_case_obj, evaluator_model=evaluator_model)

            case['context_relevancy_score'].append(
                {
                evaluator_model: recall
                }
            )
        except Exception as e:
            print(f"Error calculating context relevancy for case {case['input']}: {e}")
            case['context_relevancy_score'].append(
                {
                evaluator_model: None,
                'logs': str(e)
                }
            )

        try:
            answer_relevancy = calculate_answer_relevancy(test_case_obj, evaluator_model=evaluator_model)

            case['answer_relevancy_score'].append(
                {
                evaluator_model: answer_relevancy
                }
            )
        except Exception as e:
            print(f"Error calculating answer relevancy for case {case['input']}: {e}")
            case['answer_relevancy_score'].append(
                {
                evaluator_model: None,
                'logs': str(e)
                }
            )

        try:
            faithfullness = calculate_faithfulness(test_case_obj, evaluator_model=evaluator_model)

            case['faithfulness_score'].append(
                {
                evaluator_model: faithfullness
                }
            )
        except Exception as e:
            print(f"Error calculating faithfulness for case {case['input']}: {e}")
            case['faithfulness_score'].append(
                {
                evaluator_model: None,
                'logs': str(e)
                }
            )

    with open(dataset_path, "w", encoding="utf-8") as f:
        json.dump(test_cases, f, ensure_ascii=False, indent=4)

In [14]:
# generation_models = ["llama3.1:latest", "deepseek-r1:latest", "mistral:latest", "gpt-oss:latest"]
generation_models = ["mistral:latest", "gpt-oss:latest"] 
eval_models = ["llama3.1:latest", "llama3.2:latest"] # Rodar "gpt-oss:20b-cloud" em separado pois tem limite de token por hora e por semana

for generation_model in generation_models:
    for eval_model in eval_models:
        print(f"Running evaluation for generation model: {generation_model} with evaluator model: {eval_model}")
        run_evaluation_on_datasets(
            dataset_path=f'artifacts/eval_metrics/deepeval/{generation_model.replace(':','_')}/test_cases_dataset.json',
            evaluator_model=eval_model
        )

Running evaluation for generation model: mistral:latest with evaluator model: llama3.1:latest


Running evaluation for generation model: mistral:latest with evaluator model: llama3.2:latest


Running evaluation for generation model: gpt-oss:latest with evaluator model: llama3.1:latest


Running evaluation for generation model: gpt-oss:latest with evaluator model: llama3.2:latest


In [15]:
import time

# generation_models = ["llama3.1:latest", "deepseek-r1:latest", "mistral:latest", "gpt-oss:latest"]
generation_models = ["deepseek-r1:latest", "mistral:latest", "gpt-oss:latest"] 
eval_models = ["gpt-oss:20b-cloud"] 

for generation_model in generation_models:
    for eval_model in eval_models:
        print(f"Running evaluation for generation model: {generation_model} with evaluator model: {eval_model}")
        run_evaluation_on_datasets(
            dataset_path=f'artifacts/eval_metrics/deepeval/{generation_model.replace(':','_')}/test_cases_dataset.json',
            evaluator_model=eval_model
        )

        time.sleep(3600)

Running evaluation for generation model: deepseek-r1:latest with evaluator model: gpt-oss:20b-cloud


Error calculating context precision for case O que é drift em modelos de machine learning?: 1 validation error for Verdicts
  Invalid JSON: expected value at line 1 column 1 [type=json_invalid, input_value='```json\n{\n  "verdicts"...s."\n    }\n  ]\n}\n```', input_type=str]
    For further information visit https://errors.pydantic.dev/2.12/v/json_invalid


Error calculating context precision for case O que é CI/CD aplicado a ML?: 1 validation error for Verdicts
  Invalid JSON: expected value at line 1 column 1 [type=json_invalid, input_value='```json\n{\n  "verdicts"...no"\n    }\n  ]\n}\n```', input_type=str]
    For further information visit https://errors.pydantic.dev/2.12/v/json_invalid


Error calculating faithfulness for case Por que monitorar um modelo em produção?: 1 validation error for Claims
  Invalid JSON: expected `,` or `]` at line 1 column 745 [type=json_invalid, input_value='{"claims":["Monitorar um...erformance otimizada."}', input_type=str]
    For further information visit https://errors.pydantic.dev/2.12/v/json_invalid


Error calculating context precision for case Para que serve o MLflow?: 1 validation error for Verdicts
  Invalid JSON: expected value at line 1 column 1 [type=json_invalid, input_value='```json\n{\n  "verdicts"..."."\n    }\n  ]\n}\n```', input_type=str]
    For further information visit https://errors.pydantic.dev/2.12/v/json_invalid


Error calculating answer relevancy for case Para que serve o MLflow?: 1 validation error for Statements
  Invalid JSON: expected `,` or `]` at line 1 column 844 [type=json_invalid, input_value='{"statements":["MLflow ...sonalizadas de MLOps."}', input_type=str]
    For further information visit https://errors.pydantic.dev/2.12/v/json_invalid


Error calculating context precision for case O que é canary deployment em ML?: 1 validation error for Verdicts
  Invalid JSON: expected value at line 1 column 1 [type=json_invalid, input_value='```json\n{\n  "verdicts"...no"\n    }\n  ]\n}\n```', input_type=str]
    For further information visit https://errors.pydantic.dev/2.12/v/json_invalid


Error calculating context relevancy for case O que é um modelo baseline?: 1 validation error for ContextualRelevancyVerdicts
  Invalid JSON: expected value at line 1 column 1 [type=json_invalid, input_value='```json\n{\n  "verdicts"...l."\n    }\n  ]\n}\n```', input_type=str]
    For further information visit https://errors.pydantic.dev/2.12/v/json_invalid


Error calculating context precision for case O que é model drift detection?: 1 validation error for Verdicts
  Invalid JSON: expected value at line 1 column 1 [type=json_invalid, input_value='```json\n{\n  "verdicts"...n."\n    }\n  ]\n}\n```', input_type=str]
    For further information visit https://errors.pydantic.dev/2.12/v/json_invalid


Error calculating context precision for case Qual a importância de um lineage tracker em MLOps?: 1 validation error for Verdicts
  Invalid JSON: expected value at line 1 column 1 [type=json_invalid, input_value='```json\n{\n  "verdicts"...y."\n    }\n  ]\n}\n```', input_type=str]
    For further information visit https://errors.pydantic.dev/2.12/v/json_invalid


Error calculating context precision for case O que é um modelo champion vs challenger?: 1 validation error for Verdicts
  Invalid JSON: expected value at line 1 column 1 [type=json_invalid, input_value='```json\n{\n  "verdicts"...n."\n    }\n  ]\n}\n```', input_type=str]
    For further information visit https://errors.pydantic.dev/2.12/v/json_invalid


Running evaluation for generation model: mistral:latest with evaluator model: gpt-oss:20b-cloud


Error calculating context precision for case O que é MLOps?: 1 validation error for Verdicts
  Invalid JSON: expected value at line 1 column 1 [type=json_invalid, input_value='```json\n{\n  "verdicts"...s."\n    }\n  ]\n}\n```', input_type=str]
    For further information visit https://errors.pydantic.dev/2.12/v/json_invalid


Error calculating context precision for case Qual a diferença entre MlOps e Data Scientist?: 1 validation error for Verdicts
  Invalid JSON: expected value at line 1 column 1 [type=json_invalid, input_value='```json\n{\n  "verdicts"...s."\n    }\n  ]\n}\n```', input_type=str]
    For further information visit https://errors.pydantic.dev/2.12/v/json_invalid


Error calculating answer relevancy for case Qual a diferença entre MlOps e Data Scientist?: 1 validation error for Statements
  Invalid JSON: expected `,` or `]` at line 1 column 844 [type=json_invalid, input_value='{"statements":["A difere...(Ops) de sistemas ML."}', input_type=str]
    For further information visit https://errors.pydantic.dev/2.12/v/json_invalid


Error calculating context precision for case O que significa servir um modelo em produção?: 1 validation error for Verdicts
  Invalid JSON: expected value at line 1 column 1 [type=json_invalid, input_value='```json\n{\n  "verdicts"...r."\n    }\n  ]\n}\n```', input_type=str]
    For further information visit https://errors.pydantic.dev/2.12/v/json_invalid


Error calculating context relevancy for case O que significa servir um modelo em produção?: 1 validation error for ContextualRelevancyVerdicts
  Invalid JSON: expected value at line 1 column 1 [type=json_invalid, input_value='```json\n{\n  "verdicts"...n."\n    }\n  ]\n}\n```', input_type=str]
    For further information visit https://errors.pydantic.dev/2.12/v/json_invalid


Error calculating context precision for case Por que versionar modelos é importante?: 1 validation error for Verdicts
  Invalid JSON: expected value at line 1 column 1 [type=json_invalid, input_value='```json\n{\n  "verdicts"...s."\n    }\n  ]\n}\n```', input_type=str]
    For further information visit https://errors.pydantic.dev/2.12/v/json_invalid


Error calculating context precision for case Diferença entre batch inference e online inference?: 1 validation error for Verdicts
  Invalid JSON: expected value at line 1 column 1 [type=json_invalid, input_value='```json\n{\n  "verdicts"...no"\n    }\n  ]\n}\n```', input_type=str]
    For further information visit https://errors.pydantic.dev/2.12/v/json_invalid


Error calculating context precision for case O que é canary deployment em ML?: 1 validation error for Verdicts
  Invalid JSON: expected value at line 1 column 1 [type=json_invalid, input_value='```json\n{\n  "verdicts"...no"\n    }\n  ]\n}\n```', input_type=str]
    For further information visit https://errors.pydantic.dev/2.12/v/json_invalid


Error calculating context precision for case O que é um modelo baseline?: 1 validation error for Verdicts
  Invalid JSON: expected value at line 1 column 1 [type=json_invalid, input_value='```json\n{\n  "verdicts"...\""\n    }\n  ]\n}\n```', input_type=str]
    For further information visit https://errors.pydantic.dev/2.12/v/json_invalid


Error calculating faithfulness for case O que significa monitorar latência de inferência?: 1 validation error for Claims
  Invalid JSON: expected `,` or `]` at line 1 column 964 [type=json_invalid, input_value='{"claims":["Monitorar a ...endizado de máquina."}', input_type=str]
    For further information visit https://errors.pydantic.dev/2.12/v/json_invalid


Error calculating context precision for case O que é validação cruzada?: 1 validation error for Verdicts
  Invalid JSON: expected value at line 1 column 1 [type=json_invalid, input_value='```json\n{\n  "verdicts"...t."\n    }\n  ]\n}\n```', input_type=str]
    For further information visit https://errors.pydantic.dev/2.12/v/json_invalid


Error calculating context precision for case O que é continuous training (CT)?: 1 validation error for Verdicts
  Invalid JSON: expected value at line 1 column 1 [type=json_invalid, input_value='```json\n{\n  "verdicts"...a."\n    }\n  ]\n}\n```', input_type=str]
    For further information visit https://errors.pydantic.dev/2.12/v/json_invalid


Running evaluation for generation model: gpt-oss:latest with evaluator model: gpt-oss:20b-cloud


Error calculating context precision for case O que é drift em modelos de machine learning?: 1 validation error for Verdicts
  Invalid JSON: expected value at line 1 column 1 [type=json_invalid, input_value='```json\n{\n  "verdicts"..."."\n    }\n  ]\n}\n```', input_type=str]
    For further information visit https://errors.pydantic.dev/2.12/v/json_invalid


Error calculating context relevancy for case O que é feature store?: 1 validation error for ContextualRelevancyVerdicts
  Invalid JSON: expected value at line 1 column 1 [type=json_invalid, input_value='```json\n{\n  "verdicts"...e."\n    }\n  ]\n}\n```', input_type=str]
    For further information visit https://errors.pydantic.dev/2.12/v/json_invalid


Error calculating context relevancy for case O que significa servir um modelo em produção?: 1 validation error for ContextualRelevancyVerdicts
  Invalid JSON: expected value at line 1 column 1 [type=json_invalid, input_value='```json\n{\n  "verdicts"...n."\n    }\n  ]\n}\n```', input_type=str]
    For further information visit https://errors.pydantic.dev/2.12/v/json_invalid


Error calculating context precision for case Por que versionar modelos é importante?: 1 validation error for Verdicts
  Invalid JSON: expected value at line 1 column 1 [type=json_invalid, input_value='```json\n{\n  "verdicts"...s."\n    }\n  ]\n}\n```', input_type=str]
    For further information visit https://errors.pydantic.dev/2.12/v/json_invalid


Error calculating context precision for case O que é CI/CD aplicado a ML?: 1 validation error for Verdicts
  Invalid JSON: expected value at line 1 column 1 [type=json_invalid, input_value='```json\n{\n  "verdicts"...s."\n    }\n  ]\n}\n```', input_type=str]
    For further information visit https://errors.pydantic.dev/2.12/v/json_invalid


Error calculating answer relevancy for case O que é um model registry?: 1 validation error for Statements
  Invalid JSON: expected `,` or `]` at line 1 column 1049 [type=json_invalid, input_value='{"statements":["Um model... segura e controlada."}', input_type=str]
    For further information visit https://errors.pydantic.dev/2.12/v/json_invalid


Error calculating context precision for case O que é concept drift?: 1 validation error for Verdicts
  Invalid JSON: expected value at line 1 column 1 [type=json_invalid, input_value='```json\n{\n  "verdicts"...r."\n    }\n  ]\n}\n```', input_type=str]
    For further information visit https://errors.pydantic.dev/2.12/v/json_invalid


Error calculating faithfulness for case O que é um dataset?: 1 validation error for Claims
  Invalid JSON: expected `,` or `]` at line 1 column 826 [type=json_invalid, input_value='{"claims":["Um dataset ...amento e inferência."}', input_type=str]
    For further information visit https://errors.pydantic.dev/2.12/v/json_invalid


Error calculating context precision for case O que são métricas de avaliação?: 1 validation error for Verdicts
  Invalid JSON: expected value at line 1 column 1 [type=json_invalid, input_value='```json\n{\n  "verdicts"...r."\n    }\n  ]\n}\n```', input_type=str]
    For further information visit https://errors.pydantic.dev/2.12/v/json_invalid


Error calculating context precision for case O que é validação cruzada?: 1 validation error for Verdicts
  Invalid JSON: expected value at line 1 column 1 [type=json_invalid, input_value='```json\n{\n  "verdicts"...s."\n    }\n  ]\n}\n```', input_type=str]
    For further information visit https://errors.pydantic.dev/2.12/v/json_invalid


Error calculating context precision for case O que é model drift detection?: 1 validation error for Verdicts
  Invalid JSON: expected value at line 1 column 1 [type=json_invalid, input_value='```json\n{\n  "verdicts"...es"\n    }\n  ]\n}\n```', input_type=str]
    For further information visit https://errors.pydantic.dev/2.12/v/json_invalid


Error calculating context precision for case O que é continuous training (CT)?: 1 validation error for Verdicts
  Invalid JSON: expected value at line 1 column 1 [type=json_invalid, input_value='```json\n{\n  "verdicts"...g."\n    }\n  ]\n}\n```', input_type=str]
    For further information visit https://errors.pydantic.dev/2.12/v/json_invalid
