In [1]:
import pandas as pd
import json
import os


## Carregar Vector Store

In [2]:
from src.vectorstore.hybrid_vector_store import HybridVectorStore
from src.config.settings import settings

#### Chroma Vector Store
#### Para criar do zero, ver Vector_Store.ipynb
store = HybridVectorStore(
    persist_path=settings.DATA_DB,
    embedding_model=settings.EMBEDDING_MODEL
)

  from .autonotebook import tqdm as notebook_tqdm


## Métricas de avaliação de LLMs usando Deepeval

### Criando dataset de conteudo recuperado (retrieval) e geração de resposta.

In [None]:
#### Descomentar para gerar os datasets de avaliação

from evaluation.deepeval_dataset import generate_dataset_for_evaluation
from evaluation.deepeval_dataset import get_list_of_docs


ground_truth_csv_path = "data/ground_truth/ground_truth_mlops.csv"
json_output_path = "artifacts/eval_metrics/datasets/"
# models = ["llama3.1:latest", "deepseek-r1:latest", "mistral:latest", "gpt-oss:latest", "llama3.2:latest"]
models = ["qwen3:latest"]

for model in models:
    generate_dataset_for_evaluation(store=store,
                                    ground_truth_csv_path=ground_truth_csv_path, 
                                    json_output_path=json_output_path, 
                                    model=model)

### Cálculo de métricas

**Context Precision:** The contextual precision metric uses LLM-as-a-judge to measure your RAG pipeline's retriever by evaluating whether nodes in your retrieval_context that are relevant to the given input are ranked higher than irrelevant ones. deepeval's contextual precision metric is a self-explaining LLM-Eval, meaning it outputs a reason for its metric score.

It is calculated as the mean of the precision@k for each chunk in the context. Precision@k is the ratio of the number of relevant chunks at rank k to the total number of chunks at rank k.
<br>
Source: [Context Precision](https://deepeval.com/docs/metrics-contextual-precision)
<br><br>

**Context Recall** The contextual relevancy metric uses LLM-as-a-judge to measure the quality of your RAG pipeline's retriever by evaluating the overall relevance of the information presented in your retrieval_context for a given input. deepeval's contextual relevancy metric is a self-explaining LLM-Eval, meaning it outputs a reason for its metric score.

The ContextualRelevancyMetric score is calculated according to the following equation: 
Contextual Relevancy = (Number of Relevant Statements)/(Total Number of Statements)
<br>
Source: [Context Recall](https://deepeval.com/docs/metrics-contextual-relevancy)
<br><br>

**Answer Relevancy** The answer relevancy metric uses LLM-as-a-judge to measure the quality of your RAG pipeline's generator by evaluating how relevant the actual_output of your LLM application is compared to the provided input. deepeval's answer relevancy metric is a self-explaining LLM-Eval, meaning it outputs a reason for its metric score.

The AnswerRelevancyMetric score is calculated according to the following equation:
Answer Relevancy = (Number of Relevant Statements)/(Total Number of Statements)
<br>
Source: [Answer Relevancy](https://deepeval.com/docs/metrics-answer-relevancy)
<br><br>


​**Faithfulness** The faithfulness metric uses LLM-as-a-judge to measure the quality of your RAG pipeline's generator by evaluating whether the actual_output factually aligns with the contents of your retrieval_context. deepeval's faithfulness metric is a self-explaining LLM-Eval, meaning it outputs a reason for its metric score.

The FaithfulnessMetric score is calculated according to the following equation:
Faithfulness = (Number of Truthful Claims)/(Total Number of Claims)
<br>
Source: [Faithfulness](https://deepeval.com/docs/metrics-faithfulness)

In [17]:
from dotenv import load_dotenv
load_dotenv()

True

In [None]:
# from deepeval.models import OllamaModel

# def create_evaluator_llm(model: str):
#     return OllamaModel(
#     model=model,
#     base_url="http://localhost:11434",
#     temperature=0,
# )

In [None]:
# from deepeval.metrics import (
#   ContextualRelevancyMetric,
#   ContextualPrecisionMetric,
#   AnswerRelevancyMetric,
#   FaithfulnessMetric
# )


# def calculate_context_precision(case: dict, evaluator_model: str):

#     evaluator_llm = create_evaluator_llm(model=evaluator_model)
#     contextual_precision = ContextualPrecisionMetric(model=evaluator_llm, include_reason=False)

#     return contextual_precision.measure(case)

# def calculate_context_relevancy(case: dict, evaluator_model: str):

#     evaluator_llm = create_evaluator_llm(model=evaluator_model)
#     contextual_relevancy = ContextualRelevancyMetric(model=evaluator_llm, include_reason=False)

#     return contextual_relevancy.measure(case)

# def calculate_answer_relevancy(case: dict, evaluator_model: str):

#     evaluator_llm = create_evaluator_llm(model=evaluator_model)
#     answer_relevancy = AnswerRelevancyMetric(model=evaluator_llm, include_reason=False)

#     return answer_relevancy.measure(case)

# def calculate_faithfulness(case: dict, evaluator_model: str):

#     evaluator_llm = create_evaluator_llm(model=evaluator_model)
#     faithfulness = FaithfulnessMetric(model=evaluator_llm, include_reason=False)

#     return faithfulness.measure(case)

In [None]:
# from src.evaluation.deepeval_evaluation import (
#   calculate_context_precision,
#   calculate_context_relevancy,
#   calculate_answer_relevancy,
#   calculate_faithfulness
# )

# from src.evaluation.deepeval_dataset import json_to_llmtestcase


# def run_evaluation_on_datasets(dataset_path: str, evaluator_model: str):

#     with open(dataset_path, "r", encoding="utf-8") as f:
#         test_cases = json.load(f)
# # 
#     for case in test_cases:

#         # Criando listas para as métricas de contexto por modelo de evaluation
#         if 'context_precision_score' not in case:
#             case['context_precision_score'] = []
#         if 'context_relevancy_score' not in case:
#             case['context_relevancy_score'] = []
#         if 'answer_relevancy_score' not in case:
#             case['answer_relevancy_score'] = []
#         if 'faithfulness_score' not in case:
#             case['faithfulness_score'] = []

#         test_case_obj = json_to_llmtestcase(case)

#         try:
#             precision = calculate_context_precision(test_case_obj, evaluator_model=evaluator_model)

#             case['context_precision_score'].append(
#                 {
#                 evaluator_model: precision
#                 }
#             )
#         except Exception as e:
#             print(f"Error calculating context precision for case {case['input']}: {e}")
#             case['context_precision_score'].append(
#                 {
#                 evaluator_model: None,
#                 'logs': str(e)
#                 }
#             )

#         try:
#             recall = calculate_context_relevancy(test_case_obj, evaluator_model=evaluator_model)

#             case['context_relevancy_score'].append(
#                 {
#                 evaluator_model: recall
#                 }
#             )
#         except Exception as e:
#             print(f"Error calculating context relevancy for case {case['input']}: {e}")
#             case['context_relevancy_score'].append(
#                 {
#                 evaluator_model: None,
#                 'logs': str(e)
#                 }
#             )

#         try:
#             answer_relevancy = calculate_answer_relevancy(test_case_obj, evaluator_model=evaluator_model)

#             case['answer_relevancy_score'].append(
#                 {
#                 evaluator_model: answer_relevancy
#                 }
#             )
#         except Exception as e:
#             print(f"Error calculating answer relevancy for case {case['input']}: {e}")
#             case['answer_relevancy_score'].append(
#                 {
#                 evaluator_model: None,
#                 'logs': str(e)
#                 }
#             )

#         try:
#             faithfullness = calculate_faithfulness(test_case_obj, evaluator_model=evaluator_model)

#             case['faithfulness_score'].append(
#                 {
#                 evaluator_model: faithfullness
#                 }
#             )
#         except Exception as e:
#             print(f"Error calculating faithfulness for case {case['input']}: {e}")
#             case['faithfulness_score'].append(
#                 {
#                 evaluator_model: None,
#                 'logs': str(e)
#                 }
#             )

#     with open(dataset_path, "w", encoding="utf-8") as f:
#         json.dump(test_cases, f, ensure_ascii=False, indent=4)

In [19]:
from src.pipelines.evaluation_pipeline import run_evaluation_on_datasets
from deepeval import settings

settings.DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS_OVERRIDE = 600

# generation_models = ["llama3.1:latest", "llama3.2:latest", "deepseek-r1:latest", "mistral:latest", "gpt-oss:latest"]
generation_models = ["qwen3:latest"] 
eval_models = ["llama3.1:latest", "llama3.2:latest"] 

for generation_model in generation_models:
    for eval_model in eval_models:
        print(f"Running evaluation for generation model: {generation_model} with evaluator model: {eval_model}")
        run_evaluation_on_datasets(
            dataset_path=f'artifacts/eval_metrics/deepeval/{generation_model.replace(':','_')}/test_cases_dataset.json',
            evaluator_model=eval_model
        )

Running evaluation for generation model: qwen3:latest with evaluator model: llama3.1:latest


Running evaluation for generation model: qwen3:latest with evaluator model: llama3.2:latest


In [21]:
# Rodar "gpt-oss:20b-cloud" em separado pois tem limite de token por hora e por semana

import time

# generation_models = ["llama3.1:latest", "deepseek-r1:latest", "mistral:latest", "gpt-oss:latest"]
generation_models = ["llama3.2:latest"] 
eval_models = ["gpt-oss:20b-cloud"] 

for generation_model in generation_models:
    for eval_model in eval_models:
        print(f"Running evaluation for generation model: {generation_model} with evaluator model: {eval_model}")
        run_evaluation_on_datasets(
            dataset_path=f'artifacts/eval_metrics/deepeval/{generation_model.replace(':','_')}/test_cases_dataset.json',
            evaluator_model=eval_model
        )

        # time.sleep(3600)

Running evaluation for generation model: llama3.2:latest with evaluator model: gpt-oss:20b-cloud


Error calculating context relevancy for case O que é MLOps?: 1 validation error for ContextualRelevancyVerdicts
  Invalid JSON: expected `,` or `}` at line 52 column 5 [type=json_invalid, input_value='{\n  "verdicts": [\n    ... MLOps."\n    ]\n  ]\n}', input_type=str]
    For further information visit https://errors.pydantic.dev/2.12/v/json_invalid


Error calculating answer relevancy for case Qual a diferença entre MLOps e DevOps?: 1 validation error for Statements
  Invalid JSON: expected `,` or `]` at line 1 column 867 [type=json_invalid, input_value='{"statements":["Uma perg...de software em geral."}', input_type=str]
    For further information visit https://errors.pydantic.dev/2.12/v/json_invalid


Error calculating context precision for case O que significa servir um modelo em produção?: 1 validation error for Verdicts
  Invalid JSON: expected value at line 1 column 1 [type=json_invalid, input_value='```json\n{\n  "verdicts"...es"\n    }\n  ]\n}\n```', input_type=str]
    For further information visit https://errors.pydantic.dev/2.12/v/json_invalid


Error calculating context precision for case O que é um model registry?: 1 validation error for Verdicts
  Invalid JSON: expected value at line 1 column 1 [type=json_invalid, input_value='```json\n{\n  "verdicts"...r."\n    }\n  ]\n}\n```', input_type=str]
    For further information visit https://errors.pydantic.dev/2.12/v/json_invalid


Error calculating context precision for case Diferença entre batch inference e online inference?: 1 validation error for Verdicts
  Invalid JSON: expected value at line 1 column 1 [type=json_invalid, input_value='```json\n{\n  "verdicts"...no"\n    }\n  ]\n}\n```', input_type=str]
    For further information visit https://errors.pydantic.dev/2.12/v/json_invalid


Error calculating faithfulness for case O que é canary deployment em ML?: 1 validation error for Claims
  Invalid JSON: expected `,` or `]` at line 1 column 707 [type=json_invalid, input_value='{"claims":["Acanção (C...-lo em escala maior."}', input_type=str]
    For further information visit https://errors.pydantic.dev/2.12/v/json_invalid


Error calculating context precision for case O que é um modelo de machine learning?: 1 validation error for Verdicts
  Invalid JSON: expected value at line 1 column 1 [type=json_invalid, input_value='```json\n{\n  "verdicts"...l."\n    }\n  ]\n}\n```', input_type=str]
    For further information visit https://errors.pydantic.dev/2.12/v/json_invalid


Error calculating context precision for case O que são métricas de avaliação?: 1 validation error for Verdicts
  Invalid JSON: expected value at line 1 column 1 [type=json_invalid, input_value='```json\n{\n  "verdicts"...r."\n    }\n  ]\n}\n```', input_type=str]
    For further information visit https://errors.pydantic.dev/2.12/v/json_invalid


Error calculating context precision for case O que é validação cruzada?: 1 validation error for Verdicts
  Invalid JSON: expected value at line 1 column 1 [type=json_invalid, input_value='```json\n{\n  "verdicts"...n."\n    }\n  ]\n}\n```', input_type=str]
    For further information visit https://errors.pydantic.dev/2.12/v/json_invalid


Error calculating context precision for case O que é model drift detection?: 1 validation error for Verdicts
  Invalid JSON: expected value at line 1 column 1 [type=json_invalid, input_value='```json\n{\n  "verdicts"...\""\n    }\n  ]\n}\n```', input_type=str]
    For further information visit https://errors.pydantic.dev/2.12/v/json_invalid


Error calculating faithfulness for case Qual a importância de um lineage tracker em MLOps?: 1 validation error for Claims
  Invalid JSON: expected `,` or `]` at line 1 column 1017 [type=json_invalid, input_value='{"claims":["Um lineage t...essos de ML em MLOps."}', input_type=str]
    For further information visit https://errors.pydantic.dev/2.12/v/json_invalid


Error calculating context precision for case O que é continuous training (CT)?: 1 validation error for Verdicts
  Invalid JSON: expected value at line 1 column 1 [type=json_invalid, input_value='```json\n{\n  "verdicts"...T."\n    }\n  ]\n}\n```', input_type=str]
    For further information visit https://errors.pydantic.dev/2.12/v/json_invalid


Error calculating context precision for case Como funciona um sistema de AB testing para modelos?: 1 validation error for Verdicts
  Invalid JSON: expected value at line 1 column 1 [type=json_invalid, input_value='```json\n{\n  "verdicts"...\""\n    }\n  ]\n}\n```', input_type=str]
    For further information visit https://errors.pydantic.dev/2.12/v/json_invalid


Error calculating faithfulness for case Como funciona um sistema de AB testing para modelos?: 1 validation error for Claims
  Invalid JSON: expected `,` or `]` at line 1 column 1906 [type=json_invalid, input_value='{"claims":["Um sistema d...bre qual modelo usar."}', input_type=str]
    For further information visit https://errors.pydantic.dev/2.12/v/json_invalid


Error calculating context precision for case O que é um modelo champion vs challenger?: 1 validation error for Verdicts
  Invalid JSON: expected value at line 1 column 1 [type=json_invalid, input_value='```json\n{\n  "verdicts"...s."\n    }\n  ]\n}\n```', input_type=str]
    For further information visit https://errors.pydantic.dev/2.12/v/json_invalid
