In [1]:
from langchain_ollama import OllamaLLM
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser


import pandas as pd
# from datasets import Dataset
import json


import os
# import google.generativeai as genai

  from .autonotebook import tqdm as notebook_tqdm


## Carregar Vector Store e configurar fun√ß√µes de retrieval e gera√ß√£o de resposta

In [2]:
#sentence_transformers
#rank-bm25
#langchain_text_splitters
#chromadb

In [3]:
from src.vectorstore.hybrid_vector_store import HybridVectorStore
from src.config.settings import settings

#### Chroma Vector Store
#### Para criar do zero, ver Vector_Store.ipynb
store = HybridVectorStore(
    persist_path=settings.DATA_DB,
    embedding_model=settings.EMBEDDING_MODEL
)

In [28]:
from langchain_ollama import OllamaLLM
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser


llm = OllamaLLM(model="deepseek-r1:latest", temperature=0)

prompt = ChatPromptTemplate.from_template(
"""
Voc√™ √© um ACP - um Agente Conversacional Pedag√≥gico especializado em ajudar estudantes com d√∫vidas relacionadas a conte√∫dos acad√™micos.
Sua fun√ß√£o √© fornecer respostas claras e informativas com base no material de estudo fornecido pelo contexto.
Inclua todas as informa√ß√µes relevantes do contexto em suas respostas, evitando suposi√ß√µes ou informa√ß√µes externas.
Suas respostas devem ser sempre em portugu√™s brasileiro e devem usar um tom leve.

Se n√£o encontrar a resposta no contexto, diga:
"Nenhuma informa√ß√£o dispon√≠vel no contexto."
            
Contexto:
{context}

Pergunta:
{question}
""")

chain = (
        prompt
        | llm
        | StrOutputParser()
    )


In [None]:
#Fun√ß√£o que integra busca e gera√ß√£o de resposta

def rag_pipeline(question: str):
    docs = store.hybrid_search(question, top_k=5)
    context = "\n\n".join([d[1] for d in docs])

    chain = (
        prompt
        | llm
        | StrOutputParser()
    )

    return chain.invoke({"context": context, "question": question})

response = rag_pipeline("Qual a diferen√ßa entre Data Scientist e ML Engineer?")
print(response)

√ìtimo questionamento! Vamos esclarecer a diferen√ßa entre esses dois pap√©is importantes no ecossistema de Machine Learning.

De acordo com o contexto:

1.  **Foco:** O Data Scientist se concentra principalmente em extrair insights e desenvolver/refinar modelos de Machine Learning. O ML Engineer se concentra na parte de engenharia, respons√°vel por construir, implementar e manter esses modelos.

2.  **Expertise:** O Data Scientist tem uma forte base em an√°lise, matem√°tica e estat√≠stica. O ML Engineer tem uma compreens√£o mais profunda de princ√≠pios de engenharia de software, infraestrutura e servi√ßos em nuvem.

3.  **Colabora√ß√£o:** Ambos precisam de habilidades de comunica√ß√£o, mas o ML Engineer tende a trabalhar mais com engenheiros de software, DevOps e profissionais de TI para integrar os modelos de Machine Learning aos sistemas existentes.

Em resumo, embora ambos contribuam para o ecossistema de Machine Learning, o Data Scientist √© mais focado na an√°lise e modelagem est

In [30]:
# Fun√ß√£o apenas de recupera√ß√£o de documentos

def real_retrieval(question: str):
    docs = store.hybrid_search(question, top_k=5)
    return [docs[i][1] for i in range(len(docs))]

retrieve = real_retrieval("Qual a diferen√ßa entre Data Scientist e ML Engineer?")
print(retrieve)

['the machine learning ecosystem.\nWhat is a Data Scientist ?\nA Data Scientist is an expert in extracting valuable insights from large\nvolumes of data. With a strong background in mathematics, statistics, and\nprogramming, Data Scientists analyze and interpret data to solve complex\nproblems and support better decision-making within an organization. They\nwork closely with business stakeholders to understand their objectives and\ndevelop machine learning models to help achieve those goals.\nKey responsibilities of a Data Scientist include:\nGathering, cleaning, and preprocessing data\nDeveloping machine learning models and algorithms\nEvaluating model performance and optimizing as needed\nCommunicating findings and insights to business stakeholders\nWhat is an ML Engineer?\nAn ML Engineer, or Machine Learning Engineer, is a professional who\ndesigns, develops, and implements machine learning models. They work\nclosely with data scientists to translate prototypes into efficient and', 

## Usando Deep Eval para m√©tricas das gera√ß√µes de respostas contidas no ground_truth

Datasets completos e as avalia√ß√µes est√£o dentro da pasta artifacts.

In [34]:
from dotenv import load_dotenv
load_dotenv()

from deepeval.test_case import LLMTestCase
import json
import pandas as pd

In [35]:
####### Gera m√©tricas para cada uma das entradas da tabela de ground_truth.

test_cases = []

df = pd.read_csv("data/ground_truth/ground_truth_mlops.csv", encoding="latin-1")
for index, row in df.iterrows():
    test_cases.append(LLMTestCase(input=row["question"], expected_output=row["answer"], actual_output=rag_pipeline(row["question"]), retrieval_context=real_retrieval(row["question"])))

### Salva os test_cases gerados em um arquivo JSON. Descomentar para usar.

test_cases_dicts = [
    {
        "input": tc.input,
        "expected_output": tc.expected_output,
        "actual_output": tc.actual_output,
        "retrieval_context": tc.retrieval_context
    }
    for tc in test_cases
]

# Salva a lista de dicion√°rios em um arquivo JSON
with open("artifacts/eval_metrics/deepseek-r1/test_cases_mapeados.json", "w", encoding="utf-8") as f:
    json.dump(test_cases_dicts, f, ensure_ascii=False, indent=2)

In [36]:
# ####### Carrega o JSON salvo anteriormente, caso necess√°rio. Descomentar para usar

# with open("artifacts/eval_metrics/gpt-oss/test_cases_mapeados.json", "r", encoding="utf-8") as f:
#     test_cases_dicts = json.load(f)

# # Recria a lista de LLMTestCase
# test_cases = [
#     LLMTestCase(
#         input=tc["input"],
#         expected_output=tc["expected_output"],
#         actual_output=tc["actual_output"],
#         retrieval_context=tc["retrieval_context"]
#     )
#     for tc in test_cases_dicts
# ]

In [37]:
from deepeval.models import OllamaModel

# Modelo usado para gera√ß√£o de m√©tricas que necessitem de LLM. 
# PS: S√≥ consegui fazer o llama3 usar aqui, o gpt-oss e deepseek-r1 deram erro de timeout repetidamente.

model = OllamaModel(
    model="llama3:latest",
    base_url="http://localhost:11434",
    temperature=0
)

In [38]:
from deepeval.metrics import (
  ContextualRelevancyMetric,
  ContextualRecallMetric,
  ContextualPrecisionMetric,
  AnswerRelevancyMetric,
  FaithfulnessMetric
)
from deepeval import evaluate
from deepeval.evaluate import AsyncConfig


contextual_precision = ContextualPrecisionMetric(model=model)
contextual_recall = ContextualRecallMetric(model=model)
contextual_relevancy = ContextualRelevancyMetric(model=model)
answer_relevancy = AnswerRelevancyMetric(threshold=0.8, model=model)
faithfulness = FaithfulnessMetric(model=model)

async_config = AsyncConfig(
    run_async=True,
    throttle_value=5,  
    max_concurrent=1,
)

results = []

for tc in test_cases:
    tc_results = evaluate([tc], 
                          metrics=[contextual_precision, contextual_recall, contextual_relevancy, answer_relevancy, faithfulness],
                          async_config=async_config,
                          )
    results.append(tc_results.model_dump()['test_results'][0])
    
# results = evaluate([test_cases[0]], metrics=[contextual_precision, contextual_recall, contextual_relevancy, answer_relevancy, faithfulness])

results



Metrics Summary

  - ‚úÖ Contextual Precision (score: 0.6396524110809825, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.64 because irrelevant nodes (nodes 2, 4, 6, and 8) are correctly ranked lower than relevant nodes, as they lack direct connections to the expected output of defining MLOps. The first node's relevance is evident from its mention of 'MLOps', while subsequent 'yes' verdicts provide more specific explanations aligning with the expected output., error: None)
  - ‚úÖ Contextual Recall (score: 0.5384615384615384, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.54 because the contextual recall score indicates that most sentences can be attributed to nodes in the retrieval context, but not all, suggesting a good match between expected output and retrieval context, with some minor discrepancies., error: None)
  - ‚úÖ Contextual Relevancy (score: 0.7948717948717948, threshold: 0.5



Metrics Summary

  - ‚úÖ Contextual Precision (score: 0.7555555555555555, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.76 because irrelevant nodes (nodes 2 and 4) are correctly ranked lower than relevant nodes (nodes 1 and 3), as they don't provide information about the differences between a Data Scientist and an ML Engineer, which is crucial to understanding the question., error: None)
  - ‚úÖ Contextual Recall (score: 0.5, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.50 because the contextual recall score indicates that the expected output can be partially attributed to nodes in retrieval context, but not entirely, suggesting a moderate level of relevance between the two., error: None)
  - ‚úÖ Contextual Relevancy (score: 0.8, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.80 because the input question about the difference between ML



Metrics Summary

  - ‚úÖ Contextual Precision (score: 0.7555555555555555, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.76 because irrelevant nodes are correctly ranked lower than relevant nodes (nodes 2 and 4) as they don't provide direct answers to the question about the differences between MLOps and DevOps, while nodes 1 and 3 offer valuable insights into the topic., error: None)
  - ‚úÖ Contextual Recall (score: 0.5555555555555556, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.56 because the contextual recall score indicates that most sentences in the expected output can be attributed to nodes in the retrieval context, but some sentences remain unconnected, suggesting that there are still areas where the information does not align with the provided context., error: None)
  - ‚úÖ Contextual Relevancy (score: 0.9354838709677419, threshold: 0.5, strict: False, evaluation model: llama3



Metrics Summary

  - ‚úÖ Contextual Precision (score: 0.7095238095238094, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.71 because irrelevant nodes, such as the fairy tale about Goldilocks (ranked 4), are correctly ranked lower than relevant nodes that provide insights into concept drift and its impact on machine learning models (ranks 1, 2, and 5)., error: None)
  - ‚úÖ Contextual Recall (score: 0.5, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.50 because the contextual recall score indicates that half of the expected output can be attributed to nodes in the retrieval context, while the other half does not have a clear connection to any node., error: None)
  - ‚úÖ Contextual Relevancy (score: 0.6086956521739131, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.61 because the input question about drift in machine learning models doesn't s



Metrics Summary

  - ‚úÖ Contextual Precision (score: 0.7555555555555555, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.76 because irrelevant nodes, such as those discussing MLOps and data versioning, are correctly ranked lower than the relevant nodes that provide definitions and explanations about feature stores, like the first and third nodes in retrieval contexts., error: None)
  - ‚úÖ Contextual Recall (score: 0.6, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.60 because the contextual recall score indicates that the expected output is partially attributed to nodes in retrieval context, suggesting a moderate level of relevance between the two., error: None)
  - ‚úÖ Contextual Relevancy (score: 0.8125, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.81 because the retrieval context provides detailed information about the purpose and fu



Metrics Summary

  - ‚úÖ Contextual Precision (score: 0.8541666666666666, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.85 because irrelevant nodes, such as those discussing manual deployment or verifying predictive performance targets, are correctly ranked lower than relevant nodes that discuss model development, evaluation, and deployment. The first 'no' verdict at rank 3 is a clear indication of this, as it's not directly related to serving a model in production., error: None)
  - ‚úÖ Contextual Recall (score: 0.5, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.50 because the contextual recall struggles to attribute the expected output sentences to relevant nodes in the retrieval context, with most sentences lacking clear connections to specific nodes., error: None)
  - ‚úÖ Contextual Relevancy (score: 0.7222222222222222, threshold: 0.5, strict: False, evaluation model: llama3:latest



Metrics Summary

  - ‚úÖ Contextual Precision (score: 0.6565175565175564, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.66 because the relevant nodes (nodes 1, 3, 5, and 7) are ranked higher than irrelevant nodes (nodes 2, 4, 6, and 8-10), with a clear distinction between the two groups. The reasons for the 'yes' verdicts highlight direct connections to the expected output's mentions of 'versionar modelos', 'rastrear artefatos', 'garantir rollback seguro', 'producir resultados', and 'identificar os melhores modelos'. In contrast, the 'no' verdicts are justified by the lack of information about specific topics, such as reproducing experiment results, ensuring rollback security, or testing model predictions., error: None)
  - ‚úÖ Contextual Recall (score: 0.5217391304347826, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.52 because the contextual recall score indicates that most of the se



Metrics Summary

  - ‚úÖ Contextual Precision (score: 0.8541666666666666, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.85 because although there are some irrelevant nodes (nodes 3 and 6) that should be ranked lower than the relevant ones, the retrieval contexts effectively distinguish between them by providing clear reasons for their relevance or irrelevance to the input question 'O que √© um pipeline de ML?', error: None)
  - ‚úÖ Contextual Recall (score: 0.9, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.90 because the contextual recall score indicates a strong match between the expected output and the nodes in the retrieval context, with most sentences aligning well with specific nodes, demonstrating a high degree of relevance., error: None)
  - ‚úÖ Contextual Relevancy (score: 0.9166666666666666, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The 



Metrics Summary

  - ‚úÖ Contextual Precision (score: 0.5, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.50 because irrelevant nodes (nodes 1 and 3) are ranked lower than relevant nodes (nodes 2 and 4), indicating that contextual precision is moderate, as some 'no' verdicts are correctly placed below the 'yes' verdicts., error: None)
  - ‚úÖ Contextual Recall (score: 0.5454545454545454, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.55 because the contextual recall successfully captures some of the key concepts related to ML Ops, such as experimentation and artifact tracking, but still lacks a comprehensive understanding of the topic, failing to fully attribute sentences like the one about reproducing results using the same code, data, and parameters., error: None)
  - ‚úÖ Contextual Relevancy (score: 0.5121951219512195, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ol



Metrics Summary

  - ‚úÖ Contextual Precision (score: 0.6787301587301586, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.68 because irrelevant nodes, such as those discussing effort recognition and role distinctions in AI projects, are ranked lower than relevant nodes that directly address CI/CD concepts, like automation of tests and validation, packaging, and deployment of models., error: None)
  - ‚úÖ Contextual Recall (score: 0.5111111111111111, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.51 because the contextual recall score indicates that there are some relevant sentences in the expected output that can be attributed to nodes in the retrieval context, but not all of them, suggesting a moderate level of alignment between the two., error: None)
  - ‚ùå Contextual Relevancy (score: 0.3, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.



Metrics Summary

  - ‚úÖ Contextual Precision (score: 0.5, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.50 because the relevant nodes (ranked 2nd and 4th) are correctly ranked higher than irrelevant nodes (ranked 1st and 3rd), but not all 'no' verdicts are consistently ranked lower, as some are still relatively close to the top-ranked 'yes' verdict., error: None)
  - ‚úÖ Contextual Recall (score: 0.5357142857142857, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.54 because the contextual recall score indicates that there are some mismatches between the expected output and the nodes in the retrieval context, but overall, the model is still able to capture most of the relevant information., error: None)
  - ‚úÖ Contextual Relevancy (score: 0.875, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.88 because despite the retrieval context contai



Metrics Summary

  - ‚úÖ Contextual Precision (score: 0.7555555555555555, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.76 because irrelevant nodes (nodes 2 and 4) are correctly ranked lower than relevant nodes (nodes 1, 3), as they don't provide information about what a model registry is, only describing its capabilities., error: None)
  - ‚úÖ Contextual Recall (score: 0.6875, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.69 because the contextual recall score indicates that most of the expected output sentences are supported by relevant information in the retrieval context, with some minor discrepancies., error: None)
  - ‚úÖ Contextual Relevancy (score: 0.6486486486486487, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.65 because the retrieval context provides statements about various aspects of machine learning, such as training, eva



Metrics Summary

  - ‚úÖ Contextual Precision (score: 0.6565175565175564, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.66 because the relevant nodes (nodes 1, 3, 5, and 7) are ranked higher than irrelevant nodes (nodes 2, 4, 6, and 8-10), with a good balance between 'yes' verdicts and 'no' verdicts. The irrelevant nodes mostly provide information about ML Ops phases or techniques that don't directly relate to data validation., error: None)
  - ‚úÖ Contextual Recall (score: 0.5384615384615384, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.54 because the contextual recall score indicates that most of the expected output can be attributed to the nodes in the retrieval context, with some minor inconsistencies., error: None)
  - ‚úÖ Contextual Relevancy (score: 0.7435897435897436, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.74 because the



Metrics Summary

  - ‚úÖ Contextual Precision (score: 0.5, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.50 because irrelevant nodes, such as those discussing model explainability and logging of prediction serving requests (ranked 3), or model monitoring and abstractions for machine learning systems (ranked 5), are correctly ranked lower than the relevant nodes that discuss batch inference and online inference (ranks 1 and 2)., error: None)
  - ‚úÖ Contextual Recall (score: 0.5454545454545454, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.55 because the contextual recall score indicates a moderate level of accuracy in attributing sentences from the expected output to corresponding nodes in the retrieval context, with some sentences matching well and others not fitting as closely., error: None)
  - ‚ùå Contextual Relevancy (score: 0.2857142857142857, threshold: 0.5, strict: False, evalu



Metrics Summary

  - ‚úÖ Contextual Precision (score: 0.625, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.62 because irrelevant nodes, such as those discussing legacy features and configuration systems, are correctly ranked lower than relevant nodes that mention 'feature drift' or provide specific examples, like Covid-19 and fairy tales., error: None)
  - ‚úÖ Contextual Recall (score: 0.5, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.50 because the contextual recall score indicates that there are some supportive reasons for attributing sentences to nodes in the retrieval context, but also some unsupportive reasons where sentences cannot be attributed, suggesting a moderate level of alignment between the expected output and the retrieval context., error: None)
  - ‚úÖ Contextual Relevancy (score: 0.5641025641025641, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollam



Metrics Summary

  - ‚úÖ Contextual Precision (score: 0.5, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.50 because the irrelevant nodes (nodes 1, 3, and 5) are correctly ranked lower than the relevant nodes (nodes 2 and 4), but there's still room for improvement as some relevant nodes (node 6) are not ranked higher than others., error: None)
  - ‚úÖ Contextual Recall (score: 0.5, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.50 because the contextual recall score indicates that there are some supportive reasons for attributing sentences in the expected output to nodes in the retrieval context, but also some unsupportive reasons where sentences cannot be attributed to any parts of the retrieval context., error: None)
  - ‚ùå Contextual Relevancy (score: 0.43243243243243246, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.43 because the re



Metrics Summary

  - ‚úÖ Contextual Precision (score: 0.7095238095238094, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.71 because irrelevant nodes, such as those discussing data quality issues (node 3) and minimizing bias in ML models (node 7), should be ranked lower than relevant nodes like those mentioning MLflow's orchestration capabilities (node 1) and experimentation importance (nodes 2 and 5)., error: None)
  - ‚úÖ Contextual Recall (score: 0.5333333333333333, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.53 because the contextual recall score indicates that most of the expected output sentences can be attributed to specific nodes in the retrieval context, but there are some sentences that do not have a clear connection to any node., error: None)
  - ‚úÖ Contextual Relevancy (score: 0.5806451612903226, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reas



Metrics Summary

  - ‚úÖ Contextual Precision (score: 0.7555555555555555, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.76 because irrelevant nodes like 'MLOps level 0' and 'seamless integration of ML into existing processes' are correctly ranked lower than relevant nodes that discuss canary deployment strategies, such as the first node mentioning 'canary deployment', and the third node highlighting the importance of monitoring model performance., error: None)
  - ‚úÖ Contextual Recall (score: 0.5, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.50 because the contextual recall score indicates that half of the expected output can be attributed to nodes in the retrieval context, while the other half appears to be original statements that do not match any part of the provided context., error: None)
  - ‚úÖ Contextual Relevancy (score: 0.5, threshold: 0.5, strict: False, evaluation model: l



Metrics Summary

  - ‚úÖ Contextual Precision (score: 0.7555555555555555, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.76 because irrelevant nodes, such as those discussing serving trained models (ranked 3) or not providing information about training models (ranked 2), are correctly ranked lower than the relevant nodes that explain what a pipeline is and how it relates to machine learning (ranked 1 and 4)., error: None)
  - ‚úÖ Contextual Recall (score: 0.5, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.50 because the expected output seems to be related to the development and experimentation stage (1st node) and model continuous delivery stage (5th node), but lacks clear connections to other nodes in the retrieval context, resulting in a moderate recall score., error: None)
  - ‚úÖ Contextual Relevancy (score: 0.6333333333333333, threshold: 0.5, strict: False, evaluation model: llama3



Metrics Summary

  - ‚úÖ Contextual Precision (score: 0.8541666666666666, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.85 because the relevant nodes (nodes 1, 2, 4 and 6) that discuss 'MLOps', versioning, pipelines, monitoring, managing data science projects, and maintaining ML models are ranked higher than irrelevant nodes (nodes 3 and 5) that don't provide information about logging, data validation, reproducible processes, continuous training pipelines, or data acquisition/preprocessing., error: None)
  - ‚ùå Contextual Recall (score: 0.4444444444444444, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.44 because the contextual recall score indicates that some sentences in the expected output can be attributed to specific nodes in the retrieval context, while others do not have a clear connection to these nodes., error: None)
  - ‚úÖ Contextual Relevancy (score: 0.7586206896551724, thr



Metrics Summary

  - ‚úÖ Contextual Precision (score: 0.9166666666666666, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.92 because irrelevant nodes (nodes 3) are correctly ranked lower than relevant nodes, with the first two nodes providing direct explanations of what a machine learning model is, while node 3 focuses on the process of developing and managing models without defining what they are., error: None)
  - ‚úÖ Contextual Recall (score: 0.6, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.60 because the model's ability to partially match the expected output with relevant concepts in the retrieval context, such as machine learning model lifecycle and model development, indicates a decent recall but still leaves some room for improvement., error: None)
  - ‚úÖ Contextual Relevancy (score: 0.8787878787878788, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), re



Metrics Summary

  - ‚úÖ Contextual Precision (score: 0.5, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.50 because the relevant nodes (2nd and 4th) are ranked higher than irrelevant nodes (1st and 3rd), which do not provide definitions for 'treinamento de modelo'. The first node's reason is that it doesn't mention the term at all, while the third node talks about machine learning models but doesn't define 'treinamento de modelo'., error: None)
  - ‚úÖ Contextual Recall (score: 0.5614035087719298, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.56 because the contextual recall score indicates that the expected output sentence about Treinamento being the process of adjusting model parameters using labeled or unlabeled data can be attributed to the node(s) in retrieval context, but not all sentences can be attributed., error: None)
  - ‚úÖ Contextual Relevancy (score: 0.9354838709677419, t



Metrics Summary

  - ‚úÖ Contextual Precision (score: 0.7095238095238094, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.71 because irrelevant nodes (nodes 2 and 5) were correctly ranked lower than relevant nodes (nodes 1, 3, 4, and 6), with reasons such as 'overfitting' and 'prototype smells or configuration debt' not being directly related to the topic of separating training and testing., error: None)
  - ‚úÖ Contextual Recall (score: 0.5, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.50 because the original expected output sentence 'A separa√ß√£o evita overfitting e garante avalia√ß√£o imparcial da performance.' cannot be attributed to any node in the retrieval context, making it an unsupportive reason., error: None)
  - ‚úÖ Contextual Relevancy (score: 0.7368421052631579, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.74 because the r



Metrics Summary

  - ‚ùå Contextual Precision (score: 0.2, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.20 because irrelevant nodes (nodes 1-4) are correctly ranked lower than the relevant node (node 5), with a clear distinction between topics such as software engineering, ML system smells, data management, and ML Ops, which do not relate to inference, allowing the correct context mentioning 'infer√™ncia √© o uso de um modelo treinado para gerar previs√µes em novos dados' to rank higher., error: None)
  - ‚ùå Contextual Recall (score: 0.48936170212765956, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.49 because the original expected output does not match well with the nodes in the retrieval context, indicating a moderate level of contextual recall., error: None)
  - ‚úÖ Contextual Relevancy (score: 0.6136363636363636, threshold: 0.5, strict: False, evaluation model: llama3:latest (Oll



Metrics Summary

  - ‚úÖ Contextual Precision (score: 0.6787301587301586, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score of 0.68 is because, although irrelevant nodes (nodes 2, 4, 6, and 8) are ranked lower than relevant nodes (nodes 1, 3, 5, and 9), there's still room for improvement in ranking the most relevant nodes higher up the list., error: None)
  - ‚úÖ Contextual Recall (score: 0.5217391304347826, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.52 because the contextual recall score indicates that the expected output is partially supported by the nodes in the retrieval context, with some sentences having clear connections to specific nodes and others not being directly attributed to any node., error: None)
  - ‚úÖ Contextual Relevancy (score: 0.5813953488372093, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.58 because the retrieval cont



Metrics Summary

  - ‚úÖ Contextual Precision (score: 0.6396524110809825, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.64 because irrelevant nodes like 'institutional knowledge', 'research' and 'engineering' roles, 'classroom material', 'textbooks', and 'data scientists' are ranked lower than relevant nodes that discuss pipelines, reproducibility, scalability, and their importance in ML tasks., error: None)
  - ‚úÖ Contextual Recall (score: 0.5, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.50 because the contextual recall score indicates that half of the expected output can be attributed to the nodes in the retrieval context, with some sentences strongly relating to specific nodes (e.g., sentence 2 relates to node 2) and others not containing any relevant information (unsupportive reasons)., error: None)
  - ‚úÖ Contextual Relevancy (score: 0.5135135135135135, threshold: 0.5, strict:



Metrics Summary

  - ‚úÖ Contextual Precision (score: 0.5, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.50 because the relevant nodes (ranked 2 and 4) are correctly ranked higher than irrelevant nodes, but there's still room for improvement as some 'no' verdicts are not properly distinguished from the top-ranked 'yes' verdicts., error: None)
  - ‚úÖ Contextual Recall (score: 0.5185185185185185, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.52 because most sentences in the expected output can be attributed to specific nodes in the retrieval context, indicating a decent level of relevance and recall., error: None)
  - ‚úÖ Contextual Relevancy (score: 0.5405405405405406, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.54 because the retrieval context statements are not directly related to the input 'O que s√£o m√©tricas de avalia√ß√£o?' whi



Metrics Summary

  - ‚úÖ Contextual Precision (score: 0.7555555555555555, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.76 because irrelevant nodes (nodes 2 and 4) are correctly ranked lower than relevant nodes (nodes 1, 3, and 5), with the first node being a strong match ('model development' is mentioned)., error: None)
  - ‚úÖ Contextual Recall (score: 0.9411764705882353, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.94 because the contextual recall score indicates a high degree of accuracy, suggesting that most sentences in the expected output are correctly attributed to nodes in the retrieval context., error: None)
  - ‚úÖ Contextual Relevancy (score: 0.6551724137931034, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.66 because most of the statements in the retrieval context are about model training, evaluation, and development, whic



Metrics Summary

  - ‚úÖ Contextual Precision (score: 0.7555555555555555, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.76 because irrelevant nodes are ranked lower than relevant ones, as seen from the first 'no' verdict (node 2) being placed after the second 'yes' verdict (node 3), indicating that the system correctly prioritized the context mentioning monitoring and testing in real-time over the node discussing data testing debt., error: None)
  - ‚úÖ Contextual Recall (score: 0.5, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.50 because the contextual recall score indicates that half of the expected output sentences can be attributed to specific nodes in the retrieval context, while the other half do not have a clear connection to these nodes., error: None)
  - ‚úÖ Contextual Relevancy (score: 0.5714285714285714, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama



Metrics Summary

  - ‚úÖ Contextual Precision (score: 0.5, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.50 because irrelevant nodes (nodes 1 and 4) are ranked lower than relevant nodes (nodes 2 and 3), as they do not provide information about model validation, whereas nodes 2 and 3 explicitly mention 'validation datasets' and 'offline validation phase', making them more relevant to the topic., error: None)
  - ‚úÖ Contextual Recall (score: 0.5, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.50 because the contextual recall score indicates a moderate level of matching between the expected output and the nodes in the retrieval context, but there are some sentences in the expected output that do not contain any parts that can be attributed to these nodes., error: None)
  - ‚úÖ Contextual Relevancy (score: 0.5357142857142857, threshold: 0.5, strict: False, evaluation model: llama3:latest (



Metrics Summary

  - ‚úÖ Contextual Precision (score: 0.5666666666666667, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.57 because irrelevant nodes (nodes 1 and 3) are correctly ranked lower than the relevant nodes (nodes 2, 4, and 5), with the first 'yes' verdict at node 2, indicating that model drift detection is a topic of interest in natural data drift and unnatural data drift., error: None)
  - ‚úÖ Contextual Recall (score: 0.5454545454545454, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.55 because the contextual recall score indicates that some sentences from the expected output are correctly attributed to nodes in the retrieval context, while others do not have a clear match., error: None)
  - ‚úÖ Contextual Relevancy (score: 0.6428571428571429, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.64 because the retrieval context conta



Metrics Summary

  - ‚úÖ Contextual Precision (score: 0.6396524110809825, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.64 because irrelevant nodes (nodes ranked 2-11) are correctly ranked lower than relevant nodes (nodes ranked 1 and 3-12), as they do not provide information about the importance of a lineage tracker in MLOps, whereas the top-ranked nodes and most nodes after that do., error: None)
  - ‚úÖ Contextual Recall (score: 0.5, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.50 because the contextual recall score indicates a moderate level of accuracy in attributing sentences from the expected output to corresponding nodes in the retrieval context, with some sentences having clear connections and others not being directly linked., error: None)
  - ‚úÖ Contextual Relevancy (score: 0.7073170731707317, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason:



Metrics Summary

  - ‚úÖ Contextual Precision (score: 0.7095238095238094, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.71 because the relevant nodes (nodes 1, 3, and 5) are ranked higher than irrelevant nodes (nodes 2, 4, and 6-8), with the reasons being that these irrelevant nodes only provide superficial information about CT without explaining what it is or how it works., error: None)
  - ‚úÖ Contextual Recall (score: 0.6, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.60 because the expected output partially matches with nodes in retrieval context, particularly with sentences related to machine learning development phases and processes, such as experimentation, model training, and continuous integration., error: None)
  - ‚úÖ Contextual Relevancy (score: 0.5, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.50 because the retrieval cont



Metrics Summary

  - ‚ùå Contextual Precision (score: 0.25, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.25 because the first four nodes are irrelevant to understanding how a system of AB testing works for models, as they discuss unrelated topics such as MLOps capabilities and version control. The fifth node, however, mentions A/B testing and its relevance to introducing new models, indicating that it should be ranked higher than the other 'no' verdicts., error: None)
  - ‚úÖ Contextual Recall (score: 0.52, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.52 because the contextual recall score indicates that the expected output sentence partially matches with nodes in the retrieval context, but not perfectly, suggesting a moderate level of relevance between the two., error: None)
  - ‚ùå Contextual Relevancy (score: 0.3793103448275862, threshold: 0.5, strict: False, evaluation model: lla



Metrics Summary

  - ‚úÖ Contextual Precision (score: 0.5, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.50 because irrelevant nodes (nodes 1 and 3) are ranked lower than relevant nodes (nodes 2 and 4), indicating that the model correctly prioritizes contexts with information about 'champion' and 'challenger' models., error: None)
  - ‚úÖ Contextual Recall (score: 0.8888888888888888, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.89 because the contextual recall accurately captures the relationships between sentences in the expected output, with most sentences being attributed to nodes in the retrieval context, except for one standalone statement that doesn't require specific context., error: None)
  - ‚úÖ Contextual Relevancy (score: 0.7567567567567568, threshold: 0.5, strict: False, evaluation model: llama3:latest (Ollama), reason: The score is 0.76 because the retrieval context conta

[{'name': 'test_case_0',
  'success': True,
  'metrics_data': [{'name': 'Contextual Precision',
    'threshold': 0.5,
    'success': True,
    'score': 0.6396524110809825,
    'reason': "The score is 0.64 because irrelevant nodes (nodes 2, 4, 6, and 8) are correctly ranked lower than relevant nodes, as they lack direct connections to the expected output of defining MLOps. The first node's relevance is evident from its mention of 'MLOps', while subsequent 'yes' verdicts provide more specific explanations aligning with the expected output.",
    'strict_mode': False,
    'evaluation_model': 'llama3:latest (Ollama)',
    'error': None,
    'evaluation_cost': 0.0,
    'verbose_logs': 'Verdicts:\n[\n    {\n        "verdict": "yes",\n        "reason": "The context mentions \'MLOps\' which is directly related to the expected output."\n    },\n    {\n        "verdict": "no",\n        "reason": "The text does not provide any information about what MLOps is, it only provides a definition and exp

In [39]:
# Salvar em JSON resultados da avalia√ß√£o acima

with open("artifacts/eval_metrics/deepseek-r1/results_deepeval.json", "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

In [41]:
# # Carrega o JSON salvo anteriormente
# with open("results_deepeval.json", "r", encoding="utf-8") as f:
#     results = json.load(f)