In [1]:
!pip install -U langchain langchain-community langchain-ollama mlflow chromadb pandas




In [2]:
import pandas as pd
import mlflow
from langchain.chains import RetrievalQA
from langchain.document_loaders import WebBaseLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain_ollama import OllamaLLM, OllamaEmbeddings

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [3]:
llm = OllamaLLM(model="llama3.2", temperature=0)
embeddings = OllamaEmbeddings(model="nomic-embed-text")

In [4]:
loader = WebBaseLoader("https://mlflow.org/docs/latest/index.html")
documents = loader.load()

In [5]:
# split text into chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(documents)

In [6]:
# build local vector db 
docsearch = Chroma.from_documents(texts, embeddings)

In [7]:
# create RAG 
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=docsearch.as_retriever(),
    return_source_documents=True,
)

In [8]:
def model(input_df: pd.DataFrame):
    results = []
    for _, row in input_df.iterrows():
        query = row["questions"]
        response = qa.invoke(query)
        results.append(response["result"])
    return results

eval_df = pd.DataFrame(
    {
        "questions": [
            "what is mlflow?",
            "how to run mlflow.evaluate()?",
            "how to log_table()?",
            "how to load_table()?",
        ],
    }
)

In [11]:
# since mlflow.genai metrics use openai endpoints, lets define our own,
from mlflow.models import make_metric

def _avg_length(eval_series):
    return {"avg_length": eval_series.str.len().mean()}


response_length_metric = make_metric(
    eval_fn=_avg_length,
    greater_is_better=True,
)

results = mlflow.evaluate(
    model,
    eval_df,
    model_type="question-answering",
    evaluators="default",
    extra_metrics=[response_length_metric],
    predictions="predictions"
)

print("=== METRICS ===")
print(results.metrics)


## not worked as expected...


2025/10/14 18:24:49 INFO mlflow.models.evaluation.evaluators.default: Computing model predictions.


MlflowException: Error: Metric calculation failed for the following metrics:
Metric '_avg_length' requires the following:
- missing columns ['eval_series'] need to be defined or mapped

Below are the existing column names for the input/output data:
Input Columns: ['questions']
Output Columns: ['predictions']

To resolve this issue, you may need to:
- specify any required parameters
- if you are missing columns, check that there are no circular dependencies among your
metrics, and you may want to map them to an existing column using the following
configuration:
evaluator_config={'col_mapping': {<missing column name>: <existing column name>}}