In [1]:

import pandas as pd
import mlflow
import dagshub
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from mlflow.pyfunc import PythonModel
import numpy as np

from mlflow.genai import scorer
from mlflow.genai.scorers import Correctness, Guidelines
from langchain_groq import ChatGroq
import os
from dotenv import load_dotenv
load_dotenv()

True

In [2]:

llm = ChatGroq(model="moonshotai/kimi-k2-instruct-0905") #or llm=init_chat_model(groq:llama3-8b-8192)

In [3]:
dagshub.init(repo_owner='paruldiwakar',
             repo_name='mlflow-genai',
             mlflow=True)


mlflow.set_tracking_uri("https://dagshub.com/paruldiwakar/mlflow-genai.mlflow")


In [4]:
eval_data = pd.DataFrame(
    {
        "inputs": [
            "What is MLflow?",
            "What is Spark?",
        ],
        "ground_truth": [
            "MLflow is an open-source platform for managing the end-to-end machine learning (ML) "
            "lifecycle. It was developed by Databricks, a company that specializes in big data and "
            "machine learning solutions. MLflow is designed to address the challenges that data "
            "scientists and machine learning engineers face when developing, training, and deploying "
            "machine learning models.",
            "Apache Spark is an open-source, distributed computing system designed for big data "
            "processing and analytics. It was developed in response to limitations of the Hadoop "
            "MapReduce computing model, offering improvements in speed and ease of use. Spark "
            "provides libraries for various tasks such as data ingestion, processing, and analysis "
            "through its components like Spark SQL for structured data, Spark Streaming for "
            "real-time data processing, and MLlib for machine learning tasks",
        ],
    }
)

In [5]:
eval_data

Unnamed: 0,inputs,ground_truth
0,What is MLflow?,MLflow is an open-source platform for managing...
1,What is Spark?,"Apache Spark is an open-source, distributed co..."


### Evalution Metric Functions
(i dont want to buy the open ai token lol)

In [6]:
from rouge_score import rouge_scorer

def rouge_l(eval_df, _):
    scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
    scores = [
        scorer.score(gt, pred)["rougeL"].fmeasure
        for gt, pred in zip(eval_df["ground_truth"], eval_df["prediction"])
    ]
    return float(np.mean(scores))


In [7]:
@scorer
def is_concise(outputs: str) -> bool:
    """Evaluate if the answer is concise (less than 5 words)"""
    return len(outputs.split()) <= 5


scorers = [
    Correctness(),
    Guidelines(name="is_english", guidelines="The answer must be in English"),
    is_concise,
]

In [8]:
from google import genai

# The client gets the API key from the environment variable `GEMINI_API_KEY`.
client = genai.Client()

response = client.models.generate_content(
    model="gemini-2.5-flash", contents="Explain how AI works in a few words"
)
print(response.text)

AI learns patterns from data to make smart decisions or predictions.


In [9]:
mlflow.set_experiment("LLM Evaluation")

<Experiment: artifact_location='mlflow-artifacts:/c5b37244483849fa8ad2ef9bd266b78c', creation_time=1767117540978, experiment_id='2', last_update_time=1767117540978, lifecycle_stage='active', name='LLM Evaluation', tags={}>

In [28]:
with mlflow.start_run() as run:
    # Wrap llm as an MLflow model.
    logged_model_info =  mlflow.pyfunc.log_model(
        artifact_path="model",
        python_model=GroqQAWrapper(),
        input_example=pd.DataFrame({"question": ["What is agentic AI?"]})
    )

    model_uri = f"runs:/{run.info.run_id}/model"

    # Use predefined question-answering metrics to evaluate our model.
    results = mlflow.evaluate(
        model_uri,
        eval_data,  # DataFrame with 'question' and 'ground_truth'
        targets="ground_truth",
        model_type="question-answering",
        extra_metrics=[
            Correctness(model="gemini/gemini-2.5-flash"),
            mlflow.metrics.toxicity(model="gemini/gemini-2.5-flash"),
            mlflow.metrics.genai.answer_similarity(model="gemini/gemini-2.5-flash"),
            mlflow.metrics.latency()
        ],
    )
    print(f"See aggregated evaluation results below: \n{results.metrics}")

    eval_table = results.tables["eval_results_table"]
    df = pd.DataFrame(eval_table)
    df.to_csv("eval.csv", index=False)

    print("Aggregated metrics:")
    print(eval_table)

2025/12/31 01:25:57 INFO mlflow.pyfunc: Inferring model signature from input example


üèÉ View run fortunate-bird-751 at: https://dagshub.com/paruldiwakar/mlflow-genai.mlflow/#/experiments/0/runs/6bcf6fc5fb5a4d30ab334d826ffacbed
üß™ View experiment at: https://dagshub.com/paruldiwakar/mlflow-genai.mlflow/#/experiments/0


MlflowException: Failed to serialize Python model. Please save the model into a python file and use code-based logging method instead. Seehttps://mlflow.org/docs/latest/models.html#models-from-code for more information.

In [29]:
import mlflow.pyfunc
import pandas as pd
from langchain_groq import ChatGroq
import os
from dotenv import load_dotenv
from langchain.prompts.chat import ChatPromptTemplate
from langchain.chains.llm import LLMChain
load_dotenv()

True

In [32]:
# groq_model.py
from langchain.chains.llm import LLMChain

class GroqWrapper:
    def __init__(self, llm, prompt):
        self.chain = LLMChain(prompt=prompt, llm=llm)

    def predict(self, context):
        # context is a pandas DataFrame
        questions = context["inputs"].tolist()
        return [self.chain.run({"question": q}) for q in questions]

IndexError: list index out of range

In [33]:
class GroqQAWrapper(mlflow.pyfunc.PythonModel):
    def load_context(self, context):
        system_prompt = "Answer the following question in two sentences"

        prompt = ChatPromptTemplate.from_messages([
            ("system", system_prompt),
            ("user", "{question}")
        ])

        llm = ChatGroq(
            model="moonshotai/kimi-k2-instruct-0905",
            temperature=0
        )

        self.chain = prompt | llm | StrOutputParser()

    def predict(self, context, model_input):
        # MLflow GenAI evaluator passes everything as `inputs`
        if "inputs" in model_input.columns:
            questions = model_input["inputs"]
        elif "question" in model_input.columns:
            questions = model_input["question"]
        elif "prompt" in model_input.columns:
            questions = model_input["prompt"]
        else:
            raise ValueError(
                f"No valid input column found. Columns: {model_input.columns}"
            )

        return questions.apply(
            lambda q: self.chain.invoke({"question": q})
        )




In [79]:
eval_data = pd.DataFrame({
    "inputs": [{"question": "What is MLflow?"}],
    "ground_truth": [
        "MLflow is an open-source platform for managing the end-to-end ML lifecycle..."
    ],
    "expected_response": [
        "MLflow is an open-source platform for managing the end-to-end ML lifecycle..."
    ]
})







In [25]:
import litellm

resp = litellm.completion(
    model="gemini/gemini-2.5-flash",
    messages=[{"role": "user", "content": "Say hello in one sentence"}],
)

print(resp.choices[0].message.content)


Hello there!


In [49]:
import mlflow

model_path = "groq_model.py"

with mlflow.start_run():
    model_info = mlflow.pyfunc.log_model(
        python_model=model_path,   # path to file
        name="groq_qa_model",
    )

print(model_info.model_uri)




üèÉ View run crawling-slug-532 at: https://dagshub.com/paruldiwakar/mlflow-genai.mlflow/#/experiments/2/runs/1ad56a305b4c459dbe6bf5a829b18b35
üß™ View experiment at: https://dagshub.com/paruldiwakar/mlflow-genai.mlflow/#/experiments/2
models:/m-7831511d626b42c89ae98184fa736321


In [70]:
import mlflow.pyfunc
import pandas as pd

model = mlflow.pyfunc.load_model(model_info.model_uri)

test_df = pd.DataFrame({
    "inputs": ["What is MLflow?"]
})

print(model.predict(test_df))


Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [00:01<00:00,  4.39it/s]


['MLflow is an open-source platform that manages the end-to-end machine-learning lifecycle, including experiment tracking, model packaging, and deployment. It provides standardized APIs and a unified UI so teams can reproduce runs, share models, and push them to production regardless of the library or cloud they use.']


In [90]:
def predict_fn_wrapped(**kwargs):
    key = list(kwargs.keys())[0]
    value = kwargs[key]
    return model.predict(pd.DataFrame({"inputs": [value]}))[0]




In [91]:
from mlflow.genai import scorer
from mlflow.genai.scorers import Correctness, Guidelines

@scorer
def is_concise(outputs: str) -> bool:
    return len(outputs.split()) <= 40

scorers = [
    Correctness(),
    Guidelines(name="is_english", guidelines="The answer must be in English"),
    is_concise,
]

results = mlflow.genai.evaluate(
    data=eval_data,
    predict_fn=predict_fn_wrapped,
    scorers=scorers,
)



2025/12/31 03:43:43 INFO mlflow.genai.scorers.validation: The input data is missing following columns that are required by the specified scorers. The results will be null for those scorers.
 - `expected_response or expected_facts` field in `expectations` column is required by [correctness].


2025/12/31 03:43:43 INFO mlflow.genai.utils.data_validation: Testing model prediction with the first sample in the dataset. To disable this check, set the MLFLOW_GENAI_EVAL_SKIP_TRACE_VALIDATION environment variable to True.
Evaluating: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [Elapsed: 00:06, Remaining: 00:00] 


In [92]:
results.tables['eval_results']


Unnamed: 0,trace_id,correctness/value,is_english/value,is_concise/value,trace,client_request_id,state,request_time,execution_duration,request,response,trace_metadata,tags,spans,assessments
0,tr-a7a8d380b5a00500d206cd8be80dd5cd,,,False,"{""info"": {""trace_id"": ""tr-a7a8d380b5a00500d206...",,OK,1767132826475,3691,{'question': 'What is MLflow?'},MLflow is an open-source platform that manages...,{'mlflow.sourceRun': '4dbf624bba96463dba25acb0...,{'mlflow.artifactLocation': 'mlflow-artifacts:...,"[{'trace_id': 'p6jTgLWgBQDSBs2L6A3VzQ==', 'spa...",[{'assessment_id': 'a-d69b04f53a6240bdb2970343...


In [88]:
type(results)




mlflow.genai.evaluation.entities.EvaluationResult

In [83]:
from mlflow.models.evaluation import evaluate

In [87]:
from mlflow.metrics.genai import answer_similarity

results = mlflow.evaluate(
    model_info.model_uri,
    eval_data,                       # your DataFrame
    targets="ground_truth",
    model_type="question-answering",
    extra_metrics=[
        Correctness(model="gemini/gemini-2.5-flash"),
        #answer_similarity(model="gemini/gemini-2.5-flash"),
    ],
)

print(results.metrics)
results.tables["eval_results_table"]



 - For traditional ML or deep learning models: Use `mlflow.models.evaluate`, which maintains full compatibility with the original `mlflow.evaluate` API.

 - For LLMs or GenAI applications: Use the new `mlflow.genai.evaluate` API, which offers enhanced features specifically designed for evaluating LLMs and GenAI applications.

Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [00:01<00:00,  2.55it/s]
2025/12/31 03:43:04 INFO mlflow.tracking.fluent: Active model is set to the logged model with ID: m-7831511d626b42c89ae98184fa736321
2025/12/31 03:43:04 INFO mlflow.tracking.fluent: Use `mlflow.set_active_model` to set the active model to a different one if needed.


üèÉ View run illustrious-bat-673 at: https://dagshub.com/paruldiwakar/mlflow-genai.mlflow/#/experiments/2/runs/a0697c53fe8f45e1adfd450993218b4d
üß™ View experiment at: https://dagshub.com/paruldiwakar/mlflow-genai.mlflow/#/experiments/2


MlflowException: In the 'extra_metrics' parameter, the following metrics have the wrong type:
- Metric 'name='correctness' aggregations=None description="Check whether the expected facts (from expected_response or expected_facts) are supported by the model's response." required_columns={'inputs', 'outputs'} model='gemini/gemini-2.5-flash'' has type 'Correctness'
Please ensure that all extra metrics are instances of mlflow.metrics.EvaluationMetric.

In [None]:
with mlflow.start_run() as run: # Wrap llm as an MLflow model. 
    logged_model_info = mlflow.pyfunc.log_model( 
        artifact_path="model", 
        python_model=GroqQAWrapper(), 
        input_example=pd.DataFrame( {"question": ["What is agentic AI?"]} ),
        )
    model_uri = f"runs:/{run.info.run_id}/model" 
    # Use predefined question-answering metrics to evaluate our model. 
    results = mlflow.evaluate( 
        model_uri, eval_data, targets="ground_truth", 
        model_type="question-answering", 
        extra_metrics=[ mlflow.metrics.toxicity(model="gemini/gemini-2.5-flash"), 
                       mlflow.metrics.latency(model="gemini/gemini-2.5-flash"), 
                       mlflow.metrics.genai.answer_similarity(model="gemini/gemini-2.5-flash"), ], 
                       ) 
    print(f"See aggregated evaluation results below: \n{results.metrics}") 
    eval_table = results.tables["eval_results_table"] 
    df = pd.DataFrame(eval_table) 
    df.to_csv("eval.csv", index=False) 
    print("Aggregated metrics:") 
    print(eval_table)

2025/12/31 02:09:14 INFO mlflow.pyfunc: Inferring model signature from input example


üèÉ View run youthful-ant-658 at: https://dagshub.com/paruldiwakar/mlflow-genai.mlflow/#/experiments/0/runs/416ad26576c34774b928e8a63c8e0f54
üß™ View experiment at: https://dagshub.com/paruldiwakar/mlflow-genai.mlflow/#/experiments/0


MlflowException: Failed to serialize Python model. Please save the model into a python file and use code-based logging method instead. Seehttps://mlflow.org/docs/latest/models.html#models-from-code for more information.