In [17]:
import pandas as pd
import mlflow
import dagshub
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from mlflow.pyfunc import PythonModel
import numpy as np
import litellm
from mlflow.genai import scorer
from mlflow.genai.scorers import Correctness, Guidelines
from langchain_groq import ChatGroq
import os
from dotenv import load_dotenv
load_dotenv()

True

In [18]:
os.environ["GOOGLE_API_KEY"]

'AIzaSyDTeTM4TmhEKgUqECw1c_IsUPAkxKwD3Uc'

In [19]:
dagshub.init(repo_owner='paruldiwakar',
             repo_name='mlflow-genai',
             mlflow=True)


mlflow.set_tracking_uri("https://dagshub.com/paruldiwakar/mlflow-genai.mlflow")

### Define your mock agent's prediction function

In [20]:
llm = ChatGroq(
    model="moonshotai/kimi-k2-instruct-0905",
    temperature=0.1,
)

In [21]:
from langchain.schema import SystemMessage, HumanMessage

def my_agent(question: str) -> str:
    response = llm.invoke([
        SystemMessage(
            content="You are a helpful assistant. Answer the following question in one sentences."
        ),
        HumanMessage(content=question),
    ])
    return response.content

# Wrapper function for evaluation
def qa_predict_fn(question: str) -> str:
    return my_agent(question)

In [22]:
type(qa_predict_fn("wazzaup"))

str

### Prepare an evaluation dataset

### Define evaluation criteria using Scorers

In [27]:
@scorer
def is_concise(outputs: str) -> bool:
    """Evaluate if the answer is concise (less than 5 words)"""
    return len(outputs.split()) <= 20

In [10]:
from mlflow.genai.scorers import Correctness

assessment = Correctness(
    name="my_correctness",
    model="gemini:/gemini-2.5-flash"
)(
    inputs={
        "question": "What is the difference between reduceByKey and groupByKey in Spark?"
    },
    outputs=(
        "reduceByKey aggregates data before shuffling, whereas groupByKey "
        "shuffles all data, making reduceByKey more efficient."
    ),
    expectations={
        "expected_facts": [
            "reduceByKey aggregates data before shuffling",
            "groupByKey shuffles all data"
        ]
    },
)

print(assessment)

Feedback(name='my_correctness', source=AssessmentSource(source_type='LLM_JUDGE', source_id='gemini:/gemini-2.5-flash'), trace_id=None, run_id=None, rationale='The response is correct. The document states "reduceByKey aggregates data before shuffling" which supports the first part of the claim. The document also states "groupByKey shuffles all data" which supports the second part of the claim.', metadata={'mlflow.assessment.judgeCost': 0.0006940000000000001}, span_id=None, create_time_ms=1767169975404, last_update_time_ms=1767169975404, assessment_id=None, error=None, expectation=None, feedback=FeedbackValue(value=<CategoricalRating.YES: 'yes'>, error=None), overrides=None, valid=True)


In [17]:
print(assessment.rationale)

The claim states "reduceByKey aggregates data before shuffling" and the document states "reduceByKey aggregates data before shuffling". The claim also states "groupByKey shuffles all data" and the document states "groupByKey shuffles all data". Both parts of the claim are directly supported by the document in the context of the question. The response is correct


In [9]:
import mlflow
from mlflow.genai.scorers import Correctness

# Fix tracking
mlflow.set_tracking_uri("file:./mlruns")
mlflow.set_experiment("test")

# Two samples
data = [
    {
        "inputs": {"question": "What is Spark?"},
        "outputs": "Spark is a distributed data processing framework.",
        "expectations": {"expected_response": "Spark is a distributed data processing engine."}
    },
    {
        "inputs": {"question": "What is reduceByKey?"},
        "outputs": "reduceByKey aggregates data before shuffling.",
        "expectations": {"expected_response": "reduceByKey combines values before shuffle."}
    }
]

# Evaluate with Gemini
result = mlflow.genai.evaluate(
    data=data,
    scorers=[
        Correctness(model="gemini:/gemini-2.5-flash"),
        Guidelines(name="is_english", guidelines="The answer must be in English",model="gemini:/gemini-2.5-flash"),
        is_concise,
        
        ]
)

print(result.metrics)

Evaluating: 100%|██████████| 2/2 [Elapsed: 00:03, Remaining: 00:00] 


✨ Evaluation completed.

Metrics and evaluation results are logged to the MLflow run:
  Run name: [94mangry-goose-933[0m
  Run ID: [94m6ae16ad0084a47eb9e2f542e277b399c[0m

To view the detailed evaluation results with sample-wise scores,
open the [93m[1mTraces[0m tab in the Run page in the MLflow UI.

{'is_concise/mean': 0.5, 'is_english/mean': 1.0, 'correctness/mean': 0.5}





In [25]:
result.tables['eval_results']

Unnamed: 0,trace_id,expected_response/value,is_english/value,is_concise/value,correctness/value,trace,client_request_id,state,request_time,execution_duration,request,response,trace_metadata,tags,spans,assessments
0,tr-b3df5f9a9f9b4c8a99f31aa7b667df13,Spark is a distributed data processing engine.,yes,False,no,"{""info"": {""trace_id"": ""tr-b3df5f9a9f9b4c8a99f3...",,OK,1767170727653,0,{'question': 'What is Spark?'},Spark is a distributed data processing framework.,"{'mlflow.trace_schema.version': '3', 'mlflow.u...",{'mlflow.eval.requestId': '4ed20b0e698c4cbd703...,"[{'trace_id': 's99fmp+bTIqZ8xqntmffEw==', 'spa...",[{'assessment_id': 'a-a51b0f5b6e2c495d87c709be...
1,tr-e130d31281fdff1328531a699ab0deef,reduceByKey combines values before shuffle.,yes,True,yes,"{""info"": {""trace_id"": ""tr-e130d31281fdff132853...",,OK,1767170727655,0,{'question': 'What is reduceByKey?'},reduceByKey aggregates data before shuffling.,"{'mlflow.trace_schema.version': '3', 'mlflow.u...",{'mlflow.eval.requestId': 'c89e669e79a19ec4e16...,"[{'trace_id': '4TDTEoH9/xMoUxppmrDe7w==', 'spa...",[{'assessment_id': 'a-523ece5df90f4358a01785ff...


## Now on mlflow and storing it on Dagshub

In [28]:
# Prepare evaluation data
eval_data = [
    {
        "inputs": {
            "question": "What is MLflow?"
        },
        "expectations": {
            "expected_response": (
                "MLflow is an open-source platform for managing the end-to-end machine learning lifecycle."
            )
        }
    },
    {
        "inputs": {
            "question": "What is Spark?"
        },
        "expectations": {
            "expected_response": (
                "Apache Spark is an open-source, distributed computing system designed for big data processing and analytics."
            )
        }
    }
]


# Set experiment
mlflow.set_experiment("LLM Evaluation - Kimi with Gemini Judge")

# Run evaluation
with mlflow.start_run(run_name="kimi-qa-evaluation") as run:
    print(f"Run ID: {run.info.run_id}")
    print("Starting evaluation...\n")
    
    # Evaluate with Gemini as judge
    results = mlflow.genai.evaluate(
        data=eval_data,
        predict_fn=qa_predict_fn,
        scorers=[
            Correctness(model="gemini:/gemini-2.5-flash"),
            Guidelines(
                name="is_english", 
                guidelines="The answer must be in English",
                model="gemini:/gemini-2.5-flash"
            ),
            is_concise
        ]
    )
    
    print("\n" + "="*60)
    print("AGGREGATED EVALUATION RESULTS")
    print("="*60)
    for metric, value in results.metrics.items():
        print(f"{metric}: {value}")
    
    # Save evaluation results to CSV
    eval_table = result.tables['eval_results']
    df = pd.DataFrame(eval_table)
    df.to_csv('eval_kimi_gemini.csv', index=False)
    
    print("\n" + "="*60)
    print("EVALUATION TABLE")
    print("="*60)
    print(df.to_string())
    print(f"\n✅ Evaluation results saved to 'eval_kimi_gemini.csv'")
    print(f"✅ View full results in MLflow UI: {mlflow.get_tracking_uri()}")

2025/12/31 14:47:46 INFO mlflow.genai.utils.data_validation: Testing model prediction with the first sample in the dataset. To disable this check, set the MLFLOW_GENAI_EVAL_SKIP_TRACE_VALIDATION environment variable to True.


Run ID: 1944a87d352d4778ae9293a97d7a586f
Starting evaluation...



Evaluating: 100%|██████████| 2/2 [Elapsed: 00:08, Remaining: 00:00] 



AGGREGATED EVALUATION RESULTS
is_concise/mean: 1.0
is_english/mean: 1.0
correctness/mean: 1.0

EVALUATION TABLE
                              trace_id                         expected_response/value is_english/value  is_concise/value correctness/value                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            