# Evaluate the refactored version of Naive Langchain agent

In [1]:
import mlflow
from answerers.langchain_answerer.langchain_naive_answerer import LangchainNaiveAnswerer

In [2]:
from dotenv import dotenv_values

config = {**dotenv_values("../configs/local.env")}

In [11]:
import pandas as pd

eval = pd.read_csv("../data/evaluation_dataset.csv")[["question", "answer_detailed"]]
eval.head()

Unnamed: 0,question,answer_detailed
0,How many tracks are there in the database?,"There are 3,503 tracks in the database."
1,What is the total revenue from all invoices?,Total revenue from all invoices is approximate...
2,What is the name of the most popular genre by ...,"The most popular genre is Rock with 1,297 tracks."
3,What is the highest amount ever billed to a si...,The highest invoice total is $25.86 for invoic...
4,What is the name of the track that has generat...,The track 'The Woman King' generated the most ...


In [13]:
unanswerable_set = pd.read_csv("../data/unanswerable_dataset.csv")

In [15]:
mixed_set = pd.concat([eval, unanswerable_set])

In [18]:
import mlflow
from mlflow.metrics.genai import answer_correctness, answer_relevance
from datetime import datetime

from sql_table_qa.evaluators.llm_evaluators import openai_correctness_evaluator, openai_relevance_evaluator

# Initialize MLflow client and set the experiment
mlflow.set_tracking_uri(config["MLFLOW_TRACKING_URI"])  # Set this to your MLflow tracking server URI
experiment_name = "Naive Langchain Prototype"
mlflow.set_experiment(experiment_name)
run_prefix = "refactored-langchain-test-w-unanswerable"

with mlflow.start_run(run_name=f"{run_prefix}-{datetime.now().strftime('%Y%m%d_%H%M%S')}"):
    model = LangchainNaiveAnswerer()
    answers = []
    for _, example in mixed_set.iterrows():
        _, _, ans = model.call(example["question"])
        answers.append(ans)
    mixed_set_w_ans = mixed_set.drop(columns=[c for c in eval.columns if c not in ("question", "answer_detailed")]).assign(model_answer=answers)
    
    mlflow.log_table(data=mixed_set_w_ans, artifact_file="answers.json")
    results = mlflow.evaluate(
        data = mixed_set_w_ans,
        targets = "answer_detailed",
        predictions = "model_answer",
        evaluators=None,
        extra_metrics=[openai_correctness_evaluator, openai_relevance_evaluator, mlflow.metrics.latency()],
        evaluator_config={'col_mapping': {"inputs": "question"}}
    )
    print(f"See aggregated evaluation results below:")
    display(results.metrics)

    # Evaluation result for each data record is available in `results.tables`.
    eval_table = results.tables["eval_results_table"]
    print(f"See evaluation table below:")
    display(eval_table)

  sample_rows_result = connection.execute(command)  # type: ignore


invalid syntax (<unknown>, line 1)
invalid syntax (<unknown>, line 1)
invalid syntax (<unknown>, line 1)


  string_columns = trimmed_df.columns[(df.applymap(type) == str).all(0)]
  data = data.applymap(_hash_array_like_element_as_bytes)
  data = data.applymap(_hash_array_like_element_as_bytes)
2024/04/27 00:33:25 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2024/04/27 00:33:25 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

2024/04/27 00:33:27 INFO mlflow.models.evaluation.default_evaluator: Evaluating metrics: answer_correctness


  0%|          | 0/28 [00:00<?, ?it/s]

2024/04/27 00:33:31 INFO mlflow.models.evaluation.default_evaluator: Evaluating metrics: answer_relevance


  0%|          | 0/28 [00:00<?, ?it/s]



See aggregated evaluation results below:


{'latency/mean': 0.0,
 'latency/variance': 0.0,
 'latency/p90': 0.0,
 'answer_correctness/v1/mean': 3.607142857142857,
 'answer_correctness/v1/variance': 2.5242346938775513,
 'answer_correctness/v1/p90': 5.0,
 'answer_relevance/v1/mean': 4.321428571428571,
 'answer_relevance/v1/variance': 1.4323979591836735,
 'answer_relevance/v1/p90': 5.0}

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

See evaluation table below:


Unnamed: 0,question,answer_detailed,model_answer,latency,answer_correctness/v1/score,answer_correctness/v1/justification,answer_relevance/v1/score,answer_relevance/v1/justification
0,How many tracks are there in the database?,"There are 3,503 tracks in the database.","There are 3,503 tracks in the database.",0,5,The output provided by the model is correct an...,5,The output directly answers the question by pr...
1,What is the total revenue from all invoices?,Total revenue from all invoices is approximate...,The total revenue from all invoices is $2328.60.,0,5,The output provided by the model is correct an...,5,The output directly provides the total revenue...
2,What is the name of the most popular genre by ...,"The most popular genre is Rock with 1,297 tracks.",The name of the most popular genre by number o...,0,5,The output provided by the model is correct an...,5,The output provides the exact name of the most...
3,What is the highest amount ever billed to a si...,The highest invoice total is $25.86 for invoic...,The highest amount ever billed to a single cus...,0,5,The output provided by the model is correct an...,5,The output directly addresses the question by ...
4,What is the name of the track that has generat...,The track 'The Woman King' generated the most ...,The name of the track that has generated the m...,0,5,The output provided by the model is correct an...,5,The output directly provides the name of the t...
5,Which customer has spent the most money in total?,"Helena Holý spent the most money, totaling app...",The customer who has spent the most money in t...,0,5,The output provided by the model is correct an...,5,The output directly addresses the input questi...
6,Which artist's tracks are the most purchased?,Iron Maiden's tracks were purchased 140 times.,The artist whose tracks are the most purchased...,0,5,The output is correct and demonstrates a high ...,5,The output directly addresses the input questi...
7,Which employee has generated the most revenue ...,Fynn Zimmermann generated the most revenue at ...,The employee who has generated the most revenu...,0,5,The output provided by the model is correct an...,5,The output directly addresses the input questi...
8,What is the name of the most popular playlist ...,"The 'Music' playlist contains 3,290 tracks.",The name of the most popular playlist by numbe...,0,4,The output correctly identifies the name of th...,5,The output provides the exact name of the most...
9,Which genre has generated the highest total re...,Rock generated the highest total revenue at ap...,The genre that has generated the highest total...,0,5,The output is correct and demonstrates a high ...,5,The output directly answers the question by st...
