# Development: ML Flow

File for developing the mlflow code for the project.

In [1]:
# IMPORTS --------------------------------------------------------------------------------------------------------------

# Use the below lines if any dependencies are missing.
# ! python -m pip install uv
# ! python -m uv pip install langchain_openai mlflow load_dotenv langchain pandas langchain_community

import os
import sys

sys.path.append(os.path.abspath('\\'.join(os.getcwd().split('\\')[:-1])))

import mlflow
import pandas as pd
from dotenv import load_dotenv
from ml_flow import mlflow_server
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from mlflow.metrics.genai import make_genai_metric, EvaluationExample

_ = load_dotenv()

To begin with, we run the ML-Flow server:

In [2]:
server_process = mlflow_server()

INFO:root:Successfully running ML-Flow server. The server will terminate at the end of runtime.


Let's instantiate a dummy LLM which will answer simple queries:

In [50]:
# SETUP EXAMPLE LLM ----------------------------------------------------------------------------------------------------

example_llm = ChatOpenAI(model_name='gpt-3.5-turbo-0125', temperature=0)

example_prompt = PromptTemplate(input_variables=['inputs', 'context'],
                                template=("You're a investment manager. Using the context provided, "
                                          + "reply to the question below to the best of your ability:\n"
                                          + "Question:\n{inputs}\nContext:\n{context}"))

from langchain_core.runnables import RunnableLambda

def _get_content(model_return):
    return model_return.content

get_content = RunnableLambda(_get_content)

example_model = example_prompt | example_llm | get_content

We'll then create an evaluation set:

In [51]:
data_folder_path = '\\'.join(os.getcwd().split('\\')[:-1]) +'/data/'
file_name = 'Evaluation Dataset.csv'

file_path = data_folder_path + file_name
eval_set = pd.read_csv(file_path)

display(eval_set)

Unnamed: 0,inputs,context,targets
0,How much money does client 1 have in shares?,Client 1 has 20 shares. 70% of their shares ar...,"Client 1 has £14,000 worth of NVDA shares (70%..."


In [52]:
eval_set.columns

Index(['inputs', 'context', 'targets'], dtype='object')

In [53]:
example_model.invoke({'inputs': eval_set['inputs'][0], 'context': eval_set['context'][0]})#.content

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


'Client 1 has £14,000 worth of NVDA shares (70% of 20 shares at £1000 per share) and £1,140 worth of APPL shares (30% of 20 shares at £190 per share). Therefore, in total, Client 1 has £15,140 in shares.'

We then need to define an LLM-as-a-judge metric, and give it an example:

In [54]:
example = EvaluationExample(input="What is the best stock that client 2 currently owns?",
                            output=(r"The best performing stock owned by client 2 is NVDA, which has seen a 400% "
                                    + "increase in value in the last 10 months."),
                            score=80,
                            justification=("The best performing stock has been identified, and a reason is given for "
                                           + "its choosing."))

grading_prompt = ("Answer Quality: If the answer given does not relate to the question, or if the question is not "
                  + "answered, we will give a low score. If the question is answered comprehensively we will give a "
                  + "higher score.\nScore 0: The question is not answered.\nScore 20: The question is barely "
                  + "answered, and the answer is not useful.\nScore 40: The question is barely answered in basic "
                  + "terms.\nScore 80: The question is barely answered correctly and accurately.\nScore 100: The "
                  + "question is answer perfectly, and the choices are well reasoned.")

# Make a metric from a Gen AI model.
answer_quality = make_genai_metric(name="Answer_Quality",
                                   definition=("Answer Quality is a measure of the accuracy of the answer."),
                                   model="openai:/gpt-3.5-turbo",
                                   examples=[example],
                                   parameters={"temperature": 0.0},
                                   aggregations=["mean", "variance"],
                                   greater_is_better=True,
                                   grading_prompt=(grading_prompt))

print('The grading prompt is:')
print('')
print(grading_prompt)
print('')
print(answer_quality)

The grading prompt is:

Answer Quality: If the answer given does not relate to the question, or if the question is not answered, we will give a low score. If the question is answered comprehensively we will give a higher score.
Score 0: The question is not answered.
Score 20: The question is barely answered, and the answer is not useful.
Score 40: The question is barely answered in basic terms.
Score 80: The question is barely answered correctly and accurately.
Score 100: The question is answer perfectly, and the choices are well reasoned.

EvaluationMetric(name=Answer_Quality, greater_is_better=True, long_name=Answer_Quality, version=v1, metric_details=
Task:
You must return the following fields in your response in two lines, one below the other:
score: Your numerical score for the model's Answer_Quality based on the rubric
justification: Your reasoning about the model's Answer_Quality score

You are an impartial judge. You will be given an input that was sent to a machine
learning mo

We then connect to ML-Flow:

In [72]:
from mlflow.metrics import ari_grade_level, toxicity
from mlflow.metrics.genai import answer_similarity, faithfulness, relevance, answer_correctness, answer_relevance

judge_model = "openai:/gpt-3.5-turbo"

answer_similarity_metric = answer_similarity(model=judge_model)
faithfulness_metric = faithfulness(model=judge_model)
relevance_metric = relevance(model=judge_model)
answer_correctness_metric = answer_correctness(model=judge_model)
answer_relevance_metric = answer_relevance(model=judge_model)

semi working chat gpt Code

In [74]:
mlflow.set_experiment("mlflow_development")

with mlflow.start_run() as run: 
    _logged_model = mlflow.langchain.log_model(example_model, artifact_path="model")

    mlflow.log_param("model", example_model)
    results = mlflow.evaluate(_logged_model.model_uri,
                              eval_set,
                              model_type="question-answering",
                              #evaluators="default",
                              targets="targets",
                              extra_metrics=[relevance_metric, faithfulness_metric, answer_similarity_metric, answer_correctness_metric, answer_relevance_metric],
                              #extra_metrics=[answer_quality], # Include our custom metric!
                              evaluator_config={'col_mapping': {"inputs": "predictions"}}
                              )

    mlflow.log_metrics(results.metrics)

Let's see what the model produced:

In [71]:
pd.DataFrame(results.tables['eval_results_table'])

Downloading artifacts: 100%|██████████| 1/1 [00:00<00:00, 37.32it/s]


Unnamed: 0,inputs,context,targets,outputs,token_count,relevance/v1/score,relevance/v1/justification,faithfulness/v1/score,faithfulness/v1/justification,answer_similarity/v1/score,answer_similarity/v1/justification
0,How much money does client 1 have in shares?,Client 1 has 20 shares. 70% of their shares ar...,"Client 1 has £14,000 worth of NVDA shares (70%...","Client 1 has £14,000 worth of NVDA shares (70%...",66,5,The output accurately reflects the input and c...,5,The output correctly states the distribution o...,3,The output has moderate semantic similarity to...


In [63]:
output_df = pd.DataFrame(results.tables['eval_results_table'])
#output_df['answer'] = [d['content'] for d in output_df['outputs']]

# desired_columns = ['inputs', 'context', 'targets'] + [col for col in output_df.columns
#                                             if ('score' in col) or ('justification') in col]
# output_df = output_df[desired_columns]
display(output_df)

Downloading artifacts: 100%|██████████| 1/1 [00:00<00:00, 328.91it/s]


Unnamed: 0,inputs,context,targets,outputs,token_count,relevance/v1/score,relevance/v1/justification,faithfulness/v1/score,faithfulness/v1/justification,answer_similarity/v1/score,answer_similarity/v1/justification
0,How much money does client 1 have in shares?,Client 1 has 20 shares. 70% of their shares ar...,"Client 1 has £14,000 worth of NVDA shares (70%...","Client 1 has £14,000 worth of NVDA shares (70%...",66,5,The output provides a comprehensive answer to ...,5,The output correctly states the distribution o...,3,The output has moderate semantic similarity to...


In [65]:
output_df['outputs'][0]

'Client 1 has £14,000 worth of NVDA shares (70% of 20 shares at £1000 per share) and £5,700 worth of AAPL shares (30% of 20 shares at £190 per share). Therefore, in total, Client 1 has £19,700 in shares.'

In [69]:
from mlflow.tracking import MlflowClient

# Specify the tracking URI (replace with your MLflow server URI)
mlflow.set_tracking_uri("http://localhost:8080/")

# Initialize MLflow client
client = MlflowClient()

experiment = client.get_experiment_by_name('mlflow_development')

if experiment is None:
    raise ValueError(f"Experiment '{experiment_name}' not found.")

# Fetch all runs for the experiment
runs = client.search_runs(
    experiment_ids=[experiment.experiment_id],
    filter_string="",  # Add filter criteria if needed
    run_view_type=mlflow.entities.ViewType.ACTIVE_ONLY  # You can choose ACTIVE_ONLY, DELETED_ONLY, or ALL
)

# Display information about each run
for run in runs:
    print(f"Run ID: {run.info.run_id}")
    print(f"Parameters: {run.data.params}")
    print(f"Metrics: {run.data.metrics}")
    print(f"Tags: {run.data.tags}")
    print(f"Artifacts: {client.list_artifacts(run.info.run_id)}")
    print("-" * 40)

NameError: name 'experiment_name' is not defined

In [68]:
client.delete_experiment(experiment.experiment_id)