# Development: ML Flow

File for developing the mlflow code for the project.

In [1]:
# IMPORTS --------------------------------------------------------------------------------------------------------------

# Use the below lines if any dependencies are missing.
# ! python -m pip install uv
# ! python -m uv pip install langchain_openai mlflow load_dotenv langchain pandas langchain_community

import os
import sys

sys.path.append(os.path.abspath('\\'.join(os.getcwd().split('\\')[:-1])))

import mlflow
import pandas as pd
from dotenv import load_dotenv
from ml_flow import mlflow_server, create_example_llm, evaluate_llm
from mlflow.metrics.genai import make_genai_metric, EvaluationExample

_ = load_dotenv()

import logging
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)

To begin with, we run the ML-Flow server:

In [2]:
server_process = mlflow_server()

Let's instantiate a dummy LLM which will answer simple queries:

In [3]:
example_model = create_example_llm()

We'll then read in an evaluation set:

In [4]:
data_folder_path = '\\'.join(os.getcwd().split('\\')[:-1]) +'/data/'
file_name = 'Evaluation Dataset.csv'

file_path = data_folder_path + file_name
eval_set = pd.read_csv(file_path)

display(eval_set)

Unnamed: 0,inputs,context,targets
0,How much money does client 1 have in shares?,Client 1 has 20 shares. 70% of their shares ar...,"Client 1 has £14,000 worth of NVDA shares (70%..."
1,How much money does client 2 have in shares?,Client 2 has 10 shares. 30% of their shares ar...,"Client 2 has £3,000 worth of NVDA shares (30% ..."


Let's demonstrate that the model works:

In [5]:
question = eval_set['inputs'][0]
context = eval_set['context'][0]

print(f"Question: {question}")
print('')
print('Answer: ' + example_model.invoke({'inputs': question, 'context': context}))

Question: How much money does client 1 have in shares?

Answer: Client 1 has £14,000 worth of NVDA shares (70% of 20 shares at £1000 per share) and £5,700 worth of AAPL shares (30% of 20 shares at £190 per share). Therefore, in total, Client 1 has £19,700 in shares.


We then connect to ML-Flow:

In [6]:
results = evaluate_llm(example_model, eval_set, "openai:/gpt-3.5-turbo", "mlflow_development")

And we can then take a look at the results:

In [None]:
output_df = pd.DataFrame(results.tables['eval_results_table'])
display(output_df)

In [None]:
output_df['outputs'][1]

We can also access the saved models using the client:

In [None]:
from mlflow.tracking import MlflowClient

# Specify the tracking URI (replace with your MLflow server URI)
mlflow.set_tracking_uri("http://localhost:8080/")

# Initialize MLflow client
client = MlflowClient()

experiment = client.get_experiment_by_name('mlflow_development')

if experiment is None:
    raise ValueError(f"Experiment 'mlflow_development' not found.")

# Fetch all runs for the experiment
runs = client.search_runs(
    experiment_ids=[experiment.experiment_id],
    filter_string="",  # Add filter criteria if needed
    run_view_type=mlflow.entities.ViewType.ACTIVE_ONLY  # You can choose ACTIVE_ONLY, DELETED_ONLY, or ALL
)

# Display information about each run
for run in runs:
    print(f"Run ID: {run.info.run_id}")
    print(f"Parameters: {run.data.params}")
    print(f"Metrics: {run.data.metrics}")
    print(f"Tags: {run.data.tags}")
    print(f"Artifacts: {client.list_artifacts(run.info.run_id)}")
    print("-" * 40)

In [None]:
# client.delete_experiment(experiment.experiment_id)

In [None]:
# from mlflow.metrics.genai import answer_similarity, faithfulness, answer_correctness, answer_relevance
# judge_model = "openai:/gpt-3.5-turbo"

# faithfulness_metric = faithfulness(model=judge_model)
# answer_relevance_metric = answer_relevance(model=judge_model)
# answer_similarity_metric = answer_similarity(model=judge_model)
# answer_correctness_metric = answer_correctness(model=judge_model)

# extra_metrics = [faithfulness_metric, answer_similarity_metric, answer_correctness_metric, answer_relevance_metric]

# mlflow.set_experiment('mlflow_development')

# with mlflow.start_run() as run: 
#     _logged_model = mlflow.langchain.log_model(example_model, artifact_path="model")

#     mlflow.log_param("model", example_model)
#     results = mlflow.evaluate(_logged_model.model_uri, eval_set, model_type="question-answering",
#                                 targets="targets", extra_metrics=extra_metrics,
#                                 evaluator_config={'col_mapping': {"inputs": "predictions"}})

#     mlflow.log_metrics(results.metrics)


2024/05/29 15:54:46 INFO mlflow.models.evaluation.default_evaluator: Computing model predictions.
2024/05/29 15:54:49 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...
  from .autonotebook import tqdm as notebook_tqdm
100%|██████████| 1/1 [00:01<00:00,  1.08s/it]
100%|██████████| 1/1 [00:02<00:00,  2.11s/it]
100%|██████████| 1/1 [00:01<00:00,  1.44s/it]
100%|██████████| 1/1 [00:01<00:00,  1.47s/it]
100%|██████████| 2/2 [00:03<00:00,  1.74s/it]
100%|██████████| 2/2 [00:01<00:00,  1.22it/s]
100%|██████████| 2/2 [00:02<00:00,  1.03s/it]
100%|██████████| 2/2 [00:01<00:00,  1.04it/s]


semi working chat gpt Code

In [None]:
type(results)

mlflow.models.evaluation.base.EvaluationResult

Let's see what the model produced:

In [None]:
output_df = pd.DataFrame(results.tables['eval_results_table'])
#output_df['answer'] = [d['content'] for d in output_df['outputs']]

# desired_columns = ['inputs', 'context', 'targets'] + [col for col in output_df.columns
#                                             if ('score' in col) or ('justification') in col]
# output_df = output_df[desired_columns]
display(output_df)

Downloading artifacts: 100%|██████████| 1/1 [00:00<00:00, 283.90it/s]


Unnamed: 0,inputs,context,targets,outputs,token_count,faithfulness/v1/score,faithfulness/v1/justification,answer_similarity/v1/score,answer_similarity/v1/justification,answer_correctness/v1/score,answer_correctness/v1/justification,answer_relevance/v1/score,answer_relevance/v1/justification
0,How much money does client 1 have in shares?,Client 1 has 20 shares. 70% of their shares ar...,"Client 1 has £14,000 worth of NVDA shares (70%...","Client 1 has £14,000 worth of NVDA shares (70%...",66,5,The output correctly states the distribution o...,3,The output has moderate semantic similarity to...,4,The output provided by the model is mostly cor...,5,"The output directly mirrors the input, providi..."
1,How much money does client 2 have in shares?,Client 2 has 10 shares. 30% of their shares ar...,"Client 2 has £3,000 worth of NVDA shares (30% ...","Client 2 has £7,300 in shares. This is calcula...",76,5,The output correctly calculates the amount of ...,4,The output aligns with the provided targets in...,5,The output provided by the model is correct an...,5,The output directly addresses all aspects of t...


'Client 2 has £7,300 in shares. This is calculated by taking 30% of their shares in NVDA (3 shares x £1000 = £3000) and 70% of their shares in AAPL (7 shares x £190 = £1330), then adding these two amounts together (£3000 + £1330 = £4330).'

In [None]:
# create score output func
output_df['faithfulness/v1/score'].mean()

5.0

Run ID: 9f33508bd7124b8c9a8e154f79c2f707
Parameters: {'model': 'first=PromptTemplate(input_variables=[\'context\', \'inputs\'], template="You\'re a investment manager. Using the context provided, reply to the question below to the best of your ability:\\nQuestion:\\n{inputs}\\nContext:\\n{context}") middle=[ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x0000023F44913B10>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x0000023F6643A810>, model_name=\'gpt-3.5-turbo-0125\', temperature=0.0, openai_api_key=SecretStr(\'**********\'), openai_proxy=\'\')] last=RunnableLambda(_get_content)'}
Metrics: {'answer_correctness/v1/mean': 4.5, 'answer_correctness/v1/p90': 4.9, 'answer_correctness/v1/variance': 0.25, 'answer_relevance/v1/mean': 5.0, 'answer_relevance/v1/p90': 5.0, 'answer_relevance/v1/variance': 0.0, 'answer_similarity/v1/mean': 3.5, 'answer_similarity/v1/p90': 3.9, 'answer_similarity/v1/variance': 0.25, 'exact_match/v1': 0.0,

In [None]:
example = EvaluationExample(input="What is the best stock that client 2 currently owns?",
                            output=(r"The best performing stock owned by client 2 is NVDA, which has seen a 400% "
                                    + "increase in value in the last 10 months."),
                            score=80,
                            justification=("The best performing stock has been identified, and a reason is given for "
                                           + "its choosing."))

grading_prompt = ("Answer Quality: If the answer given does not relate to the question, or if the question is not "
                  + "answered, we will give a low score. If the question is answered comprehensively we will give a "
                  + "higher score.\nScore 0: The question is not answered.\nScore 20: The question is barely "
                  + "answered, and the answer is not useful.\nScore 40: The question is barely answered in basic "
                  + "terms.\nScore 80: The question is barely answered correctly and accurately.\nScore 100: The "
                  + "question is answer perfectly, and the choices are well reasoned.")

# Make a metric from a Gen AI model.
answer_quality = make_genai_metric(name="Answer_Quality",
                                   definition=("Answer Quality is a measure of the accuracy of the answer."),
                                   model="openai:/gpt-3.5-turbo",
                                   examples=[example],
                                   parameters={"temperature": 0.0},
                                   aggregations=["mean", "variance"],
                                   greater_is_better=True,
                                   grading_prompt=(grading_prompt))

print('The grading prompt is:')
print('')
print(grading_prompt)
print('')
print(answer_quality)

The grading prompt is:

Answer Quality: If the answer given does not relate to the question, or if the question is not answered, we will give a low score. If the question is answered comprehensively we will give a higher score.
Score 0: The question is not answered.
Score 20: The question is barely answered, and the answer is not useful.
Score 40: The question is barely answered in basic terms.
Score 80: The question is barely answered correctly and accurately.
Score 100: The question is answer perfectly, and the choices are well reasoned.

EvaluationMetric(name=Answer_Quality, greater_is_better=True, long_name=Answer_Quality, version=v1, metric_details=
Task:
You must return the following fields in your response in two lines, one below the other:
score: Your numerical score for the model's Answer_Quality based on the rubric
justification: Your reasoning about the model's Answer_Quality score

You are an impartial judge. You will be given an input that was sent to a machine
learning mo