# Development: ML Flow

File for developing the mlflow code for the project.

In [1]:
# IMPORTS --------------------------------------------------------------------------------------------------------------

# Use the below lines if any dependencies are missing.
# ! python -m pip install uv
# ! python -m uv pip install langchain_openai mlflow load_dotenv langchain pandas langchain_community

import os
import sys

sys.path.append(os.path.abspath('\\'.join(os.getcwd().split('\\')[:-1])))

import mlflow
import pandas as pd
from dotenv import load_dotenv
from mlflow.metrics.genai import make_genai_metric, EvaluationExample
from ml_flow import mlflow_server, create_example_llm, evaluate_llm, delete_runs

_ = load_dotenv()

import logging
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)

To begin with, we run the ML-Flow server:

In [2]:
server_process = mlflow_server()

Let's instantiate a dummy LLM which will answer simple queries:

In [14]:
from langchain import hub
from langchain_community.llms import OpenAI
from langchain_core.messages import AIMessage, HumanMessage
from langchain.agents import create_react_agent, AgentExecutor

In [18]:
! python -m pip install langchainhub

Collecting langchainhub
  Downloading langchainhub-0.1.17-py3-none-any.whl (4.8 kB)
Collecting types-requests<3.0.0.0,>=2.31.0.2
  Downloading types_requests-2.32.0.20240523-py3-none-any.whl (15 kB)
Installing collected packages: types-requests, langchainhub
Successfully installed langchainhub-0.1.17 types-requests-2.32.0.20240523



[notice] A new release of pip available: 22.3 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [26]:
from ionic_langchain.tool import Ionic, IonicTool
from langchain import hub
from langchain.agents import AgentExecutor, Tool, create_react_agent
from langchain_openai import OpenAI

# Based on ReAct Agent
# https://python.langchain.com/docs/modules/agents/agent_types/react
# Please reach out to support@ionicapi.com for help with add'l agent types.


model = "gpt-3.5-turbo-instruct"
temperature = 0.6

llm = OpenAI(model_name=model, temperature=temperature)


ionic_tool = IonicTool().tool()


# The tool comes with its own prompt,
# but you may also update it directly via the description attribute:

ionic_tool.description = str(
    """
Ionic is an e-commerce shopping tool. Assistant uses the Ionic Commerce Shopping Tool to find, discover, and compare products from thousands of online retailers. Assistant should use the tool when the user is looking for a product recommendation or trying to find a specific product.

The user may specify the number of results, minimum price, and maximum price for which they want to see results.
Ionic Tool input is a comma-separated string of values:
  - query string (required, must not include commas)
  - number of results (default to 4, no more than 10)
  - minimum price in cents ($5 becomes 500)
  - maximum price in cents
For example, if looking for coffee beans between 5 and 10 dollars, the tool input would be `coffee beans, 5, 500, 1000`.

Return them as a markdown formatted list with each recommendation from tool results, being sure to include the full PDP URL. For example:

1. Product 1: [Price] -- link
2. Product 2: [Price] -- link
3. Product 3: [Price] -- link
4. Product 4: [Price] -- link
"""
)

tools = [ionic_tool]

# default prompt for create_react_agent
prompt = hub.pull("hwchase17/react")

agent = create_react_agent(
    llm,
    tools,
    prompt=prompt,
)

agent_executor = AgentExecutor(
    agent=agent, tools=tools, handle_parsing_errors=True, verbose=True, max_iterations=5, return_intermediate_steps=True
)

input = ("I'm looking for a new 4k monitor can you find me some options for less than $1000")
result = agent_executor.invoke({"input": input})

print(result)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m We should use the Ionic Commerce Shopping Tool to find 4k monitors.
Action: ionic_commerce_shopping_tool
Action Input: 4k monitors, 4, 0, 100000[0m[36;1m[1;3m[{'products': [{'links': [{'text': 'Details', 'type': 'pdp', 'url': 'https://l.ioniccommerce.com/aeegg7'}], 'merchant_name': 'Best Buy', 'merchant_product_id': '6386391', 'name': '28” ViewFinity UHD IPS AMD FreeSync with HDR Monitor', 'price': '$199.99', 'status': 'available', 'thumbnail': 'https://pisces.bbystatic.com/image2/BestBuy_US/images/products/6386/6386391_sd.jpg', 'brand_name': 'Samsung', 'upc': '887276374888'}, {'links': [{'text': 'Details', 'type': 'pdp', 'url': 'https://www.amazon.com/dp/B098HQ8YRX?tag=ioniccommer00-20&linkCode=osi&th=1&psc=1'}], 'merchant_name': 'Amazon', 'merchant_product_id': 'B098HQ8YRX', 'name': 'Sceptre 4K IPS 27" 3840 x 2160 UHD Monitor up to 70Hz DisplayPort HDMI 99% sRGB Build-in Speakers, Black 2021 (U275W-UPT)', 'price': '$179

In [68]:
context = []

for step in result['intermediate_steps']:
    context += [f'Tool: {step[0].tool}']
    context += [f'Log: {step[0].log}']
    context += '\n'

print('\n'.join(context))

Tool: ionic_commerce_shopping_tool
Log:  We should use the Ionic Commerce Shopping Tool to find 4k monitors.
Action: ionic_commerce_shopping_tool
Action Input: 4k monitors, 4, 0, 100000


Tool: Format as markdown list
Log:  We should format the result as a markdown list with each recommendation being a numbered list item.
Action: Format as markdown list
Action Input: [{'products': [{'links': [{'text': 'Details', 'type': 'pdp', 'url': 'https://l.ioniccommerce.com/aeegg7'}], 'merchant_name': 'Best Buy', 'merchant_product_id': '6386391', 'name': '28” ViewFinity UHD IPS AMD FreeSync with HDR Monitor', 'price': '$199.99', 'status': 'available', 'thumbnail': 'https://pisces.bbystatic.com/image2/BestBuy_US/images/products/6386/6386391_sd.jpg', 'brand_name': 'Samsung', 'upc': '887276374888'}, {'links': [{'text': 'Details', 'type': 'pdp', 'url': 'https://www.amazon.com/dp/B098HQ8YRX?tag=ioniccommer00-20&linkCode=osi&th=1&psc=1'}], 'merchant_name': 'Amazon', 'merchant_product_id': 'B098HQ8YRX', 

In [64]:
result['intermediate_steps'][4][0].tool

'Check if the input includes all required values'

In [44]:
result['intermediate_steps'][0][0].log

' We should use the Ionic Commerce Shopping Tool to find 4k monitors.\nAction: ionic_commerce_shopping_tool\nAction Input: 4k monitors, 4, 0, 100000'

In [43]:
result['intermediate_steps'][0][1][0]['query']

{'query': '4k monitors', 'max_price': 100000, 'min_price': 0, 'num_results': 4}

In [58]:
result['intermediate_steps'][2][0].log

' We can use the observed result as the input for the markdown list.\nAction: Use observed result as input for markdown list\nAction Input: [{\'products\': [{\'links\': [{\'text\': \'Details\', \'type\': \'pdp\', \'url\': \'https://l.ioniccommerce.com/aeegg7\'}], \'merchant_name\': \'Best Buy\', \'merchant_product_id\': \'6386391\', \'name\': \'28” ViewFinity UHD IPS AMD FreeSync with HDR Monitor\', \'price\': \'$199.99\', \'status\': \'available\', \'thumbnail\': \'https://pisces.bbystatic.com/image2/BestBuy_US/images/products/6386/6386391_sd.jpg\', \'brand_name\': \'Samsung\', \'upc\': \'887276374888\'}, {\'links\': [{\'text\': \'Details\', \'type\': \'pdp\', \'url\': \'https://www.amazon.com/dp/B098HQ8YRX?tag=ioniccommer00-20&linkCode=osi&th=1&psc=1\'}], \'merchant_name\': \'Amazon\', \'merchant_product_id\': \'B098HQ8YRX\', \'name\': \'Sceptre 4K IPS 27" 3840'

In [60]:
result['intermediate_steps'][3][0].tool

'Check if the input is a comma-separated string of values'

In [None]:
trim_intermediate_steps

In [3]:
example_model = create_example_agent()

We'll then read in an evaluation set:

In [4]:
data_folder_path = '\\'.join(os.getcwd().split('\\')[:-1]) +'/data/'
file_name = 'Evaluation Dataset.csv'

file_path = data_folder_path + file_name
eval_set = pd.read_csv(file_path)

display(eval_set)

Unnamed: 0,inputs,context,targets
0,How much money does client 1 have in shares?,Client 1 has 20 shares. 70% of their shares ar...,"Client 1 has £14,000 worth of NVDA shares (70%..."
1,How much money does client 2 have in shares?,Client 2 has 10 shares. 30% of their shares ar...,"Client 2 has £3,000 worth of NVDA shares (30% ..."


Let's demonstrate that the model works:

In [5]:
question = eval_set['inputs'][0]
context = eval_set['context'][0]

print(f"Question: {question}")
print('')
print('Answer: ' + example_model.invoke({'inputs': question, 'context': context}))

Question: How much money does client 1 have in shares?

Answer: Client 1 has £14,000 worth of NVDA shares (70% of 20 shares at £1000 per share) and £5,700 worth of APPL shares (30% of 20 shares at £190 per share). Therefore, in total, Client 1 has £19,700 in shares.


We then connect to ML-Flow:

In [6]:
results = evaluate_llm(example_model, eval_set, "openai:/gpt-3.5-turbo", "mlflow_development")

2024/05/30 09:25:36 INFO mlflow.models.evaluation.default_evaluator: Computing model predictions.
2024/05/30 09:25:38 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...
  from .autonotebook import tqdm as notebook_tqdm
100%|██████████| 1/1 [00:00<00:00,  1.04it/s]
100%|██████████| 1/1 [00:01<00:00,  1.24s/it]
100%|██████████| 1/1 [00:01<00:00,  1.31s/it]
100%|██████████| 1/1 [00:01<00:00,  1.00s/it]
100%|██████████| 2/2 [00:01<00:00,  1.82it/s]
100%|██████████| 2/2 [00:01<00:00,  1.38it/s]
100%|██████████| 2/2 [00:01<00:00,  1.56it/s]
100%|██████████| 2/2 [00:01<00:00,  1.71it/s]


And we can then take a look at the results:

In [7]:
output_df = pd.DataFrame(results.tables['eval_results_table'])
display(output_df)

Downloading artifacts: 100%|██████████| 1/1 [00:00<00:00, 742.22it/s]


Unnamed: 0,inputs,context,targets,outputs,token_count,faithfulness/v1/score,faithfulness/v1/justification,answer_similarity/v1/score,answer_similarity/v1/justification,answer_correctness/v1/score,answer_correctness/v1/justification,answer_relevance/v1/score,answer_relevance/v1/justification
0,How much money does client 1 have in shares?,Client 1 has 20 shares. 70% of their shares ar...,"Client 1 has £14,000 worth of NVDA shares (70%...","Client 1 has £14,000 worth of NVDA shares (70%...",66,5,The output correctly states the distribution o...,3,The output has moderate semantic similarity to...,5,The output is correct and demonstrates a high ...,5,The output provided directly addresses all asp...
1,How much money does client 2 have in shares?,Client 2 has 10 shares. 30% of their shares ar...,"Client 2 has £3,000 worth of NVDA shares (30% ...","Client 2 has £7,300 in shares. This is calcula...",76,5,The output correctly calculates the amount of ...,4,The output aligns with the provided targets in...,5,The output is correct and demonstrates a high ...,5,The output directly addresses all aspects of t...


In [8]:
output_df['outputs'][1]

'Client 2 has £7,300 in shares. This is calculated by taking 30% of their shares in NVDA (3 shares x £1000 = £3000) and 70% of their shares in AAPL (7 shares x £190 = £1330), then adding these two amounts together (£3000 + £1330 = £4330).'

We can also access the saved models using the client:

In [9]:
from mlflow.tracking import MlflowClient

mlflow.set_tracking_uri("http://localhost:8080/")
client = MlflowClient()

experiment = client.get_experiment_by_name('mlflow_development')

if experiment is None:
    raise ValueError(f"Experiment 'mlflow_development' not found.")

runs = client.search_runs(experiment_ids=[experiment.experiment_id],
                          filter_string="",  # Add filter criteria if needed
                          run_view_type=mlflow.entities.ViewType.ACTIVE_ONLY)

for run in runs:
    print(f"Run ID: {run.info.run_id}")
    print(f"Parameters: {run.data.params}")
    print(f"Metrics: {run.data.metrics}")
    print(f"Tags: {run.data.tags}")
    print(f"Artifacts: {client.list_artifacts(run.info.run_id)}")
    print("-" * 40)

Run ID: 58a8e7cbb25e43faa27d4a17738836e5
Parameters: {'model': 'first=PromptTemplate(input_variables=[\'context\', \'inputs\'], template="You\'re a investment manager. Using the context provided, reply to the question below to the best of your ability:\\nQuestion:\\n{inputs}\\nContext:\\n{context}") middle=[ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x000002C69B59DB50>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x000002C6F9CFA390>, model_name=\'gpt-3.5-turbo-0125\', temperature=0.0, openai_api_key=SecretStr(\'**********\'), openai_proxy=\'\')] last=RunnableLambda(_get_content)'}
Metrics: {'answer_correctness/v1/mean': 5.0, 'answer_correctness/v1/p90': 5.0, 'answer_correctness/v1/variance': 0.0, 'answer_relevance/v1/mean': 5.0, 'answer_relevance/v1/p90': 5.0, 'answer_relevance/v1/variance': 0.0, 'answer_similarity/v1/mean': 3.5, 'answer_similarity/v1/p90': 3.9, 'answer_similarity/v1/variance': 0.25, 'exact_match/v1': 0.0, 

In [10]:
delete_runs()

# Other Code

Other code used in the development of this file:

In [11]:
# from mlflow.metrics.genai import answer_similarity, faithfulness, answer_correctness, answer_relevance
# judge_model = "openai:/gpt-3.5-turbo"

# faithfulness_metric = faithfulness(model=judge_model)
# answer_relevance_metric = answer_relevance(model=judge_model)
# answer_similarity_metric = answer_similarity(model=judge_model)
# answer_correctness_metric = answer_correctness(model=judge_model)

# extra_metrics = [faithfulness_metric, answer_similarity_metric, answer_correctness_metric, answer_relevance_metric]

# mlflow.set_experiment('mlflow_development')

# with mlflow.start_run() as run: 
#     _logged_model = mlflow.langchain.log_model(example_model, artifact_path="model")

#     mlflow.log_param("model", example_model)
#     results = mlflow.evaluate(_logged_model.model_uri, eval_set, model_type="question-answering",
#                                 targets="targets", extra_metrics=extra_metrics,
#                                 evaluator_config={'col_mapping': {"inputs": "predictions"}})

#     mlflow.log_metrics(results.metrics)


semi working chat gpt Code

In [12]:
type(results)

mlflow.models.evaluation.base.EvaluationResult

Let's see what the model produced:

In [13]:
output_df = pd.DataFrame(results.tables['eval_results_table'])
#output_df['answer'] = [d['content'] for d in output_df['outputs']]

# desired_columns = ['inputs', 'context', 'targets'] + [col for col in output_df.columns
#                                             if ('score' in col) or ('justification') in col]
# output_df = output_df[desired_columns]
display(output_df)



KeyError: 'eval_results_table'

In [None]:
# create score output func
output_df['faithfulness/v1/score'].mean()

5.0

In [None]:
example = EvaluationExample(input="What is the best stock that client 2 currently owns?",
                            output=(r"The best performing stock owned by client 2 is NVDA, which has seen a 400% "
                                    + "increase in value in the last 10 months."),
                            score=80,
                            justification=("The best performing stock has been identified, and a reason is given for "
                                           + "its choosing."))

grading_prompt = ("Answer Quality: If the answer given does not relate to the question, or if the question is not "
                  + "answered, we will give a low score. If the question is answered comprehensively we will give a "
                  + "higher score.\nScore 0: The question is not answered.\nScore 20: The question is barely "
                  + "answered, and the answer is not useful.\nScore 40: The question is barely answered in basic "
                  + "terms.\nScore 80: The question is barely answered correctly and accurately.\nScore 100: The "
                  + "question is answer perfectly, and the choices are well reasoned.")

# Make a metric from a Gen AI model.
answer_quality = make_genai_metric(name="Answer_Quality",
                                   definition=("Answer Quality is a measure of the accuracy of the answer."),
                                   model="openai:/gpt-3.5-turbo",
                                   examples=[example],
                                   parameters={"temperature": 0.0},
                                   aggregations=["mean", "variance"],
                                   greater_is_better=True,
                                   grading_prompt=(grading_prompt))

print('The grading prompt is:')
print('')
print(grading_prompt)
print('')
print(answer_quality)

The grading prompt is:

Answer Quality: If the answer given does not relate to the question, or if the question is not answered, we will give a low score. If the question is answered comprehensively we will give a higher score.
Score 0: The question is not answered.
Score 20: The question is barely answered, and the answer is not useful.
Score 40: The question is barely answered in basic terms.
Score 80: The question is barely answered correctly and accurately.
Score 100: The question is answer perfectly, and the choices are well reasoned.

EvaluationMetric(name=Answer_Quality, greater_is_better=True, long_name=Answer_Quality, version=v1, metric_details=
Task:
You must return the following fields in your response in two lines, one below the other:
score: Your numerical score for the model's Answer_Quality based on the rubric
justification: Your reasoning about the model's Answer_Quality score

You are an impartial judge. You will be given an input that was sent to a machine
learning mo