# Development: ML Flow

File for developing the mlflow code for the project.

In [33]:
# IMPORTS --------------------------------------------------------------------------------------------------------------

# Use the below lines if any dependencies are missing.
# ! python -m pip install uv
# ! python -m uv pip install langchain_openai mlflow load_dotenv langchain pandas langchain_community

import os
import sys

sys.path.append(os.path.abspath('\\'.join(os.getcwd().split('\\')[:-1])))

import mlflow
import pandas as pd
from dotenv import load_dotenv
from ml_flow import mlflow_server
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from mlflow.metrics.genai import make_genai_metric, EvaluationExample

_ = load_dotenv()

To begin with, we run the ML-Flow server:

In [35]:
server_process = mlflow_server()

INFO:root:Successfully running ML-Flow server. The server will terminate at the end of runtime.


Let's instantiate a dummy LLM which will answer simple queries:

In [36]:
# SETUP EXAMPLE LLM ----------------------------------------------------------------------------------------------------

example_llm = ChatOpenAI(model_name='gpt-3.5-turbo-0125', temperature=0)

example_prompt = PromptTemplate(input_variables=['question'],
                                template=("You're a investment manager. Using your knowledge of investment management, "
                                          + "reply to the question below to the best of your ability:\n"
                                          + "Question:\n{question}"))

example_model = example_prompt | example_llm

We'll then create an evaluation set:

In [37]:
eval_set = pd.DataFrame({"question": ["What is the best stock to buy right now?"]})

We then need to define an LLM-as-a-judge metric, and give it an example:

In [38]:
example = EvaluationExample(input="What is the best stock that client 2 currently owns?",
                            output=(r"The best performing stock owned by client 2 is NVDA, which has seen a 400% "
                                    + "increase in value in the last 10 months."),
                            score=80,
                            justification=("The best performing stock has been identified, and a reason is given for "
                                           + "its choosing."))

grading_prompt = ("Answer Quality: If the answer given does not relate to the question, or if the question is not "
                  + "answered, we will give a low score. If the question is answered comprehensively we will give a "
                  + "higher score.\nScore 0: The question is not answered.\nScore 20: The question is barely "
                  + "answered, and the answer is not useful.\nScore 40: The question is barely answered in basic "
                  + "terms.\nScore 80: The question is barely answered correctly and accurately.\nScore 100: The "
                  + "question is answer perfectly, and the choices are well reasoned.")

# Make a metric from a Gen AI model.
answer_quality = make_genai_metric(name="Answer_Quality",
                                   definition=("Answer Quality is a measure of the accuracy of the answer."),
                                   model="openai:/gpt-3.5-turbo",
                                   examples=[example],
                                   parameters={"temperature": 0.0},
                                   aggregations=["mean", "variance"],
                                   greater_is_better=True,
                                   grading_prompt=(grading_prompt),
)

print('The grading prompt is:')
print('')
print(grading_prompt)
print('')
print(answer_quality)

The grading prompt is:

Answer Quality: If the answer given does not relate to the question, or if the question is not answered, we will give a low score. If the question is answered comprehensively we will give a higher score.
Score 0: The question is not answered.
Score 20: The question is barely answered, and the answer is not useful.
Score 40: The question is barely answered in basic terms.
Score 80: The question is barely answered correctly and accurately.
Score 100: The question is answer perfectly, and the choices are well reasoned.

EvaluationMetric(name=Answer_Quality, greater_is_better=True, long_name=Answer_Quality, version=v1, metric_details=
Task:
You must return the following fields in your response in two lines, one below the other:
score: Your numerical score for the model's Answer_Quality based on the rubric
justification: Your reasoning about the model's Answer_Quality score

You are an impartial judge. You will be given an input that was sent to a machine
learning mo

We then connect to ML-Flow:

In [39]:
mlflow.set_experiment("mlflow_development")

with mlflow.start_run() as run: 
    _logged_model = mlflow.langchain.log_model(example_model, artifact_path="model")

    mlflow.log_param("model", example_model)
    results = mlflow.evaluate(_logged_model.model_uri,
                              eval_set,
                              model_type="question-answering",
                              extra_metrics=[answer_quality], # Include our custom metric!
                              evaluator_config={'col_mapping': {"inputs": "predictions"}})

    mlflow.log_metrics(results.metrics)

                    stream was transferred to model_kwargs.
                    Please confirm that stream is what you intended.
                    stream was transferred to model_kwargs.
                    Please confirm that stream is what you intended.
2024/05/28 17:55:17 INFO mlflow.models.evaluation.default_evaluator: Computing model predictions.
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024/05/28 17:55:20 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...
100%|██████████| 1/1 [00:01<00:00,  1.45s/it]
100%|██████████| 1/1 [00:01<00:00,  1.13s/it]


Let's see what the model produced:

In [40]:
output_df = pd.DataFrame(results.tables['eval_results_table'])
output_df['answer'] = [d['content'] for d in output_df['outputs']]

desired_columns = ['question', 'answer'] + [col for col in output_df.columns
                                            if ('score' in col) or ('justification') in col]
output_df = output_df[desired_columns]
display(output_df)

Downloading artifacts: 100%|██████████| 1/1 [00:00<00:00, 490.05it/s]


Unnamed: 0,question,answer,Answer_Quality/v1/score,Answer_Quality/v1/justification
0,What is the best stock to buy right now?,"As an investment manager, I cannot provide spe...",100,The model's response directly addresses the in...


In [41]:
output_df['answer'][0]

'As an investment manager, I cannot provide specific stock recommendations without knowing more about your individual financial situation, risk tolerance, investment goals, and time horizon. It is important to conduct thorough research and analysis before making any investment decisions. I recommend diversifying your portfolio and considering factors such as company fundamentals, industry trends, and market conditions when selecting stocks to buy. It may be beneficial to consult with a financial advisor for personalized investment advice.'