<a href="https://colab.research.google.com/github/olonok69/LLM_Notebooks/blob/main/mlflow/summarization/T5_large_Evaluation_multi_news_summarization_mlflow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



# LLM Evaluation Metrics

https://mlflow.org/docs/latest/llms/llm-evaluate/index.html


There are two types of LLM evaluation metrics in MLflow:

- Heuristic-based metrics: These metrics calculate a score for each data record (row in terms of Pandas/Spark dataframe), based on certain functions, such as: Rouge (rougeL()), Flesch Kincaid (flesch_kincaid_grade_level()) or Bilingual Evaluation Understudy (BLEU) (bleu()). These metrics are similar to traditional continuous value metrics. For the list of built-in heuristic metrics and how to define a custom metric with your own function definition, see the Heuristic-based Metrics section.

- LLM-as-a-Judge metrics: LLM-as-a-Judge is a new type of metric that uses LLMs to score the quality of model outputs. It overcomes the limitations of heuristic-based metrics, which often miss nuances like context and semantic accuracy. LLM-as-a-Judge metrics provides a more human-like evaluation for complex language tasks while being more scalable and cost-effective than human evaluation. MLflow provides various built-in LLM-as-a-Judge metrics and supports creating custom metrics with your own prompt, grading criteria, and reference examples. See the LLM-as-a-Judge Metrics section for more details.



### MLFLOW Metrics
The mlflow.metrics module helps you quantitatively and qualitatively measure your models.

https://mlflow.org/docs/latest/python_api/mlflow.metrics.html


Create a test case of inputs that will be passed into the model and ground_truth which will be used to compare against the generated output from the model.

#### TASK: text-summarization: model_type="text-summarization":
- ROUGE

- toxicity

- ari_grade_level

- flesch_kincaid_grade_level

#### Descriptions

- https://huggingface.co/spaces/evaluate-measurement/toxicity
- https://en.wikipedia.org/wiki/Automated_readability_index
- https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests#Flesch%E2%80%93Kincaid_grade_level

### Toxicity
https://huggingface.co/facebook/roberta-hate-speech-dynabench-r4-target

### Textstat
Textstat is an easy to use library to calculate statistics from text. It helps determine readability, complexity, and grade level.

https://pypi.org/project/textstat/

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install mlflow   --quiet
! pip install  evaluate  textstat tiktoken -q
! pip install psutil pynvml
! pip install bert_score -q

In [None]:
# Transformers installation
! pip install -q --disable-pip-version-check py7zr sentencepiece loralib peft trl
! pip install -q    bitsandbytes
! pip install datasets evaluate rouge_score -q
! pip install transformers[torch] -q
! pip install accelerate -U -q
# To install from source instead of the last release, comment the command above and uncomment the following one.
# ! pip install git+https://github.com/huggingface/transformers.git

In [None]:
! pip install onnxruntime optimum -q
! pip install optimum[onnxruntime] -q

In [None]:

import argparse
import bitsandbytes as bnb
from datasets import load_dataset
from functools import partial
import os
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, AutoPeftModelForCausalLM
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed, Trainer, TrainingArguments, BitsAndBytesConfig, \
    DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import load_dataset
from torch import cuda, bfloat16
import transformers
import openai
import torch
import torch.nn as nn
from google.colab import userdata

In [None]:

from google.colab import output
output.enable_custom_widget_manager()

from transformers.utils import logging


In [None]:
logging.set_verbosity_warning()

os.environ["TRANSFORMERS_VERBOSITY"] = "warning"

In [None]:


device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
device


# Load multi_news dataset
https://huggingface.co/datasets/multi_news

In [None]:
from datasets import load_dataset

dataset  = load_dataset("multi_news", trust_remote_code=True)

In [None]:
dataset

In [None]:

print(f"Train dataset size: {len(dataset['train'])}")
print(f"test dataset size: {len(dataset['test'])}")
print(f"Validation dataset size: {len(dataset['validation'])}")

In [None]:
dataset['train'][100]['document']

In [None]:
dataset['train'][100]['summary']

In [None]:

len(dataset['train'][100]['document'])

In [None]:

len(dataset['train'][100]['summary'])

In [None]:
import transformers
from mlflow.models import infer_signature
from mlflow.transformers import generate_signature_output
import locale
import mlflow
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [None]:
model_uri = "runs:/490668a70c06448d83903669efde0a8b/text_summarizer"

In [None]:
MLFLOW_TRACKING_URI="databricks"
# Specify the workspace hostname and token
DATABRICKS_HOST="https://adb-2467347032368999.19.azuredatabricks.net/"
DATABRICKS_TOKEN=userdata.get('DATABRCKS_TTOKEN')

In [None]:


if "MLFLOW_TRACKING_URI" not in os.environ:
    os.environ["MLFLOW_TRACKING_URI"] = MLFLOW_TRACKING_URI
if "DATABRICKS_HOST" not in os.environ:
    os.environ["DATABRICKS_HOST"] = DATABRICKS_HOST
if "DATABRICKS_TOKEN" not in os.environ:
    os.environ["DATABRICKS_TOKEN"] = DATABRICKS_TOKEN

In [None]:
os.environ["OPENAI_API_KEY"]=userdata.get('KEY_OPENAI')

In [None]:
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

In [None]:

mlflow.set_experiment("/Users/pepe@kk.com/summarization_evaluation")


In [None]:
mlflow.end_run()

In [None]:
# summarization_components = mlflow.transformers.load_model(
#     model_uri, return_type="components"
# )

In [None]:
# summarization_components.keys()

In [None]:
import torch
from tqdm.auto import tqdm

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [None]:
# reconstructed_pipeline = transformers.pipeline(**summarization_components)

In [None]:
# test1= dataset['test'][100]['document']

In [None]:
# reconstructed_pipeline(test1)

In [None]:
df_test = dataset['validation'].to_pandas()

In [None]:
df_test.columns = ['inputs', 'summary']

In [None]:
df_test.head()

In [None]:
import gc
import torch
import datetime
torch.cuda.empty_cache()
gc.collect()

# Evaluate MLFLOW default metrics

https://mlflow.org/docs/latest/llms/llm-evaluate/index.html


In [None]:
now = datetime.datetime.now()

description= f"""Evaluation Fine Tuned T5-Large Model on Multi_News Dataset
model_uri: {model_uri}
"""
with mlflow.start_run(run_name=f"Evaluation_{now.strftime('%Y-%m-%d_%H:%M:%S')}", description=description) as run:

    results = mlflow.evaluate(
         model_uri,
         df_test[:10],
        targets="summary",  # specify which column corresponds to the expected output
        model_type="text-summarization",  # model type indicates which metrics are relevant for this task
        evaluators="default",
    )


# Custom Metrics

https://github.com/mlflow/mlflow/blob/master/examples/evaluation/evaluate_with_custom_metrics.py

https://huggingface.co/spaces/evaluate-metric/bertscore

In [None]:
from mlflow.metrics import latency
from mlflow.metrics.genai import answer_correctness
from mlflow.models import infer_signature, make_metric

In [None]:
mlflow.enable_system_metrics_logging()


In [None]:
mlflow.metrics.__all__

In [None]:
mlflow.metrics.genai.__all__

In [None]:
from evaluate import load
import pandas as pd
from typing import List
bertscore = load("bertscore")
predictions = ["hello there"]
references = ["hello there"]
results = bertscore.compute(predictions=predictions, references=references, lang="en")

In [None]:
results

In [None]:
def calculate_bert_f1(eval_df, _builtin_metrics):
    predictions = []

    return bertscore.compute(predictions=eval_df["prediction"], references=eval_df["target"], lang="en")['f1'][0]
def calculate_bert_recall(eval_df, _builtin_metrics):
    predictions = []

    return bertscore.compute(predictions=eval_df["prediction"], references=eval_df["target"], lang="en")['recall'][0]
def calculate_bert_precision(eval_df, _builtin_metrics):
    predictions = []

    return bertscore.compute(predictions=eval_df["prediction"], references=eval_df["target"], lang="en")['precision'][0]

In [None]:

torch.cuda.empty_cache()
gc.collect()

In [None]:
now = datetime.datetime.now()

description= f"""Evaluation Fine Tuned T5-Large Model on Multi_News Dataset
model_uri: {model_uri}

custom metric BertScore and latency
"""
with mlflow.start_run(run_name=f"Evaluation_{now.strftime('%Y-%m-%d_%H:%M:%S')}", description=description) as run:

    results = mlflow.evaluate(
         model_uri,
         df_test[:10],
        targets="summary",  # specify which column corresponds to the expected output
        model_type="text-summarization",  # model type indicates which metrics are relevant for this task
        evaluators="default",
        extra_metrics=[

        latency(),
      make_metric(
                eval_fn=calculate_bert_f1,
                greater_is_better=True,
            ),
        make_metric(
                eval_fn=calculate_bert_recall,
                greater_is_better=True,
            ),
        make_metric(
                eval_fn=calculate_bert_precision,
                greater_is_better=True,
            ),
    ],
    )


# Evaluate with LLM-as-a-Judge metrics


In [None]:
from mlflow.metrics.genai import EvaluationExample, make_genai_metric

professionalism_metric = make_genai_metric(
    name="professionalism",
    definition=(
        "Professionalism refers to the use of a formal, respectful, and appropriate style of communication that is tailored to the context and audience. It often involves avoiding overly casual language, slang, or colloquialisms, and instead using clear, concise, and respectful language"
    ),
    grading_prompt=(
        "Professionalism: If the answer is written using a professional tone, below "
        "are the details for different scores: "
        "- Score 1: Language is extremely casual, informal, and may include slang or colloquialisms. Not suitable for professional contexts."
        "- Score 2: Language is casual but generally respectful and avoids strong informality or slang. Acceptable in some informal professional settings."
        "- Score 3: Language is balanced and avoids extreme informality or formality. Suitable for most professional contexts. "
        "- Score 4: Language is noticeably formal, respectful, and avoids casual elements. Appropriate for business or academic settings. "
        "- Score 5: Language is excessively formal, respectful, and avoids casual elements. Appropriate for the most formal settings such as textbooks. "
    ),
    examples=[
        EvaluationExample(
            input="What is MLflow?",
            output=(
                "MLflow is like your friendly neighborhood toolkit for managing your machine learning projects. It helps you track experiments, package your code and models, and collaborate with your team, making the whole ML workflow smoother. It's like your Swiss Army knife for machine learning!"
            ),
            score=2,
            justification=(
                "The response is written in a casual tone. It uses contractions, filler words such as 'like', and exclamation points, which make it sound less professional. "
            ),
        )
    ],
    version="v1",
    model="openai:/gpt-4",
    parameters={"temperature": 0.0},
    grading_context_columns=[],
    aggregations=["mean", "variance", "p90"],
    greater_is_better=True,
)

print(professionalism_metric)

In [None]:
torch.cuda.empty_cache()
gc.collect()

In [None]:
now = datetime.datetime.now()

description= f"""Evaluation Fine Tuned T5-Large Model on Multi_News Dataset
model_uri: {model_uri}

custom metric BertScore , latency and professionalism
"""
with mlflow.start_run(run_name=f"Evaluation_{now.strftime('%Y-%m-%d_%H:%M:%S')}", description=description) as run:

    results = mlflow.evaluate(
        model_uri,
        df_test[:10],
        targets="summary",  # specify which column corresponds to the expected output
        model_type="text-summarization",  # model type indicates which metrics are relevant for this task
        evaluators="default",
        extra_metrics=[

        latency(),
        make_metric(
                eval_fn=calculate_bert_f1,
                greater_is_better=True,
            ),
        make_metric(
                eval_fn=calculate_bert_recall,
                greater_is_better=True,
            ),
        make_metric(
                eval_fn=calculate_bert_precision,
                greater_is_better=True,
            ),
        professionalism_metric,
    ],
    )
results.metrics

In [None]:
torch.cuda.empty_cache()
gc.collect()

# Evaluate ONNX models in Custom PythonModel

```
class ONNXModelForSeq2SeqLM(PythonModel):
  def load_context(self, context):
        """
        This method initializes the tokenizer and language model
        using the specified model snapshot directory.
        """

        from transformers import AutoTokenizer
        from optimum.onnxruntime import ORTModelForSeq2SeqLM
        from optimum.pipelines import pipeline

        self.model = ORTModelForSeq2SeqLM.from_pretrained(context.artifacts["snapshot"])
        self.tokenizer = AutoTokenizer.from_pretrained(context.artifacts["snapshot"])


  def predict(self, context, model_input, params=None):
        """
        This method generates prediction for the given input.
        """
        prompt = model_input["prompt"][0]
         # Retrieve or use default values for temperature and max_tokens
        temperature = params.get("temperature", 0.7) if params else 0.7
        max_tokens = params.get("max_tokens", 128) if params else 128
        task = params.get("task", "summarization") if params else "summarization"


        pipe = pipeline(task, model=self.model, tokenizer=self.tokenizer)
        result = pipe(prompt)
        return {"candidates": [result[0]['summary_text']]}

  ```

In [None]:
model_uri_onnx = "runs:/79c1dcaabd214f0cae2c55797175b16a/t5-summarization-onnx"

In [None]:
loaded_model = mlflow.pyfunc.load_model(model_uri_onnx)

In [None]:
from typing import List
def onnx_summ(inputs: pd.DataFrame) -> List[str]:
    predictions = []

    for _, row in inputs.iterrows():
        response = loaded_model.predict(pd.DataFrame(
    {"prompt": [row["inputs"]]}), params={"temperature": 0.8, "max_tokens": 128}
)
        predictions.append(response['candidates'][0])

    return predictions

In [None]:
df_val = dataset['validation'].to_pandas()

In [None]:
df_val.columns = ['inputs', 'summary']

In [None]:
df_val.head()

In [None]:
torch.cuda.empty_cache()
gc.collect()

In [None]:
now = datetime.datetime.now()

description= f"""Evaluation  Tuned T5-Large Model converted to ONNX with optimum-cli
model_uri: {model_uri_onnx}
"""
with mlflow.start_run(run_name=f"evaluation_to_onnx_{now.strftime('%Y-%m-%d_%H:%M:%S')}", description=description) as run:

    results = mlflow.evaluate(
         model=onnx_summ,
         data= df_val[:10],
        targets="summary",  # specify which column corresponds to the expected output
        model_type="text-summarization",  # model type indicates which metrics are relevant for this task
        evaluators="default",
    )


In [None]:
import pprint
pprint.pprint(results.metrics)

In [None]:
# c22d0ac9d7e54c659bc9c1206471dfc7
model_uri_onnx = "runs:/c22d0ac9d7e54c659bc9c1206471dfc7/t5-summarization-onnx"
loaded_model_q = mlflow.pyfunc.load_model(model_uri_onnx)
def onnx_summ(inputs: pd.DataFrame) -> List[str]:
    predictions = []
    for _, row in inputs.iterrows():
        response = loaded_model_q.predict(pd.DataFrame(
    {"prompt": [row["inputs"]]}), params={"temperature": 0.8, "max_tokens": 128}
)
        predictions.append(response['candidates'][0])

    return predictions

In [None]:
now = datetime.datetime.now()

description= f"""Evaluation  Tuned T5-Large Model converted to ONNX with optimum-cli
quantized with INT8
model_uri: {model_uri_onnx}
"""
with mlflow.start_run(run_name=f"evaluation_to_onnx_{now.strftime('%Y-%m-%d_%H:%M:%S')}", description=description) as run:

    results = mlflow.evaluate(
         model=onnx_summ,
         data= df_val[:10],
        targets="summary",  # specify which column corresponds to the expected output
        model_type="text-summarization",  # model type indicates which metrics are relevant for this task
        evaluators="default",
    )

In [None]:
pprint.pprint(results.metrics)