<a href="https://colab.research.google.com/github/olonok69/LLM_Notebooks/blob/main/mlflow/transformers/MLFlow_Transformers_flavor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#MLFLOW
https://mlflow.org/docs/latest/introduction/index.html


MLflow is a solution to many of these issues in this dynamic landscape, offering tools and simplifying processes to streamline the ML lifecycle and foster collaboration among ML practitioners.

# Flan T5
google/flan-t5-large

https://huggingface.co/google/flan-t5-xl


# MLflow Sentence Transformers Guide (Experimental)
https://mlflow.org/docs/latest/models.html#transformers-transformers-experimental

# ngrok
Connect localhost to the internet for testing applications and APIs
Bring secure connectivity to apps and APIs in localhost and dev/test environments with just one command or function call.
- Webhook testing
- Developer Previews
- Mobile backend testing

https://ngrok.com/


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install mlflow pyngrok evaluate transformers==4.39.3 --quiet
get_ipython().system_raw("mlflow ui --port 5000 &")

In [None]:

from pyngrok import ngrok
from getpass import getpass

# Terminate open tunnels if exist
ngrok.kill()

In [None]:
from google.colab import userdata
NGROK_AUTH_TOKEN  = userdata.get('NGROK')

ngrok.set_auth_token(NGROK_AUTH_TOKEN)

# Open an HTTPs tunnel on port 5000 for http://localhost:5000
ngrok_tunnel = ngrok.connect(addr="5000", proto="http", bind_tls=True)
print("MLflow Tracking UI:", ngrok_tunnel.public_url)

In [None]:
# Disable tokenizers warnings when constructing pipelines
%env TOKENIZERS_PARALLELISM=false

import warnings

# Disable a few less-than-useful UserWarnings from setuptools and pydantic
warnings.filterwarnings("ignore", category=UserWarning)

# Text2Text Generation

In [None]:
import transformers

import mlflow
from typing import List, Dict

# Define the task that we want to use (required for proper pipeline construction)
task = "text2text-generation"

# Define the pipeline, using the task and a model instance that is applicable for our task.
generation_pipeline = transformers.pipeline(
    task=task,
    model="google/flan-t5-large",
    model_kwargs= {"force_download":True}

)


# Define a simple input example that will be recorded with the model in MLflow, giving
# users of the model an indication of the expected input format.
input_example : List[Dict[str, str]] = list({"prompt 1", "prompt 2", "prompt 3"})

# Define the parameters (and their defaults) for optional overrides at inference time.
parameters = {"max_new_tokens": 512, "do_sample": True, "temperature": 0.4}


In [None]:
input_example

In [None]:
type(input_example)

In [None]:
signature = mlflow.models.infer_signature(
    input_example,
    mlflow.transformers.generate_signature_output(generation_pipeline, input_example),
    parameters,
)

# Visualize the signature
signature

In [None]:
mlflow.set_tracking_uri("http://localhost:5000")


experiment_name = "transformers"
try:
    mlflow.create_experiment(experiment_name)
except:
    print("experiment exists")
mlflow.set_experiment(experiment_name)

## Save Model into MLflow

In [None]:
from datetime import datetime
import pandas as pd
name = "transformers_" +datetime.now().strftime("%Y-%m-%d_%H:%M:%S")
with mlflow.start_run(run_name = name) as run:
    model_info = mlflow.transformers.log_model(
        transformers_model=generation_pipeline,
        artifact_path="text_generator",
        input_example=pd.DataFrame(input_example),
        signature=signature,
        # Uncomment the following line to save the model in 'reference-only' mode:
        # save_pretrained=False,
    )

# Load our pipeline as a generic python function

In [None]:
model_info.model_uri

In [None]:

sentence_generator = mlflow.pyfunc.load_model(model_info.model_uri)


In [None]:
type(sentence_generator)

In [None]:
def format_predictions(predictions):
    """
    Function for formatting the output for readability in a Jupyter Notebook
    """
    formatted_predictions = []

    for prediction in predictions:
        # Split the output into sentences, ensuring we don't split on abbreviations or initials
        sentences = [
            sentence.strip() + ("." if not sentence.endswith(".") else "")
            for sentence in prediction.split(". ")
            if sentence
        ]

        # Join the sentences with a newline character
        formatted_text = "\n".join(sentences)

        # Add the formatted text to the list
        formatted_predictions.append(formatted_text)

    return formatted_predictions

In [None]:
# Validate that our loaded pipeline, as a generic pyfunc, can produce an output that makes sense
predictions = sentence_generator.predict(
    data=[
        "I can't decide whether to go running or kayaking this weekend. Can you help me decide?",
        "Please tell me a joke about chess.",
    ],
    params={"temperature": 0.9},
)

# Format each prediction for notebook readability
formatted_predictions = format_predictions(predictions)

for i, formatted_text in enumerate(formatted_predictions):
    print(f"Response to prompt {i+1}:\n{formatted_text}\n")


# Translation with Transformers and MLflow

Construct a translation pipeline using flan-t5-base from the Transformers library.

Log the translation model and its configurations using MLflow.

Determine the input and output signature of the translation model automatically.

Retrieve a logged translation model from MLflow for direct interaction.

Emulate the deployment of the translation model using MLflow’s pyfunc model flavor for language translation tasks.


In [None]:
model_architecture = "google/flan-t5-large"

translation_pipeline = transformers.pipeline(
    task="translation_en_to_fr",
    model=transformers.T5ForConditionalGeneration.from_pretrained(
        model_architecture, max_length=1000, force_download=True
    ),
    tokenizer=transformers.T5TokenizerFast.from_pretrained(model_architecture, return_tensors="pt"),
)

In [None]:
# Evaluate the pipeline on a sample sentence prior to logging
translation_pipeline(
    "translate English to French: I liked my slow walking along the Champs-Élysées."
)

In [None]:
# Define the parameters that we are permitting to be used at inference time, along with their default values if not overridden
model_params = {"max_length": 1000}

# Generate the model signature by providing an input, the expected output, and (optionally), parameters available for overriding at inference time
signature = mlflow.models.infer_signature(
    "This is a sample input sentence.",
    mlflow.transformers.generate_signature_output(translation_pipeline, "This is another sample."),
    params=model_params,
)

In [None]:
signature

In [None]:
mlflow.set_experiment("Translation")

In [None]:
from datetime import datetime

name = "translator_" +datetime.now().strftime("%Y-%m-%d_%H:%M:%S")


with mlflow.start_run(run_name = name) as run:
    model_info = mlflow.transformers.log_model(
        transformers_model=translation_pipeline,
        artifact_path="spanish_translator",
        signature=signature,
        model_params=model_params,
    )

In [None]:
model_info.model_uri

In [None]:
# Load our saved model as a dictionary of components, comprising the model itself, the tokenizer, and any other components that were saved
translation_components = mlflow.transformers.load_model(
    model_info.model_uri, return_type="components"
)

# Show the components that made up our pipeline that we saved and what type each are
for key, value in translation_components.items():
    print(f"{key} -> {type(value).__name__}")

In [None]:
type(translation_components)

In [None]:
translation_components.keys()

In [None]:
# Show the model parameters that were saved with our model to gain an understanding of what is recorded when saving a transformers pipeline
model_info.flavors

In [None]:
# Load our saved model as a transformers pipeline and validate the performance for a simple translation task
translation_pipeline = mlflow.transformers.load_model(model_info.model_uri)


In [None]:
type(translation_pipeline)

In [None]:
response = translation_pipeline("I have heard that Nice is nice this time of year.")

print(response)

In [None]:
# Verify that the components that we loaded can be constructed into a pipeline manually
reconstructed_pipeline = transformers.pipeline(**translation_components)

reconstructed_response = reconstructed_pipeline(
    "transformers makes using Deep Learning models easy and fun!"
)

print(reconstructed_response)

In [None]:
# Access the individual components from the components dictionary
tokenizer = translation_components["tokenizer"]
model = translation_components["model"]

query = "Translate to French: Liberty, equality, fraternity, or death."

# This notebook was run on a Mac laptop, so we'll send the output tensor to the "mps" device.
# If you're running this on a different system, ensure that you're sending the tensor output to the appropriate device to ensure that
# the model is able to read it from memory.
inputs = tokenizer.encode(query, return_tensors="pt").to("cpu")
outputs = model.generate(inputs).to("cpu")
result = tokenizer.decode(outputs[0])

# Since we're not using a pipeline here, we need to modify the output slightly to get only the translated text.
print(result.replace("<pad> ", "\n").replace("</s>", ""))

In [None]:
query = "Translate to Spanish: Liberty, equality, fraternity, or death."

# This notebook was run on a Mac laptop, so we'll send the output tensor to the "mps" device.
# If you're running this on a different system, ensure that you're sending the tensor output to the appropriate device to ensure that
# the model is able to read it from memory.
inputs = tokenizer.encode(query, return_tensors="pt").to("cpu")
outputs = model.generate(inputs).to("cpu")
result = tokenizer.decode(outputs[0])

# Since we're not using a pipeline here, we need to modify the output slightly to get only the translated text.
print(result.replace("<pad> ", "\n").replace("</s>", ""))

# Dialog GPT

# Conversational Pipeline model microsoft/DialoGPT-medium

DialoGPT is a SOTA large-scale pretrained dialogue response generation model for multiturn conversations. The human evaluation results indicate that the response generated from DialoGPT is comparable to human response quality under a single-turn conversation Turing test. The model is trained on 147M multi-turn dialogue from Reddit discussion thread.


https://huggingface.co/microsoft/DialoGPT-medium

In [None]:
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer
import mlflow

# Define our pipeline, using the default configuration specified in the model card for DialoGPT-medium
conversational_pipeline = transformers.pipeline(model="microsoft/DialoGPT-large")

# Infer the signature by providing a representnative input and the output from the pipeline inference abstraction in the transformers flavor in MLflow
signature = mlflow.models.infer_signature(
    "Hi there, chatbot!",
    mlflow.transformers.generate_signature_output(conversational_pipeline, "Hi there, chatbot!"),
    params={"temperature": 0.5, "max_length": 1000, "do_sample": True,}
)

In [None]:
signature

In [None]:
mlflow.set_experiment("Conversational_chatbot")

In [None]:
from datetime import datetime

name = "Conversational_" +datetime.now().strftime("%Y-%m-%d_%H:%M:%S")


with mlflow.start_run(run_name = name) as run:
    model_info = mlflow.transformers.log_model(
        transformers_model=conversational_pipeline,
        artifact_path="chatbot",
        task="conversational",
        signature=signature,
        input_example="A clever and witty question",
    )

In [None]:
run.to_dictionary()

In [None]:
model_info.model_uri

In [None]:
# Load the model as a generic python function in order to leverage the integrated Conversational Context
# Note that loading a conversational model with the native flavor (i.e., `mlflow.transformers.load_model()`) will not include anything apart from the
# pipeline itself; if choosing to load in this way, you will need to manage your own Conversational Context instance to maintain state on the
# conversation history.
chatbot = mlflow.pyfunc.load_model(model_uri=model_info.model_uri)


In [None]:
# Validate that the model is capable of responding to a question
first = chatbot.predict("Does money buy happiness?", params={"temperature": 1.0, "max_length": 512, "do_sample": False})

In [None]:
print(f"Response: {first}")

In [None]:
second = chatbot.predict("why not?", params={"temperature": 1.0, "max_length": 2000, "do_sample": True})

In [None]:
print(f"second: {second}")

In [None]:
ngrok.kill()