In [1]:
import warnings

# Disable a few less-than-useful UserWarnings from setuptools and pydantic
warnings.filterwarnings("ignore", category=UserWarning)

In [2]:
import os
import requests

from bs4 import BeautifulSoup
import numpy as np
import openai

import mlflow
from mlflow.models.signature import ModelSignature
from mlflow.types.schema import ColSpec, ParamSchema, ParamSpec, Schema, TensorSpec

assert "OPENAI_API_KEY" in os.environ, " OPENAI_API_KEY environment variable must be set"

In [3]:
with mlflow.start_run():
    model_info = mlflow.openai.log_model(
        model="text-embedding-ada-002",
        task=openai.Embedding,
        artifact_path="model",
        signature=ModelSignature(
            inputs=Schema([ColSpec(type="string", name=None)]),
            outputs=Schema([TensorSpec(type=np.dtype("float64"), shape=(-1,))]),
            params=ParamSchema([ParamSpec(name="batch_size", dtype="long", default=1024)]),
        ),
    )

model = mlflow.pyfunc.load_model(model_info.model_uri)


In [9]:
def insert_space_after_tags(soup, tags):
    """
    Insert a space after each tag specified in the provided BeautifulSoup object.

    :param soup: BeautifulSoup object representing the parsed HTML.
    :param tags: List of tag names (as strings) after which space should be inserted.
    """
    for tag_name in tags:
        for tag in soup.find_all(tag_name):
            tag.insert_after(" ")

def extract_text_from_url(url, id):
    """
    Extract and return text content from a specific section of a webpage.

    The function targets a div with class 'section' and id 'llms', then extracts
    text from <h>, <li>, and <p> tags, excluding <p> tags within <ul> and any 
    <li> tags that contain <p> with <a> having class 'reference external'. It also
    excludes any standalone 'Note' entries.

    :param url: URL of the webpage from which to extract text.
    :param id: The target id for the div containing the main text content of the page
    :return: A string containing the extracted text or an error message.
    """

    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the div with class 'section' and id 'llms'
        target_div = soup.find('div', {'class': 'section', 'id': id})

        if target_div:
            # Make href and strong tags more readable
            insert_space_after_tags(target_div, ['strong', 'a'])

            content_tags = []
            for tag in target_div.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'p']):
                if tag.name == 'li' and tag.find('p') and tag.find('a', class_='reference external'):
                    continue  # Skip this specific <li>
                content_tags.append(tag)

            return '\n'.join(tag.get_text(separator=' ', strip=True) 
                             for tag in content_tags 
                             if (tag.name != 'p' or tag.find_parent('ul') is None) and tag.get_text(strip=True).lower() != "note")
        else:
            return "Target element not found."
    else:
        return f"Failed to retrieve content: Status code {response.status_code}"

# Example usage
llm_landing_url = "https://www.mlflow.org/docs/2.8.1/llms/index.html"
llms_landing_page_content = extract_text_from_url(llm_landing_url, "llms")

In [10]:
print(llms_landing_page_content)

LLMs
LLMs, or Large Language Models, have rapidly become a cornerstone in the machine learning domain, offering
immense capabilities ranging from natural language understanding to code generation and more.
However, harnessing the full potential of LLMs often involves intricate processes, from interfacing with
multiple providers to fine-tuning specific models to achieve desired outcomes.
Such complexities can easily become a bottleneck for developers and data scientists aiming to integrate LLM
capabilities into their applications.
MLflow’s Support for LLMs aims to alleviate these challenges by introducing a suite of features and tools designed with the end-user in mind:
MLflow AI Gateway
Serving as a unified interface, the MLflow AI Gateway simplifies interactions with multiple LLM providers, such as OpenAI , MosaicML , Cohere , Anthropic , PaLM 2 , AWS Bedrock , and AI21 Labs .
In addition to supporting the most popular SaaS LLM providers, the AI Gateway provides an integration to MLfl

In [14]:
llms_landing_embeddings = model.predict(llms_landing_page_content)

llms_landing_embeddings

[[-0.02396763674914837,
  0.01575297862291336,
  0.009518012404441833,
  -0.021205933764576912,
  -0.009461650624871254,
  0.010039353743195534,
  -0.008038528263568878,
  0.017162011936306953,
  -0.02437625639140606,
  -0.05867209658026695,
  0.0198532622307539,
  0.015513443388044834,
  -0.012371301651000977,
  0.006192696280777454,
  -0.015302089042961597,
  0.005019676871597767,
  0.01812015287578106,
  -0.005703057628124952,
  0.001988496631383896,
  -0.011561108753085136,
  0.004537083208560944,
  -0.0232208501547575,
  -0.015597985126078129,
  -0.029082423076033592,
  -0.021389108151197433,
  0.018261056393384933,
  0.025461209937930107,
  -0.05021790415048599,
  -0.023657649755477905,
  0.005896799266338348,
  0.020529597997665405,
  0.004248231649398804,
  -0.023136306554079056,
  0.015090733766555786,
  0.010285934433341026,
  -0.004896386526525021,
  0.027236590161919594,
  0.005322618875652552,
  0.029195144772529602,
  -0.0070275478065013885,
  0.0318441241979599,
  0.0231

In [12]:
llms_evaluate_url = "https://www.mlflow.org/docs/2.8.1/llms/llm-evaluate/index.html"
llms_evaluate_page_content = extract_text_from_url(llms_evaluate_url, "mlflow-llm-evaluate")

print(llms_evaluate_page_content)

MLflow LLM Evaluate
With the emerging of ChatGPT, LLMs have shown its power of text generation in various fields, such as
question answering, translating and text summarization. Evaluating LLMs’ performance is slightly different
from traditional ML models, as very often there is no single ground truth to compare against.
MLflow provides an API mlflow.evaluate() to help evaluate your LLMs.
MLflow’s LLM evaluation functionality consists of 3 main components:
A model to evaluate : it can be an MLflow pyfunc model, a URI pointing to one registered
MLflow model, or any python callable that represents your model, e.g, a HuggingFace text summarization pipeline.
A model to evaluate : it can be an MLflow pyfunc model, a URI pointing to one registered
MLflow model, or any python callable that represents your model, e.g, a HuggingFace text summarization pipeline.
Metrics : the metrics to compute, LLM evaluate will use LLM metrics.
Metrics : the metrics to compute, LLM evaluate will use LLM metric

In [13]:
embeddings_evaluate_page = model.predict(llms_evaluate_page_content)

embeddings_evaluate_page

[[-0.022500766441226006,
  0.019651325419545174,
  0.010113414376974106,
  -0.009565983898937702,
  0.003982902504503727,
  -5.642079486278817e-05,
  0.005119871813803911,
  0.0005807668203487992,
  -0.027483781799674034,
  -0.03551275283098221,
  0.013468176126480103,
  0.005084780510514975,
  -0.03851659595966339,
  -0.002216037828475237,
  -0.005948035046458244,
  0.0022581478115171194,
  0.009229104034602642,
  -0.011446896940469742,
  0.021672604605555534,
  -0.012064510025084019,
  -0.004607534036040306,
  -0.019019674509763718,
  -0.013826110400259495,
  -0.03683219850063324,
  -0.027722405269742012,
  0.006972711067646742,
  0.015791242942214012,
  -0.03725329786539078,
  -0.0038741184398531914,
  0.008485161699354649,
  0.004151342436671257,
  0.0016449211398139596,
  -0.025588832795619965,
  -0.005326912738382816,
  -0.010885430499911308,
  0.0005022492841817439,
  0.014991153962910175,
  0.031189460307359695,
  0.03618651255965233,
  -0.0001607636222615838,
  0.0129488194361

In [21]:
import numpy as np

def cosine_similarity(embedding1, embedding2):
    """
    Calculate the cosine similarity between two document embeddings.

    :param embedding1: First embedding vector, either a list or a numpy array.
    :param embedding2: Second embedding vector, either a list or a numpy array.
    :return: Cosine similarity between the two embeddings.
    """
    # Convert embeddings to numpy arrays if they are lists
    if isinstance(embedding1, list):
        embedding1 = np.array(embedding1)
    if isinstance(embedding2, list):
        embedding2 = np.array(embedding2)

    # Flatten the embeddings to 1D arrays if they are 2D
    embedding1 = embedding1.flatten()
    embedding2 = embedding2.flatten()

    return np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2))

def euclidean_distance(embedding1, embedding2):
    """
    Calculate the Euclidean distance between two document embeddings.

    :param embedding1: First embedding vector, either a list or a numpy array.
    :param embedding2: Second embedding vector, either a list or a numpy array.
    :return: Euclidean distance between the two embeddings.
    """
    # Convert embeddings to numpy arrays if they are lists
    if isinstance(embedding1, list):
        embedding1 = np.array(embedding1)
    if isinstance(embedding2, list):
        embedding2 = np.array(embedding2)

    # Compute Euclidean distance
    return np.linalg.norm(embedding1 - embedding2)


In [22]:
similarity_cos = cosine_similarity(llms_landing_embeddings, embeddings_evaluate_page)
similarity_euclid = euclidean_distance(llms_landing_embeddings, embeddings_evaluate_page)
print(similarity_cos)
print(similarity_euclid)

0.8792430026458967
0.4914407261221534
