Add docs and tutorials for Sentence Transformers flavor (#10476)

Signed-off-by: Ben Wilson <benjamin.wilson@databricks.com>
mlflow · Dec 1, 2023 · 1b487c9 · 1b487c9
1 parent 9e45df7
commit 1b487c9
Show file tree

Hide file tree

Showing 11 changed files with 2,787 additions and 7 deletions.
diff --git a/docs/source/_static/images/tutorials/llms/semantic-search-arch.png b/docs/source/_static/images/tutorials/llms/semantic-search-arch.png
diff --git a/docs/source/_static/images/tutorials/llms/sentence-transformers-architecture.png b/docs/source/_static/images/tutorials/llms/sentence-transformers-architecture.png
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -163,6 +163,17 @@ LLM Guides and Tutorials
                     </p>
                 </a>
             </div>
+            <div class="simple-card">
+                <a href="llms/sentence-transformers/index.html" >
+                    <div class="header">
+                        Guide to using Sentence Transformers in MLflow
+                    </div>
+                    <p>
+                        Learn how to leverage the advanced capabilities with semantic sentence embeddings within the Sentence Transformers package, using MLflow to simplify 
+                        inference, create custom deployable applications, and more.
+                    </p>
+                </a>
+            </div>
         </article>
     </section>
 

diff --git a/docs/source/llms/custom-pyfunc-for-llms/notebooks/custom-pyfunc-advanced-llm.ipynb b/docs/source/llms/custom-pyfunc-for-llms/notebooks/custom-pyfunc-advanced-llm.ipynb
@@ -617,7 +617,7 @@
    "source": [
     "#### Set the experiment that we're going to be logging our custom model to\n",
     "\n",
-    "If the the experiment doesn't already exist, MLflow will create a new experiment with this name and will alert you that it has created a new experiment."
+    "If the experiment doesn't already exist, MLflow will create a new experiment with this name and will alert you that it has created a new experiment."
    ]
   },
   {

diff --git a/docs/source/llms/index.rst b/docs/source/llms/index.rst
@@ -250,7 +250,7 @@ Select the integration below to read the documentation on how to leverage MLflow
             </div>
         </a>
 
-        <a href="../models.html#sentencetransformers-sentence-transformers-experimental">
+        <a href="sentence-transformers/guide/index.html">
             <div class="logo-card">
                 <img src="../_static/images/logos/sentence-transformers-logo.png" alt="Sentence Transformers Logo"/>
             </div>
@@ -288,6 +288,17 @@ Native Integration Guides and Tutorials
                     </p>
                 </a>
             </div>
+            <div class="simple-card">
+                <a href="sentence-transformers/index.html">
+                    <div class="header">
+                        Sentence Transformers
+                    </div>
+                    <p>
+                        Learn about MLflow's native integration with the Sentence Transformers library and see example notebooks that leverage 
+                        MLflow and Sentence Transformers to perform operations with encoded text such as semantic search, text similarity, and information retrieval.
+                    </p>
+                </a>
+            </div>
         </article>
     </section>
 
@@ -296,18 +307,14 @@ Native Integration Guides and Tutorials
     :hidden:
 
     transformers/index
-
+    sentence-transformers/index
 
 Native Integration Examples
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 If you'd like to directly explore code examples for how to get started with using our official library integrations, you can navigate 
 directly to our up-to-date examples on GitHub below:
 
-* **sentence-transformers**
-
-    * `Text Encoding Example <https://github.com/mlflow/mlflow/blob/master/examples/sentence_transformers/simple.py>`_
-
 * **langchain**
 
     * `Logging and using a Chain <https://github.com/mlflow/mlflow/blob/master/examples/langchain/simple_chain.py>`_
@@ -366,6 +373,8 @@ Interested in learning how to leverage MLflow for your LLM projects?
 
 Look in the tutorials and guides below to learn more about interesting use cases that could help to make your journey into leveraging LLMs a bit easier!
 
+Note that there are additional tutorials within the `Native Integration Guides and Tutorials section above <#native-integration-guides-and-tutorials>`_, so be sure to check those out as well!
+
 .. toctree::
     :maxdepth: 1
     :hidden:

diff --git a/docs/source/llms/sentence-transformers/guide/index.rst b/docs/source/llms/sentence-transformers/guide/index.rst
@@ -0,0 +1,124 @@
+Sentence Transformers within MLflow
+====================================
+
+.. attention::
+    The ``sentence_transformers`` flavor is in active development and is marked as Experimental. Public APIs may change and new features are
+    subject to be added as additional functionality is brought to the flavor.
+
+The ``sentence_transformers`` model flavor enables logging of
+`sentence-transformers models <https://www.sbert.net/>`_ in MLflow format via
+the :py:func:`mlflow.sentence_transformers.save_model()` and :py:func:`mlflow.sentence_transformers.log_model()` functions. Using these
+functions also adds the ``python_function`` flavor to the MLflow Models, enabling the model to be
+interpreted as a generic Python function for inference via :py:func:`mlflow.pyfunc.load_model()`.
+Additionally, :py:func:`mlflow.sentence_transformers.load_model()` can be used to load a saved or logged MLflow
+Model with the ``sentence_transformers`` flavor in the native sentence-transformers format.
+
+Tutorials for Sentence Transformers
+-----------------------------------
+
+Looking to get right in to some usable examples and tutorials that show how to leverage this library with MLflow? 
+
+.. raw:: html
+
+    <a href="../index.html#getting-started-with-the-mlflow-sentence-transformers-flavor-tutorials-and-guides" class="download-btn">See the Tutorials</a>
+
+Input and Output Types for PyFunc
+---------------------------------
+
+The ``sentence_transformers`` :ref:`python_function (pyfunc) model flavor <pyfunc-model-flavor>` standardizes
+the process of embedding sentences and computing semantic similarity. This standardization allows for serving
+and batch inference by adapting the required data structures for ``sentence_transformers`` into formats compatible with JSON serialization and casting to Pandas DataFrames.
+
+.. note::
+    The ``sentence_transformers`` flavor supports various models for tasks such as embedding generation, semantic similarity, and paraphrase mining. The specific input and output types will depend on the model and task being performed.
+
+Saving and Logging Sentence Transformers Models
+-----------------------------------------------
+
+You can save and log sentence-transformers models in MLflow. Here's an example of both saving and logging a model:
+
+.. code-block:: python
+
+    import mlflow
+    from sentence_transformers import SentenceTransformer
+
+    model = SentenceTransformer("model_name")
+
+    # Saving the model
+    mlflow.sentence_transformers.save_model(model=model, path="path/to/save/directory")
+
+    # Logging the model
+    with mlflow.start_run():
+        mlflow.sentence_transformers.log_model(
+            sentence_transformers_model=model, artifact_path="model_artifact_path"
+        )
+
+Custom Python Function Implementation
+-------------------------------------
+
+In addition to using pre-built models, you can create custom Python functions with the `sentence_transformers` flavor. Here's an example of a custom 
+implementation for comparing the similarity between text documents:
+
+.. code-block:: python
+
+    import mlflow
+    from mlflow.pyfunc import PythonModel
+    import pandas as pd
+    import numpy as np
+    from sentence_transformers import SentenceTransformer, util
+
+
+    class DocumentSimilarityModel(PythonModel):
+        def load_context(self, context):
+            """Load the model context for inference."""
+            self.model = SentenceTransformer.load(context.artifacts["model_path"])
+
+        def predict(self, context, model_input):
+            """Predict method for comparing similarity between documents."""
+            if isinstance(model_input, pd.DataFrame) and model_input.shape[1] == 2:
+                documents = model_input.values
+            else:
+                raise ValueError("Input must be a DataFrame with exactly two columns.")
+
+            # Compute embeddings for each document separately
+            embeddings1 = self.model.encode(documents[:, 0], convert_to_tensor=True)
+            embeddings2 = self.model.encode(documents[:, 1], convert_to_tensor=True)
+
+            # Calculate cosine similarity
+            similarity_scores = util.cos_sim(embeddings1, embeddings2)
+
+            return pd.DataFrame(similarity_scores.numpy(), columns=["similarity_score"])
+
+
+    # Example model saving and loading
+    model = SentenceTransformer("all-MiniLM-L6-v2")
+    model_path = "/tmp/sentence_transformers_model"
+    model.save(model_path)
+
+    # Example usage
+    with mlflow.start_run():
+        model_info = mlflow.pyfunc.log_model(
+            artifact_path="document_similarity_model",
+            python_model=DocumentSimilarityModel(),
+            artifacts={"model_path": model_path},
+        )
+
+    loaded = mlflow.pyfunc.load_model(model_info.model_uri)
+
+    # Test prediction
+    df = pd.DataFrame(
+        {
+            "doc1": ["Sentence Transformers is a wonderful package!"],
+            "doc2": ["MLflow is pretty great too!"],
+        }
+    )
+
+    result = loaded.predict(df)
+    print(result)
+
+Which will generate the similarity score for the documents passed, as shown below:
+
+.. code-block:: bash
+
+       similarity_score
+    0          0.275423