In [0]:
# create the sentence transformer (can skip if model exist)

import mlflow
import mlflow.pyfunc
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np

class SentenceTransformerModel(mlflow.pyfunc.PythonModel):

    def load_context(self, context):
        # Load the sentence transformer model
        self.model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')  # Use your model name here

    def predict(self, context, model_input):
        def encode_text(text):
            try:
                return self.model.encode(text, show_progress_bar=False)
            except Exception as e:
                print(f"Error encoding text: {text}. Error: {str(e)}")
                return np.zeros(384)  # Assuming embedding size is 384

        # Apply the model to the input DataFrame
        text_series = model_input.iloc[:, 0]  # Assuming the first column is the text
        embeddings = text_series.apply(encode_text)
        return pd.DataFrame(embeddings.tolist())

# Create sample input and output
sample_input = pd.DataFrame({'text': ["This is a sample sentence"]})
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Create sample output by encoding the text
sample_output = pd.DataFrame([model.encode("This is a sample sentence").tolist()])

# Infer the model signature
from mlflow.models.signature import infer_signature
signature = infer_signature(sample_input, sample_output)

run_name="all-MiniLM-L6-v2-run"

# Log the model to MLflow
with mlflow.start_run(run_name=run_name) as run:
    mlflow.pyfunc.log_model(
        artifact_path="hugging_face_sentence_transformer_model",
        python_model=SentenceTransformerModel(),
        input_example=sample_input,
        signature=signature,
        registered_model_name="hugging_face_sentence_transformer_model"  # Specify the name in the registry
    )

  module = self._original_builtins_import(name, *args, **kwargs)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Uploading artifacts:   0%|          | 0/11 [00:00<?, ?it/s]

Successfully registered model 'llm_workspace.default.hugging_face_sentence_transformer_model'.


Uploading artifacts:   0%|          | 0/11 [00:00<?, ?it/s]

Created version '1' of model 'llm_workspace.default.hugging_face_sentence_transformer_model'.


Downloading artifacts:   0%|          | 0/11 [00:00<?, ?it/s]

2024/10/23 06:06:03 INFO mlflow.tracking._tracking_service.client: 🏃 View run all-MiniLM-L6-v2-run at: https://dbc-077f5204-3af4.cloud.databricks.com/ml/experiments/26fc5ff7060643a7a96d401413b2ab0e/runs/403ec23637954d94b02cabcdab138bfa.
2024/10/23 06:06:03 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dbc-077f5204-3af4.cloud.databricks.com/ml/experiments/26fc5ff7060643a7a96d401413b2ab0e.
