In [0]:
%pip install --upgrade databricks-sdk

In [0]:
dbutils.library.restartPython()

In [0]:
schema_use = spark.sql("SELECT REPLACE(SPLIT(current_user(), '@')[0], '.', '_')").collect()[0][0]
schema_use

In [0]:
source_volume = "/Volumes/fhir_workshop/synthea/synthetic_files_raw/output/fhir/"
target_volume = f"/Volumes/fhir_workshop/{schema_use}/landing/"
target_volume

In [0]:
import mlflow
import shutil
import glob
import os

In [0]:
class VolumeCopyFiles(mlflow.pyfunc.PythonModel):
    def __init__(self):
        pass

    def load_context(self, context):
        import shutil
        import glob
        import os

    def copy_files(self, source_volume: str, target_volume: str, file_pattern: str=None):
        # Check if the source and target volumes end with a slash
        if not source_volume.endswith('/'):
            source_volume += '/'

        if not target_volume.endswith('/'):
            target_volume += '/'

        # Use glob to locate files based on the file pattern
        if file_pattern is None:
            file_pattern = '*'

        files = glob.glob(os.path.join(source_volume, file_pattern))

        # Copy each file to the destination directory
        for file in files:
            target_file = os.path.join(target_volume, os.path.basename(file))
            if os.path.exists(target_file):
                os.remove(target_file)
            shutil.copy2(file, target_volume)

        return f"Copied {str(len(files))} files."

    def predict(self, context, model_input, params = None):
        results = []
        # Apply the copy_files function to each row of the DataFrame
        # Convert Pandas DF to string.
        for row in model_input.itertuples(index=False):
            result = self.copy_files(
                source_volume = row.source_volume
                ,target_volume = row.target_volume
                ,file_pattern = row.file_pattern
            )
            results.append(result)
        return results

In [0]:
class VolumeCopyFiles(mlflow.pyfunc.PythonModel):
    def __init__(self):
        pass

    def load_context(self, context):
        import shutil
        import glob
        import os

    def copy_files(self, source_volume: str, target_volume: str, file_pattern: str=None):
        # Check if the source and target volumes end with a slash
        if not source_volume.endswith('/'):
            source_volume += '/'

        if not target_volume.endswith('/'):
            target_volume += '/'

        # Use glob to locate files based on the file pattern
        if file_pattern is None:
            file_pattern = '*'

        files = glob.glob(os.path.join(source_volume, file_pattern))

        # Copy each file to the destination directory
        for file in files:
            target_file = os.path.join(target_volume, os.path.basename(file))
            if os.path.exists(target_file):
                os.remove(target_file)
            shutil.copy2(file, target_volume)

        return f"Copied {str(len(files))} files."

    def predict(self, context, model_input, params = None):
        results = []
        # Apply the copy_files function to each row of the DataFrame
        # Convert Pandas DF to string.
        for row in model_input.itertuples(index=False):
            result = self.copy_files(
                source_volume = row.source_volume
                ,target_volume = row.target_volume
                ,file_pattern = row.file_pattern
            )
            results.append(result)
        return results

In [0]:
import pandas as pd

df = pd.DataFrame({"source_volume": [source_volume], "target_volume": [target_volume], "file_pattern": ["A*.json"]})
df

In [0]:
import pandas as pd

# Create a Pandas DataFrame with a single row.
# This will be used to infer the model signature
# which is mandatory for UC model registry.
from mlflow.models import infer_signature

In [0]:
signature = infer_signature(df, ["volume_copy_files"])
signature

In [0]:
with mlflow.start_run():

 # Ensure the standard python libraries glob, shutil, and os are available based on this version of the DBR.  
 conda_env = mlflow.pyfunc.get_default_conda_env()

 mlflow.pyfunc.log_model(artifact_path="model",
                         python_model=VolumeCopyFiles(),
                         signature=signature,
                         conda_env=conda_env)
 
 run_id = mlflow.active_run().info.run_id

In [0]:
logged_model = f"runs:/{run_id}/model"

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)

# Predict on a Pandas DataFrame.
loaded_model.predict(df)

In [0]:
catalog = "fhir_workshop"
schema = "synthea"
model_name = "volume_copy_files"
full_model_name = f"{catalog}.{schema}.{model_name}"

mlflow.set_registry_uri("databricks-uc")

mlflow.register_model(f"runs:/{run_id}/model", full_model_name)

In [0]:
client = mlflow.MlflowClient()

# Search for all versions of the model
model_version_infos = client.search_model_versions(f"name = '{full_model_name}'")

# Find the latest version
latest_version = max([model_version_info.version for model_version_info in model_version_infos])
print(f"""
    The latest version of the model {full_model_name} in Unity Catalog is version {latest_version}.
""")

In [0]:
# Load Unity Catalog model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(f"models:/{full_model_name}/{latest_version}")

# Predict on a Pandas DataFrame.
loaded_model.predict(df)

In [0]:
# Set the model alias
client.set_registered_model_alias(
    name=full_model_name
    ,alias="Staged"
    ,version=latest_version
)

## Use the GUI to Serve the Model with Mosaic AI Model Serving 

The test the model with the browser query using:

```{  
    "dataframe_split": {  
        "index": [0],  
        "columns": ["source_volume", "target_volume", "file_pattern"],  
        "data": [ ["/Volumes/fhir_workshop/synthea/synthetic_files_raw/output/fhir/","/Volumes/fhir_workshop/odl_instructor_1452233/landing/", "A*.json"]  
        ]  
    }  
}```

In [0]:
w.serving_endpoints.get("fhir_workshop_synthea_volume_copy_files")

In [0]:
%sql
DECLARE OR REPLACE VARIABLE schema_use STRING DEFAULT REPLACE(SPLIT(current_user(), '@')[0], '.', '_');
DECLARE OR REPLACE VARIABLE target_volume STRING DEFAULT "/Volumes/fhir_workshop/" || schema_use || "/landing/";

select target_volume;

In [0]:
%sql
SELECT ai_query(
  'fhir_workshop_synthea_volume_copy_files'
  ,request => named_struct(
    "source_volume", "/Volumes/fhir_workshop/synthea/synthetic_files_raw/output/fhir/"
    ,"target_volume", target_volume
    ,"file_pattern", "A*.json")
  ,returnType => "STRING"
  ) as results;