In [1]:
#meta:tag=hide
%env METAFLOW_PROFILE=dev-valay
%env METAFLOW_UI_URL=


env: METAFLOW_PROFILE=dev-valay
env: METAFLOW_UI_URL=


In [2]:
#meta:tag=hide
import os
os.makedirs("temp_files", exist_ok=True)


# `@huggingface_hub`

<!-- START doctoc -->
<!-- END doctoc -->


The `@huggingface_hub` decorator simplifies the process of downloading, caching, and managing models from the Hugging Face Hub. It provides seamless integration between Metaflow's datastore and Hugging Face Hub's model repository. This decorator is a syntactic sugar over the `@checkpoint` decorator to easily cache/load models from HuggingFace Hub. All models are stored with same way the `@checkpoint` decorator stores checkpoints i.e. objects are stored under the namespace and step name. The decorator injects a `huggingface_hub` object into the `current` singleton. This object has two main properties: 

1. Exposes a `loaded` property that returns the path to the models loaded via the `@huggingface_hub` decorator's `load` parameter.
2. Provides an wrapper over the [huggingface_hub](https://github.com/huggingface/huggingface_hub)'s [snapshot_download](https://huggingface.co/docs/huggingface_hub/main/en/package_reference/file_download#huggingface_hub.snapshot_download) function. The `current.huggingface_hub.snapshot_download` function returns a reference to the model stored in the datastore. This reference can be used in subsequent steps using the `@model` decorator. 


## Usage Patterns

### Loading Static Repos For A `@step`

Some Flows might require statically hard-coded models/datasets coming from huggingface. The `load` parameter of the `@huggingface_hub` decorator can be used to load these models/datasets. This style of loading models is very useful when models/datasets don't change often and can be hard-coded into the Flow. The models specified in the `load` parameter are downloaded from huggingface hub and stored in the datastore if they are not already present. The path to the model/dataset loaded is accessible via the `loaded` property of the `huggingface_hub` object. The below example shows how to load a static model from huggingface hub and access it in a `@step`. 

The `load` parameter can take multiple form of arguments. Where the simplest for is a list of strings representing the `repo_id`. It can also take a list of dictionaries that provide all the arguments to the [snapshot_download](https://huggingface.co/docs/huggingface_hub/main/en/package_reference/file_download#huggingface_hub.snapshot_download) function. 

In [3]:
%%writefile temp_files/hub_deco_flow.py
#meta:tag=hide_output
from metaflow import FlowSpec, step, huggingface_hub, pypi_base, current
import os

@pypi_base(packages={"huggingface-hub": "0.16.4"})
class SimpleHFFlow(FlowSpec):
    
    @huggingface_hub(load=["bert-base-uncased"])
    @step
    def start(self):
        import os
        # Access the loaded model through current.huggingface_hub.loaded
        model_path = current.huggingface_hub.loaded["bert-base-uncased"]
        print(f"Model loaded at: {model_path}")
        print(f"Contents: {os.listdir(model_path)}")
        self.next(self.load_to_path)
    
    @huggingface_hub(load=[("bert-base-uncased", "./model_directory")])
    @step
    def load_to_path(self):
        import os
        # Access the loaded model through current.huggingface_hub.loaded
        model_path = current.huggingface_hub.loaded["bert-base-uncased"]
        print(f"Model loaded at: {model_path}")
        print(f"Contents: {os.listdir(model_path)}")
        self.next(self.end)
    

    @huggingface_hub(load=[
        {
            "repo_id": "bert-base-uncased",
            "allow_patterns": ["*.json", "tokenizer.txt"],
            "repo_type": "model"
        },
    ])
    @step
    def end(self):
        # Access the loaded model through current.huggingface_hub.loaded
        model_path = current.huggingface_hub.loaded["bert-base-uncased"]
        print(f"Model loaded at: {model_path}")
        print(f"Contents: {os.listdir(model_path)}")
        

if __name__ == "__main__":
    SimpleHFFlow()

Overwriting temp_files/hub_deco_flow.py


In [4]:
#meta:tag=hide_input
#meta:show_steps=start,load_to_path,end
! python temp_files/hub_deco_flow.py --environment=pypi run

[35m[1mMetaflow 2.12.36.post9-git09d02cb-dirty+obcheckpoint(0.1.4);ob(v1)[0m[35m[22m executing [0m[31m[1mSimpleHFFlow[0m[35m[22m[0m[35m[22m for [0m[31m[1muser:valay@outerbounds.co[0m[35m[22m[K[0m[35m[22m[0m


[35m[22mValidating your flow...[K[0m[35m[22m[0m
[32m[1m    The graph looks good![K[0m[32m[1m[0m
[35m[22mRunning pylint...[K[0m[35m[22m[0m


[32m[1m    Pylint is happy![K[0m[32m[1m[0m
[35m2024-12-11 07:13:34.971 [0m[22mBootstrapping virtual environment(s) ...[0m


[35m2024-12-11 07:13:35.093 [0m[22mVirtual environment(s) bootstrapped![0m


[35m2024-12-11 07:13:37.881 [0m[1mWorkflow starting (run-id 7472):[0m


[35m2024-12-11 07:13:38.765 [0m[32m[7472/start/47522 (pid 2223466)] [0m[1mTask is starting.[0m


[35m2024-12-11 07:13:40.470 [0m[32m[7472/start/47522 (pid 2223466)] [0m[22m[@huggingface_hub] Loading model from datastore to /tmp/metaflow_hf_9b5c6e8800_kbi7fnx5. Model being loaded: mf.huggingface_hub/checkpoints/artifacts/SimpleHFFlow/start/26ec4b03ee0e/a92ae9615600/8bcbe12d.0.9b5c6e8800.0[0m


[35m2024-12-11 07:14:05.718 [0m[32m[7472/start/47522 (pid 2223466)] [0m[22mModel loaded at: /tmp/metaflow_hf_9b5c6e8800_kbi7fnx5[0m
[35m2024-12-11 07:14:05.719 [0m[32m[7472/start/47522 (pid 2223466)] [0m[22mContents: ['README.md', 'config.json', '.gitattributes', 'LICENSE', 'flax_model.msgpack', 'tf_model.h5', 'tokenizer.json', 'pytorch_model.bin', 'vocab.txt', 'model.onnx', 'rust_model.ot', 'model.safetensors', 'tokenizer_config.json'][0m


[35m2024-12-11 07:14:06.810 [0m[32m[7472/start/47522 (pid 2223466)] [0m[1mTask finished successfully.[0m


[35m2024-12-11 07:14:07.215 [0m[32m[7472/load_to_path/47523 (pid 2223643)] [0m[1mTask is starting.[0m


[35m2024-12-11 07:14:09.521 [0m[32m[7472/load_to_path/47523 (pid 2223643)] [0m[22m[@huggingface_hub] Loading model from datastore to ./model_directory. Model being loaded: mf.huggingface_hub/checkpoints/artifacts/SimpleHFFlow/load_to_path/26ec4b03ee0e/1a69a1ab0540/4b90315a.0.9b5c6e8800.0[0m


[35m2024-12-11 07:14:34.421 [0m[32m[7472/load_to_path/47523 (pid 2223643)] [0m[22mModel loaded at: ./model_directory[0m
[35m2024-12-11 07:14:34.421 [0m[32m[7472/load_to_path/47523 (pid 2223643)] [0m[22mContents: ['README.md', 'config.json', '.gitattributes', 'LICENSE', 'flax_model.msgpack', 'tf_model.h5', 'tokenizer.json', 'pytorch_model.bin', 'coreml', 'vocab.txt', 'model.onnx', 'rust_model.ot', 'model.safetensors', 'tokenizer_config.json'][0m


[35m2024-12-11 07:14:35.200 [0m[32m[7472/load_to_path/47523 (pid 2223643)] [0m[1mTask finished successfully.[0m


[35m2024-12-11 07:14:35.577 [0m[32m[7472/end/47524 (pid 2223804)] [0m[1mTask is starting.[0m


[35m2024-12-11 07:14:37.338 [0m[32m[7472/end/47524 (pid 2223804)] [0m[22m[@huggingface_hub] Loading model from datastore to /tmp/metaflow_hf_9b5c6e8800_yfsk2bdb. Model being loaded: mf.huggingface_hub/checkpoints/artifacts/SimpleHFFlow/end/26ec4b03ee0e/24105546e482/73eef1dc.0.9b5c6e8800.0[0m


[35m2024-12-11 07:14:38.566 [0m[32m[7472/end/47524 (pid 2223804)] [0m[22mModel loaded at: /tmp/metaflow_hf_9b5c6e8800_yfsk2bdb[0m
[35m2024-12-11 07:14:38.566 [0m[32m[7472/end/47524 (pid 2223804)] [0m[22mContents: ['config.json', 'tokenizer.json', 'tokenizer_config.json'][0m


[35m2024-12-11 07:14:39.334 [0m[32m[7472/end/47524 (pid 2223804)] [0m[1mTask finished successfully.[0m


[35m2024-12-11 07:14:39.457 [0m[1mDone![0m


### Loading HF Repos Dynamically

In many cases, Huggingface models or datasets might be passed down as parameters to the Flow. This approach can make it challenging to load them using the `load` parameter of the `@huggingface_hub` decorator. For this case the `current.huggingface_hub` provides a `snapshot_download` function that can be used to download the model/dataset from huggingface hub and return a reference to the model/dataset. This reference can be used in subsequent steps using the `@model` decorator. The core difference between the `load` parameter and the `snapshot_download` function is that the `load` parameter is used to load static models/datasets while the `snapshot_download` function will return a reference that can be loaded in future steps. If the `force_download` parameter is passed to the `snapshot_download` function, it will bust the cache, download the model/dataset again and store it in the datastore. 

The below example shows how to load a dynamic model from huggingface hub and access it in a `@step`. 

In [5]:
%%writefile temp_files/hub_deco_flow_2.py
#meta:tag=hide_output
from metaflow import FlowSpec, step, current, huggingface_hub, model
import os

class SimpleHFFlow(FlowSpec):
    
    @huggingface_hub
    @step
    def start(self):
        # Download a small model from HuggingFace Hub
        self.hf_model_reference = current.huggingface_hub.snapshot_download(
            repo_id="bert-base-uncased",
            allow_patterns=["*.json"]  # Only download the config file to keep it light
        )
        print(f"Model Reference saved with key : %s" % self.hf_model_reference["key"])
        self.next(self.end)
    
    @model(load="hf_model_reference")
    @step
    def end(self):
        print(f"Model loaded at: {current.model.loaded['hf_model_reference']}")
        print(f"Contents: {os.listdir(current.model.loaded['hf_model_reference'])}")

if __name__ == "__main__":
    SimpleHFFlow()

Overwriting temp_files/hub_deco_flow_2.py


In [6]:
#meta:tag=hide_input
#meta:show_steps=start,end
! python temp_files/hub_deco_flow_2.py run

[35m[1mMetaflow 2.12.36.post9-git09d02cb-dirty+obcheckpoint(0.1.4);ob(v1)[0m[35m[22m executing [0m[31m[1mSimpleHFFlow[0m[35m[22m[0m[35m[22m for [0m[31m[1muser:valay@outerbounds.co[0m[35m[22m[K[0m[35m[22m[0m


[35m[22mValidating your flow...[K[0m[35m[22m[0m
[32m[1m    The graph looks good![K[0m[32m[1m[0m
[35m[22mRunning pylint...[K[0m[35m[22m[0m


[32m[1m    Pylint is happy![K[0m[32m[1m[0m


[35m2024-12-11 07:14:45.090 [0m[1mWorkflow starting (run-id 7473):[0m


[35m2024-12-11 07:14:46.220 [0m[32m[7473/start/47526 (pid 2223882)] [0m[1mTask is starting.[0m


[35m2024-12-11 07:14:48.438 [0m[32m[7473/start/47526 (pid 2223882)] [0m[22mModel Reference saved with key : mf.huggingface_hub/checkpoints/artifacts/SimpleHFFlow/start/26ec4b03ee0e/a92ae9615600/8bcbe12d.0.9b5c6e8800.0[0m


[35m2024-12-11 07:14:49.258 [0m[32m[7473/start/47526 (pid 2223882)] [0m[1mTask finished successfully.[0m


[35m2024-12-11 07:14:49.563 [0m[32m[7473/end/47527 (pid 2223935)] [0m[1mTask is starting.[0m


[35m2024-12-11 07:14:50.743 [0m[32m[7473/end/47527 (pid 2223935)] [0m[22m[@model] Loading Artifact with name `hf_model_reference` [type:checkpoint] with key: mf.huggingface_hub/checkpoints/artifacts/SimpleHFFlow/start/26ec4b03ee0e/a92ae9615600/8bcbe12d.0.9b5c6e8800.0[0m


[35m2024-12-11 07:15:16.468 [0m[32m[7473/end/47527 (pid 2223935)] [0m[22m[@model] Loaded artifact `hf_model_reference[type:checkpoint]` in 25.73 seconds[0m


[35m2024-12-11 07:15:16.491 [0m[32m[7473/end/47527 (pid 2223935)] [0m[22mModel loaded at: /tmp/metaflow_models_hf_model_reference_4yejtsvy[0m
[35m2024-12-11 07:15:16.491 [0m[32m[7473/end/47527 (pid 2223935)] [0m[22mContents: ['README.md', 'config.json', '.gitattributes', 'LICENSE', 'flax_model.msgpack', 'tf_model.h5', 'tokenizer.json', 'pytorch_model.bin', 'vocab.txt', 'model.onnx', 'rust_model.ot', 'model.safetensors', 'tokenizer_config.json'][0m


[35m2024-12-11 07:15:20.901 [0m[32m[7473/end/47527 (pid 2223935)] [0m[1mTask finished successfully.[0m


[35m2024-12-11 07:15:21.016 [0m[1mDone![0m
