### Upload HF dataset to OSS

Since the size of the dataset can be big, this time we will use a slightly different approach. 

We're going to use OCI **UploadManager**, with **multipart upload**

In [1]:
import oci
from oci.object_storage import UploadManager
from oci.object_storage.transfer.constants import MEBIBYTE

import os
from os import path

# moved namespace to config.py
from config import (
    NAMESPACE,
)

In [2]:
# the name of the HF dataset (also the dir where it is saved)
HF_DIR = "atco2_hf"

# the bucket where the entire ds will be saved
BUCKET = "atco2_hf"

# config for the UploadManager
# for the Upload Manager
PART_SIZE = 2 * MEBIBYTE
PARALLEL = 4

In [3]:
def progress_callback(bytes_uploaded):
    # disabled to avoid thousands of print
    # print("{} additional bytes uploaded".format(bytes_uploaded))
    return

In [4]:
#
# This code try to get an instance of OCIFileSystem
# first try using Resource Principal, otherwise use api keys
#
try:
    rps = oci.auth.signers.get_resource_principals_signer()

    # if here, we can use rp
    print("Using RP for auth...")

    object_storage = oci.object_storage.ObjectStorageClient(config={}, signer=rps)
except:
    print("Using API Key for auth...")

    default_config = oci.config.from_file()

    # validate the default config file
    oci.config.validate_config(default_config)

    object_storage = oci.object_storage.ObjectStorageClient(config=default_config)

Using API Key for auth...


In [5]:
for path, subdirs, files in os.walk(HF_DIR):
    for name in files:
        orig_path = os.path.join(path, name)

        # use_path will be like test/state.json
        # remove initial HF_DIR/
        use_path = orig_path[len(HF_DIR) + 1 :]
        print(f"Copy {use_path} to bucket {BUCKET}...")

        # copy a single file to bucket
        upload_manager = UploadManager(
            object_storage, allow_parallel_uploads=True, parallel_process_count=PARALLEL
        )

        response = upload_manager.upload_file(
            NAMESPACE,
            BUCKET,
            use_path,
            orig_path,
            part_size=PART_SIZE,
            progress_callback=progress_callback,
        )

Copy dataset_dict.json to bucket atco2_hf...
Copy test/state.json to bucket atco2_hf...
Copy test/dataset_info.json to bucket atco2_hf...
Copy test/data-00000-of-00001.arrow to bucket atco2_hf...
Copy train/state.json to bucket atco2_hf...
Copy train/dataset_info.json to bucket atco2_hf...
Copy train/data-00000-of-00001.arrow to bucket atco2_hf...
