This notebook shows a simple example of how to use some of the `flamingo` utilities to pre-process a dataset
and upload it as a W&B artifact.

Generally, this workflow will be performed in a dev environment on cluster so that the dataset files
can be saved on a shared volume. 
But this notebook can be run locally for educational purposes to illustrate the basic functions.

(1) Load and pre-process the base dataset from HuggingFace

In [None]:
from datasets import load_dataset

base_dataset = "fka/awesome-chatgpt-prompts"
dataset = load_dataset(base_dataset, split="train")

dataset

In [None]:
def preprocess_dataset(examples):
    texts = []
    for x in examples["prompt"]:
        texts.append(x[::-1])  # Dummy reverse the prompt
    examples["text"] = texts
    return examples


# Map some preprocessing function over the base dataset (e.g., for prompt formatting)
dataset = dataset.map(preprocess_dataset, batched=True, remove_columns=dataset.column_names)

dataset

(2) Save the dataset to disk

In [None]:
from pathlib import Path

# Add an actual path here to where you want the data to live on shared storage
dataset_save_path = str(Path("example_dataset").absolute())

dataset.save_to_disk(dataset_save_path)

(3a) Log the dataset directory as an reference artifact using W&B directly

In [None]:
import wandb

with wandb.init(
    name="flamingo-preprocessing-example",
    project="example-project",
    entity="mozilla-ai",
    job_type="preprocessing",
):
    artifact = wandb.Artifact(name="example-dataset-reference", type="dataset")
    artifact.add_reference(uri=f"file://{dataset_save_path}")
    wandb.log_artifact(artifact)

(3b) Log the dataset directory as an artifact using flamingo helper functions

In [1]:
from flamingo.integrations.wandb import (
    ArtifactType,
    ArtifactURIScheme,
    WandbRunConfig,
    log_directory_contents,
    log_directory_reference,
    wandb_init_from_config,
)
from flamingo.jobs.utils import FlamingoJobType

run_config = WandbRunConfig(
    name="flamingo-preprocessing-example",
    project="example-project",
    entity="mozilla-ai",
)

with wandb_init_from_config(run_config, job_type=FlamingoJobType.PREPROCESSING):
    # Log a reference to the directory contents
    log_directory_reference(
        dir_path=dataset_save_path,
        artifact_name="example-dataset-artfact-reference",
        artifact_type=ArtifactType.DATASET,
        scheme=ArtifactURIScheme.FILE,
    )
    # Log and upload the directory contents
    log_directory_contents(
        dir_path=dataset_save_path,
        artifact_name="example-dataset-artfact-upload",
        artifact_type=ArtifactType.DATASET,
    )