In [7]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [26]:
import os
import json
import wandb
import secrets
from pathlib import Path

from ray.job_submission import JobSubmissionClient

from datasets import DatasetDict, load_dataset, load_from_disk, Dataset

import flamingo
from flamingo.jobs.finetuning import FinetuningJobConfig
from flamingo.jobs.lm_harness import LMHarnessJobConfig
from flamingo.jobs.simple import SimpleJobConfig
from flamingo.jobs.utils import FlamingoJobType
from flamingo.integrations.huggingface import AutoModelConfig
from flamingo.integrations.wandb import WandbArtifactConfig, WandbRunConfig, ArtifactType, ArtifactURIScheme
from flamingo.integrations.wandb.utils import wandb_init_from_config, log_artifact_from_path

## Development

In [41]:
run_config = WandbRunConfig(
    name="test-dataset-with-actual-upload",
    project="sfriedowitz-dev",
    entity="mozilla-ai",
)

run_config

WandbRunConfig(run_id='vxbvw154', name='test-dataset-with-actual-upload', project='sfriedowitz-dev', run_group=None, entity='mozilla-ai')

In [42]:
dataset_path = str(Path("datasets/test_dataset").absolute())

dataset = load_dataset("fka/awesome-chatgpt-prompts", split="train")
dataset.save_to_disk(dataset_path)

Saving the dataset (0/1 shards):   0%|          | 0/153 [00:00<?, ? examples/s]

In [43]:
with wandb_init_from_config(run_config, job_type=FlamingoJobType.PREPROCESSING):
    log_artifact_from_path(
        name="test-dataset-actual-upload",
        path=dataset_path,
        artifact_type=ArtifactType.DATASET,
    )

[34m[1mwandb[0m: Adding directory to artifact (/Users/sfriedowitz/workspace/prototyping/debugging/datasets/test_dataset)... Done. 0.0s


VBox(children=(Label(value='0.109 MB of 0.109 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

## Debugging

In [36]:
client = JobSubmissionClient(f"http://10.146.174.91:8265")
client.get_address()

'http://10.146.174.91:8265'

In [37]:
config = FinetuningJobConfig.from_yaml_file("configs/finetuning.yaml")

print(config.json(indent=2))

{
  "model": {
    "path": "distilgpt2",
    "trust_remote_code": false,
    "torch_dtype": null
  },
  "dataset": {
    "path": "fka/awesome-chatgpt-prompts",
    "split": "train",
    "text_field": "prompt",
    "test_size": 0.2,
    "seed": null
  },
  "tokenizer": {
    "path": "distilgpt2",
    "trust_remote_code": null,
    "use_fast": null
  },
  "quantization": null,
  "adapter": null,
  "tracking": {
    "run_id": "zw3zkqbp",
    "name": "test-post-refactor",
    "project": "sfriedowitz-dev",
    "run_group": null,
    "entity": "mozilla-ai"
  },
  "trainer": {
    "max_seq_length": 512,
    "num_train_epochs": 1.0,
    "per_device_train_batch_size": 16,
    "per_device_eval_batch_size": 16,
    "learning_rate": null,
    "weight_decay": null,
    "gradient_accumulation_steps": null,
    "gradient_checkpointing": null,
    "evaluation_strategy": null,
    "eval_steps": null,
    "logging_strategy": "epoch",
    "logging_steps": 1.0,
    "save_strategy": "epoch",
    "save_step

In [39]:
flamingo_repo = Path(flamingo.__file__).parents[2]
flamingo_module = flamingo_repo / "src" / "flamingo"

flamingo_module

PosixPath('/Users/sfriedowitz/workspace/flamingo/src/flamingo')

In [40]:
runtime_env = {
    "working_dir": "configs",
    "env_vars": {"WANDB_API_KEY": os.environ["WANDB_API_KEY"]},
    "py_modules": [str(flamingo_module)],
    "pip": "requirements.txt"
}

client.submit_job(
    entrypoint="python -m flamingo run finetuning --config finetuning.yaml",
    runtime_env=runtime_env
)

2024-01-22 13:25:54,392	INFO dashboard_sdk.py:338 -- Uploading package gcs://_ray_pkg_9e7a7ee1f486946b.zip.
2024-01-22 13:25:54,394	INFO packaging.py:530 -- Creating a file package for local directory 'configs'.
2024-01-22 13:25:54,781	INFO dashboard_sdk.py:338 -- Uploading package gcs://_ray_pkg_c5dedd8d5e91ab6d.zip.
2024-01-22 13:25:54,784	INFO packaging.py:530 -- Creating a file package for local directory '/Users/sfriedowitz/workspace/flamingo/src/flamingo'.


'raysubmit_1fRSCCdqCFNdFeDL'