This notebook illustrates how to use LM Buddy as a library to run jobs directly on the host machine.

Jobs are fully specified by a `lm_buddy.jobs.configs.LMBuddyJobConfig` 
and are executed with the `lm_buddy.run_job` method.

**Warning**: This workflow is still considered experimental.
Some jobs depend on external services (e.g., W&B, Ray cluster) and host-machine GPU resources,
and may not work without a properly configured local environment.

## Imports

In [8]:
import lm_buddy
from lm_buddy.jobs.configs import (
    FinetuningJobConfig,
    FinetuningRayConfig,
    LMHarnessJobConfig,
    LMHarnessEvaluatorConfig,
)
from lm_buddy.integrations.huggingface import (
    HuggingFaceRepoConfig,
    AutoModelConfig,
    TextDatasetConfig,
    TrainerConfig,
    AdapterConfig,
)
from lm_buddy.integrations.wandb import WandbRunConfig

# Third party
import torch
from peft import PeftType

## Finetuning

In [16]:
# HuggingFace assets
model_config = AutoModelConfig(
    load_from=HuggingFaceRepoConfig(repo_id="distilgpt2"),
    torch_dtype=torch.bfloat16,
)
dataset_config = TextDatasetConfig(
    load_from=HuggingFaceRepoConfig(repo_id="imdb"),
    split="train[:100]",
    text_field="text",
)
trainer_config = TrainerConfig(
    max_seq_length=256,
    num_train_epochs=1,
    logging_strategy="steps",
    logging_steps=1,
    save_strategy="epoch",
    save_steps=1,
)
adapter_config = AdapterConfig(
    peft_type="LORA",
    task_type="CAUSAL_LM",
    r=8,
    lora_alpha=16,
    lora_dropout=0.2,
)

In [17]:
# Job setup
tracking_config = WandbRunConfig(
    name="example-fietuning",
    project="sfriedowitz-dev",
)
ray_config = FinetuningRayConfig(
    use_gpu=False,  # In case the local machine does not have a GPU
    num_workers=2,
)
job_config = FinetuningJobConfig(
    model=model_config,
    dataset=dataset_config,
    trainer=trainer_config,
    adapter=adapter_config,
    tracking=tracking_config,
    ray=ray_config,
)

In [18]:
# Run the job

lm_buddy.run_job(job_config)

2024-02-16 15:09:37,329	ERROR tune_controller.py:1374 -- Trial task failed for trial TorchTrainer_08e3c_00000
Traceback (most recent call last):
  File "/Users/sfriedowitz/miniconda3/envs/lm-buddy/lib/python3.10/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/Users/sfriedowitz/miniconda3/envs/lm-buddy/lib/python3.10/site-packages/ray/_private/auto_init_hook.py", line 22, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/Users/sfriedowitz/miniconda3/envs/lm-buddy/lib/python3.10/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/Users/sfriedowitz/miniconda3/envs/lm-buddy/lib/python3.10/site-packages/ray/_private/worker.py", line 2624, in get
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(RuntimeError): [36mray::_Inner.train()[39m (pid=96385, ip=127.0.0.1, actor_id=bf3aee7e1968b8c89f1f5e8a01000000, repr=TorchTrainer)
  

TrainingFailedError: The Ray Train run failed. Please inspect the previous error messages for a cause. After fixing the issue (assuming that the error is not caused by your own application logic, but rather an error such as OOM), you can restart the run from scratch or continue this run.
To continue this run, you can use: `trainer = TorchTrainer.restore("/Users/sfriedowitz/ray_results/example-fietuning")`.
To start a new run that will retry on training failures, set `train.RunConfig(failure_config=train.FailureConfig(max_failures))` in the Trainer's `run_config` with `max_failures > 0`, or `max_failures = -1` for unlimited retries.

## Evaluation