# Notebook launcher example 

[Launching Multi-Node Training from a Jupyter Environment](https://huggingface.co/docs/accelerate/basic_tutorials/notebook)

[How to use DeepSpeed](https://huggingface.co/docs/accelerate/usage_guides/deepspeed)

In [None]:
!pip install deepspeed==0.7.2

## General Utils

#### load and tokenize dataset

In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset
from functools import partial


def preprocess(sample, tokenizer=None):
    enc = tokenizer(sample["text"], truncation=True)
    if "label" in sample:
        enc["labels"] = sample["label"]
    return enc


def tokenize_dataset(dataset_id, tokenizer=None, preprocess_fn=preprocess):
    dataset = load_dataset(dataset_id)
    # remove not needed columns
    remove_columns = dataset["train"].column_names
    if "labels" in remove_columns:
        remove_columns.remove("labels")
    # tokenize dataset
    dataset = dataset.map(partial(preprocess, tokenizer=tokenizer), batched=True, remove_columns=remove_columns)

    # print some stats
    print(f"Train dataset size: {len(dataset['train'])}")
    print(f"Dataset columns: {dataset['train'].column_names}")
    return dataset


#### create dataloader function

In [2]:
from torch.utils.data import DataLoader


def create_dataloaders(dataset, collate_fn=None, train_batch_size=8, eval_batch_size=32):
    train_dataloader = DataLoader(dataset["train"], collate_fn=collate_fn, shuffle=True, batch_size=train_batch_size)
    eval_dataloader = DataLoader(
        dataset["validation"], collate_fn=collate_fn, shuffle=False, batch_size=eval_batch_size
    )
    return train_dataloader, eval_dataloader


#### train function

In [3]:
import datasets
import transformers
from accelerate import Accelerator, DeepSpeedPlugin
from accelerate.utils.deepspeed import DummyOptim, DummyScheduler
from transformers import (
    get_linear_schedule_with_warmup,
    DataCollatorWithPadding,
    set_seed,
    PreTrainedModel,
    PreTrainedTokenizer,
)
from datasets import DatasetDict
from torch.optim import AdamW
from tqdm import tqdm
import torch
import evaluate

from dataclasses import dataclass

###### Hyperparameters ######
@dataclass
class Hyperparameters:
    train_batch_size: int = 64
    eval_batch_size: int = 64
    learning_rate: float = 5e-5
    num_epochs: int = 3


def training_function(
    model: PreTrainedModel,
    dataset: DatasetDict,
    tokenizer: PreTrainedTokenizer = None,
    hp: Hyperparameters = None,
):
    # dummy optimizer and scheduler for deepspeed
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
        },
        {
            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
    ]
    optimizer = DummyOptim(optimizer_grouped_parameters, lr=hp.learning_rate)
    lr_scheduler = DummyScheduler(
        optimizer,
        num_warmup_steps=100,
        num_training_steps=len(train_dataloader) * hp.num_epochs,
    )
    gradient_accumulation_steps = 2
    # Initialize accelerator and Deepspeed
    # deepspeed needs to know your gradient accumulation steps before hand, so don't forget to pass it
    # Remember you still need to do gradient accumulation by yourself, just like you would have done without deepspeed
    deepspeed_plugin = DeepSpeedPlugin(
        zero_stage=2,
        gradient_accumulation_steps=gradient_accumulation_steps,
        offload_optimizer_device="cpu",
        offload_param_device="cpu",
    )
    accelerator = Accelerator(
        fp16=True, deepspeed_plugin=deepspeed_plugin, gradient_accumulation_steps=gradient_accumulation_steps
    )

    # To have only one message (and not 8) per logs of Transformers or Datasets, we set the logging verbosity
    # to INFO for the main process only.
    if accelerator.is_main_process:
        datasets.utils.logging.set_verbosity_warning()
        transformers.utils.logging.set_verbosity_info()
    else:
        datasets.utils.logging.set_verbosity_error()
        transformers.utils.logging.set_verbosity_error()

    data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)

    train_dataloader, eval_dataloader = create_dataloaders(
        dataset, collate_fn=data_collator, train_batch_size=hp.train_batch_size, eval_batch_size=hp.eval_batch_size
    )
    # The seed need to be set before we instantiate the model, as it will determine the random head.
    set_seed(34)

    # Prepare everything
    model, optimizer, lr_scheduler, train_dataloader, eval_dataloader = accelerator.prepare(
        model, optimizer, lr_scheduler, train_dataloader, eval_dataloader
    )

    # Instantiate a progress bar to keep track of training. Note that we only enable it on the main
    progress_bar = tqdm(range(hp.num_epochs * len(train_dataloader)), disable=not accelerator.is_main_process)
    # Now we train the model
    for epoch in range(hp.num_epochs):
        model.train()
        for step, batch in enumerate(train_dataloader):
            # perform gradient accumulation
            with accelerator.accumulate(model):
                optimizer.zero_grad()
                outputs = model(**batch)
                loss = outputs.loss
                accelerator.backward(loss)

                optimizer.step()
                lr_scheduler.step()
                progress_bar.update(1)

        model.eval()
        all_predictions = []
        all_labels = []

        for step, batch in enumerate(eval_dataloader):
            with torch.no_grad():
                outputs = model(**batch)
            predictions = outputs.logits.argmax(dim=-1)

            # We gather predictions and labels from the 8 TPUs to have them all.
            all_predictions.append(accelerator.gather(predictions))
            all_labels.append(accelerator.gather(batch["labels"]))

        # Concatenate all predictions and labels.
        # The last thing we need to do is to truncate the predictions and labels we concatenated
        # together as the prepared evaluation dataloader has a little bit more elements to make
        # batches of the same size on each process.
        all_predictions = torch.cat(all_predictions)[: len(dataset["validation"])]
        all_labels = torch.cat(all_labels)[: len(dataset["validation"])]

        # Use accelerator.print to print only on the main process.
        metric = evaluate.load("accuracy")
        eval_metric = metric.compute(predictions=all_predictions, references=all_labels)
        accelerator.print(f"epoch {epoch+1}:", eval_metric)


### Train model

In [None]:
from torch.optim import AdamW
from torch.utils.data import DataLoader
from transformers import get_linear_schedule_with_warmup, DataCollatorWithPadding


###### Load model and dataset ######
model_id = "distilbert-base-uncased"
dataset_id = "emotion"

model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=6)
tokenizer = AutoTokenizer.from_pretrained(model_id)
dataset = tokenize_dataset(dataset_id, tokenizer)
hyperparameters = Hyperparameters()

launch  training

In [5]:
from accelerate import notebook_launcher

args = (model, dataset, tokenizer, hyperparameters)

notebook_launcher(training_function, args, mixed_precision="fp16", num_processes=1)


Launching training on one GPU.


 33%|███▎      | 250/750 [01:11<02:33,  3.26it/s]

epoch 0: {'accuracy': 0.929}


 67%|██████▋   | 500/750 [02:29<01:16,  3.25it/s]

epoch 1: {'accuracy': 0.934}


100%|██████████| 750/750 [03:53<00:00,  3.21it/s]

epoch 2: {'accuracy': 0.934}



