# Migrate form PyTorch to Accelerate

### Distilbert example

## General Utils

### Load Model & Datasets

In [9]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset

model_id="distilbert-base-uncased"
dataset_id="emotion"

model=AutoModelForSequenceClassification.from_pretrained(model_id,num_labels=6)
tokenizer=AutoTokenizer.from_pretrained(model_id)

dataset= load_dataset(dataset_id)

def preprocess(sample):
  enc = tokenizer(sample["text"],truncation=True)
  if "label" in sample:
    enc["labels"] = sample["label"]
  return enc

dataset=dataset.map(preprocess,batched=True,remove_columns=dataset["train"].column_names)

print(f"Train dataset size: {len(dataset['train'])}")
print(f"Dataset columns: {dataset['train'].column_names}")
print(f"Validation dataset size: {len(dataset['validation'])}")


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'pre_classi

  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached processed dataset at /home/ubuntu/.cache/huggingface/datasets/emotion/default/0.0.0/348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705/cache-d80fa21c019591e3.arrow
Loading cached processed dataset at /home/ubuntu/.cache/huggingface/datasets/emotion/default/0.0.0/348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705/cache-fb9eb107896e5216.arrow
Loading cached processed dataset at /home/ubuntu/.cache/huggingface/datasets/emotion/default/0.0.0/348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705/cache-1bf19eb875cbef47.arrow


Train dataset size: 16000
Dataset columns: ['input_ids', 'attention_mask', 'labels']
Validation dataset size: 2000


### Hyperparameters, Dataloader, Optimizer

In [10]:
from torch.optim import AdamW
from torch.utils.data import DataLoader
from transformers import get_linear_schedule_with_warmup,DataCollatorWithPadding

###### Hyperparameters ######
TRAIN_BATCH_SIZE = 64
EVAL_BATCH_SIZE = 64
LEARNING_RATE = 3e-5
NUM_EPOCHS = 3

###### Data Loaders ######
data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)

# Instantiate dataloaders.
train_dataloader = DataLoader(
  dataset["train"], shuffle=True, collate_fn=data_collator, batch_size=TRAIN_BATCH_SIZE
)
eval_dataloader = DataLoader(
  dataset["validation"], shuffle=False, collate_fn=data_collator, batch_size=EVAL_BATCH_SIZE
)

###### Optimizer ######
optimizer = AdamW(params=model.parameters(), lr=LEARNING_RATE)
lr_scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * NUM_EPOCHS)

## Pytorch example

### PyTorch Training

In [11]:
import torch
import evaluate
from tqdm import tqdm
import logging

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# epoch train loop
for epoch in range(NUM_EPOCHS):
  model.train()
  # mini-batch train loop
  with tqdm(train_dataloader) as pbar:
      pbar.set_description(f"Epoch {epoch+1}")
      for batch in pbar:
        # reset gradient
        optimizer.zero_grad()
        # move to device
        inputs = {k : v.to(device) for k,v in batch.items()}
        # forward pass
        outputs = model(**inputs)
        # backward pass
        outputs.loss.backward()
        optimizer.step()
        lr_scheduler.step()
        pbar.set_postfix(loss=float(outputs.loss))



Epoch 1: 100%|██████████| 250/250 [01:13<00:00,  3.42it/s, loss=0.175]
Epoch 2: 100%|██████████| 250/250 [01:15<00:00,  3.33it/s, loss=0.111] 
Epoch 3: 100%|██████████| 250/250 [01:17<00:00,  3.23it/s, loss=0.0334]


###  Evaluate model

In [12]:
# evaluate model
metric = evaluate.load("accuracy")
model.eval()
with tqdm(eval_dataloader) as pbar:
    for batch in pbar:
      # move to device
      inputs = {k : v.to(device) for k,v in batch.items()}
      # forward pass
      with torch.no_grad():
        outputs = model(**inputs)
      # get predicted label
      predictions = outputs.logits.argmax(dim=-1)
      # add to metric
      metric.add_batch(references=batch["labels"].tolist(), predictions=predictions.tolist())
cur_metric = metric.compute()
print(f"Accuracy: {cur_metric['accuracy']*100}%")


100%|██████████| 32/32 [00:03<00:00,  9.50it/s]

Accuracy: 93.4%





## accelerate migration

[[Documentation]: Migrating your code to 🤗 Accelerate](https://huggingface.co/docs/accelerate/basic_tutorials/migration)

```diff
  import torch
  import torch.nn.functional as F
  from datasets import load_dataset
+ from accelerate import Accelerator

+ accelerator = Accelerator()
- device = 'cpu'
+ device = accelerator.device

+ model, optimizer, data = accelerator.prepare(model, optimizer, data)

  model.train()
  for epoch in range(10):
      for source, targets in data:
-         source = source.to(device)
-         targets = targets.to(device)

          optimizer.zero_grad()

          output = model(source)
          loss = F.cross_entropy(output, targets)

-         loss.backward()
+         accelerator.backward(loss)

          optimizer.step()
```

### Accelerate Training

In [13]:
import torch
import evaluate
from tqdm import tqdm
import logging
from accelerate import Accelerator

accelerator = Accelerator()

device = accelerator.device
print(f"Device: {device}")

# make sure everything is setup in the current environment for you to start training:
# correct device etc. 
model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
    model, optimizer, train_dataloader, lr_scheduler
)

# epoch train loop
for epoch in range(NUM_EPOCHS):
  model.train()
  # mini-batch train loop
  with tqdm(train_dataloader) as pbar:
      pbar.set_description(f"Epoch {epoch+1}")
      for batch in pbar:
        # reset gradient
        optimizer.zero_grad()
        # forward pass
        outputs = model(**batch)
        # backward pass
        accelerator.backward(outputs.loss)
        optimizer.step()
        lr_scheduler.step()
        pbar.set_postfix(loss=float(outputs.loss))

Device: cuda


Epoch 1:  34%|███▍      | 85/250 [00:26<00:49,  3.33it/s, loss=0.186] 

###  Evaluate model

In [None]:
metric = evaluate.load("accuracy")

model,  eval_dataloader = accelerator.prepare(
    model, eval_dataloader
)

model.eval()
with tqdm(eval_dataloader) as pbar:
    for batch in pbar:
      # forward pass
      with torch.no_grad():
        outputs = model(**batch)
      # get predicted label
      predictions = outputs.logits.argmax(dim=-1)
      # add to metric
      metric.add_batch(references=batch["labels"].tolist(), predictions=predictions.tolist())
cur_metric = metric.compute()
print(f"Accuracy: {cur_metric['accuracy']*100}%")

100%|██████████| 32/32 [00:03<00:00, 10.42it/s]

Accuracy: 93.4%



