# Migrate form PyTorch to Accelerate

### Distilbert example

## 1. Load Model & Datasets

In [62]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset

model_id="distilbert-base-uncased"
dataset_id="emotion"

model=AutoModelForSequenceClassification.from_pretrained(model_id,num_labels=6)
tokenizer=AutoTokenizer.from_pretrained(model_id)

dataset= load_dataset(dataset_id)

def preprocess(sample):
  enc = tokenizer(sample["text"],truncation=True)
  if "label" in sample:
    enc["labels"] = sample["label"]
  return enc

dataset=dataset.map(preprocess,batched=True,remove_columns=dataset["train"].column_names)

print(f"Train dataset size: {len(dataset['train'])}")
print(f"Dataset columns: {dataset['train'].column_names}")
print(f"Validation dataset size: {len(dataset['validation'])}")


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

Train dataset size: 16000
Dataset columns: ['input_ids', 'attention_mask', 'labels']
Validation dataset size: 2000


## 2. Hyperparameters, Dataloader, Optimizer

In [63]:
from torch.optim import AdamW
from torch.utils.data import DataLoader
from transformers import get_linear_schedule_with_warmup,DataCollatorWithPadding

###### Hyperparameters ######
TRAIN_BATCH_SIZE = 16
EVAL_BATCH_SIZE = 32
LEARNING_RATE = 5e-5
NUM_EPOCHS = 3

###### Data Loaders ######
data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)

# Instantiate dataloaders.
train_dataloader = DataLoader(
  dataset["train"], shuffle=True, collate_fn=data_collator, batch_size=TRAIN_BATCH_SIZE
)
eval_dataloader = DataLoader(
  dataset["validation"], shuffle=False, collate_fn=data_collator, batch_size=EVAL_BATCH_SIZE
)

###### Optimizer ######

optimizer = AdamW(params=model.parameters(), lr=LEARNING_RATE)
lr_scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(dataset) // TRAIN_BATCH_SIZE * NUM_EPOCHS)



## 3. Vanilla PyTorch Training

In [64]:
import torch
import logging
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)


# epoch train loop
for epoch in range(NUM_EPOCHS):
  model.train()
  # mini-batch train loop
  for step, batch in enumerate(train_dataloader):
    # reset gradient
    optimizer.zero_grad()
    # move to device
    inputs = {k : v.to(device) for k,v in batch.items()}
    # forward pass
    outputs = model(**inputs)
    logging.info(outputs.loss)
    # backward pass
    outputs.loss.backward()
    optimizer.step()
    lr_scheduler.step()
  

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.