<a href="https://colab.research.google.com/github/lordWalnuts/hf-nlpCourse-notes/blob/main/hfChap3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
!pip install datasets
!pip install --upgrade accelerate

In [None]:
from datasets import load_dataset

raw_datasets = load_dataset("glue","mrpc")


In [None]:
raw_datasets


DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [None]:
raw_train_dts = raw_datasets["train"]
raw_train_dts[0]

{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
 'label': 1,
 'idx': 0}

In [None]:
raw_train_dts.features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),
 'idx': Value(dtype='int32', id=None)}

In [None]:
raw_validation_dts = raw_datasets["validation"]
raw_validation_dts[87]
raw_train_dts[15]

{'sentence1': 'Rudder was most recently senior vice president for the Developer & Platform Evangelism Business .',
 'sentence2': 'Senior Vice President Eric Rudder , formerly head of the Developer and Platform Evangelism unit , will lead the new entity .',
 'label': 0,
 'idx': 16}

## Preprocessing the dataset

In [None]:
input15 = tokenizer(raw_datasets["train"]["sentence1"][15],raw_datasets["train"]["sentence2"][15])
input15

In [None]:
tokenizer.convert_ids_to_tokens(input15["input_ids"])

In [None]:
d1 = {'a':1, 'b':2}
d2 = {'c':3, 'd':4}

d3 = {}
for (key1, value1), (key2, value2) in zip(d1.items(), d2.items()):
  d3[f"{key1} + {key2}"] = value1+ value2

d3 


In [None]:
from transformers import AutoTokenizer

checkpoint  = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
# tokenized_sentences_1 = tokenizer(raw_datasets["train"]["sentence1"])
# tokenized_sentences_2 = tokenizer(raw_datasets["train"]["sentence2"])

In [None]:
# baqd approach 
# returns a dict and limited by ram 

# tokenized_dataset = tokenizer(
#     raw_datasets["train"]["sentence1"],
#     raw_datasets["train"]["sentence2"],
#     truncation= True,
#     padding = True
# )

def tokenize_function(example):
  return tokenizer(example["sentence1"], example["sentence2"], truncation=True)
  
tokenized_dataset = raw_datasets.map(tokenize_function, batched=True)
tokenized_dataset


Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

## Finetuning

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_dataset = load_dataset("glue","mrpc")
checkpoint = "bert-base-uncased" 
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
  return tokenizer(example["sentence1"],example["sentence2"],truncation=True)

tokenized_dataset = raw_dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer =tokenizer)




  0%|          | 0/3 [00:00<?, ?it/s]



Map:   0%|          | 0/408 [00:00<?, ? examples/s]



In [None]:
from transformers import TrainingArguments
from transformers import AutoModelForSequenceClassification

training_args = TrainingArguments("test-trainer")
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

In [None]:
from transformers import Trainer
trainer = Trainer(
    model,
    training_args,
    train_dataset= tokenized_dataset["train"],
    eval_dataset= tokenized_dataset["validation"],
    data_collator=data_collator,
    tokenizer = tokenizer
)

In [None]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,0.4949
1000,0.2623


TrainOutput(global_step=1377, training_loss=0.3053090435573824, metrics={'train_runtime': 188.0042, 'train_samples_per_second': 58.531, 'train_steps_per_second': 7.324, 'total_flos': 406183858377360.0, 'train_loss': 0.3053090435573824, 'epoch': 3.0})

## Fine Tuning

In [None]:
!pip install transformers 
!pip install datasets
!pip install --upgrade accelerate
!pip install evaluate

In [None]:
from transformers import AutoTokenizer, DataCollatorWithPadding
from datasets import load_dataset

#set checkpoint and tokenizer
checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
# download dataset
raw_dataset = load_dataset("glue","mrpc")

# write tokenize function
def tokenize_function(example):
  return tokenizer(example["sentence1"],example["sentence2"], truncation= True)


# tokenized Dataset
tokenized_dataset = raw_dataset.map(tokenize_function,batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)






  0%|          | 0/3 [00:00<?, ?it/s]



In [None]:
# metric computation
import evaluate
import numpy as np


def compute_metrics(eval_preds):
  metric = evaluate.load("glue","mrpc")
  logits, labels = eval_preds
  predictions = np.argmax(logits, axis = -1)
  return metric.compute(predictions = predictions, references = labels)

In [None]:
from transformers import TrainingArguments, AutoModelForSequenceClassification, Trainer

training_args = TrainingArguments("test-trainer", optim="adamw_torch")
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

trainer = Trainer(
    model,
    training_args,
    train_dataset = tokenized_dataset["train"],
    eval_dataset = tokenized_dataset["validation"],
    data_collator = data_collator,
    tokenizer = tokenizer,
    compute_metrics = compute_metrics 

)

In [None]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,0.51
1000,0.2649


TrainOutput(global_step=1377, training_loss=0.3107976006180286, metrics={'train_runtime': 213.7033, 'train_samples_per_second': 51.492, 'train_steps_per_second': 6.444, 'total_flos': 420167799858720.0, 'train_loss': 0.3107976006180286, 'epoch': 3.0})

## In pure PyTorch

In [None]:
# Downlaod, tokenize dataset

from transformers import AutoTokenizer, DataCollatorWithPadding
from datasets import load_dataset

#set checkpoint and tokenizer
checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
# download dataset
raw_dataset = load_dataset("glue","mrpc")

# write tokenize function
def tokenize_function(example):
  return tokenizer(example["sentence1"],example["sentence2"], truncation= True)


# tokenized Dataset
tokenized_datasets = raw_dataset.map(tokenize_function,batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
tokenized_datasets["train"].column_names


['sentence1',
 'sentence2',
 'label',
 'idx',
 'input_ids',
 'token_type_ids',
 'attention_mask']

In [None]:
## some more preprocessing for dataloaders. Trainer class does it automatically

tokenized_datasets = tokenized_datasets.remove_columns(["sentence1", "sentence2", "idx"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")


In [None]:
tokenized_datasets["train"].column_names


['labels', 'input_ids', 'token_type_ids', 'attention_mask']

In [None]:
# set dataloaders

from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    batch_size = 8,
    collate_fn = data_collator
)

eval_dataloader = DataLoader(
    tokenized_datasets["validation"],
    batch_size = 8,
    collate_fn = data_collator
)

In [None]:
from transformers import AutoModelForSequenceClassification

model=AutoModelForSequenceClassification.from_pretrained(checkpoint)


In [None]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)



In [None]:
from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)

lr_scheduler = get_scheduler(
    "linear",
    optimizer = optimizer,
    num_warmup_steps=0,
    num_training_steps = num_training_steps
)

print(num_training_steps)

1377


In [None]:
import torch 

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

device(type='cuda')

In [None]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
  for batch in train_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    outputs = model(**batch)
    loss = outputs.loss
    loss.backward()

    optimizer.step()
    lr_scheduler.step()
    optimizer.zero_grad()
    progress_bar.update(1)

    


  0%|          | 0/1377 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [None]:
import evaluate

metric = evaluate.load("glue", "mrpc")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

Downloading builder script:   0%|          | 0.00/5.75k [00:00<?, ?B/s]

{'accuracy': 0.8578431372549019, 'f1': 0.9026845637583893}