In [1]:
!pip install transformers[torch] accelerate -U



In [2]:
!pip install datasets



In [31]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AdamW, DataCollatorWithPadding
from datasets import load_dataset
import numpy as np

## Fine-tuning a tre-trained model

### Sample short code for training using two sentences

In [4]:
# define and load pre-trained tokenizer and model
checkpoint = 'bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
text = ['I Love Coding', 'I hate that I forget syntax often']
model_input = tokenizer(text, padding = True, truncation = True, return_tensors='pt')
model_input['labels'] = torch.tensor([1, 0])
loss = model(**model_input).loss
optimizer = AdamW(model.parameters(), betas=(0.9, 0.95), lr=1e-3, weight_decay=0.1)
loss.backward()
optimizer.step()



## Load and Explore datasets

In [6]:
# Loading the glue-microsoft research dataset for similar paraphrase
raw_dataset = load_dataset('glue','mrpc')

In [7]:
# The dataset is a dictionary of train (3668), val(408) and test (1725) datasets. Each contain columns for the two sentences, label (if similar paraphrase or not) and id for sentence
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [8]:
# Explore train dataset
print(type(raw_dataset['train']),'\n', raw_dataset['train'][14], '\n', raw_dataset['train']['label'][15])

<class 'datasets.arrow_dataset.Dataset'> 
 {'sentence1': 'Gyorgy Heizler , head of the local disaster unit , said the coach was carrying 38 passengers .', 'sentence2': 'The head of the local disaster unit , Gyorgy Heizler , said the coach driver had failed to heed red stop lights .', 'label': 0, 'idx': 15} 
 0


In [9]:
# Understand dataset features
raw_dataset['train'].features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),
 'idx': Value(dtype='int32', id=None)}

## Preprocess datasets

In [10]:
# Check the difference between tokenization of single, double and double pair sentences
# token_type_ids: Ids used to signify which sentence a word belongs to when sentences are given as input pairs to tokenizer
model_input_single = tokenizer(text[0], padding = True)
model_input_double = tokenizer(text, padding = True)
model_input_double_pair = tokenizer(text[0],text[1], padding = True)
print(model_input_single, '\n\n', model_input_double, '\n\n', model_input_double_pair)

{'input_ids': [101, 146, 2185, 3291, 3408, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1]} 

 {'input_ids': [[101, 146, 2185, 3291, 3408, 102, 0, 0, 0], [101, 146, 4819, 1115, 146, 5042, 24426, 1510, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1]]} 

 {'input_ids': [101, 146, 2185, 3291, 3408, 102, 146, 4819, 1115, 146, 5042, 24426, 1510, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [11]:
# Providing tokenizer with pair of sequences of sentences from raw datasets
model_input_pair = tokenizer(raw_dataset['train']['sentence1'],raw_dataset['train']['sentence2'], padding=True, truncation=True)
print(type(model_input_pair))

<class 'transformers.tokenization_utils_base.BatchEncoding'>


In [12]:
# By using map method to apply tokenization function on each sequence of sentences
def tokenizer_function(example):
  return tokenizer(example['sentence1'], example['sentence2'], truncation = True)

tokenized_datasets = raw_dataset.map(tokenizer_function, batched=True)
tokenized_datasets

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

## Dynamic Padding
Padding applied as per maximum length of sentence within a batch to avoid unnecessary large amount of padding of smaller sentences
- Improves speed on CPUs and GPUs
- GPUs prefer fixed batch size inputs

In [13]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [14]:
# Test how dynamic batch-wise padding works
samples = tokenized_datasets['train'][0:8]
samples = {k: v for k, v in samples.items() if k not in ["idx", "sentence1", "sentence2"]}
batch = data_collator(samples)
{k: v.shape for k, v in batch.items()}

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': torch.Size([8, 69]),
 'token_type_ids': torch.Size([8, 69]),
 'attention_mask': torch.Size([8, 69]),
 'labels': torch.Size([8])}

## Fine-tuning with Trainer API

In [26]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19 (from evaluate)
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Installing collected packages: responses, evaluate
Successfully installed evaluate-0.4.1 responses-0.18.0


In [27]:
from transformers import TrainingArguments
import evaluate

training_args = TrainingArguments("test-trainer", evaluation_strategy = 'epoch')

In [28]:
def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [29]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics = compute_metrics,
)

In [32]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,1.075525,0.818627,0.878689
2,0.188000,0.905302,0.85049,0.897133
3,0.069400,1.04877,0.85049,0.895009


Epoch,Training Loss,Validation Loss


Checkpoint destination directory test-trainer/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory test-trainer/checkpoint-1000 already exists and is non-empty.Saving will proceed but saved results may be invalid.


TrainOutput(global_step=1377, training_loss=0.09825788136401208, metrics={'train_runtime': 205.1013, 'train_samples_per_second': 53.652, 'train_steps_per_second': 6.714, 'total_flos': 560389547924640.0, 'train_loss': 0.09825788136401208, 'epoch': 3.0})

In [33]:
predictions = trainer.predict(tokenized_datasets["test"])
print(predictions.predictions.shape, predictions.label_ids.shape)

(1725, 2) (1725,)


In [37]:
predictions.metrics

{'test_loss': 1.1947238445281982,
 'test_accuracy': 0.831304347826087,
 'test_f1': 0.8770595690747782,
 'test_runtime': 10.112,
 'test_samples_per_second': 170.589,
 'test_steps_per_second': 21.361}