In [1]:
# source: https://huggingface.co/learn/nlp-course/chapter3/2

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [4]:
print(f"Number of Model Parameters: {round(model.num_parameters()/1e6, 2)} M")

Number of Model Parameters: 109.48 M


In [5]:
sequences = [
    "I've been waiting for a HuggingFace course my whole life.",
    "This course is amazing!",
]

In [6]:
batch = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")

In [7]:
batch

{'input_ids': tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102],
        [  101,  2023,  2607,  2003,  6429,   999,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}

In [8]:
batch["labels"] = torch.tensor([1, 1])

In [9]:
batch

{'input_ids': tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102],
        [  101,  2023,  2607,  2003,  6429,   999,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'labels': tensor([1, 1])}

In [10]:
optimizer = AdamW(model.parameters())



In [11]:
pred = model(**batch)
pred

SequenceClassifierOutput(loss=tensor(0.5666, grad_fn=<NllLossBackward0>), logits=tensor([[ 0.0625,  0.3754],
        [-0.0055,  0.2254]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [12]:
pred.logits.softmax(dim = -1)

tensor([[0.4224, 0.5776],
        [0.4425, 0.5575]], grad_fn=<SoftmaxBackward0>)

In [13]:
loss = model(**batch).loss
loss

tensor(0.5666, grad_fn=<NllLossBackward0>)

In [14]:
loss.backward()
optimizer.step()

### Loading Dataset

In [15]:
from datasets import load_dataset

raw_datasets = load_dataset("glue", "mrpc")
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [16]:
tr_data = raw_datasets['train']

In [17]:
tr_data[0]

{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
 'label': 1,
 'idx': 0}

In [18]:
from transformers import AutoTokenizer

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [19]:
tokenized_sentences_1 = tokenizer(raw_datasets["train"]["sentence1"])
tokenized_sentences_2 = tokenizer(raw_datasets["train"]["sentence2"])

In [45]:
# raw_datasets
# - train
#     - 'sentence1'
#     - 'sentence2'
#     - 'label'
#     - 'idx'

len(raw_datasets["train"]["sentence1"])

3668

In [49]:
inputs = tokenizer("My name is Md Abul Hayat")
inputs

{'input_ids': [101, 2026, 2171, 2003, 9108, 8273, 2140, 10974, 4017, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [52]:
inputs = tokenizer("My name is", "Md Abul Hayat")
inputs

{'input_ids': [101, 2026, 2171, 2003, 102, 9108, 8273, 2140, 10974, 4017, 102], 'token_type_ids': [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [77]:
tokenized_dataset = tokenizer(
    raw_datasets["train"]["sentence1"],
    raw_datasets["train"]["sentence2"],
    padding=True,
    truncation=True,
)

In [79]:
tokenized_dataset.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [80]:
len(tokenized_dataset['input_ids'])

3668

In [81]:
def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

In [82]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets

Map: 100%|█████████████████████████████████████████████████████████████████████████████| 408/408 [00:00<00:00, 6153.04 examples/s]


DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

In [84]:
# tokenized_datasets['train'].keys()

In [85]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [86]:
data_collator

DataCollatorWithPadding(tokenizer=PreTrainedTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}), padding=True, max_length=None, pad_to_multiple_of=None, return_tensors='pt')

In [87]:
tokenizer

PreTrainedTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [88]:
samples = tokenized_datasets["train"][:8]
samples['input_ids']

[[101,
  2572,
  3217,
  5831,
  5496,
  2010,
  2567,
  1010,
  3183,
  2002,
  2170,
  1000,
  1996,
  7409,
  1000,
  1010,
  1997,
  9969,
  4487,
  23809,
  3436,
  2010,
  3350,
  1012,
  102,
  7727,
  2000,
  2032,
  2004,
  2069,
  1000,
  1996,
  7409,
  1000,
  1010,
  2572,
  3217,
  5831,
  5496,
  2010,
  2567,
  1997,
  9969,
  4487,
  23809,
  3436,
  2010,
  3350,
  1012,
  102],
 [101,
  9805,
  3540,
  11514,
  2050,
  3079,
  11282,
  2243,
  1005,
  1055,
  2077,
  4855,
  1996,
  4677,
  2000,
  3647,
  4576,
  1999,
  2687,
  2005,
  1002,
  1016,
  1012,
  1019,
  4551,
  1012,
  102,
  9805,
  3540,
  11514,
  2050,
  4149,
  11282,
  2243,
  1005,
  1055,
  1999,
  2786,
  2005,
  1002,
  6353,
  2509,
  2454,
  1998,
  2853,
  2009,
  2000,
  3647,
  4576,
  2005,
  1002,
  1015,
  1012,
  1022,
  4551,
  1999,
  2687,
  1012,
  102],
 [101,
  2027,
  2018,
  2405,
  2019,
  15147,
  2006,
  1996,
  4274,
  2006,
  2238,
  2184,
  1010,
  5378,
  1996,
  6636

In [89]:
samples = {k: v for k, v in samples.items() if k not in ["idx", "sentence1", "sentence2"]}

In [93]:
samples
temp = [len(x) for x in samples["input_ids"]]

In [94]:
min(temp)

32

In [95]:
max(temp)

67

In [103]:
batch = data_collator(samples)

{k: v.shape for k, v in batch.items()}

{'input_ids': torch.Size([8, 67]),
 'token_type_ids': torch.Size([8, 67]),
 'attention_mask': torch.Size([8, 67]),
 'labels': torch.Size([8])}

In [104]:
# Do the tasks