# `BERT in Detail`

In [1]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.1-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 29.0 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 59.7 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 12.9 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 59.1 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninsta

In [2]:
from transformers import BertModel,BertTokenizer
import torch

## Download and load the pre-trained bert-base-uncased model
## Download and load the tokenizer that was used to pre-train the ber-base-uncased model

In [3]:
model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

## Preprocess the input before feeding it to bert

In [4]:
sentence = 'I love Paris'

In [5]:
tokens = tokenizer.tokenize(sentence)
tokens

['i', 'love', 'paris']

In [6]:
tokens = ['[CLS]'] + tokens + ['[SEP]']
tokens

['[CLS]', 'i', 'love', 'paris', '[SEP]']

## Next we add PAD token to make size same

In [7]:
tokens = tokens + ['[PAD]'] + ['[PAD]']
tokens

['[CLS]', 'i', 'love', 'paris', '[SEP]', '[PAD]', '[PAD]']

## Attention MASK

In [8]:
attention_mask = [1 if i!= '[PAD]' else 0 for i in tokens]
attention_mask

[1, 1, 1, 1, 1, 0, 0]

## Token ID's

In [9]:
token_ids = tokenizer.convert_tokens_to_ids(tokens)
token_ids

[101, 1045, 2293, 3000, 102, 0, 0]

## Now we convert token_ids and attention_mask to tensors

In [10]:
token_ids = torch.tensor(token_ids).unsqueeze(0)
token_ids

tensor([[ 101, 1045, 2293, 3000,  102,    0,    0]])

In [11]:
attention_mask = torch.tensor(attention_mask).unsqueeze(0)
attention_mask

tensor([[1, 1, 1, 1, 1, 0, 0]])

## Getting the embeddings, we feed the token_ids and attention_mask to model get the embeddings.

## Note that model returns the output as a tuple with two values. The first value indicates the hidden state representation i.e **hidden_rep** and it consists of the representation of all the tokens obtained from the final encoder(EN 12) and the second value **cls_head** consists of the representation of the [CLS] token.  
## hidden_rep contains the embedding of all the tokens in our input.
## [1,7,768] => [batch_size,sequence_length,hidden_size]


In [12]:
hidden_rep = model(token_ids,attention_mask=attention_mask)[0]

In [13]:
hidden_rep.shape

torch.Size([1, 7, 768])

In [14]:
cls_head = model(token_ids,attention_mask=attention_mask)[1]

In [15]:
cls_head.shape

torch.Size([1, 768])

## Representation of each token
### [CLS]

In [16]:
hidden_rep[0][0].shape

torch.Size([768])

### I

In [17]:
hidden_rep[0][1].shape

torch.Size([768])

### love

In [18]:
hidden_rep[0][3].shape

torch.Size([768])

## I this way we can obtain the contextual representation of all the tokens. This is basically the contextualized word embeddings of all the words in the given sentence.
## We learned that cls_head holds the aggregate representation of the sentence, so we can use cls_head as the representation of the sentence I love Paris.
## cls_head => [1,768] => [batch_size,hidden_state]

In [19]:
hidden_rep[0].shape

torch.Size([7, 768])

# `Extracting embeddings from all layers, not just final encoder`
## output_hidden_states=True help obtain embeddings from all the encoder layers
## It will return 3 values
### last_hidden_state: contains the representation of all the tokens obtained only from the final encoder.
### pooler_output: indicates the representation of the [CLS] token from the final encoder layer, which is further processed by a linear and tanh activation function.
### hidden_states: contains the representation of all the tokens obtained from all the encoder layers.

In [20]:
model = BertModel.from_pretrained('bert-base-uncased',output_hidden_states=True)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [21]:
last_hidden_state = model(token_ids,attention_mask=attention_mask)[0]

In [22]:
pooler_output = model(token_ids,attention_mask=attention_mask)[1]

In [23]:
hidden_states = model(token_ids,attention_mask=attention_mask)[2]

In [24]:
last_hidden_state.shape

torch.Size([1, 7, 768])

## contains tuple containing 13 values holding the representation of all the encoder layers, from **h0 to h12**

In [25]:
len(hidden_states)

13

### h0

In [26]:
hidden_states[0].shape

torch.Size([1, 7, 768])

### h1

In [27]:
hidden_states[4].shape

torch.Size([1, 7, 768])

### h12

In [28]:
hidden_states[12].shape

torch.Size([1, 7, 768])

# `Fine tuning the BERT model`
1. Text Classification
2. NER
3. Natural language inference
4. Question answering

## 1. `Sentiment Analysis`

In [31]:
pip install nlp

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting nlp
  Downloading nlp-0.4.0-py3-none-any.whl (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 28.2 MB/s 
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 74.2 MB/s 
Installing collected packages: xxhash, nlp
Successfully installed nlp-0.4.0 xxhash-3.0.0


In [32]:
import transformers
import nlp

In [33]:
from transformers import BertForSequenceClassification,BertTokenizerFast,Trainer,TrainingArguments
from nlp import load_dataset
import numpy as np

In [34]:
!gdown https://drive.google.com/uc?id=11_M4ootuT7I1G0RlihcC0cA3Elqotlc-
dataset = load_dataset('csv', data_files='./imdbs.csv', split='train')

Downloading...
From: https://drive.google.com/uc?id=11_M4ootuT7I1G0RlihcC0cA3Elqotlc-
To: /content/imdbs.csv
  0% 0.00/132k [00:00<?, ?B/s]100% 132k/132k [00:00<00:00, 78.7MB/s]


Downloading:   0%|          | 0.00/2.75k [00:00<?, ?B/s]



Downloading and preparing dataset csv/default-11046c2826f07a01 (download: Unknown size, generated: Unknown size, post-processed: Unknown sizetotal: Unknown size) to /root/.cache/huggingface/datasets/csv/default-11046c2826f07a01/0.0.0/ede98314803c971fef04bcee45d660c62f3332e8a74491e0b876106f3d99bd9b...


0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-11046c2826f07a01/0.0.0/ede98314803c971fef04bcee45d660c62f3332e8a74491e0b876106f3d99bd9b. Subsequent calls will reuse this data.


In [35]:
dataset = dataset.train_test_split(test_size=0.3)
dataset

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'train': Dataset(features: {'text': Value(dtype='string', id=None), 'label': Value(dtype='int64', id=None)}, num_rows: 70),
 'test': Dataset(features: {'text': Value(dtype='string', id=None), 'label': Value(dtype='int64', id=None)}, num_rows: 30)}

In [36]:
train = dataset['train']
test = dataset['test']

In [37]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [38]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

Downloading tokenizer.json:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [39]:
tokenizer(sentence)

{'input_ids': [101, 1045, 2293, 3000, 102], 'token_type_ids': [0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1]}

In [40]:
tokenizer(['I love Paris','birds fly','snow fall'],padding=True,max_length=5)

  "`max_length` is ignored when `padding`=`True` and there is no truncation strategy. "


{'input_ids': [[101, 1045, 2293, 3000, 102], [101, 5055, 4875, 102, 0], [101, 4586, 2991, 102, 0]], 'token_type_ids': [[0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1], [1, 1, 1, 1, 0], [1, 1, 1, 1, 0]]}

## Working with dataset

In [41]:
def preprocess(data):
  return tokenizer(data['text'],padding=True,truncation=True)

In [42]:
train_set = train.map(preprocess,batched=True,batch_size=len(train))
test_set = train.map(preprocess,batched=True,batch_size=len(test))

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

In [43]:
train_set.set_format('torch',columns=['input_ids','attention_mask','label'])
test_set.set_format('torch',columns=['input_ids','attention_mask','label'])

## Train the model

In [44]:
batch_size = 8
epochs = 3
warmup_steps = 500
weight_decay = 0.01

In [45]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    warmup_steps=warmup_steps,
    weight_decay=weight_decay,
    evaluation_strategy='epoch',
    logging_dir='./logs',
)

In [46]:
trainer = Trainer(
    model=model,
    args = training_args,
    train_dataset = train_set,
    eval_dataset = test_set
)

In [47]:
trainer.train()

***** Running training *****
  Num examples = 70
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 27


Epoch,Training Loss,Validation Loss
1,No log,0.682399
2,No log,0.643641
3,No log,0.598948


***** Running Evaluation *****
  Num examples = 70
  Batch size = 8
***** Running Evaluation *****
  Num examples = 70
  Batch size = 8
***** Running Evaluation *****
  Num examples = 70
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=27, training_loss=0.6654167175292969, metrics={'train_runtime': 28.1171, 'train_samples_per_second': 7.469, 'train_steps_per_second': 0.96, 'total_flos': 55253321625600.0, 'train_loss': 0.6654167175292969, 'epoch': 3.0})

## Inference on a single text

In [135]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
sent = input()
class_names = ['negative','positive']
input_ids = torch.tensor(tokenizer(sent)['input_ids']).unsqueeze(0)
attention_mask = torch.tensor(tokenizer(sent)['attention_mask']).unsqueeze(0)
input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)
output = model(input_ids, attention_mask)
out = np.argmax(output.logits.softmax(dim=-1).tolist()[0])
if out == 0:
  print('negative sentiment')
else:
  print('positive sentiment')

good  good good
negative sentiment
