In [62]:
# pip install transformers

In [1]:
from transformers import BertModel,BertTokenizer
import torch

## Download and load the pre-trained bert-base-uncased model
## Download and load the tokenizer that was used to pre-train the ber-base-uncased model

In [2]:
model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


OSError: Can't load tokenizer for 'bert-base-uncased'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'bert-base-uncased' is the correct path to a directory containing all relevant files for a BertTokenizer tokenizer.

## Preprocess the input before feeding it to bert

In [None]:
sentence = 'I love Paris'

In [None]:
tokens = tokenizer.tokenize(sentence)
tokens

In [67]:
tokens = ['[CLS]'] + tokens + ['[SEP]']
tokens

['[CLS]', 'i', 'love', 'paris', '[SEP]']

## Next we add PAD token to make size same

In [68]:
tokens = tokens + ['[PAD]'] + ['[PAD]']
tokens

['[CLS]', 'i', 'love', 'paris', '[SEP]', '[PAD]', '[PAD]']

## Attention MASK

In [69]:
attention_mask = [1 if i!= '[PAD]' else 0 for i in tokens]
attention_mask

[1, 1, 1, 1, 1, 0, 0]

## Token ID's

In [70]:
token_ids = tokenizer.convert_tokens_to_ids(tokens)
token_ids

[101, 1045, 2293, 3000, 102, 0, 0]

## Now we convert token_ids and attention_mask to tensors

In [71]:
token_ids = torch.tensor(token_ids).unsqueeze(0)
token_ids

tensor([[ 101, 1045, 2293, 3000,  102,    0,    0]])

In [72]:
attention_mask = torch.tensor(attention_mask).unsqueeze(0)
attention_mask

tensor([[1, 1, 1, 1, 1, 0, 0]])

## Getting the embeddings, we feed the token_ids and attention_mask to model get the embeddings.

In [73]:
hidden_rep,cls_head = model(token_ids,attention_mask=attention_mask)

## Note that model returns the output as a tuple with two values. The first value indicates the hidden state representation i.e **hidden_rep** and it consists of the representation of all the tokens obtained from the final encoder(EN 12) and the second value **cls_head** consists of the representation of the [CLS] token.  

In [74]:
hidden_rep

'last_hidden_state'