# Install Transformers

In [22]:
!pip install transformers



# Import Tokenizer

In [23]:
from transformers import AutoTokenizer

# Decide a model and import tokenizer based on model selection

In [24]:
checkpoint ="bert-base-uncased"  # Decide a model we need
tokenizer = AutoTokenizer.from_pretrained(checkpoint) # creating an object tokenizer by passing a checkpoint

In [25]:
tokenizer # we see that important informations of object tokenizer are printed out like the name of tokenizer, vocabsize, max length etc

BertTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

# Passing String to Tokenizer

In [26]:
tokenizer("hello world") # return dictionary with three keys. input_ids are the integer representation of each token.
                         # Four ids because there are two BERT Token added i.e [CLS] and [SEP]
                         # attention mask tell us which token to enter into tensor computation.
                         # token ids are not shown up for every model we used
                         # token_type_ids identifies which sequence a token belongs to when there is more than one sequence.

{'input_ids': [101, 7592, 2088, 102], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1]}

In [27]:
tokenizer("hello world" , " Hugging face")

{'input_ids': [101, 7592, 2088, 102, 17662, 2227, 102], 'token_type_ids': [0, 0, 0, 0, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

# See the steps individually

In [28]:
tokens = tokenizer.tokenize("hello world") # To see what tokenizer does behind the scene.
tokens                                     # tokenizer.tokenize split the string

['hello', 'world']

In [29]:
ids = tokenizer.convert_tokens_to_ids(tokens) # Converting token into integer ids

In [30]:
ids

[7592, 2088]

In [31]:
tokenizer.convert_ids_to_tokens(ids)

['hello', 'world']

In [32]:
tokenizer.decode(ids) # tokenizer.decode not only convert the ids into tokens but also
                      # convert them into string

'hello world'

In [33]:
ids = tokenizer.encode("hello world")
ids

[101, 7592, 2088, 102]

In [34]:
tokenizer.convert_ids_to_tokens(ids)

['[CLS]', 'hello', 'world', '[SEP]']

In [35]:
tokenizer.decode(ids)

'[CLS] hello world [SEP]'

# Creating Model Inputs

In [36]:
model_inputs = tokenizer("hello world")
model_inputs

{'input_ids': [101, 7592, 2088, 102], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1]}

# Import class of AutoModelForSequenceClassification

In [37]:
from transformers import AutoModelForSequenceClassification

In [38]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint) # Here the model is by default a binary classifier model.
                                                                        #same checkpoint with the tokenizer.
                                                                        # The warning below shows that we need model to be trained

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [39]:
outputs = model(**model_inputs)  # Here we get an error because model doesnot accept list. The model need pytorch tensor

AttributeError: 'list' object has no attribute 'size'

In [40]:
model_inputs = tokenizer("hello world", return_tensors ="pt")
model_inputs

{'input_ids': tensor([[ 101, 7592, 2088,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1]])}

In [41]:
outputs = model(**model_inputs) # Calling a function from dictionary named arguments.
outputs  # we get logits, however these logits are meaningless coz we did not train our model yet.
         # Since model output two logits, this shows that it is a binary classifier.

SequenceClassifierOutput(loss=None, logits=tensor([[ 0.1507, -0.3637]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [42]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels = 3) # Here we specify three classes

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [43]:
outputs = model(**model_inputs)
outputs # we see that three logits due to three classes

SequenceClassifierOutput(loss=None, logits=tensor([[-0.0957,  0.1155,  0.5319]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [44]:
outputs[0]

tensor([[-0.0957,  0.1155,  0.5319]], grad_fn=<AddmmBackward0>)

In [45]:
outputs.logits.detach().cpu().numpy()

array([[-0.09570765,  0.11546849,  0.5318826 ]], dtype=float32)

# Create another data and tokenize it

In [46]:
data = [
    "I like deep learning.",
    "Do you like deep learning too?",
]
model_inputs = tokenizer(data, return_tensors="pt")  # Error because the input sentences have diff length
model_inputs

ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`input_ids` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

In [47]:
model_inputs = tokenizer(data, padding = True, truncation = True, return_tensors="pt")
model_inputs

{'input_ids': tensor([[ 101, 1045, 2066, 2784, 4083, 1012,  102,    0,    0],
        [ 101, 2079, 2017, 2066, 2784, 4083, 2205, 1029,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [48]:
model_inputs["input_ids"]

tensor([[ 101, 1045, 2066, 2784, 4083, 1012,  102,    0,    0],
        [ 101, 2079, 2017, 2066, 2784, 4083, 2205, 1029,  102]])

In [49]:
model_inputs["attention_mask"]   # In attention mask, 1 means real tokens and 0 means padded tokens

tensor([[1, 1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1]])