#### Behind the pipeline

In [4]:
from transformers import pipeline

classifier = pipeline('sentiment-analysis')
classifier(['I have been waiting for the next episode of Harry Potter.',
           'I hate the waiting period.'])

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


[{'label': 'NEGATIVE', 'score': 0.9981814622879028},
 {'label': 'NEGATIVE', 'score': 0.9983410835266113}]

3 steps the pipeline does internally:
1. preprocessing
2. passing preprocessed inputs
3. postprocessing

### Preprocessing:

Converting words to its numerical form using tokenizer.
1. Splitting the setence into words (tokens)
2. Each token is given a input id
3. Adding additional input id for better understanding

In [15]:
#Loading a tokenizer
from transformers import AutoTokenizer

#Load the weights. Checkpoints are basically the weights that is used with any model
checkpoints = "distilbert-base-uncased-finetuned-sst-2-english"
#Load the tokenizer using from_pretrained from AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoints)

In [16]:
#Now pass a sentence
inputs = ['Harry Potter book is way better than the movie.', 'In the movie there are so many contexts which are missed out']
tokenize = tokenizer(inputs, padding = True, return_tensors = 'pt', truncation = True) #return_tensor makes sure the input passed are tensor.
#'pt' for pyTorch tensors
#'tf' for tensorflow tensors
#'np' for numpy arrays
print(tokenize)

#Note that in output we have 2 keys, input_ids and attention_mask

{'input_ids': tensor([[  101,  4302, 10693,  2338,  2003,  2126,  2488,  2084,  1996,  3185,
          1012,   102,     0,     0],
        [  101,  1999,  1996,  3185,  2045,  2024,  2061,  2116, 18046,  2029,
          2024,  4771,  2041,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [28]:
#Loading the model
#This is the same as loading a tokenizer
from transformers import AutoModel

model = AutoModel.from_pretrained(checkpoints)

In [29]:
model

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): MultiHeadSelfAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): Li

The model returns a high dimensional vector


A high dimensional vector has 3 properties:
1. Batch size
2. Sequence length
3. Hidden size

In [14]:
output = model(**tokenize)
print(output[0].shape)

torch.Size([2, 14, 768])


There are many different architectures in transformer:
1. ForSequenceClassification
2. ForTokenClassification
3. ForQuestionAnswering
4. ForMultipleChoice
5. ForCasualLM
6. ForMaskedLM


For eg: If we need to classify our sequence we will use AutoModelForSequenceClassification

### Model Head

In [17]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoints)
output = model(**tokenize)
print(output[0].shape)

torch.Size([2, 2])


### Post processing the outputs

In [18]:
output.logits

tensor([[-3.6463,  4.0243],
        [ 4.0716, -3.3318]], grad_fn=<AddmmBackward0>)

Note: The above code produces logits not probabilities because for probabilities the sum has to be 1 for all. Hence, we will make it pass thrugh a softmax layer.

In [20]:
import torch

predictions = torch.nn.functional.softmax(output.logits, dim = 1)
print(predictions)

tensor([[4.6611e-04, 9.9953e-01],
        [9.9939e-01, 6.0877e-04]], grad_fn=<SoftmaxBackward0>)


Note here the predictions are for 2 sentences

For first sentence:
NEGATIVE: 0.00046, POSITIVE: 0.999

For second sentence:
NEGATIVE: 0.09993, POSITIVE = 0.00608

In [21]:
#To find the label use id2label
model.config.id2label

{0: 'NEGATIVE', 1: 'POSITIVE'}

# Creating and Using Models

The configuration tells abut the model's architecture

In [30]:
from transformers import BertModel, BertConfig

#Loading the config
config = BertConfig()

#Building the model from config
model = BertModel(config)
print(config)

#For this example we have to train our model from scratch which is unnecessary efforts

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.35.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



In [32]:
#To load a already trained model we use from_pretrained() method
from transformers import BertModel

model = BertModel.from_pretrained('bert-base-uncased')

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [33]:
#Save the methods
model.save_pretrained('directory_on_computer')

# Encoding

In [34]:
#Below is a representation of how encoding works
#Encoding is the process of converting texts into numbers
from transformers import AutoTokenizer

sequence = 'Harry Potter is my favorite movie'
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
tokens = tokenizer.tokenize(sequence)
print(tokens)

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

['Harry', 'Potter', 'is', 'my', 'favorite', 'movie']


In [35]:
#Convert the tokens into input_ids
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

[3466, 11434, 1110, 1139, 5095, 2523]


In [36]:
#Decode - convert numbers into text
decoded_string = tokenizer.decode([3466, 11434, 1110, 1139, 5095, 2523])
print(decoded_string)

Harry Potter is my favorite movie


Note: Decode not only converts the input_ids back to its tokens but also groups together the same wrd to make the sentence more readable

# Handling multiple sequences

When handling multiple sequences we have to pass the padding and the attention mask along with it

In [38]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

#Padding the inputs
batched_ids = [
    [200,200,200],
    [200,200]
]

#Give the padding to make the length of the 2 sequences equal
padding_id = 100

batched_ids = [
    [200,200,200],
    [200,200,padding_id]
]

sequence1_ids = [[200,200,200]]
sequence2_ids = [[200,200]]

print(model(torch.tensor(sequence1_ids)).logits)
print(model(torch.tensor(sequence2_ids)).logits)
print(model(torch.tensor(batched_ids)).logits)

tensor([[ 1.5694, -1.3895]], grad_fn=<AddmmBackward0>)
tensor([[ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)
tensor([[ 1.5694, -1.3895],
        [ 0.9907, -0.9139]], grad_fn=<AddmmBackward0>)


Note that the logits of second row is not equal to last row. This is because when attention mask is not provided the mdel will attend to all tokens

In [39]:
batched_ids = [
    [200,200,200],
    [200,200,padding_id]
]

attention_mask = [
    [1,1,1],
    [1,1,0]
]

model(torch.tensor(batched_ids), attention_mask = torch.tensor(attention_mask)).logits

tensor([[ 1.5694, -1.3895],
        [ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)

Now note that the values are same

In [41]:
sequence = 'Voldemort is the only villian with no nose'

tokenize = tokenizer(sequence)
print(tokenize['input_ids'])

tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

[101, 5285, 3207, 5302, 5339, 2003, 1996, 2069, 6819, 6894, 2319, 2007, 2053, 4451, 102]
[5285, 3207, 5302, 5339, 2003, 1996, 2069, 6819, 6894, 2319, 2007, 2053, 4451]


In [43]:
print(tokenizer.decode(tokenize['input_ids']))
print(tokenizer.decode(ids))

[CLS] voldemort is the only villian with no nose [SEP]
voldemort is the only villian with no nose


Note that since the model is pretrained to add [CLS] and [SEP] hence the tokens