# Behind the pipeline (PyTorch)

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [None]:
!pip install datasets evaluate transformers[sentencepiece]

In [2]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis")
classifier(
    [
        "I've been waiting for a HuggingFace course my whole life.",
        "I hate this so much!",
    ]
)

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


[{'label': 'POSITIVE', 'score': 0.9598046541213989},
 {'label': 'NEGATIVE', 'score': 0.9994558691978455}]

## Preprocessing with a Tokenizer

In [5]:
from transformers import AutoTokenizer

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [10]:
raw_inputs = [
    "I've been waiting for a HuggingFace course my whole life.",
    "I hate this so much!",
]
inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")
print(inputs)

{'input_ids': tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102],
        [  101,  1045,  5223,  2023,  2061,  2172,   999,   102,     0,     0,
             0,     0,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])}


The output is a dictionary containing two keys:
- inputs_ids: Two rows of integers, unique identifiers of the tokens in each sentence.
- attention_mask: What tokens are taken into account.

## Going through the Model

In [23]:
from transformers import AutoModel

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModel.from_pretrained(checkpoint)

In [24]:
outputs = model(**inputs)
print(outputs.last_hidden_state.shape)

torch.Size([2, 16, 768])


In [25]:
print(outputs)

BaseModelOutput(last_hidden_state=tensor([[[-0.1798,  0.2333,  0.6321,  ..., -0.3017,  0.5008,  0.1481],
         [ 0.2758,  0.6497,  0.3200,  ..., -0.0760,  0.5136,  0.1329],
         [ 0.9046,  0.0985,  0.2950,  ...,  0.3352, -0.1407, -0.6464],
         ...,
         [ 0.1466,  0.5661,  0.3235,  ..., -0.3376,  0.5100, -0.0561],
         [ 0.7500,  0.0487,  0.1738,  ...,  0.4684,  0.0030, -0.6084],
         [ 0.0519,  0.3729,  0.5223,  ...,  0.3584,  0.6500, -0.3883]],

        [[-0.2937,  0.7283, -0.1497,  ..., -0.1187, -1.0227, -0.0422],
         [-0.2206,  0.9384, -0.0951,  ..., -0.3643, -0.6605,  0.2407],
         [-0.1536,  0.8988, -0.0728,  ..., -0.2189, -0.8528,  0.0710],
         ...,
         [-0.3017,  0.9002, -0.0200,  ..., -0.1082, -0.8412, -0.0861],
         [-0.3338,  0.9674, -0.0729,  ..., -0.1952, -0.8181, -0.0634],
         [-0.3454,  0.8824, -0.0426,  ..., -0.0993, -0.8329, -0.1065]]],
       grad_fn=<NativeLayerNormBackward0>), hidden_states=None, attentions=None)


- Batch Size = 2 = Number of sequences processed
- Sequence Length = 16 = Length of the numerical representation of the sequence
- Hidden Size = 768 = Vector dimension of each model input

## Making sense out of the numbers

In [26]:
from transformers import AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
outputs = model(**inputs)

In [27]:
print(outputs.logits.shape)

torch.Size([2, 2])


In [28]:
print(outputs.logits)

tensor([[-1.5607,  1.6123],
        [ 4.1692, -3.3464]], grad_fn=<AddmmBackward0>)


In [29]:
print(outputs)

SequenceClassifierOutput(loss=None, logits=tensor([[-1.5607,  1.6123],
        [ 4.1692, -3.3464]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


## Postprocessing the output

In [16]:
import torch

predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
print(predictions)

tensor([[4.0195e-02, 9.5980e-01],
        [9.9946e-01, 5.4418e-04]], grad_fn=<SoftmaxBackward0>)


In [17]:
model.config.id2label

{0: 'NEGATIVE', 1: 'POSITIVE'}

## Exercise

Choose two (or more) texts of your own and run them through the sentiment-analysis pipeline. Then replicate the steps you saw here yourself and check that you obtain the same results!

### Basic Usage

In [48]:
from transformers import pipeline

# ----O: Basic Usage----
classifier = pipeline('sentiment-analysis')
sentences = ['Hi I am happy because I am learning NLP', 'Hi I am sad because NLP is hard', 'Hi I am stressed']
print(classifier(sentences))

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


[{'label': 'POSITIVE', 'score': 0.999839186668396}, {'label': 'NEGATIVE', 'score': 0.9954878687858582}, {'label': 'NEGATIVE', 'score': 0.9857692122459412}]




### Step by step usage

#### Step 1: Preprocessing with a tokenizer

In [73]:
from transformers import pipeline, AutoTokenizer

sentences = ['Hi I am happy because I am learning NLP', 'Hi I am sad because NLP is hard', 'Hi I am stressed']
checkpoint = 'distilbert-base-uncased-finetuned-sst-2-english'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")
print(f"{inputs}")

{'input_ids': tensor([[  101,  7632,  1045,  2572,  3407,  2138,  1045,  2572,  4083, 17953,
          2361,   102],
        [  101,  7632,  1045,  2572,  6517,  2138, 17953,  2361,  2003,  2524,
           102,     0],
        [  101,  7632,  1045,  2572, 13233,   102,     0,     0,     0,     0,
             0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]])}


#### Step 2: Processing with the model

In [71]:
from transformers import AutoModelForSequenceClassification
# We get a model with a head, if not the output will be a hidden state
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
outputs = model(**inputs)
print(f"{outputs}\n")
print(outputs.logits)

SequenceClassifierOutput(loss=None, logits=tensor([[-4.1913,  4.5440],
        [ 2.9072, -2.4892],
        [ 2.2723, -1.9657]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

tensor([[-4.1913,  4.5440],
        [ 2.9072, -2.4892],
        [ 2.2723, -1.9657]], grad_fn=<AddmmBackward0>)


#### Step 3: Postprocessing Logits with SoftMax function

In [70]:
import torch

predictions = torch.nn.functional.softmax(outputs.logits, dim = -1)
print(predictions)
model.config.id2label

predicted_labels = []
for pred in predictions:
    label_index = torch.argmax(pred).item()
    predicted_label = model.config.id2label[label_index]
    predicted_labels.append(predicted_label)

print(f"Result: {predicted_labels}")

tensor([[1.6077e-04, 9.9984e-01],
        [9.9549e-01, 4.5122e-03],
        [9.8577e-01, 1.4231e-02]], grad_fn=<SoftmaxBackward0>)
Result: ['POSITIVE', 'NEGATIVE', 'NEGATIVE']
