In [None]:
# Following NLP Course here:
# https://huggingface.co/learn/nlp-course/chapter2/2

In [1]:
from transformers import pipeline

In [2]:
classifier = pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading (…)lve/main/config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [4]:
classifier(
    [
        "I've been waiting for a Hugging Face course",
        "I hate this so much!",
        "Just another sentence"
    ]
)

[{'label': 'POSITIVE', 'score': 0.9930523633956909},
 {'label': 'NEGATIVE', 'score': 0.9994558691978455},
 {'label': 'NEGATIVE', 'score': 0.998630166053772}]

In [7]:
# Let's dig into what happens

# STEP 1 - Tokenizer
from transformers import AutoTokenizer

In [8]:
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [9]:
raw_inputs = [
    "I've been waiting for a Hugging Face course",
    "I hate this so much!",
    "Just another sentence"
]

In [16]:
inputs = tokenizer(raw_inputs,
                   padding=True, # Using same length of longest sentence
                   truncation=True, # Truncate at the longest sentence
                   return_tensors="pt" # Use pytorch
                  )
inputs

{'input_ids': tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662,  2227,
          2607,   102],
        [  101,  1045,  5223,  2023,  2061,  2172,   999,   102,     0,     0,
             0,     0],
        [  101,  2074,  2178,  6251,   102,     0,     0,     0,     0,     0,
             0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]])}

In [17]:
# STEP 2 - Model
from transformers import AutoModel

In [18]:
model = AutoModel.from_pretrained(checkpoint)

In [28]:
outputs = model(**inputs)
# printing last hidden layer shape.
outputs.last_hidden_state.shape

torch.Size([3, 12, 768])

In [29]:
# Using Model for Sequence Classification
from transformers import AutoModelForSequenceClassification

In [30]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
outputs = model(**inputs)
# printing last hidden layer shape
outputs.logits.shape

torch.Size([3, 2])

In [32]:
outputs.logits

tensor([[-2.4317,  2.5307],
        [ 4.1692, -3.3464],
        [ 3.5786, -3.0130]], grad_fn=<AddmmBackward0>)

In [33]:
# STEP 3 - Post Processing
import torch

In [35]:
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
predictions

tensor([[6.9477e-03, 9.9305e-01],
        [9.9946e-01, 5.4418e-04],
        [9.9863e-01, 1.3699e-03]], grad_fn=<SoftmaxBackward0>)

In [36]:
model.config.id2label

{0: 'NEGATIVE', 1: 'POSITIVE'}