In [1]:
# !pip install datasets evaluate transformers[sentencepiece]

In [80]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis", device=0)
classifier(
    [
    "I love cats",
    "I like dogs better than cats",
    "I do not like skunks"
    ]
)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


[{'label': 'POSITIVE', 'score': 0.9995536208152771},
 {'label': 'POSITIVE', 'score': 0.9973645806312561},
 {'label': 'NEGATIVE', 'score': 0.9924862384796143}]

In [81]:
from transformers import AutoTokenizer

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [82]:
# tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [83]:
raw_inputs = [
    "I love cats",
    "I like dogs better than cats",
    "I do not like skunks"
]
inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")
print("Output is a dictionary with a key for each input: input_ids and the attention mask")
print("input_ids contains a row of integers, one for each input, which are the uniqie IDs for the tokens in each input")
print("attention mask is TBD on info")
print(inputs)

Output is a dictionary with a key for each input: input_ids and the attention mask
input_ids contains a row of integers, one for each input, which are the uniqie IDs for the tokens in each input
attention mask is TBD on info
{'input_ids': tensor([[  101,  1045,  2293,  8870,   102,     0,     0,     0,     0],
        [  101,  1045,  2066,  6077,  2488,  2084,  8870,   102,     0],
        [  101,  1045,  2079,  2025,  2066, 15315, 16814,  2015,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [84]:
print("Go through the model")

Go through the model


In [85]:
from transformers import AutoModel
# download the checkpoint
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
# instantiate a model
model = AutoModel.from_pretrained(checkpoint)

In [86]:
# use the same input sentences from above
outputs = model(**inputs)
print(outputs.last_hidden_state.shape)

torch.Size([3, 9, 768])


In [87]:
# same but using the model for sequence classification
from transformers import AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
outputs = model(**inputs)
print(outputs.logits.shape)
print("the 2 values are because we have 2 sentences and 2 labels")

torch.Size([3, 2])
the 2 values are because we have 2 sentences and 2 labels


In [88]:
# logits
print("get a value for each input sentence")
print("not probabilites but logits which are raw unnormalized scores from the models last layer")
print("Softmax layer normalizes them")
print(outputs.logits)

first 2 values are the for the 1st sentence, and the 2nd values are the 2nd sentence
not probabilites but logits which are raw unnormalized scores from the models last layer
Softmax layer normalizes them
tensor([[-3.7413,  3.9726],
        [-2.9056,  3.0305],
        [ 2.6433, -2.2401]], grad_fn=<AddmmBackward0>)


In [89]:
import torch
print("convert logits to probabilities")
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
print(predictions)

convert logits to probabilities
tensor([[4.4638e-04, 9.9955e-01],
        [2.6355e-03, 9.9736e-01],
        [9.9249e-01, 7.5138e-03]], grad_fn=<SoftmaxBackward0>)


In [90]:
print("print the labels")
model.config.id2label

print the labels


{0: 'NEGATIVE', 1: 'POSITIVE'}