In [12]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification

import torch
import torch.nn.functional as F

In [13]:
# Default model is distilbert-base-uncased-finetuned-sst-2-english
classifier = pipeline("sentiment-analysis")
res = classifier(["We are very happy to show you the HuggingFace Transformer library.",
                  "We hope you don't hate it."])

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [14]:
print(res)

[{'label': 'POSITIVE', 'score': 0.9998084902763367}, {'label': 'NEGATIVE', 'score': 0.5308645963668823}]


In [15]:
# Choose a specific model (same in this case, nothing should change)
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [16]:
classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
res = classifier(["We are very happy to show you the HuggingFace Transformer library.",
                  "We hope you don't hate it."])

In [17]:
print(res)

[{'label': 'POSITIVE', 'score': 0.9998084902763367}, {'label': 'NEGATIVE', 'score': 0.5308645963668823}]


In [20]:
# Converts into list of tokens
tokens = tokenizer.tokenize("We are very happy to show you the HuggingFace Transformer library")

# Turns it into a list of token IDs (number representation of the words)
token_ids = tokenizer.convert_tokens_to_ids(tokens)

# Dictionary of expanded token_ids (inlcudes start, end, and other tokens) and attention mask
input_ids = tokenizer("We are very happy to show you the HuggingFace Transformer library")

In [22]:
print(f"   Tokens: {tokens}")
print(f"Token IDs: {token_ids}")
print(f"Input IDs: {input_ids}")

   Tokens: ['we', 'are', 'very', 'happy', 'to', 'show', 'you', 'the', 'hugging', '##face', 'transform', '##er', 'library']
Token IDs: [2057, 2024, 2200, 3407, 2000, 2265, 2017, 1996, 17662, 12172, 10938, 2121, 3075]
Input IDs: {'input_ids': [101, 2057, 2024, 2200, 3407, 2000, 2265, 2017, 1996, 17662, 12172, 10938, 2121, 3075, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [24]:
X_train = ["We are very happy to show you the HuggingFace Transformer library.",
                  "We hope you don't hate it."]

batch = tokenizer(X_train, padding=True, truncation=True, max_length=512, return_tensors="pt")
print(batch)


{'input_ids': tensor([[  101,  2057,  2024,  2200,  3407,  2000,  2265,  2017,  1996, 17662,
         12172, 10938,  2121,  3075,  1012,   102],
        [  101,  2057,  3246,  2017,  2123,  1005,  1056,  5223,  2009,  1012,
           102,     0,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]])}


In [26]:
with torch.no_grad(): # disable gradient updating and tracking for inference
    outputs = model(**batch) # unpacks the values of the dictionary
    print(outputs)
    predictions = F.softmax(outputs.logits, dim=1)
    print(predictions)
    labels = torch.argmax(predictions, dim=1) # Could have just passed in outputs
    print(labels)
    labels = [model.config.id2label[label_id] for label_id in labels.tolist()]
    print(labels)

SequenceClassifierOutput(loss=None, logits=tensor([[-4.1426,  4.4177],
        [ 0.0818, -0.0418]]), hidden_states=None, attentions=None)
tensor([[1.9153e-04, 9.9981e-01],
        [5.3086e-01, 4.6914e-01]])
tensor([1, 0])
['POSITIVE', 'NEGATIVE']


In [27]:
# Save pretrained model
save_directory = "saved"
tokenizer.save_pretrained(save_directory)
model.save_pretrained(save_directory)

In [28]:
# Load pretrained model
tokenizer = AutoTokenizer.from_pretrained(save_directory)
model = AutoModelForSequenceClassification.from_pretrained(save_directory)

In [29]:
# Use models from huggingface.co/models
model_name = "oliverguhr/german-sentiment-bert"

# Load and save model
# tokenizer.save_pretrained(save_directory)
# model.save_pretrained(save_directory)
tokenizer = AutoTokenizer.from_pretrained(save_directory)
model = AutoModelForSequenceClassification.from_pretrained(save_directory)

In [31]:
X_train = ["Mit keinem guten Ergenbis", "Das war unfair", "Das ist gar nicht mal so gut",
         "nicht so schlecht wie erwartet", "Das war gut!", "Sie fahrt ein grunes Auto."]

batch = tokenizer(X_train, padding=True, truncation=True, max_length=512, return_tensors="pt")
print(batch)

{'input_ids': tensor([[  101, 10210, 26679, 25832,  9535,  2368,  9413,  6914, 18477,   102,
             0,     0,     0,     0],
        [  101,  8695,  2162, 15571,   102,     0,     0,     0,     0,     0,
             0,     0,     0,     0],
        [  101,  8695, 21541, 11721,  2099, 27969, 11039, 15451,  2061,  9535,
           102,     0,     0,     0],
        [  101, 27969, 11039,  2061,  8040,  7317, 15937,  2102, 15536,  2063,
          9413, 18367,  3388,   102],
        [  101,  8695,  2162,  9535,   999,   102,     0,     0,     0,     0,
             0,     0,     0,     0],
        [  101,  9033,  2063,  6904,  8093,  2102, 16417, 24665, 26639,  8285,
          1012,   102,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
    

In [34]:
with torch.no_grad():
    outputs = model(**batch)
    label_ids = torch.argmax(outputs.logits, dim=1)
    print(label_ids)
    labels = [model.config.id2label[label_id] for label_id in label_ids.tolist()]
    print(labels)

tensor([0, 0, 0, 0, 0, 0])
['NEGATIVE', 'NEGATIVE', 'NEGATIVE', 'NEGATIVE', 'NEGATIVE', 'NEGATIVE']
