In [2]:
# https://www.youtube.com/watch?v=GSt00_-0ncQ
from transformers import pipeline
import torch
import torch.nn.functional as F

Collecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m35.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m36.7 MB/s[0m eta [36m0:00:00[0m
Col

In [5]:
model_name = "distilbert-base-uncased-finetuned-sst-2-english"

classifier = pipeline("sentiment-analysis", model=model_name)
results = classifier(["We are very happy to show you the transformers library.",
                      "We hope you don't hate it."])

for result in results:
    print(result)

{'label': 'POSITIVE', 'score': 0.9997994303703308}
{'label': 'NEGATIVE', 'score': 0.5308590531349182}


In [7]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "distilbert-base-uncased-finetuned-sst-2-english"

model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
results = classifier(["We are very happy to show you the transformers library.",
                      "We hope you don't hate it."])

for result in results:
    print(result)

{'label': 'POSITIVE', 'score': 0.9997994303703308}
{'label': 'NEGATIVE', 'score': 0.5308590531349182}


In [8]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "distilbert-base-uncased-finetuned-sst-2-english"

model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
results = classifier(["We are very happy to show you the transformers library.",
                      "We hope you don't hate it."])

for result in results:
    print(result)

tokens = tokenizer.tokenize("We are very happy to show you the transformers library.")
token_ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = tokenizer("We are very happy to show you the transformers library.")

print(f"Tokens: {tokens}")
print(f"Token IDs: {token_ids}")
print(f"Input IDs: {input_ids}") # you can pass input_ids to models later to do predictions manually

{'label': 'POSITIVE', 'score': 0.9997994303703308}
{'label': 'NEGATIVE', 'score': 0.5308590531349182}
Tokens: ['we', 'are', 'very', 'happy', 'to', 'show', 'you', 'the', 'transformers', 'library', '.']
Token IDs: [2057, 2024, 2200, 3407, 2000, 2265, 2017, 1996, 19081, 3075, 1012]
Input IDs: {'input_ids': [101, 2057, 2024, 2200, 3407, 2000, 2265, 2017, 1996, 19081, 3075, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [11]:

X_train = ["We are very happy to show you the transformers library.",
          "We hope you don't hate it."]

batch = tokenizer(X_train, padding=True, truncation=True, max_length=512, return_tensors="pt")

with torch.no_grad():
    # raw values
    outputs = model(**batch, labels=torch.tensor([1, 0]))
    print(outputs)
    # probabilities and predictions
    predictions = F.softmax(outputs.logits, dim=1)
    print(predictions)
    # taking the prediction/index with the highest probability
    # returns a tensor with labels 1 and 0
    labels = torch.argmax(predictions, dim=1)
    print(labels)
    # convert each label to the actual class name, which is "POSITIVE" and "NEGATIVE"
    labels = [model.config.id2label[label_id] for label_id in labels.tolist()]
    print(labels)

SequenceClassifierOutput(loss=tensor(0.3167), logits=tensor([[-4.1329,  4.3811],
        [ 0.0818, -0.0418]]), hidden_states=None, attentions=None)
tensor([[2.0060e-04, 9.9980e-01],
        [5.3086e-01, 4.6914e-01]])
tensor([1, 0])
['POSITIVE', 'NEGATIVE']


In [12]:
# saving this
save_directory = "saved"
tokenizer.save_pretrained(save_directory)
model.save_pretrained(save_directory)

tokenizer = AutoTokenizer.from_pretrained(save_directory)
model = AutoModelForSequenceClassification.from_pretrained(save_directory)

In [14]:
# german example

model_name = "oliverguhr/german-sentiment-bert"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

X_train_german = [
    "Mit keinem guten Ergebnis",
    "Das war unfair",
    "Das ist gar nicht mal so gut",
    "nicht so schlecht wie erwartet",
    "Das war gut!",
    "Sie fährt ein grünes Auto."
]

batch = tokenizer(X_train_german, padding=True, truncation=True, max_length=512, return_tensors="pt")
print(batch)

with torch.no_grad():
    outputs = model(**batch)
    label_ids = torch.argmax(outputs.logits, dim=1)
    print(label_ids)
    labels = [model.config.id2label[label_id] for label_id in label_ids.tolist()]
    print(labels)

Downloading (…)okenizer_config.json:   0%|          | 0.00/161 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/255k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

{'input_ids': tensor([[    3,   304,  8524,  5569,  2011,     4,     0,     0,     0],
        [    3,   295,   185,   174,  8716,   124,     4,     0,     0],
        [    3,   295,   127,  2523,   149,  2723,   181,  1522,     4],
        [    3,   149,   181,  6975,   246,  6303,     4,     0,     0],
        [    3,   295,   185,  1522, 26982,     4,     0,     0,     0],
        [    3,   371,  9755,    39, 19044, 26902,  3512, 26914,     4]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1]])}
tensor([1, 1, 1, 0, 0, 2])
['negative', 'negative', 'negative'

In [15]:
# another version of the code from above
# DOESN'T WORK

model_name = "oliverguhr/german-sentiment-bert"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

X_train_german = [
    "Mit keinem guten Ergebnis",
    "Das war unfair",
    "Das ist gar nicht mal so gut",
    "nicht so schlecht wie erwartet",
    "Das war gut!",
    "Sie fährt ein grünes Auto."
]

# omit return_tensors="pt"
batch = tokenizer(X_train_german, padding=True, truncation=True, max_length=512)
# then do this
bacth = torch.tensor(batch["input_ids"])
print(batch)

with torch.no_grad():
    # don't unpack the dict
    outputs = model(batch)
    label_ids = torch.argmax(outputs.logits, dim=1)
    print(label_ids)
    labels = [model.config.id2label[label_id] for label_id in label_ids.tolist()]
    print(labels)

{'input_ids': [[3, 304, 8524, 5569, 2011, 4, 0, 0, 0], [3, 295, 185, 174, 8716, 124, 4, 0, 0], [3, 295, 127, 2523, 149, 2723, 181, 1522, 4], [3, 149, 181, 6975, 246, 6303, 4, 0, 0], [3, 295, 185, 1522, 26982, 4, 0, 0, 0], [3, 371, 9755, 39, 19044, 26902, 3512, 26914, 4]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 0, 0], [1, 1, 1, 1, 1, 1, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1]]}


TypeError: ignored

In [16]:
# https://huggingface.co/transformers/v4.0.1/custom_datasets.html
# fine tune our own model

from pathlib import Path
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments

# 1. prepare dataset
# 2. load pretrained Tokenizer, call it with dataset -> encoding
# 3. build PyTorch dataset with encodings
# 4. load pretrained model
# 5. a.) load Trainer and train it
#    b.) or use native PyTorch training pipeline

model_name = "distilbert-base-uncased"

def read_imdb_split(split_dir):
    split_dir = Path(split_dir)
    texts = []
    labels = []
    for label_dir in ["pos", "neg"]:
        for text_file in (split_dir/label_dir).iterdir():
            texts.append(text_file.read_text())
            labels.append(0 if label_dir == "neg" else 1)

    return texts, labels

# Large Movie Review Dataset
# http://ai.stanford.edu/~amaas/data/sentiment
train_texts, train_labels = read_imdb_split("aclImdb/train")
test_texts, test_labels = read_imdb_split("aclImdb/test")

train_text, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.2)

class IMDbDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)

# ensure that all of our sequences are padded to the same length and are truncated to be no longer than model's
# maximum input length. This will allow us to feed batches of sequences into the model at the same time.
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

train_dataset = IMDbDataset(train_encodings, train_labels)
val_dataset = IMDbDataset(val_encodings, val_labels)
test_dataset = IMDbDataset(test_encodings, test_labels)

training_args = TrainingArguments(
    output_dir="./results",         # output directory
    num_train_epochs=2,             # totral number of training epochs
    per_device_train_batch_size=16, # batch size per device during training
    per_device_eval_batch_size=64,  # batch size for evaluation
    warmup_steps=500,               # number of warmup steps for learning rate scheduler
    learning_rate=5e-5,             # learning rate
    weight_decay=0.01,              # strength of weight decay
    logging_dir="./logs",           # directory for storing logs
    logging_steps=10
)

model = DistilBertForSequenceClassification.from_pretrained(model_name)

trainer = Trainer(
    model=model,                  # the instantiated Transformers model to be trained
    args=training_args,           # training arguents, defined above
    train_dataset=train_dataset,  # training dataset
    eval_dataset=val_dataset      # evaluation dataset
)

trainer.train()

FileNotFoundError: ignored

In [None]:
# or native PyTorch

from torch.utils.data import DataLoader
from transformers import AdamW

device  = torch.device("cuda") if torch.cuda.if_available() else torch.device("cpu")

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
model.to(device)
model.train()

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

optim = AdamW(model.parameters(), lr=5e-5)

num_train_epochs = 2
for epoch in range(num_train_epochs):
    for batch in train_loader:
        optim.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

        loss = outputs[0]
        loss.backward()
        optim.step()

model.eval()