In [1]:
import torch
import wandb
import time
import os
from tqdm import tqdm
import numpy as np
import pandas as pd
tqdm.pandas()

from datasets import load_dataset
from transformers import AutoTokenizer, pipeline

from trl import AutoModelForCausalLMWithValueHead
from trl import PPOTrainer

In [2]:
# обрезать отзыв до размера n
def cut_review(example):
    example["review"] = example["review"][0:5000]
    return example

In [3]:
ds = load_dataset('imdb', split='test')
ds = ds.rename_columns({'text': 'review', 'label': 'sentiment'})
ds = ds.filter(lambda x: len(x["review"])>200, batched=False)
# ds = ds.map(cut_review, batched=False)
ds = ds.shuffle(seed=1)
ds

Dataset({
    features: ['review', 'sentiment'],
    num_rows: 24872
})

In [4]:
ds.features

{'review': Value(dtype='string', id=None),
 'sentiment': ClassLabel(names=['neg', 'pos'], id=None)}

In [5]:
ds['review'][0]

"Wretched. Talk about botched. BEYOND THE POSEIDON ADVENTURE is bad in every respect. Salvagers Michael Caine and Karl Malden decide to tow the wreck of the eponymous ocean liner with a really creaky tug boat. They're challenged by ruthless Telly Savalas and his gang of machine-gun toting goons. This part sequel, part remake has Caine, Malden and ANOTHER group of Poseidon survivors making a similarly dangerous trek out of the sinking ship. Among this group are Shirley Jones, Slim Pickens, Peter Boyle, Shirley Knight and Slim Pickens. Jack Warden plays a blind man. Surely, you'll wish you were blind after seeing this mess. Sally Field is particularly annoying as a stowaway on board Caine's tug.<br /><br />Disaster master Irwin Allen not only produced this one, he decided to direct it as well."

In [6]:
n_test_examples = 100

In [7]:
from torch.utils.data import DataLoader, TensorDataset

batch_size = 16
n_test_examples = 100
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [8]:
labels = torch.tensor(ds['sentiment'][:n_test_examples]).to(device)
labels

tensor([0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
        1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1,
        1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1,
        0, 0, 0, 1], device='cuda:0')

### DistilBertForSequenceClassification + distilbert-base-uncased-finetuned-sst-2-english

In [9]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification


checkpoint = 'distilbert-base-uncased-finetuned-sst-2-english'
classifier_2 = DistilBertForSequenceClassification.from_pretrained(checkpoint).to(device)
tokenizer_2 = DistilBertTokenizer.from_pretrained(checkpoint)

In [10]:
inputs_2 = tokenizer_2(ds['review'][0:n_test_examples], padding=True, truncation=True, return_tensors='pt').to(device)
dataset_2 = TensorDataset(inputs_2['input_ids'], inputs_2['attention_mask'], labels)
dataloader_2 = DataLoader(dataset_2, batch_size=batch_size, shuffle=True)

In [11]:
%%time
with torch.no_grad():
    logits_2 = []
    labels = []
    for batch in dataloader_2:
        batch_inputs = {'input_ids': batch[0], 'attention_mask': batch[1]}
        logits = classifier_2(**batch_inputs).logits
        logits_2.append(logits)
        labels.append(batch[2])
        
logits_2 = torch.cat(logits_2, dim=0).to('cpu')
labels = torch.cat(labels).to('cpu')

display(logits_2[0:5])
print()
display(labels[0:5])
print()

tensor([[ 3.8222, -3.2315],
        [-2.9453,  3.0676],
        [-3.7527,  3.9749],
        [-3.5014,  3.6809],
        [ 1.0436, -0.7575]])




tensor([0, 1, 1, 1, 1])


CPU times: total: 3.05 s
Wall time: 3.11 s


In [12]:
predicted_class_id_2 = logits_2.argmax(dim=1)
predicted_class_id_2

tensor([0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0,
        1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
        1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0,
        1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0,
        0, 0, 1, 1])

In [13]:
torch.tensor(ds['sentiment'][0:n_test_examples])

tensor([0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
        1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1,
        1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1,
        0, 0, 0, 1])

In [14]:
(predicted_class_id_2 == labels).sum().item() / len(predicted_class_id_2)

0.85

### DistilBertForSequenceClassification + distilbert-base-uncased

In [15]:
checkpoint = 'distilbert-base-uncased'
classifier_3 = DistilBertForSequenceClassification.from_pretrained(checkpoint)
tokenizer_3 = DistilBertTokenizer.from_pretrained(checkpoint)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
inputs_3 = tokenizer_3(ds['review'][0:n_test_examples], padding=True, truncation=True, return_tensors='pt')
inputs_3

{'input_ids': tensor([[  101, 23277, 29574,  ...,     0,     0,     0],
        [  101,  6874,  9443,  ...,     0,     0,     0],
        [  101,  1996,  2200,  ...,     0,     0,     0],
        ...,
        [  101,  2065, 17551,  ...,     0,     0,     0],
        [  101,  2023,  3185,  ...,     0,     0,     0],
        [  101,  1999,  1996,  ...,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [17]:
%%time
with torch.no_grad():
    logits_3 = classifier_3(**inputs_3).logits

CPU times: total: 2min 44s
Wall time: 34.3 s


In [18]:
predicted_class_id_3 = logits_3.argmax(dim=1)
predicted_class_id_3

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1])

In [19]:
(predicted_class_id_3 == torch.tensor(ds['sentiment'][0:n_test_examples])).sum().item() / len(predicted_class_id_3)

0.52

### AutoModelForSequenceClassification + kurianbenoy/distilbert-base-uncased-finetuned-imdb

In [20]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

checkpoint = 'kurianbenoy/distilbert-base-uncased-finetuned-imdb'
classifier_4 = AutoModelForSequenceClassification.from_pretrained(checkpoint)
tokenizer_4 = AutoTokenizer.from_pretrained(checkpoint)

In [21]:
inputs_4 = tokenizer_4(ds['review'][0:n_test_examples], padding=True, truncation=True, return_tensors='pt')
inputs_4

{'input_ids': tensor([[  101, 23277, 29574,  ...,     0,     0,     0],
        [  101,  6874,  9443,  ...,     0,     0,     0],
        [  101,  1996,  2200,  ...,     0,     0,     0],
        ...,
        [  101,  2065, 17551,  ...,     0,     0,     0],
        [  101,  2023,  3185,  ...,     0,     0,     0],
        [  101,  1999,  1996,  ...,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [22]:
%%time
with torch.no_grad():
    logits_4 = classifier_4(**inputs_4).logits

CPU times: total: 2min 44s
Wall time: 34.1 s


In [23]:
predicted_class_id_4 = logits_4.argmax(dim=1)
predicted_class_id_4

tensor([0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1,
        0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1,
        1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1,
        1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1,
        0, 0, 1, 0])

In [24]:
(predicted_class_id_4 == torch.tensor(ds['sentiment'][0:n_test_examples])).sum().item() / len(predicted_class_id_4)

0.92