pip install trl evaluate scikit-learn

In [1]:
import torch
import numpy as np
from tqdm import tqdm

### DATASET 

In [2]:
from datasets import load_dataset

ds = load_dataset('imdb')
ds = ds.rename_columns({'text': 'review', 'label': 'sentiment'})
ds = ds.filter(lambda x: len(x["review"])>200, batched=False)
ds = ds.shuffle(seed=1)
ds

DatasetDict({
    train: Dataset({
        features: ['review', 'sentiment'],
        num_rows: 24895
    })
    test: Dataset({
        features: ['review', 'sentiment'],
        num_rows: 24872
    })
    unsupervised: Dataset({
        features: ['review', 'sentiment'],
        num_rows: 49776
    })
})

In [3]:
from torch.utils.data import DataLoader, TensorDataset

batch_size = 16
n_test_examples = 100
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [4]:
label2id = {k:v for k,v in 
            zip(ds['train'].features['sentiment'].names, range(len(ds['train'].features['sentiment'].names)))}

id2label = {v:k for k, v in label2id.items()}

print(f'{id2label=}')
print(f'{label2id=}')

id2label={0: 'neg', 1: 'pos'}
label2id={'neg': 0, 'pos': 1}


### MODEL 

In [5]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

In [6]:
checkpoint = 'distilbert-base-uncased'

In [7]:
classifier = DistilBertForSequenceClassification.from_pretrained(checkpoint, 
                                                                 num_labels=2,
                                                                 id2label=id2label,
                                                                 label2id=label2id)

classifier.to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

### TOKENIZER 

In [8]:
tokenizer = DistilBertTokenizer.from_pretrained(checkpoint)

def preprocess_func(examples):
    return tokenizer(examples['review'], truncation=True)

tokenized_ds = ds.map(preprocess_func, batched=True)
tokenized_ds

Map:   0%|          | 0/24895 [00:00<?, ? examples/s]

Map:   0%|          | 0/24872 [00:00<?, ? examples/s]

Map:   0%|          | 0/49776 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['review', 'sentiment', 'input_ids', 'attention_mask'],
        num_rows: 24895
    })
    test: Dataset({
        features: ['review', 'sentiment', 'input_ids', 'attention_mask'],
        num_rows: 24872
    })
    unsupervised: Dataset({
        features: ['review', 'sentiment', 'input_ids', 'attention_mask'],
        num_rows: 49776
    })
})

In [9]:
from transformers import DataCollatorWithPadding

In [10]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

### DATALOADER

In [11]:
tokenized_ds['train'].column_names

['review', 'sentiment', 'input_ids', 'attention_mask']

In [12]:
tokenized_ds = tokenized_ds.remove_columns(["review"])
tokenized_ds = tokenized_ds.rename_column("sentiment", "labels")
tokenized_ds.set_format("torch")
tokenized_ds['train'].column_names

['labels', 'input_ids', 'attention_mask']

In [13]:
from torch.utils.data import DataLoader

In [14]:
train_dataloader = DataLoader(tokenized_ds["train"], shuffle=True, batch_size=8, collate_fn=data_collator)
eval_dataloader = DataLoader(tokenized_ds["test"], batch_size=8, collate_fn=data_collator)

In [15]:
print(len(train_dataloader), len(eval_dataloader))

3112 3109


In [16]:
i = 0
for batch in train_dataloader:
    print({k: v.shape for k, v in batch.items()})
    i+=1
    if i == 5: break

{'labels': torch.Size([8]), 'input_ids': torch.Size([8, 409]), 'attention_mask': torch.Size([8, 409])}
{'labels': torch.Size([8]), 'input_ids': torch.Size([8, 481]), 'attention_mask': torch.Size([8, 481])}
{'labels': torch.Size([8]), 'input_ids': torch.Size([8, 512]), 'attention_mask': torch.Size([8, 512])}
{'labels': torch.Size([8]), 'input_ids': torch.Size([8, 373]), 'attention_mask': torch.Size([8, 373])}
{'labels': torch.Size([8]), 'input_ids': torch.Size([8, 512]), 'attention_mask': torch.Size([8, 512])}


In [17]:
batch

{'labels': tensor([0, 1, 1, 1, 0, 1, 0, 1]), 'input_ids': tensor([[  101,  5432,  1024,  ...,     0,     0,     0],
        [  101,  3185,  1000,  ...,     0,     0,     0],
        [  101, 11063, 10852,  ...,     0,     0,     0],
        ...,
        [  101,  2023,  6925,  ...,     0,     0,     0],
        [  101,  1037,  5621,  ...,  3526, 18845,   102],
        [  101,  2028,  1997,  ...,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]])}

### checking model | batch dimensions

In [18]:
batch.to(device)

{'labels': tensor([0, 1, 1, 1, 0, 1, 0, 1], device='cuda:0'), 'input_ids': tensor([[  101,  5432,  1024,  ...,     0,     0,     0],
        [  101,  3185,  1000,  ...,     0,     0,     0],
        [  101, 11063, 10852,  ...,     0,     0,     0],
        ...,
        [  101,  2023,  6925,  ...,     0,     0,     0],
        [  101,  1037,  5621,  ...,  3526, 18845,   102],
        [  101,  2028,  1997,  ...,     0,     0,     0]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')}

In [19]:
outputs = classifier(**batch)
outputs

SequenceClassifierOutput(loss=tensor(0.6967, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.0200, -0.0193],
        [-0.0414, -0.0650],
        [-0.0099, -0.0366],
        [-0.0723, -0.0483],
        [-0.0472,  0.0099],
        [-0.0355, -0.0423],
        [-0.0129, -0.0504],
        [-0.0377, -0.0398]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

### test model (eval_dataloader)

In [20]:
def compute_accuracy(predictions, labels):
    accuracy = (predictions == labels).sum().item() / len(labels)
    print(f'{accuracy=:.3f}')
    return accuracy

In [21]:
preds = []
labels = []

classifier.eval()
with torch.no_grad():
    for batch in tqdm(eval_dataloader):
        batch.to(device)
        pred = torch.argmax(classifier(**batch).logits, dim=1)
    
        preds.append(pred)
        labels.append(batch['labels'])

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3109/3109 [03:56<00:00, 13.13it/s]


In [22]:
preds_ = torch.cat(preds)
labels_ = torch.cat(labels)

compute_accuracy(preds_, labels_);

accuracy=0.506


Точность непредобученной модели на задаче классификации примерно равна 48%. 

### TRAIN

In [23]:
from torch.optim import AdamW

optimizer = AdamW(classifier.parameters(), lr=5e-5)
optimizer

AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 5e-05
    maximize: False
    weight_decay: 0.01
)

In [24]:
from transformers import get_scheduler

num_epochs = 1
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

3112


In [25]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

classifier.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = classifier(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/3112 [00:00<?, ?it/s]

In [26]:
preds = []
labels = []

classifier.eval()
with torch.no_grad():
    for batch in tqdm(eval_dataloader):
        batch.to(device)
        pred = torch.argmax(classifier(**batch).logits, dim=1)
    
        preds.append(pred)
        labels.append(batch['labels'])

  0%|          | 0/3109 [00:00<?, ?it/s]

In [27]:
preds_trained = torch.cat(preds)
labels_trained = torch.cat(labels)

compute_accuracy(preds_trained, labels_trained);

accuracy=0.930


### pushing model to hub

In [28]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [29]:
classifier.push_to_hub('bert_imdb_classifier')

pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/maxfil333/bert_imdb_classifier/commit/16d119afd2d01596d9060aa41d44204641eda4f3', commit_message='Upload DistilBertForSequenceClassification', commit_description='', oid='16d119afd2d01596d9060aa41d44204641eda4f3', pr_url=None, pr_revision=None, pr_num=None)

In [30]:
tokenizer.push_to_hub('bert_imdb_classifier')

CommitInfo(commit_url='https://huggingface.co/maxfil333/bert_imdb_classifier/commit/8267472f4c78ed63c2222ff634c854c1a916e849', commit_message='Upload tokenizer', commit_description='', oid='8267472f4c78ed63c2222ff634c854c1a916e849', pr_url=None, pr_revision=None, pr_num=None)

Точность после обучения модели примерно равна 93%. 