In [None]:
# if u run this notebook in collab, install this dependences

# !pip install transformers
# !pip install datasets
# !pip install wandb

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch
import numpy as np
from datasets import Dataset, load_dataset
import wandb
from datetime import datetime
import pandas as pd
import datasets

  from .autonotebook import tqdm as notebook_tqdm
  return torch._C._cuda_getDeviceCount() > 0
The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.


In [2]:
dataset = load_dataset("glue", "mrpc")

In [3]:
def preproces_dataset(raw_dataset):
  df = pd.DataFrame(raw_dataset)
  one_hot_encoded = pd.get_dummies(df['label'], prefix='label')
  df = pd.concat([df, one_hot_encoded], axis=1)
  df.drop('label', axis=1, inplace=True)
  df.drop('idx', axis=1, inplace=True)
  new_dataset = Dataset.from_pandas(df)
  return new_dataset

def preproces_dict_dataset(raw_dataset):
  train_dataset = raw_dataset['train']
  validation_dataset = raw_dataset['validation']
  test_dataset = raw_dataset['test']

  train_dataset = preproces_dataset(train_dataset)
  validation_dataset = preproces_dataset(validation_dataset)
  test_dataset = preproces_dataset(test_dataset)

  dataset = datasets.dataset_dict.DatasetDict({
    'train': train_dataset,
    'validation': validation_dataset,
    'test': test_dataset
  })
  return dataset


In [4]:
dataset = preproces_dict_dataset(dataset)

In [5]:
labels = ['label_0', 'label_1']
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}

In [6]:
MODEL_NAME = "google/mobilebert-uncased"

In [7]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [8]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME,
                                                           problem_type="multi_label_classification",
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id)

Some weights of MobileBertForSequenceClassification were not initialized from the model checkpoint at google/mobilebert-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

if model.config.pad_token_id == None:
    model.config.pad_token_id = model.config.eos_token_id
    model.config.pad_token = tokenizer.pad_token

In [None]:
# # let's explore internal structure of model.config
# print(model.config)

BertConfig {
  "_name_or_path": "huawei-noah/TinyBERT_General_4L_312D",
  "attention_probs_dropout_prob": 0.1,
  "cell": {},
  "classifier_dropout": null,
  "emb_size": 312,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 312,
  "id2label": {
    "0": "label_0",
    "1": "label_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 1200,
  "label2id": {
    "label_0": 0,
    "label_1": 1
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 4,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "pre_trained": "",
  "problem_type": "multi_label_classification",
  "structure": [],
  "transformers_version": "4.40.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



In [10]:
def preprocess_data(examples):
  # take a batch of texts
  text = [examples["sentence1"], examples["sentence2"]]
  # encode them
  encoding = tokenizer(*text, padding="max_length", truncation=True, max_length=128)
  n_samples, sample_len = np.shape(np.array(encoding['input_ids']))
  # add labels
  labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
  # create numpy array of shape (batch_size, num_labels)
  labels_matrix = np.zeros((n_samples, len(labels)))
  # fill numpy array
  for idx, label in enumerate(labels):
    labels_matrix[:, idx] = labels_batch[label]

  encoding["labels"] = labels_matrix.tolist()

  return encoding

In [11]:
encoded_dataset = dataset.map(preprocess_data, batched=True, remove_columns=dataset['train'].column_names)

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map: 100%|██████████| 3668/3668 [00:00<00:00, 5367.83 examples/s]
Map: 100%|██████████| 408/408 [00:00<00:00, 4815.52 examples/s]
Map: 100%|██████████| 1725/1725 [00:00<00:00, 4532.23 examples/s]


In [None]:
# # Alternative way to preprocess data
# def tokenize_function(example):
#     return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

# # here we use datacollator, that means, that batching and paddig will be applied to the dataset during the training
# tokenized_datasets = train_dataset.map(tokenize_function, batched=True)
# data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True)

In [12]:
example = encoded_dataset['train'][0]
print(example.keys())
print(tokenizer.decode(example['input_ids']))
print(example['labels'])
print([id2label[idx] for idx, label in enumerate(example['labels']) if label == 1.0])

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])
[CLS] amrozi accused his brother, whom he called " the witness ", of deliberately distorting his evidence. [SEP] referring to him as only " the witness ", amrozi accused his brother of deliberately distorting his evidence. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
[0.0, 1.0]
['label_1']


In [13]:
encoded_dataset.set_format("torch")

In [14]:
type(encoded_dataset['train']['input_ids'])

torch.Tensor

In [15]:
batch_size = 8
metric_name = "f1"

args = TrainingArguments(
    f"{MODEL_NAME}",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name
)

In [16]:
# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions,
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds,
        labels=p.label_ids)
    return result

TODO: try to use raw model to predict some label

In [17]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [18]:
date = datetime.strftime(datetime.now(), "%d.%m.%Y-%H.%M.%S")
wandb.init(
    # set the wandb project where this run will be logged
    project="nlp-classifier",
    name=f"{MODEL_NAME}-cls-{date}",

    # # track hyperparameters and run metadata
    # config={
    # "learning_rate": 0.02,
    # "architecture": "CNN",
    # "dataset": "CIFAR-100",
    # "epochs": 10,
    # }
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mrubikpf2002[0m ([33mrubikpf2002-mipt[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [19]:
trainer.train()

                                                    
 10%|█         | 459/4590 [13:21<1:34:19,  1.37s/it]

{'eval_loss': 0.4312116503715515, 'eval_f1': 0.7682926829268293, 'eval_roc_auc': 0.767156862745098, 'eval_accuracy': 0.75, 'eval_runtime': 24.6327, 'eval_samples_per_second': 16.563, 'eval_steps_per_second': 2.07, 'epoch': 1.0}


 11%|█         | 500/4590 [14:27<1:46:25,  1.56s/it] 

{'loss': 22112.762, 'grad_norm': 5.626082420349121, 'learning_rate': 1.7821350762527233e-05, 'epoch': 1.09}


                                                    
 20%|██        | 918/4590 [26:03<1:30:39,  1.48s/it]

{'eval_loss': 0.33675694465637207, 'eval_f1': 0.8606658446362515, 'eval_roc_auc': 0.8615196078431373, 'eval_accuracy': 0.8480392156862745, 'eval_runtime': 24.266, 'eval_samples_per_second': 16.814, 'eval_steps_per_second': 2.102, 'epoch': 2.0}


 22%|██▏       | 1000/4590 [28:14<1:35:52,  1.60s/it]

{'loss': 0.4006, 'grad_norm': 24.380786895751953, 'learning_rate': 1.5642701525054468e-05, 'epoch': 2.18}


                                                     
 30%|███       | 1377/4590 [38:28<1:19:47,  1.49s/it]

{'eval_loss': 0.4792243242263794, 'eval_f1': 0.8410757946210269, 'eval_roc_auc': 0.8406862745098039, 'eval_accuracy': 0.8357843137254902, 'eval_runtime': 23.0181, 'eval_samples_per_second': 17.725, 'eval_steps_per_second': 2.216, 'epoch': 3.0}


 33%|███▎      | 1500/4590 [41:40<1:17:17,  1.50s/it]

{'loss': 0.2802, 'grad_norm': 1.8744786977767944, 'learning_rate': 1.3464052287581701e-05, 'epoch': 3.27}


                                                     
 40%|████      | 1836/4590 [51:43<1:04:55,  1.41s/it]

{'eval_loss': 0.6627210974693298, 'eval_f1': 0.8480392156862745, 'eval_roc_auc': 0.8480392156862746, 'eval_accuracy': 0.8455882352941176, 'eval_runtime': 25.5036, 'eval_samples_per_second': 15.998, 'eval_steps_per_second': 2.0, 'epoch': 4.0}


 44%|████▎     | 2000/4590 [56:47<1:38:19,  2.28s/it]

{'loss': 0.2361, 'grad_norm': 0.20242375135421753, 'learning_rate': 1.1285403050108935e-05, 'epoch': 4.36}


                                                       
 50%|█████     | 2295/4590 [1:08:24<1:13:48,  1.93s/it]

{'eval_loss': 0.6542292833328247, 'eval_f1': 0.8760736196319019, 'eval_roc_auc': 0.8762254901960784, 'eval_accuracy': 0.875, 'eval_runtime': 35.0215, 'eval_samples_per_second': 11.65, 'eval_steps_per_second': 1.456, 'epoch': 5.0}


 54%|█████▍    | 2500/4590 [1:16:13<59:55,  1.72s/it]  

{'loss': 0.4516, 'grad_norm': 82.70254516601562, 'learning_rate': 9.106753812636166e-06, 'epoch': 5.45}


                                                       
 60%|██████    | 2754/4590 [1:24:22<56:56,  1.86s/it]

{'eval_loss': 0.7753095626831055, 'eval_f1': 0.8578431372549019, 'eval_roc_auc': 0.857843137254902, 'eval_accuracy': 0.8578431372549019, 'eval_runtime': 32.0033, 'eval_samples_per_second': 12.749, 'eval_steps_per_second': 1.594, 'epoch': 6.0}


 65%|██████▌   | 3000/4590 [1:32:33<47:15,  1.78s/it]  

{'loss': 0.144, 'grad_norm': 17.5367374420166, 'learning_rate': 6.928104575163399e-06, 'epoch': 6.54}


                                                       
 70%|███████   | 3213/4590 [1:40:02<43:38,  1.90s/it]

{'eval_loss': 0.9489344954490662, 'eval_f1': 0.8421052631578947, 'eval_roc_auc': 0.8419117647058824, 'eval_accuracy': 0.8406862745098039, 'eval_runtime': 33.994, 'eval_samples_per_second': 12.002, 'eval_steps_per_second': 1.5, 'epoch': 7.0}


 76%|███████▋  | 3500/4590 [1:50:14<42:16,  2.33s/it]  

{'loss': 0.0958, 'grad_norm': 0.09160790592432022, 'learning_rate': 4.749455337690632e-06, 'epoch': 7.63}


                                                     
 80%|████████  | 3672/4590 [1:55:37<20:09,  1.32s/it]

{'eval_loss': 1.0753490924835205, 'eval_f1': 0.8501228501228502, 'eval_roc_auc': 0.8504901960784313, 'eval_accuracy': 0.8480392156862745, 'eval_runtime': 23.9862, 'eval_samples_per_second': 17.01, 'eval_steps_per_second': 2.126, 'epoch': 8.0}


 87%|████████▋ | 4000/4590 [2:04:07<14:39,  1.49s/it]  

{'loss': 0.078, 'grad_norm': 0.024523083120584488, 'learning_rate': 2.570806100217865e-06, 'epoch': 8.71}


                                                     
 90%|█████████ | 4131/4590 [2:07:54<09:57,  1.30s/it]

{'eval_loss': 1.0042163133621216, 'eval_f1': 0.8539877300613496, 'eval_roc_auc': 0.8541666666666666, 'eval_accuracy': 0.8529411764705882, 'eval_runtime': 22.615, 'eval_samples_per_second': 18.041, 'eval_steps_per_second': 2.255, 'epoch': 9.0}


 98%|█████████▊| 4500/4590 [2:17:56<02:22,  1.58s/it]  

{'loss': 0.056, 'grad_norm': 0.027429426088929176, 'learning_rate': 3.921568627450981e-07, 'epoch': 9.8}


                                                     
100%|██████████| 4590/4590 [2:20:40<00:00,  1.38s/it]

{'eval_loss': 1.0514111518859863, 'eval_f1': 0.8553921568627451, 'eval_roc_auc': 0.8553921568627451, 'eval_accuracy': 0.8553921568627451, 'eval_runtime': 23.9285, 'eval_samples_per_second': 17.051, 'eval_steps_per_second': 2.131, 'epoch': 10.0}


100%|██████████| 4590/4590 [2:20:40<00:00,  1.84s/it]

{'train_runtime': 8440.8931, 'train_samples_per_second': 4.346, 'train_steps_per_second': 0.544, 'train_loss': 2408.988544213642, 'epoch': 10.0}





TrainOutput(global_step=4590, training_loss=2408.988544213642, metrics={'train_runtime': 8440.8931, 'train_samples_per_second': 4.346, 'train_steps_per_second': 0.544, 'total_flos': 575037137203200.0, 'train_loss': 2408.988544213642, 'epoch': 10.0})

In [20]:
# Закрыть сессию WandB
wandb.finish()

0,1
eval/accuracy,▁▆▆▆█▇▆▆▇▇
eval/f1,▁▇▆▆█▇▆▆▇▇
eval/loss,▂▁▂▄▄▅▇█▇█
eval/roc_auc,▁▇▆▆█▇▆▆▇▇
eval/runtime,▂▂▁▃█▆▇▂▁▂
eval/samples_per_second,▆▇█▆▁▂▁▇█▇
eval/steps_per_second,▆▇█▆▁▂▁▇█▇
train/epoch,▁▁▂▂▃▃▃▄▄▄▅▅▆▆▆▇▇███
train/global_step,▁▁▂▂▃▃▃▄▄▄▅▅▆▆▆▇▇███
train/grad_norm,▁▃▁▁█▂▁▁▁

0,1
eval/accuracy,0.85539
eval/f1,0.85539
eval/loss,1.05141
eval/roc_auc,0.85539
eval/runtime,23.9285
eval/samples_per_second,17.051
eval/steps_per_second,2.131
total_flos,575037137203200.0
train/epoch,10.0
train/global_step,4590.0


In [None]:
# model.save_pretrained("./seved_model")

In [None]:
# text = dataset['test'][18]['sentence1'] + ' ' + dataset['test'][18]['sentence2']
# true_label = [dataset['test'][18]['label_0'], dataset['test'][18]['label_1']]
# if true_label[0]:
#   true_label = 'label_0'
# else:
#   true_label = 'label_1'
# print(text)
# encoding = tokenizer(text, return_tensors="pt")
# encoding = {k: v.to(trainer.model.device) for k,v in encoding.items()}
# print(encoding)
# outputs = trainer.model(**encoding)
# logits = outputs.logits
# print(logits.shape)
# sigmoid = torch.nn.Sigmoid()
# probs = sigmoid(logits.squeeze().cpu())
# print(probs)
# predictions = np.zeros(probs.shape)
# predictions[np.where(probs >= 0.5)] = 1

# # turn predicted id's into actual label names
# predicted_labels = [id2label[idx] for idx, label in enumerate(predictions) if label == 1.0]
# print(predicted_labels)
# print(true_label)

In [21]:
# устанавливаем модель в inference режим
model.eval()

MobileBertForSequenceClassification(
  (mobilebert): MobileBertModel(
    (embeddings): MobileBertEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 512)
      (token_type_embeddings): Embedding(2, 512)
      (embedding_transformation): Linear(in_features=384, out_features=512, bias=True)
      (LayerNorm): NoNorm()
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): MobileBertEncoder(
      (layer): ModuleList(
        (0-23): 24 x MobileBertLayer(
          (attention): MobileBertAttention(
            (self): MobileBertSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_features=512, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): MobileBertSelfOutput(
              (dense): Linear(in_fe

In [22]:
test_dataset = encoded_dataset['test'] # заметим, что здесь test_dataset уже подготовлен к инференсу см. выше

In [23]:
from torch.utils.data import DataLoader

test_dataloader = DataLoader(test_dataset, batch_size=16)

In [None]:
for batch in test_dataloader:
    print(batch)
    break

In [24]:
from sklearn.metrics import classification_report

In [25]:
predictions = []
true_labels = []

for batch in test_dataloader:
  # Move batch data to the same device as the model
  batch = {k: v.to(model.device) for k, v in batch.items()}

  with torch.no_grad():
    outputs = model(**batch)

  logits = outputs.logits
  preds = np.argmax(logits.cpu().numpy(), axis=1)
  predictions.extend(preds)

  true_batch_labels = batch['labels'].cpu().numpy()
  true_labels.extend(true_batch_labels.T[1])  # Добавляем истинные метки для оценки

# 5. Оценка качества
f1_micro_average = f1_score(y_true=true_labels, y_pred=predictions, average='micro')
roc_auc = roc_auc_score(true_labels, predictions, average = 'micro')
accuracy = accuracy_score(true_labels, predictions)
# return as dictionary
metrics = {'f1': f1_micro_average,
            'roc_auc': roc_auc,
            'accuracy': accuracy}
report = classification_report(true_labels, predictions)

print(f"Accuracy: {accuracy}")
print(metrics)
print(report)

Accuracy: 0.8452173913043478
{'f1': 0.8452173913043478, 'roc_auc': 0.8231017880253286, 'accuracy': 0.8452173913043478}
              precision    recall  f1-score   support

         0.0       0.78      0.76      0.77       578
         1.0       0.88      0.89      0.88      1147

    accuracy                           0.85      1725
   macro avg       0.83      0.82      0.83      1725
weighted avg       0.84      0.85      0.84      1725

