<a href="https://colab.research.google.com/github/medbar/maga_sis/blob/main/2/NLP/BERT_clf_tutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

* Полезно: [Fine-tuning a pretrained model](https://huggingface.co/transformers/training.html)

* Если модель есть в репозитории Huggingface, её можно загрузить и проинициализировать по названию.
Например:
https://huggingface.co/DeepPavlov/rubert-base-cased
```
tokenizer = AutoTokenizer.from_pretrained("rubert-base-cased")
model = AutoModelForSequenceClassification.from_pretrained("rubert-base-cased", num_labels=2)
```



In [None]:
!pip install transformers

In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", 
                                                           num_labels=2)  # Число классов в целевом датасете

# Prepare data
[Loading a dataset](https://huggingface.co/docs/datasets/quicktour.html#loading-a-dataset)

In [None]:
!pip install datasets  # тоже библиотека Huggingface

In [None]:
### Сгенерируем элементарный датасет

import random

data = {0: [], 1: []}  # положительный и отрицательный класс

for i in range(1000):
    label = random.randint(0, 1)
    if label == 1:
        text = ['spam']
    else:
        text = ['ham']
    text *= random.randint(10, 1000)
    text = " ".join(text)

    data[label].append(text)

for value in data.values():  # перемешаем каждый класс перед разбиением
    random.shuffle(value)

In [None]:
# Выберем пропорции для разбиения на train, eval и test сабсеты

split_sizes = (0.8, 0.1, 0.1)
assert sum(split_sizes) == 1

# Возьмём нужную долю от каждого класса 
# (как sklearn.model_selection.train_test_split с арг. stratify)

from math import ceil

data_split = {key: {} for key in ["train", "eval", "test"]}
for no, partition in enumerate(["train", "eval", "test"]):
    for label, samples in data.items():
        sample_count = len(samples)

        start = ceil(sum(split_sizes[:no]) * sample_count)
        stop = ceil(sum(split_sizes[:no+1]) * sample_count)

        data_split[partition][label] = samples[start:stop]

In [None]:
### Запишем в файлы

import csv

for partition in ["train", "eval", "test"]:
    with open(f"dummy_data_{partition}.csv", "w+") as f:
        writer = csv.writer(f)

        writer.writerow(['text', 'label'])
        for label, texts in data_split[partition].items():
            for text in texts:
                writer.writerow([text, label])

In [None]:
from datasets import load_dataset

dataset = load_dataset('csv', data_files={
    'train': 'dummy_data_train.csv',
    'eval': 'dummy_data_eval.csv',
    'test': 'dummy_data_test.csv'
})

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

In [None]:
tokenized_dataset["train"]

# Train
[Trainer](https://huggingface.co/transformers/main_classes/trainer.html)

[TrainingArguments](https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments)

In [None]:
from transformers import TrainingArguments


# Аргументы дефолтные, feel free to experiment
training_args = TrainingArguments(
    per_device_train_batch_size=8,
    learning_rate=5e-5,
    lr_scheduler_type="linear",
    num_train_epochs=100,
    warmup_steps=0,
    weight_decay=.0,
    save_total_limit=20,
    output_dir='checkpoints',
    logging_dir='logs',
    # Для сохранения и валидации на каждом n-ном шаге — иначе сохраняется после каждой эпохи
    # evaluation_strategy="steps",
    # save_strategy="steps",
    # eval_steps=50,
    # save_steps=50,
)

In [None]:
import numpy as np
from datasets import load_metric

metric_names = ["accuracy", "precision", "recall", "f1"]
metrics = {name: load_metric(name) for name in metric_names}

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {name: metric.compute(predictions=predictions, references=labels) for name, metric in metrics.items()}

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model, 
    args=training_args, 
    train_dataset=tokenized_dataset["train"], 
    eval_dataset=tokenized_dataset["eval"],
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

# Tensorboard
Помогает наблюдать за обучением (запускать предварительно). Обычно запускается локально на некотором порте, открывается в браузере; но Колаб позволяет запустить его прямо в ячейке

In [None]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

In [None]:
import tensorflow as tf
import datetime, os

In [None]:
%tensorboard --logdir logs

# Test

In [None]:
trainer.evaluate(tokenized_dataset["test"])

# Infer


In [None]:
import torch

def infer(texts):
    with torch.no_grad():
        tokenized_texts = tokenizer(texts, padding="max_length", truncation=True, return_tensors="pt")
        tokenized_texts = tokenized_texts.to('cuda')
        return model.forward(**tokenized_texts)

In [None]:
predictions = infer([
       "spam spam spam",
       "ham ham ham ham"
])['logits'].cpu().numpy()

predictions