# Дообучение моделей с Hugging Face

In [None]:
import pandas as pd
import numpy as np

In [None]:
%%capture
%pip install datasets evaluate transformers[sentencepiece]

In [None]:
from transformers import AutoTokenizer, DataCollatorWithPadding
from datasets import DatasetDict, Features, Value, ClassLabel, Dataset

In [None]:
vac_df = pd.read_excel("data.xls")
vac_df

In [None]:
aug_df = pd.read_csv('vacancies_augmented.csv', on_bad_lines='warn')
aug_df

In [None]:
train_df = pd.concat([
    pd.DataFrame({
        'text' : vac_df['responsibilities'].dropna().apply(lambda x: str(x).lower().replace('условия', ' ').replace('обязанности', ' ').replace(':', '')),
        'labels' : 0,
    }),
    pd.DataFrame({
        'text' : vac_df['requirements'],
        'labels' : 1,
    }),
    pd.DataFrame({
        'text' : vac_df['terns'],
        'labels' : 2,
    }),
    pd.DataFrame({
        'text' : vac_df['notes'],
        'labels' : 3,
    }),

    pd.DataFrame({
        'text' : aug_df[' responsibilities'].dropna().apply(lambda x: str(x).lower().replace('условия', ' ').replace('обязанности', ' ').replace(':', '')),
        'labels' : 0,
    }),
        pd.DataFrame({
        'text' : aug_df[' requirements'],
        'labels' : 1,
    }),
    pd.DataFrame({
        'text' : aug_df[' terms'],
        'labels' : 2,
    }),
    pd.DataFrame({
        'text' : aug_df[' notes'],
        'labels' : 3,
    }),
])

train_df = train_df.dropna()
train_df

In [None]:
train_df['text'] = train_df['text'].astype(str)
train_df['labels'] = train_df['labels'].astype(int)

In [None]:
%%capture
% pip install scikit-learn

In [None]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(train_df, test_size=0.2)
test_df, valid_df = train_test_split(test_df, test_size=0.3)

train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)
valid_df = valid_df.reset_index(drop=True)

raw_datasets = DatasetDict({
    'train': Dataset.from_pandas(train_df),
    'test': Dataset.from_pandas(test_df),
    'valid': Dataset.from_pandas(valid_df),
})

feature_names = ['responsibilities', 'requirements', 'terms', 'notes']
raw_datasets['train'] = raw_datasets['train'].cast(Features({'text': Value('string'), 'labels': ClassLabel(num_classes=4, names=feature_names)}))
raw_datasets['test'] = raw_datasets['test'].cast(Features({'text': Value('string'), 'labels': ClassLabel(num_classes=4, names=feature_names)}))
raw_datasets['valid'] = raw_datasets['valid'].cast(Features({'text': Value('string'), 'labels': ClassLabel(num_classes=4, names=feature_names)}))

raw_datasets

In [None]:
# Проверяем, корретно ли присвоились классы
raw_datasets['train'].features

In [None]:
class_weights = (1 - (train_df['labels'].value_counts().sort_index() / len(train_df))).values
class_weights

In [None]:
# checkpoint = "DeepPavlov/rubert-base-cased"
checkpoint = "cointegrated/rubert-tiny"
# look later rubert-base-cased-sentence

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example['text'], truncation=True, padding=True, max_length=512)

In [None]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

tokenized_datasets

In [None]:
# Проверяем размер samples
samples = tokenized_datasets['train'][:]
samples = {k: v for k, v in samples.items() if k not in ['idx', 'text']}

batch = data_collator(samples)
{k: v.shape for k, v in batch.items()}

In [None]:
from transformers import AutoModelForSequenceClassification  # AutoModelForPreTraining ?

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=4)

model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint,
    num_labels=4,
    label2id={'responsibilities' : 0, 'requirements' : 1, 'terms' : 2, 'notes' : 3},
    id2label={0 : 'responsibilities', 1 : 'requirements', 2 : 'terms', 3 : 'notes'},
)

In [None]:
from sklearn.metrics import precision_recall_fscore_support

def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  precision, recall, f1 = precision_recall_fscore_support(labels, preds, average='macro')[:3]
  print({'precision' : precision, 'recall' : recall, 'F1-score' : f1}) # jupiter in pycharm moment...
  return {'precision' : precision, 'recall' : recall, 'F1-score' : f1}

In [None]:
import torch
from torch import nn
from transformers import Trainer

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get('labels')
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get('logits')
        # compute custom loss
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor(class_weights.astype(np.float32)).to('cuda'))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [None]:
%%capture
%pip install transformers[torch]
%pip install accelerate -U

In [None]:
from transformers import TrainingArguments

batch_size = 8
epochs_num = 5
checkpoints_path="checkpoints"
# logging_steps = len(raw_datasets['train']) // batch_size

training_args = TrainingArguments(output_dir=checkpoints_path,
                                  overwrite_output_dir=True,
                                  num_train_epochs=epochs_num,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=0.001,
                                  evaluation_strategy='epoch',
                                  logging_strategy='epoch',
                                  save_strategy='epoch',
                                  fp16=True,
                                  )

trainer = CustomTrainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['valid'],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [None]:
# F1 появится после первой эпохи
trainer.train()

### Сохранение модели

In [None]:
%%capture
%pip install pytest-shutil

In [None]:
from distutils.dir_util import copy_tree
checkpoint_name = 'checkpoint-1172'
new_model_name = "rubert-tiny-vacancy-information-extractor"
save_path = new_model_name

copy_tree('checkpoints/' + checkpoint_name, save_path)

In [None]:
%%capture
%pip install huggingface_hub

In [None]:
from huggingface_hub import notebook_login

print("Go to: https://huggingface.co/settings/tokens")
notebook_login() # Сюда надо вставить свой сгенерированный токен [https://huggingface.co/settings/tokens]

In [None]:
AutoTokenizer.from_pretrained(save_path).push_to_hub(new_model_name)

In [None]:
  # Pytorch
pt_model = AutoModelForSequenceClassification.from_pretrained(save_path)
# pt_model.save_pretrained("drive/MyDrive/ОБЩЕЕ/РСОДПО/AI/emotional/finetuned/to_hf/rubert-name-you-picked")
pt_model.push_to_hub(new_model_name)

In [None]:
  # TensorFlow
# tf_model = TFDistilBertForSequenceClassification.from_pretrained("path/to/awesome-name-you-picked", from_pt=True)
# tf_model.save_pretrained("path/to/awesome-name-you-picked")

In [None]:
  # JAX
# flax_model = FlaxDistilBertForSequenceClassification.from_pretrained(
#     "path/to/awesome-name-you-picked", from_pt=True
# )

## Проверка модели

In [None]:
from transformers import pipeline

In [None]:
model_name = "seninoseno/rubert-tiny-vacancy-information-extractor"
face = pipeline('text-classification', model=model_name)

In [None]:
res = face(test_df['text'].to_list())

In [None]:
from sklearn.metrics import f1_score

res = [row['label'] for row in res]
res

In [None]:
labels = test_df['labels'].replace({0 : 'responsibilities', 1 : 'requirements', 2 : 'terms', 3 : 'notes'}).to_list()

In [None]:
from sklearn.metrics import precision_recall_fscore_support as alotmetrics

alotmetrics(labels, res, average='macro')[:3]