In [None]:
!pip install git+https://github.com/huggingface/transformers.git
!pip install -U sentence-transformers
!pip install evaluate
!pip install transformers[torch]

In [1]:
import torch
from torch.utils.data import DataLoader, Dataset

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

import evaluate

from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import AutoTokenizer

In [2]:
checkpoint = "cointegrated/rubert-tiny2"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint, num_labels=2
)

Some weights of the model checkpoint at cointegrated/rubert-tiny2 were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny2

Dataset of toxic comments  

In [None]:
comments = pd.DataFrame(pd.read_csv('data_commets_toxic.csv')['comment'])
comments = comments.rename(columns={'comment': 'text'})

comments['is_request'] = 0
comments

Promobot Dataset  

In [None]:
requests = pd.DataFrame(pd.read_csv('data_corrected_spell_ner_full_text.csv')['Текст инцидента'])
requests = requests.rename(columns={'Текст инцидента': 'text'})

requests['is_request'] = 1
requests

In [None]:
data = pd.concat([requests, comments], ignore_index=True)
data

In [3]:
# data = pd.read_csv('data_request_predict.csv')

train_data, val_data = train_test_split(
    data, random_state=42, test_size=.1
)

data

Unnamed: 0,text,is_request
0,"Добрый день. Сегодня, 20 августа, моя мать шла...",1
1,"Пермь, г. , +791692145. В Перми с ноября 2021 ...",1
2,"Добрый день! Скажите, пожалуйста, если подала ...",1
3,Каждая из них не о чем. Люди на остановках хот...,1
4,"В Березниках у сына, привитого от коронавируса...",1
...,...,...
36897,Вонючий совковый скот прибежал и ноет. А вот и...,0
36898,А кого любить? Гоблина тупорылого что-ли? Или ...,0
36899,"Посмотрел Утомленных солнцем 2. И оказалось, ч...",0
36900,КРЫМОТРЕД НАРУШАЕТ ПРАВИЛА РАЗДЕЛА Т.К В НЕМ Н...,0


In [4]:
class TextDataset(Dataset):
    def __init__(self, data_df, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.max_length = max_length

        self.sentences = data_df["text"].values
        self.labels = data_df['is_request'].values

    def __len__(self):
        return self.labels.shape[0]

    def __getitem__(self, i):
        sentence, label = self.sentences[i], self.labels[i]

        tokens = tokenizer(sentence, truncation="longest_first", padding="max_length", max_length=self.max_length)

        tokens['labels'] = label

        tokens = {key: torch.tensor(val).long() for key, val in tokens.items()}

        return tokens


train_dataset = TextDataset(train_data, tokenizer)
val_dataset = TextDataset(val_data, tokenizer)

In [5]:
accuracy = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

In [6]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    out = {}

    out.update(accuracy.compute(predictions=predictions, references=labels))
    out.update(f1_metric.compute(predictions=predictions, references=labels))

    return out

In [7]:
training_args = TrainingArguments(
    output_dir="models/rubert_tiny_request",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [8]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [9]:
trainer.train()



  0%|          | 0/5190 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 0.232, 'learning_rate': 1.8073217726396917e-05, 'epoch': 0.48}
{'loss': 0.1266, 'learning_rate': 1.6146435452793836e-05, 'epoch': 0.96}


  0%|          | 0/116 [00:00<?, ?it/s]

{'eval_loss': 0.10093213617801666, 'eval_accuracy': 0.9636954754808995, 'eval_f1': 0.9706140350877193, 'eval_runtime': 12.348, 'eval_samples_per_second': 298.915, 'eval_steps_per_second': 9.394, 'epoch': 1.0}
{'loss': 0.0837, 'learning_rate': 1.4219653179190754e-05, 'epoch': 1.45}
{'loss': 0.0797, 'learning_rate': 1.2292870905587671e-05, 'epoch': 1.93}


  0%|          | 0/116 [00:00<?, ?it/s]

{'eval_loss': 0.09592889994382858, 'eval_accuracy': 0.9677594147927391, 'eval_f1': 0.974000436967446, 'eval_runtime': 10.1454, 'eval_samples_per_second': 363.811, 'eval_steps_per_second': 11.434, 'epoch': 2.0}
{'loss': 0.0605, 'learning_rate': 1.0366088631984585e-05, 'epoch': 2.41}
{'loss': 0.0588, 'learning_rate': 8.439306358381504e-06, 'epoch': 2.89}


  0%|          | 0/116 [00:00<?, ?it/s]

{'eval_loss': 0.09358404576778412, 'eval_accuracy': 0.9712814955296668, 'eval_f1': 0.9766725352112675, 'eval_runtime': 10.0983, 'eval_samples_per_second': 365.507, 'eval_steps_per_second': 11.487, 'epoch': 3.0}
{'loss': 0.0509, 'learning_rate': 6.512524084778421e-06, 'epoch': 3.37}
{'loss': 0.0452, 'learning_rate': 4.585741811175338e-06, 'epoch': 3.85}


  0%|          | 0/116 [00:00<?, ?it/s]

{'eval_loss': 0.09912583976984024, 'eval_accuracy': 0.9726361419669466, 'eval_f1': 0.977787552232241, 'eval_runtime': 10.0689, 'eval_samples_per_second': 366.575, 'eval_steps_per_second': 11.521, 'epoch': 4.0}
{'loss': 0.0357, 'learning_rate': 2.658959537572254e-06, 'epoch': 4.34}
{'loss': 0.0366, 'learning_rate': 7.321772639691716e-07, 'epoch': 4.82}


  0%|          | 0/116 [00:00<?, ?it/s]

{'eval_loss': 0.10722693055868149, 'eval_accuracy': 0.9712814955296668, 'eval_f1': 0.976805251641138, 'eval_runtime': 10.0167, 'eval_samples_per_second': 368.486, 'eval_steps_per_second': 11.581, 'epoch': 5.0}
{'train_runtime': 1329.5693, 'train_samples_per_second': 124.894, 'train_steps_per_second': 3.904, 'train_loss': 0.07927754177982867, 'epoch': 5.0}


TrainOutput(global_step=5190, training_loss=0.07927754177982867, metrics={'train_runtime': 1329.5693, 'train_samples_per_second': 124.894, 'train_steps_per_second': 3.904, 'train_loss': 0.07927754177982867, 'epoch': 5.0})

In [10]:
model.save_pretrained("request_ruBert-tiny/")