In [None]:
torch.cuda.empty_cache()

In [1]:
import kagglehub, zipfile, os
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.model_selection import train_test_split

from transformers import BertConfig, BertModel, BertTokenizerFast, BertForSequenceClassification, TrainingArguments, Trainer, get_linear_schedule_with_warmup, AdamW

In [2]:
# Download latest version
path = kagglehub.dataset_download("blackmoon/russian-language-toxic-comments")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/blackmoon/russian-language-toxic-comments?dataset_version_number=1...


100%|██████████| 1.49M/1.49M [00:01<00:00, 1.50MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/blackmoon/russian-language-toxic-comments/versions/1


In [3]:
RANDOM_SEED = 1
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [4]:
data = pd.read_csv('/root/.cache/kagglehub/datasets/blackmoon/russian-language-toxic-comments/versions/1/labeled.csv')

In [5]:
ds_train, ds_test = train_test_split(data, test_size=0.1, random_state=RANDOM_SEED)
ds_val, ds_test = train_test_split(ds_test, test_size=0.5, random_state=RANDOM_SEED)

In [None]:
ds_train['comment'].tolist()[0:5]

['Ну я смотрю - давай, облей меня говном.\n',
 'Любимый мой анекдот. Ибо часто попадаются клиенты с вопросом- КУЛЕ ТАК ДОРОГО??? Работаю автоэлектриком',
 'Все зависит от региона. В своей области я устраивался и зп была 15-18к, а клиентоориентированность похоже покурить вышла, установка одна - абонент всегда врет, если хочет отключиться - делай что хочешь, но 5 возражений должен отработать. И на сладкое. Меня лишили части зарплаты, за то что программа зафиксоровала мое якобы отсутствие в течении 3-4х часов, на протяжении месяца.\n',
 'Вы может быть не поняли? Вы хорошо владеете русским языком? Повторю еще раз: Вам не трудно будет дать список ценностей принятых в нашем социуме? Где он? Кто его видел?\n',
 'ррряяяяя! Пшла нахуй в свой петушиный загон, вахтёропидарашка.\n']

In [81]:
def slice_token(index, sentences, labels, tokenizer, max_length):
    start, stop, step = index.indices(len(sentences))
    result = []
    for i in range(start, stop, step):
        encoding = tokenizer(
                [sentences[i]],
                padding='max_length',
                truncation = True,
                max_length = max_length,
                return_tensors = 'pt'
            )
        #print(torch.tensor([labels[i]], dtype=torch.float64))
        encoding['labels'] = torch.tensor([labels[i]], dtype=torch.float32).unsqueeze(1)
        if any(v is None for v in encoding.values()):
            print("Warning: None value detected in encoding!")
            print(encoding)
        result.append(encoding)

    return result

In [82]:
class NERDataset(Dataset):
    def __init__(self, sentences, labels, tokenizer, max_length):
        self.sentences = sentences.tolist()
        self.labels = labels.astype(float).tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length
    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):

        if isinstance(idx, slice):
            # Обработка среза
            return slice_token(idx, self.sentences, self.labels, self.tokenizer, self.max_length)
        elif isinstance(idx, int):
            tokens = self.sentences[idx]
            tag = self.labels[idx]
            # токенизируем
            encoding = self.tokenizer(
                [tokens],
                padding='max_length',
                truncation = True,
                max_length = self.max_length,
                return_tensors = 'pt'
            )
            encoding['labels'] = torch.tensor([tag], dtype=torch.float32).unsqueeze(1)
            if any(v is None for v in encoding.values()):
                print("Warning: None value detected in encoding!")
                print(encoding)
            return encoding

            #return {key : value[0] for key, value in encoding.items()}

In [8]:
tokenizer = BertTokenizerFast.from_pretrained('google-bert/bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [83]:
dataset_train = NERDataset(ds_train['comment'], ds_train['toxic'], tokenizer, 512)
dataset_test = NERDataset(ds_test['comment'], ds_test['toxic'], tokenizer, 512)
dataset_val = NERDataset(ds_val['comment'], ds_val['toxic'], tokenizer, 512)

In [72]:
little_dataset_train = NERDataset(ds_train['comment'][0:3000], ds_train['toxic'], tokenizer, 512)
little_dataset_val = NERDataset(ds_val['comment'], ds_val['toxic'], tokenizer, 512)

In [84]:
test_loader = DataLoader(dataset_test, batch_size=16)
train_loader = DataLoader(dataset_train, batch_size=16)
val_loader = DataLoader(dataset_val, batch_size = 16)

In [55]:
little_train_loader = DataLoader(dataset_train[0:3000], batch_size = 32)
little_val_loader = DataLoader(dataset_val, batch_size = 32)

In [42]:
next(iter(train_loader))

{'input_ids': tensor([[  101,  1192, 29748,  ...,     0,     0,     0],
         [  101,  1190, 29757,  ...,     0,     0,     0],
         [  101,  1182, 29747,  ...,     0,     0,     0],
         ...,
         [  101,  1195, 14150,  ...,     0,     0,     0],
         [  101,  1193, 29746,  ...,     0,     0,     0],
         [  101,  1193, 29742,  ...,     0,     0,     0]]),
 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'labels': tensor([1., 0., 0., 1., 1., 0., 1., 0., 0., 0., 1., 1., 0., 0., 0., 1.],
        dtype=torch.float64)}

In [56]:
next(iter(little_train_loader))

{'input_ids': tensor([[  101,  1192, 29748,  ...,     0,     0,     0],
         [  101,  1190, 29757,  ...,     0,     0,     0],
         [  101,  1182, 29747,  ...,     0,     0,     0],
         ...,
         [  101,  1190, 14150,  ...,     0,     0,     0],
         [  101,  1189, 16856,  ...,     0,     0,     0],
         [  101,  1184, 29436,  ...,     0,     0,     0]]),
 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'labels': tensor([1., 0., 0., 1., 1., 0., 1., 0., 0., 0., 1., 1., 0., 0., 0., 1., 0., 0.,
         0., 0., 1., 0., 0., 0., 1., 0

In [20]:
torch.unsqueeze(torch.tensor([1., 0., 0., 1., 1., 0., 1., 0., 0., 0., 1., 1., 0., 0., 0., 1.],
        dtype=torch.float64), 0)

tensor([[1., 0., 0., 1., 1., 0., 1., 0., 0., 0., 1., 1., 0., 0., 0., 1.]],
       dtype=torch.float64)

In [None]:
next(iter(train_loader))

{'input_ids': tensor([[  101,  1192, 29748,  ...,     0,     0,     0],
         [  101,  1190, 29757,  ...,     0,     0,     0],
         [  101,  1182, 29747,  ...,     0,     0,     0],
         ...,
         [  101,  1190, 14150,  ...,     0,     0,     0],
         [  101,  1189, 16856,  ...,     0,     0,     0],
         [  101,  1184, 29436,  ...,     0,     0,     0]]),
 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'labels': tensor([[1.],
         [0.],
         [0.],
         [1.],
         [1.],
         [0.],
         [1.],
         [0.],

In [12]:
model = BertForSequenceClassification.from_pretrained('google-bert/bert-base-uncased', num_labels = len(set(ds_train['toxic'])))
model = model.to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=2e-5, correct_bias=False)
training_args = TrainingArguments(
    output_dir='./results',          # Output directory
    eval_strategy="epoch",    # Evaluate after each epoch
    learning_rate=2e-5,             # Learning rate
    per_device_train_batch_size=32, # Batch size for training
    per_device_eval_batch_size=32,  # Batch size for evaluation
    num_train_epochs=3,             # Number of epochs
    weight_decay=0.01,              # Strength of weight decay
    logging_dir="./logs",           # Directory for storing logs
    logging_steps=10,               # что это такое?
    save_strategy="epoch",          # Save model after each epoch
    load_best_model_at_end=True,    # Load the best model after training
    metric_for_best_model="f1", # Use F1 score to choose the best model
    seed=RANDOM_SEED
)



In [14]:
def compute_metrics(p):
    print(p)
    predictions, labels = p
    print(predictions, labels)
    # логиты в индексы
    predictions = predictions.argmax(axis=-1)

    # пихнем в метрику и получим результат
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    acc = accuracy_score(labels, predictions)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [85]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,  # Training dataset
    eval_dataset=dataset_val,   # Evaluation dataset
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
    )


  trainer = Trainer(


In [86]:
# собственно обучение - автоматически делает логи
trainer.train()

# оценим модельку
eval_results = trainer.evaluate()
print(f"Evaluation Results: {eval_results}")

# Сохраним, что получилось
trainer.save_model("./ner_model")
#b5a31e3a762dc4fdbd905c7a205899ee8116917a

ValueError: too many values to unpack (expected 2)