In [1]:
import kagglehub, zipfile, os
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.model_selection import train_test_split

from transformers import BertConfig, BertModel, BertTokenizerFast, BertForSequenceClassification, TrainingArguments, Trainer, get_linear_schedule_with_warmup, AdamW

In [2]:
torch.cuda.empty_cache()

In [3]:
# Download latest version
path = kagglehub.dataset_download("blackmoon/russian-language-toxic-comments")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/blackmoon/russian-language-toxic-comments?dataset_version_number=1...


100%|██████████| 1.49M/1.49M [00:01<00:00, 1.49MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/blackmoon/russian-language-toxic-comments/versions/1





In [4]:
RANDOM_SEED = 1
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [5]:
data = pd.read_csv('/root/.cache/kagglehub/datasets/blackmoon/russian-language-toxic-comments/versions/1/labeled.csv')

In [6]:
ds_train, ds_test = train_test_split(data, test_size=0.1, random_state=RANDOM_SEED)
ds_val, ds_test = train_test_split(ds_test, test_size=0.5, random_state=RANDOM_SEED)

In [7]:
def slice_token(index, sentences, labels, tokenizer, max_length):
    start, stop, step = index.indices(len(sentences))
    result = []
    for i in range(start, stop, step):
        encoding = tokenizer(
                [sentences[i]],
                padding='max_length',
                truncation = True,
                max_length = max_length,
                return_tensors = 'pt'
            )
        item = {key: val.squeeze(0) for key, val in encoding.items()}  # Убираем batch dim
        item['labels'] = torch.tensor(labels[i], dtype=torch.long) # Без лишнего .unsqueeze(1)
        result.append(item)
    return result

In [8]:
class NERDataset(Dataset):
    def __init__(self, sentences, labels, tokenizer, max_length):
        self.sentences = sentences.tolist()
        self.labels = labels.astype(float).tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length
    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):

        if isinstance(idx, slice):
            # Обработка среза
            return slice_token(idx, self.sentences, self.labels, self.tokenizer, self.max_length)
        elif isinstance(idx, int):
            tokens = self.sentences[idx]
            tag = self.labels[idx]
            # токенизируем
            encoding = self.tokenizer(
                [tokens],
                padding='max_length',
                truncation = True,
                max_length = self.max_length,
                return_tensors = 'pt'
            )
            item = {key: val.squeeze(0) for key, val in encoding.items()}  # Убираем batch dim
            item['labels'] = torch.tensor(tag, dtype=torch.long) # Без лишнего

            return item

In [9]:
tokenizer = BertTokenizerFast.from_pretrained('google-bert/bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [10]:
dataset_train = NERDataset(ds_train['comment'], ds_train['toxic'], tokenizer, 512)
dataset_test = NERDataset(ds_test['comment'], ds_test['toxic'], tokenizer, 512)
dataset_val = NERDataset(ds_val['comment'], ds_val['toxic'], tokenizer, 512)

In [11]:
test_loader = DataLoader(dataset_test, batch_size=16)
train_loader = DataLoader(dataset_train, batch_size=16)
val_loader = DataLoader(dataset_val, batch_size = 16)

In [12]:
model = BertForSequenceClassification.from_pretrained('google-bert/bert-base-uncased', num_labels = 2)
model = model.to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=2e-5, correct_bias=False)
training_args = TrainingArguments(
    output_dir='./results',          # Output directory
    eval_strategy="epoch",    # Evaluate after each epoch
    learning_rate=2e-5,             # Learning rate
    per_device_train_batch_size=32, # Batch size for training
    per_device_eval_batch_size=32,  # Batch size for evaluation
    num_train_epochs=3,             # Number of epochs
    weight_decay=0.01,              # Strength of weight decay
    logging_dir="./logs",           # Directory for storing logs
    logging_steps=10,               # что это такое?
    save_strategy="epoch",          # Save model after each epoch
    load_best_model_at_end=True,    # Load the best model after training
    metric_for_best_model="f1", # Use F1 score to choose the best model
    seed=RANDOM_SEED
)



In [14]:
def compute_metrics(p):
    predictions, labels = p
    # логиты в индексы
    predictions = predictions.argmax(axis=-1)

    # пихнем в метрику и получим результат
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    acc = accuracy_score(labels, predictions)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [16]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,  # Training dataset
    eval_dataset=dataset_val,   # Evaluation dataset
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
    )


  trainer = Trainer(


In [17]:
# собственно обучение - автоматически делает логи
trainer.train()

# оценим модельку
eval_results = trainer.evaluate()
print(f"Evaluation Results: {eval_results}")

# Сохраним, что получилось
trainer.save_model("./ner_model")
#b5a31e3a762dc4fdbd905c7a205899ee8116917a

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mlizaolva123[0m ([33mlizaolva123-rggu[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3053,0.380293,0.85853,0.780172,0.870192,0.707031
2,0.3393,0.316942,0.872399,0.820312,0.820312,0.820312
3,0.2898,0.324929,0.893204,0.843177,0.880851,0.808594


Evaluation Results: {'eval_loss': 0.32492873072624207, 'eval_accuracy': 0.8932038834951457, 'eval_f1': 0.8431771894093686, 'eval_precision': 0.8808510638297873, 'eval_recall': 0.80859375, 'eval_runtime': 23.2211, 'eval_samples_per_second': 31.049, 'eval_steps_per_second': 0.99, 'epoch': 3.0}


In [18]:
def eval_model(model, data_loader, device):
  model = model.eval()

  all_preds = torch.tensor([], device=device)
  all_trues = torch.tensor([], device=device)

  with torch.no_grad():
    for d in data_loader:
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      targets = d["labels"].to(device)

      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      #print(outputs)
      preds = torch.argmax(outputs['logits'], axis=-1)
      all_preds = torch.cat((all_preds, preds), -1)
      all_trues = torch.cat((all_trues, targets), -1)

  precision, recall, f1, _ = precision_recall_fscore_support(all_trues.cpu(), all_preds.cpu(), average='macro')
  acc = accuracy_score(all_trues.cpu(), all_preds.cpu())
  return {
      'accuracy': acc,
      'f1': f1,
      'precision': precision,
      'recall': recall
  }

In [19]:
test = eval_model(model, test_loader, device)

In [20]:
test

{'accuracy': 0.8848821081830791,
 'f1': 0.8657798352386528,
 'precision': 0.879727720472075,
 'recall': 0.8554682742662283}

In [21]:
precision, recall, f1, _ = precision_recall_fscore_support(torch.tensor(data['toxic']), torch.zeros(14412), average='macro')
acc = accuracy_score(torch.tensor(data['toxic']), torch.zeros(14412))
print({
      'accuracy': acc,
      'f1': f1,
      'precision': precision,
      'recall': recall})

{'accuracy': 0.6651401609769636, 'f1': 0.3994499541628469, 'precision': 0.3325700804884818, 'recall': 0.5}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
