In [1]:
import pandas as pd
import numpy as np
import random
import torch
import transformers
import torch.nn as nn
from transformers import AutoModel, BertTokenizer, BertForSequenceClassification
from transformers import TrainingArguments, Trainer
from sklearn.metrics import classification_report, f1_score
from sklearn.preprocessing import LabelEncoder
import re



In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
def seed_everything(seed=42):
    import random
    import numpy as np
    import os
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

In [4]:
def pr_text_editing(a):
    # удаляем ссылки
    a = re.sub(r'http\S+', '', a)
    # удаление номера телефона
    a = re.sub('(\+7|8)(-| | \(|\(|)\d{3}(-| |\)|\) )\d{3}(-| |)\d{2}(-| |)\d{2}', '', a)
    # удаление почты
    a = re.sub('([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+@[A-Za-z0-9-]+(\.[A-Z|a-z]{2,})+', '', a)
    # удаление скобок
    a = re.sub('\([^)]*\)', '', a)
    # удаление множественных пробелов
    a = re.sub(' +', ' ', a)
    # удаление спецсимволов
    a = re.sub('<rating>', '', a)
    a = re.sub('[^A-zA-Z0-9А-Яа-яЁё\.\,%\- ]', '', a)
    a = re.sub('-+', '-', a)
    return a


def preprocess_names(a):
    """
  Удаляет личные данные экспертов в самом начале всех текстов от одной конторы
  """
    b = re.search(
        '([1-9]|[12][0-9]|3[0-1]) (января|февраля|марта|апреля|мая|июня|июля|августа|сентября|октября|декабря) [12][0-9][0-9][0-9]',
        a)
    if b:
        if b.span()[0] == 0:
            return a[re.search(' [А-Я]+ ', a).span()[0] + 1:]
        else:
            return a
    else:
        return a


In [5]:
# ====================================================
# Data Loading
# ====================================================
train = pd.read_csv('/kaggle/input/nn-hack-data/CRA_train_1200_train.csv')
valid = pd.read_csv('/kaggle/input/nn-hack-data/CRA_train_1200_valid.csv')
test = pd.read_csv('/kaggle/input/nn-hack-data/CRA_train_1200_test.csv')

In [6]:
le = LabelEncoder()
le.fit(train['Уровень рейтинга'])

train['target'] = le.transform(train['Уровень рейтинга'])
valid['target'] = le.transform(valid['Уровень рейтинга'])
test['target'] = le.transform(test['Уровень рейтинга'])

In [7]:
train['clear_text'] = train['pr_txt'].apply(pr_text_editing)
valid['clear_text'] = valid['pr_txt'].apply(pr_text_editing)
test['clear_text'] = test['pr_txt'].apply(pr_text_editing)

train['clear_text'] = train['clear_text'].apply(preprocess_names)
valid['clear_text'] = valid['clear_text'].apply(preprocess_names)
test['clear_text'] = test['clear_text'].apply(preprocess_names)

In [8]:
model = BertForSequenceClassification.from_pretrained('sberbank-ai/sbert_large_mt_nlu_ru', num_labels=17).to("cuda")
tokenizer = BertTokenizer.from_pretrained('sberbank-ai/sbert_large_mt_nlu_ru')

Downloading (…)lve/main/config.json:   0%|          | 0.00/752 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sberbank-ai/sbert_large_mt_nlu_ru and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/1.78M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/331 [00:00<?, ?B/s]

In [9]:
tokens_train = tokenizer.batch_encode_plus(
    train['clear_text'],
    max_length=512,
    pad_to_max_length=True,
    truncation = True,
    add_special_tokens=True,
)
tokens_valid = tokenizer.batch_encode_plus(
    valid['clear_text'],
    max_length=512,
    pad_to_max_length=True,
    truncation = True,
    add_special_tokens=True,
)

In [10]:
class Data(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        
    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item
    def __len__(self):
        return len(self.labels)
    
train_dataset = Data(tokens_train, train['target'])
test_dataset = Data(tokens_valid, valid['target'])

In [11]:
training_args = TrainingArguments(
    output_dir = '/kaggle/working/', #Выходной каталог
    num_train_epochs = 18, #Кол-во эпох для обучения
    per_device_train_batch_size = 4, #Размер пакета для каждого устройства во время обучения
    per_device_eval_batch_size = 8, #Размер пакета для каждого устройства во время валидации
    weight_decay =0.01, #Понижение весов
    logging_dir = '/kaggle/working/logs', #Каталог для хранения журналов
    load_best_model_at_end = True, #Загружать ли лучшую модель после обучения
    learning_rate = 1e-5, #Скорость обучения
    evaluation_strategy ='epoch', #Валидация после каждой эпохи (можно сделать после конкретного кол-ва шагов)
    logging_strategy = 'epoch', #Логирование после каждой эпохи
    save_strategy = 'epoch', #Сохранение после каждой эпохи
    save_total_limit = 1,
    seed=42
)

In [12]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average = 'macro')
    return {'F1': f1}

In [13]:
trainer = Trainer(model=model,
                  tokenizer = tokenizer,
                  args = training_args,
                  train_dataset = train_dataset,
                  eval_dataset = test_dataset,
                  compute_metrics = compute_metrics)

In [None]:
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  


[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016671395183334426, max=1.0…

Epoch,Training Loss,Validation Loss,F1
1,2.5688,2.330562,0.100521
2,1.99,1.801122,0.241356
3,1.4816,1.446548,0.295538
4,1.0594,1.210994,0.460854
5,0.7092,1.155064,0.489686
6,0.4767,0.96064,0.578664
7,0.2793,1.021253,0.597816
8,0.1524,1.144207,0.565872
9,0.0841,1.166098,0.56611
10,0.0468,1.397888,0.561617
