##### Импорты

In [38]:
import os

import pandas as pd
import numpy as np

import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint  # сохранение весов
from pytorch_lightning.callbacks import LearningRateMonitor # автоматическое отслеживание lr
from pytorch_lightning.callbacks.early_stopping import EarlyStopping  # ранние остановки

import torch
from torch import nn
from torch.utils.data import DataLoader
from torch.utils.data.dataset import Dataset

from torchmetrics import Accuracy

from transformers import AutoTokenizer, AutoModelForSequenceClassification

import warnings
import matplotlib.pyplot as plt

from tqdm.autonotebook import tqdm

warnings.filterwarnings("ignore", "FutureWarning")
plt.style.use("dark_background")

##### Загрузка данных

In [39]:
# def get_file_from_csv(path: str) -> pd.DataFrame:
#     return pd.read_csv(path).dropna()

In [40]:
# FILE_PATH = 'data.csv'

# data = get_file_from_csv(FILE_PATH)

In [41]:
# data['Review']


In [42]:
# data['Rating'].value_counts()

##### Уберем нейтральные отзывы и маловстречающиеся оценки -> 0, 3, 7, 9

In [43]:
# # 
# excluded_grades = [0, 3 ,7 ,9]

# data = data[~data.Rating.isin(excluded_grades)]

# data['Rating'].value_counts()

In [44]:
# # Добавим колонку с оценками на тональность и заменим значения 0 - negative, 1 - positive

# sentiments_decriptor = { 'negative': 0, 'positive': 1 }

# data = data.rename(columns={'Rating': 'sentiment'})

# data.loc[data['sentiment'] <= 2, 'sentiment'] = sentiments_decriptor['negative']
# data.loc[data['sentiment'] >= 4, 'sentiment'] = sentiments_decriptor['positive']

# data

##### Определение устройства для обучения

In [45]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DEVICE

'cuda'

In [46]:
# from sklearn.model_selection import train_test_split


# data = get_file_from_csv('small_data.csv')
# data.rename(columns={'Review': 'text', 'sentiment': 'label'}, inplace=True)
# data
# train_data, test_data = train_test_split(data, test_size=0.3)
# train_data, val_data = train_test_split(data, test_size=0.3)
# # data['sentiment'].value_counts()
# train_data = pd.DataFrame(train_data)
# test_data = pd.DataFrame(test_data)
# val_data = pd.DataFrame(val_data)
# train_data['label'].value_counts()
# test_data['label'].value_counts()
# train_data.to_csv('train.csv', index=False)
# test_data.to_csv('test.csv', index=False)
# val_data.to_csv('val.csv', index=False)


##### Класс преобработки текста

In [47]:
import pymorphy2

import re

from pymystem3 import Mystem

from nltk.corpus import stopwords
import nltk
nltk.download("stopwords")


class TextPreproccessor:
    def __init__(self):
        self.__html_pattern = re.compile(
            '<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
        self.__alphabet_pattern = r"([^а-яА-Я!?:.)(\s])|(\n|\r)"

        self.__mystem = Mystem()  # for lemmatization
        self.morph = pymorphy2.MorphAnalyzer()

        self.__russian_stopwords = stopwords.words("russian")

    def __remove_html(self, text: str) -> str:
        return re.sub(self.__html_pattern, '', text)

    def __filter_symbols(self, text: str) -> str:
        result = re.sub(self.__alphabet_pattern, '', text)
        result = re.sub(r'[\t\r\n]', '', result)
        result = re.sub(r'(\s{2,})', ' ', result)

        return result

    def __lemmatization(self, text: str) -> str:

        return ''.join(self.__mystem.lemmatize(text))

    def __strip(self, text: str) -> str:
        return text.strip(' ')

    def __lower(self, text: str) -> str:
        return text.lower()

    def __remove_stopwords_and_lemmatization(self, text: str) -> str:
        words = text.split(' ')

        words_without_stopwords = []

        for word in words:
            if word not in self.__russian_stopwords:
                words_without_stopwords.append(self.morph.parse(word)[0].normal_form)


        return ' '.join(words_without_stopwords)
    
    def preproccess_text(self, text: str) -> str:
        text = self.__remove_html(text)
        text = self.__lower(text)
        text = self.__strip(text)
        text = text.replace('ё', 'e')
        text = self.__filter_symbols(text)
        text = self.__remove_stopwords_and_lemmatization(text)
        # text = self.__lemmatization(text)

        return text

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\SsaWin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [48]:
morph = pymorphy2.MorphAnalyzer()
res = morph.parse('раму')[0]
res.normal_form


'рама'

In [51]:
class CSVDataset(Dataset):
    def __init__(self, file_path, tokenizer_path="cointegrated/rubert-tiny2"):
        super().__init__()
        
        df = self.read_csv(file_path)
        
        self.texts = df['text'].values.astype(str)
        self.labels = df['label'].values.astype(int)

        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

        self.text_preproccessor = TextPreproccessor()

    def read_csv(self, path: str) -> pd.DataFrame:
        return pd.read_csv(path).dropna()

    def tokenize(self, text: str):
        t = self.tokenizer(text, padding='max_length',
                           truncation=True, return_tensors='pt')

        input_ids = t['input_ids']
        token_type_ids = t['token_type_ids']
        attention_mask = t['attention_mask']

        return input_ids.squeeze(), token_type_ids.squeeze(), attention_mask.squeeze()

    def __getitem__(self, idx: int):
        text: str = self.texts[idx]
        label: int = self.labels[idx]
        
        text = self.text_preproccessor.preproccess_text(text)

        input_ids, token_type_ids, attention_mask = self.tokenize(text)

        return input_ids, token_type_ids, attention_mask, label

    def __len__(self):
        return len(self.texts)

In [52]:
class SentenceDataModule(pl.LightningDataModule):
    def __init__(self,
                 batch_size: int = 32,
                 data_dir: str = './data/',
                 num_workers: int = 12,
                ):
        super().__init__()
        
        self.batch_size = batch_size
        self.data_dir = data_dir
        self.num_workers = num_workers

    def setup(self, stage=None):
        self.train_set = CSVDataset(os.path.join(self.data_dir, 'train.csv'))
        self.val_set = CSVDataset(os.path.join(self.data_dir, 'val.csv'))
        self.test_set = CSVDataset(os.path.join(self.data_dir, 'test.csv'))

    def train_dataloader(self):       
        return DataLoader(dataset=self.train_set, batch_size=self.batch_size, shuffle=True)#, num_workers=self.num_workers)

    def val_dataloader(self):
        if __name__ == '__main__':
            return DataLoader(dataset=self.val_set, batch_size=self.batch_size, shuffle=False)#, num_workers=self.num_workers)

    def test_dataloader(self):
        return DataLoader(dataset=self.test_set, batch_size=self.batch_size, shuffle=False)#, num_workers=self.num_workers)

    def prepare_data(self):
        pass


In [53]:
class SemanticClassifier(pl.LightningModule):
    def __init__(self, 
                 model="cointegrated/rubert-tiny2",
                 out_channels=1,
                 dropout=0.25,
                 eta=3e-4,
                 criterion = None,
                 **kwargs
                 ):
        super().__init__()
        self.save_hyperparameters()

        self.model = AutoModelForSequenceClassification.from_pretrained(model, num_labels=out_channels)

        device = "cuda" if torch.cuda.is_available() else "cpu"

        self.criterion = criterion if criterion is not None else nn.BCEWithLogitsLoss().to(device)

        self.out_channels = out_channels
        
        self.dropout = nn.Dropout(p=dropout)

        self.eta = eta

        self.metrics = {'accuracy': Accuracy(task='binary').to(device)}

        print('INIT MODEL')

    def forward(self, input_ids, token_type_ids, attention_mask):
        # print('FORWARD')
        output = self.model(input_ids=input_ids, attention_mask=attention_mask, return_dict=False)[0]
        return output


    def shared_step(self, sample, stage):
        # print('SHARED STEP', stage)
        input_ids, token_type_ids, attention_mask, label = sample

        logits = self.forward(input_ids, token_type_ids, attention_mask)
        
        # label = label.unsqueeze(1).float()
        # print('logits', logits.shape)
        # print('label', label.shape)
        preds = torch.argmax(logits, 1)

        loss = self.criterion(logits, label.unsqueeze(1).float())
        # print('LOSS', loss.shape)

        return {
            'loss': loss,
            'accuracy': self.metrics["accuracy"](preds, label)
        }

    def shared_epoch_end(self, outputs, stage):
        loss = np.mean([x["loss"].item() for x in outputs])
        acc = np.mean([x["accuracy"].item() for x in outputs])

        metrics = {
            f"{stage}_loss": loss,
            f"{stage}_acc": acc
        }

        self.log_dict(metrics, prog_bar=True)

    def configure_optimizers(self):
        # print('CONFIGURING OPTIMIZERS')

        optimizer = torch.optim.Adam(self.parameters(), lr=self.hparams.eta)

        scheduler_dict = {
            "scheduler": torch.optim.lr_scheduler.ReduceLROnPlateau(
                optimizer,
                patience=5
            ),
            "interval": "epoch",
            "monitor": "valid_loss"
        }

        return {'optimizer': optimizer, 'lr_scheduler': scheduler_dict}
    
    def training_step(self, batch, batch_idx):
        return self.shared_step(batch, 'train')

    def training_epoch_end(self, outputs):
        return self.shared_epoch_end(outputs, 'train')

    def validation_step(self, batch, batch_idx):
        return self.shared_step(batch, 'valid')

    def validation_epoch_end(self, outputs):
        return self.shared_epoch_end(outputs, 'valid')

    def test_step(self, batch, batch_idx):
        return self.shared_step(batch, 'test')

    def test_epoch_end(self, outputs):
        return self.shared_epoch_end(outputs, 'test')

## свернул

In [54]:
BATCH_SIZE = 4

dm = SentenceDataModule(batch_size=BATCH_SIZE)
dm.setup()

In [55]:

model = SemanticClassifier()

model

Some weights of the model checkpoint at cointegrated/rubert-tiny2 were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not 

INIT MODEL


SemanticClassifier(
  (model): BertForSequenceClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(83828, 312, padding_idx=0)
        (position_embeddings): Embedding(2048, 312)
        (token_type_embeddings): Embedding(2, 312)
        (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=312, out_features=312, bias=True)
                (key): Linear(in_features=312, out_features=312, bias=True)
                (value): Linear(in_features=312, out_features=312, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=312, out_features=312, bi

In [56]:
callbacks = [
    ModelCheckpoint(
        dirpath='models',
        filename='{epoch}_{valid_acc:.2f}_{valid_loss:.2f}',
        save_top_k=2,
        monitor='valid_loss',
        mode='min'
    ),
    LearningRateMonitor(logging_interval="step"),
    EarlyStopping(
        monitor="valid_loss",
        min_delta=2e-4,
        patience=10,
        verbose=False,
        mode="min"
    )
]

LOG_PATH = './logs'
logger = TensorBoardLogger(LOG_PATH, name='tiny_bert')

CHECKPOINT = None

In [57]:
trainer = pl.Trainer(
    accelerator='cuda', 
    devices=1,
    max_epochs=100,
    logger=logger,
    callbacks=callbacks,
    resume_from_checkpoint=CHECKPOINT
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [58]:
%env CUDA_VISIBLE_DEVICES=1
%env CUDA_LAUNCH_BLOCKING = 1

env: CUDA_VISIBLE_DEVICES=1
env: CUDA_LAUNCH_BLOCKING=1


## не свернул

In [59]:
CUDA_LAUNCH_BLOCKING = "1"
trainer.fit(model, dm)

  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [1]

  | Name      | Type                          | Params
------------------------------------------------------------
0 | model     | BertForSequenceClassification | 29.2 M
1 | criterion | BCEWithLogitsLoss             | 0     
2 | dropout   | Dropout                       | 0     
------------------------------------------------------------
29.2 M    Trainable params
0         Non-trainable params
29.2 M    Total params
116.776   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


In [60]:
trainer.test(model, dm)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [1]
  rank_zero_warn(


Testing: 0it [00:00, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        test_acc            0.49533333333333335
        test_loss           0.3488016105443239
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'test_loss': 0.3488016105443239, 'test_acc': 0.49533333333333335}]

In [None]:
!kill 6348
!kill 6006

kill: 6348: No such process
kill: 6006: No such process


In [61]:
%load_ext tensorboard
%tensorboard --logdir ./logs/tiny_bert

Reusing TensorBoard on port 6006 (pid 6348), started 3:08:47 ago. (Use '!kill 6348' to kill it.)