In [1]:
!pip install -q sentence-transformers pylatexenc natasha razdel 

[0m

In [2]:
import numpy as np
from numpy.random import default_rng
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from sentence_transformers.datasets import DenoisingAutoEncoderDataset
from torch.utils.data import DataLoader
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import torch
from sentence_transformers import SentenceTransformer, models
from sentence_transformers.losses import DenoisingAutoEncoderLoss
from pylatexenc.latex2text import LatexNodes2Text
from sklearn.model_selection import train_test_split
from razdel import tokenize
from nltk.corpus import stopwords
from string import punctuation
from natasha import Doc, MorphVocab, Segmenter, NewsEmbedding, NewsMorphTagger

In [3]:
df = pd.read_csv('../input/ru-data/habr_cyberleninka.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,data_clean,data_unclean,url
0,0,изложить метод проектирование устройство подач...,Изложен метод проектирования устройств подачи ...,https://cyberleninka.ru/article/n/matematiches...
1,1,статья исследовательский метод оценка панель л...,В статье представлены исследовательские методы...,https://cyberleninka.ru/article/n/inzhenernaya...
2,2,,,https://cyberleninka.ru/article/n/mirovoy-fina...
3,3,исследование актуальный философский проблема р...,Представлено исследование актуальной философск...,https://cyberleninka.ru/article/n/metodologich...
4,4,статья анализироваться суть понятие определить...,В статье анализируется суть как и понятия. О...,https://cyberleninka.ru/article/n/sotsiokultur...


In [4]:
df.isna().sum()

Unnamed: 0        0
data_clean      485
data_unclean    485
url               0
dtype: int64

In [5]:
df.dropna(inplace=True)

In [6]:
df.isna().sum()

Unnamed: 0      0
data_clean      0
data_unclean    0
url             0
dtype: int64

In [7]:
df.shape

(7577, 4)

In [8]:
train_df, test_df, _, _ = train_test_split(df, np.arange(df.shape[0]), test_size=0.3, random_state=42)

In [9]:
train_df.shape, test_df.shape

((5303, 4), (2274, 4))

In [10]:
data_comb = train_df['data_unclean'].tolist()

In [11]:
data_comb[0]

'Конечно, вы можете не заниматься всеми этими связанными с кокусом хлопотами, а просто собрать вместе сторонников мистера Честняги, сформировать из них предвыборный комитет, и выдвинуть кандидатуру....'

In [12]:
import re

def get_sentences(texts, val=False):
    num_sentences = 0
    sentences = []
    splitter = re.compile(r'\.\s?\n?')
    for row in texts:
        new_sentences = splitter.split(row)
        new_sentences = [line for line in new_sentences if len(line) > 10]
        sentences.extend(new_sentences)
        if val:
            continue
        num_sentences += len(new_sentences)
        if num_sentences > 100_000:
            # Sentence transformers recommends 10-100K sentences for training
            print('Exceeded 100k')
            break
    return sentences

sentences_train = get_sentences(data_comb)

In [13]:
len(sentences_train), sentences_train[0]

(13028,
 'Конечно, вы можете не заниматься всеми этими связанными с кокусом хлопотами, а просто собрать вместе сторонников мистера Честняги, сформировать из них предвыборный комитет, и выдвинуть кандидатуру')

In [14]:
segmenter = Segmenter()
morph_vocab = MorphVocab()
emb = NewsEmbedding()
morph_tagger = NewsMorphTagger(emb)

def natasha_lemmatize(text):
    text = text.lower()
    try:
        text = LatexNodes2Text().latex_to_text(text)
    except:
        text = text.lower()
    doc = Doc(text)
    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)
    for token in doc.tokens:
        token.lemmatize(morph_vocab)
    return [_.lemma for _ in doc.tokens]

def cleaner(txt, noise):
    res = []
    for word in txt:
        if word not in noise:
            res.append(word)
    return ' '.join(res)

def prep(txt_list):
    noise = stopwords.words('russian') + list(punctuation) + ['что', 'то', 'кто', 'привет', 'весь', 'всем', 'какой', 'ваш', 'внимание', 'добрый', 'время', 'сатья', 'данный', 'хотеть', 'рассказать', 'посвятить', 'результат', 'результаты', 'представить', 'привести']
    for i in tqdm(range(len(txt_list))):
        txt_list[i] = cleaner(natasha_lemmatize(txt_list[i]), noise)



cleaned_sentences_train = sentences_train.copy()
prep(cleaned_sentences_train)

  0%|          | 0/13028 [00:00<?, ?it/s]

In [15]:
print(len(cleaned_sentences_train))

13028


In [16]:
cleaned_sentences_train[0]

'мочь заниматься связать кокус хлопоты просто собрать вместе сторонник мистер честняга сформировать предвыборный комитет выдвинуть кандидатура'

In [17]:
def train_model(sentences_train,model_name):
    # load data
    print('Loading data')
    train_data = DenoisingAutoEncoderDataset(sentences_train)
    loader = DataLoader(train_data, batch_size=8, shuffle=True, drop_last=True)
    
    # initialize model
    print('Initializing model')
    transformer = models.Transformer(model_name)
    pooling = models.Pooling(transformer.get_word_embedding_dimension(), 'cls')
    model = SentenceTransformer(modules=[transformer, pooling])
    
    # initialize loss
    print('Initializing loss')
    train_loss = DenoisingAutoEncoderLoss(model, decoder_name_or_path=model_name, tie_encoder_decoder=True)
    
    # start training
    print('Start training')
    epochs = 5
    if model_name == 'sentence-transformers/all-distilroberta-v1':
        epochs = 1
    model.fit(
    train_objectives=[(loader, train_loss)],
    epochs=epochs,
    weight_decay=0,
    scheduler='constantlr',
    optimizer_params={'lr': 3e-5},
    show_progress_bar=True
    )

    model.save('output/ru-tsdae-' + model_name.split('/')[1])

In [18]:
import warnings
warnings.filterwarnings("ignore")

In [19]:
lst_models = tqdm(['sberbank-ai/ruBert-base', 'cimm-kzn/rudr-bert',
              'DeepPavlov/rubert-base-cased-sentence', 'cointegrated/roberta-base-formality'])
for mdl in lst_models:
    torch.cuda.empty_cache()
    lst_models.set_description(f"Processing {mdl}")
    train_model(cleaned_sentences_train, mdl)

  0%|          | 0/4 [00:00<?, ?it/s]

Loading data
Initializing model


Downloading:   0%|          | 0.00/590 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/683M [00:00<?, ?B/s]

Some weights of the model checkpoint at sberbank-ai/ruBert-base were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/1.70M [00:00<?, ?B/s]

Initializing loss


Some weights of the model checkpoint at sberbank-ai/ruBert-base were not used when initializing BertLMHeadModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertLMHeadModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertLMHeadModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertLMHeadModel were not initialized from the model checkpoint at sberbank-ai/ruBert-base and are newly initialized: ['bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer.8.crossattention.self.query.bias', 'bert.encoder.layer.11.crossattention.output.dense.bias', 'bert.encoder.layer.1.crossattention.self.query.bias', 'bert.

Start training


Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1628 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1628 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1628 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1628 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1628 [00:00<?, ?it/s]

Loading data
Initializing model


Downloading:   0%|          | 0.00/521 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/681M [00:00<?, ?B/s]

Some weights of the model checkpoint at cimm-kzn/rudr-bert were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/972k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Initializing loss


Some weights of the model checkpoint at cimm-kzn/rudr-bert were not used when initializing BertLMHeadModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertLMHeadModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertLMHeadModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertLMHeadModel were not initialized from the model checkpoint at cimm-kzn/rudr-bert and are newly initialized: ['bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer.8.crossattention.self.query.bias', 'bert.encoder.layer.11.crossattention.output.dense.bias', 'bert.encoder.layer.1.crossattention.self.query.bias', 'bert.encoder.la

Start training


Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1628 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1628 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1628 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1628 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1628 [00:00<?, ?it/s]

Loading data
Initializing model


Downloading:   0%|          | 0.00/642 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/678M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.57M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Initializing loss


Some weights of BertLMHeadModel were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased-sentence and are newly initialized: ['cls.predictions.transform.dense.weight', 'encoder.layer.1.crossattention.output.LayerNorm.bias', 'encoder.layer.9.crossattention.self.value.weight', 'encoder.layer.8.crossattention.output.LayerNorm.weight', 'encoder.layer.11.crossattention.self.key.bias', 'encoder.layer.2.crossattention.self.query.bias', 'encoder.layer.1.crossattention.self.key.weight', 'encoder.layer.0.crossattention.output.LayerNorm.bias', 'encoder.layer.3.crossattention.self.value.weight', 'encoder.layer.2.crossattention.self.query.weight', 'encoder.layer.5.crossattention.self.key.bias', 'encoder.layer.4.crossattention.self.value.weight', 'encoder.layer.2.crossattention.self.value.weight', 'encoder.layer.10.crossattention.output.dense.weight', 'encoder.layer.2.crossattention.output.dense.bias', 'encoder.layer.11.crossattention.self.value.bias', 'encoder.layer.11.crossat

Start training


Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1628 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1628 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1628 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1628 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1628 [00:00<?, ?it/s]

Loading data
Initializing model


Downloading:   0%|          | 0.00/711 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/476M [00:00<?, ?B/s]

Some weights of the model checkpoint at cointegrated/roberta-base-formality were not used when initializing RobertaModel: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at cointegrated/roberta-base-formality and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infer

Downloading:   0%|          | 0.00/288 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/780k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Initializing loss


Some weights of the model checkpoint at cointegrated/roberta-base-formality were not used when initializing RobertaForCausalLM: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias']
- This IS expected if you are initializing RobertaForCausalLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForCausalLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForCausalLM were not initialized from the model checkpoint at cointegrated/roberta-base-formality and are newly initialized: ['roberta.encoder.layer.10.crossattention.self.key.bias', 'roberta.encoder.layer.11.crossattention.self.query.bias', 'roberta.encoder.layer.9.

Start training


Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1628 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1628 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1628 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1628 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1628 [00:00<?, ?it/s]