In [1]:
!pip install -q sentence-transformers pylatexenc natasha razdel bertopic hdbscan octis optuna

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-io 0.21.0 requires tensorflow-io-gcs-filesystem==0.21.0, which is not installed.
dask-cudf 21.10.1 requires cupy-cuda114, which is not installed.
yellowbrick 1.4 requires scikit-learn>=1.0.0, but you have scikit-learn 0.24.2 which is incompatible.
tensorflow 2.6.3 requires absl-py~=0.10, but you have absl-py 1.0.0 which is incompatible.
tensorflow 2.6.3 requires numpy~=1.19.2, but you have numpy 1.21.6 which is incompatible.
tensorflow 2.6.3 requires six~=1.15.0, but you have six 1.16.0 which is incompatible.
tensorflow 2.6.3 requires wrapt~=1.12.1, but you have wrapt 1.14.0 which is incompatible.
tensorflow-transform 1.7.0 requires pyarrow<6,>=1, but you have pyarrow 7.0.0 which is incompatible.
tensorflow-transform 1.7.0 requires tensorflow!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!

In [6]:
import numpy as np
from numpy.random import default_rng
import transformers
import pandas as pd
from tqdm.notebook import tqdm
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import torch
import umap
import optuna
import hdbscan
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from octis.evaluation_metrics.diversity_metrics import TopicDiversity
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel
from sklearn.model_selection import train_test_split
from pylatexenc.latex2text import LatexNodes2Text

In [3]:
df = pd.read_csv('../input/ru-data/habr_cyberleninka.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,data_clean,data_unclean,url
0,0,изложить метод проектирование устройство подач...,Изложен метод проектирования устройств подачи ...,https://cyberleninka.ru/article/n/matematiches...
1,1,статья исследовательский метод оценка панель л...,В статье представлены исследовательские методы...,https://cyberleninka.ru/article/n/inzhenernaya...
2,2,,,https://cyberleninka.ru/article/n/mirovoy-fina...
3,3,исследование актуальный философский проблема р...,Представлено исследование актуальной философск...,https://cyberleninka.ru/article/n/metodologich...
4,4,статья анализироваться суть понятие определить...,В статье анализируется суть как и понятия. О...,https://cyberleninka.ru/article/n/sotsiokultur...


In [4]:
df.dropna(inplace=True)

In [7]:
train_df, test_df, _, _ = train_test_split(df, np.arange(df.shape[0]), test_size=0.3, random_state=42)

In [19]:
data_test = test_df['data_clean'].tolist()
data_val = train_df['data_clean'].tolist()

In [9]:
def get_crps(txts: list):
    ret = []
    for el in txts:
        ret.append(el.split())
    return ret


def get_dict(data):
    return dict(zip(data['Topic'].tolist(), [el.split('_')[1:] for el in data['Name'].tolist()]))


def transform_topics(lst, dct):
    ret = []
    for el in lst:
        ret.append(dct[el])
    return ret


def compute_coherence(topic_model, name_c, data, topics_, topn):
    cleaned_docs = topic_model._preprocess_text(data)

    # Extract vectorizer and tokenizer from BERTopic
    vectorizer = topic_model.vectorizer_model
    tokenizer = vectorizer.build_tokenizer()

    # Extract features for Topic Coherence evaluation
    words = vectorizer.get_feature_names()
    tokens = [tokenizer(doc) for doc in cleaned_docs]
    dictionary = corpora.Dictionary(tokens)
    corpus = [dictionary.doc2bow(token) for token in tokens]
    topic_words = [[words for words, _ in topic_model.get_topic(topic) if words!='']
                   for topic in range(len(set(topics_))-1)]

    # Evaluate
    coherence_model = CoherenceModel(topics=topic_words,
                                     texts=tokens,
                                     corpus=corpus,
                                     dictionary=dictionary,
                                     coherence=name_c, topn=topn)
    coherence = coherence_model.get_coherence()
    return coherence


@torch.no_grad()
def compute_metrics(topics_, topic_model_, data, top_k=5):
    try:
        output = {'topics' : [[words for words, _ in topic_model.get_topic(topic)]for topic in range(len(set(topics_[0]))-1)]}
        topic_diversity = TopicDiversity(topk=top_k)
        topic_diversity_score = topic_diversity.score(output)
    except:
        topic_diversity_score = None

    npmi_score = compute_coherence(topic_model_,'c_npmi', data, topics_[0], top_k)
    cv_score = compute_coherence(topic_model_,'c_v', data, topics_[0], top_k)

    return topic_diversity_score, npmi_score, cv_score

In [12]:
lst_models = ['sberbank-ai/ruBert-base', 'cimm-kzn/rudr-bert',
              'DeepPavlov/rubert-base-cased-sentence', 'cointegrated/roberta-base-formality']
scores_untrained = []
with torch.no_grad():
    for md in tqdm(lst_models):
        sentence_model = SentenceTransformer(md, device="cuda")
        topic_model = BERTopic(embedding_model=sentence_model)
        topics = topic_model.fit_transform(np.array(data_test))
        scores_untrained.append(compute_metrics(topics, topic_model, np.array(data_test)))
print(scores_untrained)

  0%|          | 0/4 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/392 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/590 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/716M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.78M [00:00<?, ?B/s]

Some weights of the model checkpoint at /root/.cache/torch/sentence_transformers/sberbank-ai_ruBert-base were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/345 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.44k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/521 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/714M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/996k [00:00<?, ?B/s]

Some weights of the model checkpoint at /root/.cache/torch/sentence_transformers/cimm-kzn_rudr-bert were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/391 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/976 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/642 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/711M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/711 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/499M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/288 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/798k [00:00<?, ?B/s]

Some weights of the model checkpoint at /root/.cache/torch/sentence_transformers/cointegrated_roberta-base-formality were not used when initializing RobertaModel: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at /root/.cache/torch/sentence_transformers/cointegrated_roberta-base-formality and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRA

[(0.9333333333333333, 0.16194672960041037, 0.5603898695375928), (0.9, -0.018609326341781328, 0.3378534509592917), (0.95, 0.09267507882017453, 0.5451520036113584), (0.7, 0.00448571080655997, 0.39418963591097755)]


In [14]:
lst_models_2 = ['../input/tsdae-ru/output/ru-tsdae-ruBert-base', '../input/tsdae-ru/output/ru-tsdae-rudr-bert',
              '../input/tsdae-ru/output/ru-tsdae-rubert-base-cased-sentence', '../input/tsdae-ru/output/ru-tsdae-roberta-base-formality']
scores_trained = []
with torch.no_grad():
    for md in tqdm(lst_models_2):
        sentence_model = SentenceTransformer(md, device="cuda")
        topic_model = BERTopic(embedding_model=sentence_model)
        topics = topic_model.fit_transform(np.array(data_test))
        scores_trained.append(compute_metrics(topics, topic_model, np.array(data_test)))
print(scores_trained)

  0%|          | 0/4 [00:00<?, ?it/s]

[(0.9, -0.01860932634178133, 0.3378534509592917), (1.0, -0.08771433886233847, 0.1431919917535657), (0.9, -0.01860932634178133, 0.3378534509592917), (0.9333333333333333, 0.16030667767088386, 0.5568735820056916)]


In [15]:
@torch.no_grad()
def objective(trial):
    torch.cuda.empty_cache()
    tnw = trial.suggest_int("top_n_words", 10, 30, log=True)
    ngr = trial.suggest_int("n_gram_range", 1, 3, log=True)
    mts = trial.suggest_int("min_topic_size", 5, 50, log=True)

    min_dist = trial.suggest_float("min_dist", 0.000001, 1, log=True)
    n_neigh = trial.suggest_int("n_neighbors", 2, 100, log=True)
    n_comp = trial.suggest_int("n_components", 10, 250, log=True)
    umap_model = umap.UMAP(
        n_neighbors=n_neigh,
        min_dist=min_dist,
        n_components=n_comp,
        random_state=42,
    )
    # cse = trial.suggest_float("cluster_selection_epsilon", 0.0001, 10, log=True)
    # mcs = trial.suggest_int("min_cluster_size", 2, 100, log=True)
    ms = trial.suggest_int("min_samples", 2, 40, log=True)
    hdbscan_model = hdbscan.HDBSCAN(min_samples=ms)
    topic_model = BERTopic(embedding_model=sentence_model, top_n_words=tnw, n_gram_range=(1, ngr), min_topic_size=mts,
                           umap_model=umap_model, hdbscan_model=hdbscan_model)
    try:
        topics = topic_model.fit_transform(np.array(data_val))
        return compute_metrics(topics, topic_model, np.array(data_val))[1]
    except:
        return -1

In [20]:
lst_models_3 = ['../input/tsdae-ru/output/ru-tsdae-ruBert-base', '../input/tsdae-ru/output/ru-tsdae-rudr-bert',
              '../input/tsdae-ru/output/ru-tsdae-rubert-base-cased-sentence', '../input/tsdae-ru/output/ru-tsdae-roberta-base-formality']
params_for_models = []
for md in lst_models_3:
    sentence_model = SentenceTransformer(md, device="cuda")
    study = optuna.create_study(direction="maximize")
    study.enqueue_trial(
        {
            'top_n_words': 10,
            'n_gram_range': 1,
            'min_topic_size': 10,
            'min_dist': 0.000001,
            'n_neighbors': 15,
            'n_components': 5,
            # 'cluster_selection_epsilon': 0.0,
            # 'min_cluster_size': 5,
            'min_samples': 5
        }
    )
    study.optimize(objective, n_trials=30, n_jobs=1)
    params_for_models.append(study.best_trial)

[32m[I 2022-05-16 14:12:52,952][0m A new study created in memory with name: no-name-e4ab6309-2bbb-4a11-b652-d112ba6f789a[0m
  create_trial(state=TrialState.WAITING, system_attrs={"fixed_params": params})
  create_trial(state=TrialState.WAITING, system_attrs={"fixed_params": params})


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[32m[I 2022-05-16 14:14:43,925][0m Trial 0 finished with value: 0.054315324188291235 and parameters: {'top_n_words': 10, 'n_gram_range': 1, 'min_topic_size': 10, 'min_dist': 1e-06, 'n_neighbors': 15, 'n_components': 140, 'min_samples': 5}. Best is trial 0 with value: 0.054315324188291235.[0m
[32m[I 2022-05-16 14:15:24,486][0m Trial 1 finished with value: 0.01141420616454743 and parameters: {'top_n_words': 14, 'n_gram_range': 2, 'min_topic_size': 12, 'min_dist': 0.15977730992723652, 'n_neighbors': 7, 'n_components': 46, 'min_samples': 22}. Best is trial 0 with value: 0.054315324188291235.[0m
[32m[I 2022-05-16 14:16:07,343][0m Trial 2 finished with value: -1.0 and parameters: {'top_n_words': 13, 'n_gram_range': 2, 'min_topic_size': 23, 'min_dist': 0.662400566068187, 'n_neighbors': 41, 'n_components': 16, 'min_samples': 3}. Best is trial 0 with value: 0.054315324188291235.[0m
[32m[I 2022-05-16 14:17:37,686][0m Trial 3 finished with value: 0.3461085399101968 and parameters: {'to

In [21]:
def extract_pars(dct):
    tmp = dict()
    tmp['min_dist'] = dct['min_dist']
    tmp['n_neighbors'] = dct['n_neighbors']
    tmp['n_components'] = dct['n_components']
    tmp_1 = dict()
    # tmp_1['cluster_selection_epsilon'] = dct['cluster_selection_epsilon']
    # tmp_1['min_cluster_size'] = dct['min_cluster_size']
    tmp_1['min_samples'] = dct['min_samples']
    tmp_2 = dict()
    tmp_2['top_n_words'] = dct['top_n_words']
    tmp_2['n_gram_range'] = (1, dct['n_gram_range'])
    tmp_2['min_topic_size'] = dct['min_topic_size']
    return tmp, tmp_1, tmp_2

In [22]:
scores_paramed= []
lst_models_3 = ['../input/tsdae-ru/output/ru-tsdae-ruBert-base', '../input/tsdae-ru/output/ru-tsdae-rudr-bert',
              '../input/tsdae-ru/output/ru-tsdae-rubert-base-cased-sentence', '../input/tsdae-ru/output/ru-tsdae-roberta-base-formality']
for i, md in tqdm(enumerate(lst_models_3)):
    params_ = extract_pars(params_for_models[i].params)
    sentence_model = SentenceTransformer(md, device="cuda")
    umap_model = umap.UMAP(**params_[0], random_state=42)
    hdbscan_model = hdbscan.HDBSCAN(**params_[1])
    topic_model = BERTopic(embedding_model=sentence_model, umap_model=umap_model, hdbscan_model=hdbscan_model, **params_[2])
    topics = topic_model.fit_transform(np.array(data_test))
    scores_paramed.append(compute_metrics(topics, topic_model, np.array(data_test)))
print(scores_paramed)

0it [00:00, ?it/s]

[(0.9, -0.01860932634178133, 0.3378534509592917), (0.9333333333333333, 0.24343068962560233, 0.5586868706789779), (0.9116279069767442, 0.04480111235868773, 0.6104256262338547), (0.9333333333333333, 0.16194672960041037, 0.5603898695375928)]
