In [1]:
!pip install -q sentence-transformers pylatexenc natasha razdel bertopic hdbscan octis optuna plotly

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-io 0.21.0 requires tensorflow-io-gcs-filesystem==0.21.0, which is not installed.
dask-cudf 21.10.1 requires cupy-cuda114, which is not installed.
yellowbrick 1.4 requires scikit-learn>=1.0.0, but you have scikit-learn 0.24.2 which is incompatible.
tensorflow 2.6.3 requires absl-py~=0.10, but you have absl-py 1.0.0 which is incompatible.
tensorflow 2.6.3 requires numpy~=1.19.2, but you have numpy 1.21.6 which is incompatible.
tensorflow 2.6.3 requires six~=1.15.0, but you have six 1.16.0 which is incompatible.
tensorflow 2.6.3 requires wrapt~=1.12.1, but you have wrapt 1.14.0 which is incompatible.
tensorflow-transform 1.7.0 requires pyarrow<6,>=1, but you have pyarrow 7.0.0 which is incompatible.
tensorflow-transform 1.7.0 requires tensorflow!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!

In [2]:
import numpy as np
from numpy.random import default_rng
import transformers
import pandas as pd
from tqdm.notebook import tqdm
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import torch
import umap
import optuna
import hdbscan
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from octis.evaluation_metrics.diversity_metrics import TopicDiversity
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel
from sklearn.model_selection import train_test_split
from pylatexenc.latex2text import LatexNodes2Text

In [3]:
torch.manual_seed(42)
np.random.seed(42)
torch.manual_seed(42)
tqdm.pandas()

In [4]:
df = pd.read_csv('../input/ru-data/habr_cyberleninka.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,data_clean,data_unclean,url
0,0,изложить метод проектирование устройство подач...,Изложен метод проектирования устройств подачи ...,https://cyberleninka.ru/article/n/matematiches...
1,1,статья исследовательский метод оценка панель л...,В статье представлены исследовательские методы...,https://cyberleninka.ru/article/n/inzhenernaya...
2,2,,,https://cyberleninka.ru/article/n/mirovoy-fina...
3,3,исследование актуальный философский проблема р...,Представлено исследование актуальной философск...,https://cyberleninka.ru/article/n/metodologich...
4,4,статья анализироваться суть понятие определить...,В статье анализируется суть как и понятия. О...,https://cyberleninka.ru/article/n/sotsiokultur...


In [5]:
df.dropna(inplace=True)

In [6]:
train_df, test_df, _, _ = train_test_split(df, np.arange(df.shape[0]), test_size=0.3, random_state=42)

In [7]:
data=df['data_clean'].tolist()

In [8]:


def get_crps(txts: list):
    ret = []
    for el in txts:
        ret.append(el.split())
    return ret


def get_dict(data):
    return dict(zip(data['Topic'].tolist(), [el.split('_')[1:] for el in data['Name'].tolist()]))


def transform_topics(lst, dct):
    ret = []
    for el in lst:
        ret.append(dct[el])
    return ret


def compute_coherence(topic_model, name_c, data, topics_, topn):
    cleaned_docs = topic_model._preprocess_text(data)

    # Extract vectorizer and tokenizer from BERTopic
    vectorizer = topic_model.vectorizer_model
    tokenizer = vectorizer.build_tokenizer()

    # Extract features for Topic Coherence evaluation
    words = vectorizer.get_feature_names()
    tokens = [tokenizer(doc) for doc in cleaned_docs]
    dictionary = corpora.Dictionary(tokens)
    corpus = [dictionary.doc2bow(token) for token in tokens]
    topic_words = [[words for words, _ in topic_model.get_topic(topic) if words!='']
                   for topic in range(len(set(topics_))-1)]

    # Evaluate
    coherence_model = CoherenceModel(topics=topic_words,
                                     texts=tokens,
                                     corpus=corpus,
                                     dictionary=dictionary,
                                     coherence=name_c, topn=topn)
    coherence = coherence_model.get_coherence()
    return coherence


@torch.no_grad()
def compute_metrics(topics_, topic_model_, data, top_k=5):
    try:
        output = {'topics' : [[words for words, _ in topic_model.get_topic(topic)]for topic in range(len(set(topics_[0]))-1)]}
        topic_diversity = TopicDiversity(topk=top_k)
        topic_diversity_score = topic_diversity.score(output)
    except:
        topic_diversity_score = None

    npmi_score = compute_coherence(topic_model_,'c_npmi', data, topics_[0], top_k)
    cv_score = compute_coherence(topic_model_,'c_v', data, topics_[0], top_k)

    return topic_diversity_score, npmi_score, cv_score

In [9]:
model_name = 'paraphrase-multilingual-MiniLM-L12-v2'
sentence_model = SentenceTransformer(model_name, device="cuda")
topic_model = BERTopic(embedding_model=sentence_model,verbose=True, calculate_probabilities=True)

topics = topic_model.fit_transform(np.array(data))
compute_metrics(topics, topic_model, np.array(data))

Downloading:   0%|          | 0.00/968 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.79k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/645 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/471M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/480 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/14.8M [00:00<?, ?B/s]

Batches:   0%|          | 0/237 [00:00<?, ?it/s]

2022-05-17 07:34:35,997 - BERTopic - Transformed documents to Embeddings


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


2022-05-17 07:35:06,097 - BERTopic - Reduced dimensionality
2022-05-17 07:35:11,482 - BERTopic - Clustered reduced embeddings


(0.9148148148148149, 0.09596231362848608, 0.6039738089885154)

In [10]:
topic_model.visualize_topics()

In [11]:
model_name = 'paraphrase-multilingual-mpnet-base-v2'
sentence_model = SentenceTransformer(model_name, device="cuda")
topic_model = BERTopic(embedding_model=sentence_model,verbose=True, calculate_probabilities=True)

topics = topic_model.fit_transform(np.array(data))
compute_metrics(topics, topic_model, np.array(data))

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.77k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/723 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/402 [00:00<?, ?B/s]

Batches:   0%|          | 0/237 [00:00<?, ?it/s]

2022-05-17 07:36:22,434 - BERTopic - Transformed documents to Embeddings
2022-05-17 07:36:35,504 - BERTopic - Reduced dimensionality
2022-05-17 07:36:39,219 - BERTopic - Clustered reduced embeddings


(0.8989690721649485, 0.06958486500931085, 0.5981981314468424)

In [12]:
topic_model.visualize_topics()

In [13]:
model_name = 'cointegrated/roberta-base-formality'
sentence_model = SentenceTransformer(model_name, device="cuda")
topic_model = BERTopic(embedding_model=sentence_model,verbose=True, calculate_probabilities=True)

topics = topic_model.fit_transform(np.array(data))
compute_metrics(topics, topic_model, np.array(data))

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/711 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/499M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/288 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/798k [00:00<?, ?B/s]

Some weights of the model checkpoint at /root/.cache/torch/sentence_transformers/cointegrated_roberta-base-formality were not used when initializing RobertaModel: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at /root/.cache/torch/sentence_transformers/cointegrated_roberta-base-formality and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRA

Batches:   0%|          | 0/237 [00:00<?, ?it/s]

2022-05-17 07:38:11,362 - BERTopic - Transformed documents to Embeddings
2022-05-17 07:38:25,307 - BERTopic - Reduced dimensionality
2022-05-17 07:38:26,492 - BERTopic - Clustered reduced embeddings


(0.7090909090909091, 0.03637621428579752, 0.5119819786435423)

In [14]:
topic_model.visualize_topics()

In [15]:
def extract_pars(dct):
    tmp = dict()
    tmp['min_dist'] = dct['min_dist']
    tmp['n_neighbors'] = dct['n_neighbors']
    tmp['n_components'] = dct['n_components']
    tmp_1 = dict()
    # tmp_1['cluster_selection_epsilon'] = dct['cluster_selection_epsilon']
    # tmp_1['min_cluster_size'] = dct['min_cluster_size']
    tmp_1['min_samples'] = dct['min_samples']
    tmp_2 = dict()
    tmp_2['top_n_words'] = dct['top_n_words']
    tmp_2['n_gram_range'] = (1, dct['n_gram_range'])
    tmp_2['min_topic_size'] = dct['min_topic_size']
    return tmp, tmp_1, tmp_2

model_name =  '../input/tsdae-ru/output/ru-tsdae-roberta-base-formality'
params_ = extract_pars({'top_n_words': 14, 'n_gram_range': 1, 'min_topic_size': 5, 'min_dist': 1.3517643802848983e-05, 'n_neighbors': 5, 'n_components': 83, 'min_samples': 9})

In [16]:
sentence_model = SentenceTransformer(model_name, device="cuda")
umap_model = umap.UMAP(**params_[0], random_state=42)
hdbscan_model = hdbscan.HDBSCAN(**params_[1], prediction_data = True)
topic_model = BERTopic(embedding_model=sentence_model, umap_model=umap_model, hdbscan_model=hdbscan_model,
                       **params_[2], verbose=True, calculate_probabilities=True)

topics = topic_model.fit_transform(np.array(data))
compute_metrics(topics, topic_model, np.array(data))

Batches:   0%|          | 0/237 [00:00<?, ?it/s]

2022-05-17 07:39:40,408 - BERTopic - Transformed documents to Embeddings
2022-05-17 07:40:28,813 - BERTopic - Reduced dimensionality
2022-05-17 07:40:35,635 - BERTopic - Clustered reduced embeddings


(1.0, 0.28830287707533647, 0.7615008213885747)

In [17]:
topic_model.visualize_topics()

In [18]:
model_name = 'sberbank-ai/ruRoberta-large'
sentence_model = SentenceTransformer(model_name, device="cuda")
topic_model = BERTopic(embedding_model=sentence_model,verbose=True, calculate_probabilities=True)

topics = topic_model.fit_transform(np.array(data))
compute_metrics(topics, topic_model, np.array(data))

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/336 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/674 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.37M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.81M [00:00<?, ?B/s]

Some weights of the model checkpoint at /root/.cache/torch/sentence_transformers/sberbank-ai_ruRoberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at /root/.cache/torch/sentence_transformers/sberbank-ai_ruRoberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.poo

Batches:   0%|          | 0/237 [00:00<?, ?it/s]

2022-05-17 07:50:18,336 - BERTopic - Transformed documents to Embeddings
2022-05-17 07:50:32,259 - BERTopic - Reduced dimensionality
2022-05-17 07:50:33,070 - BERTopic - Clustered reduced embeddings


(0.75, 0.2898069013394424, 0.7705169444128144)

In [19]:
topic_model.visualize_topics()