In [1]:
!pip install -q sentence-transformers pylatexenc natasha razdel bertopic hdbscan octis optuna plotly
# paraphrase-multilingual-MiniLM-L12-v2

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-io 0.21.0 requires tensorflow-io-gcs-filesystem==0.21.0, which is not installed.
dask-cudf 21.10.1 requires cupy-cuda114, which is not installed.
yellowbrick 1.4 requires scikit-learn>=1.0.0, but you have scikit-learn 0.24.2 which is incompatible.
tensorflow 2.6.3 requires absl-py~=0.10, but you have absl-py 1.0.0 which is incompatible.
tensorflow 2.6.3 requires numpy~=1.19.2, but you have numpy 1.21.6 which is incompatible.
tensorflow 2.6.3 requires six~=1.15.0, but you have six 1.16.0 which is incompatible.
tensorflow 2.6.3 requires wrapt~=1.12.1, but you have wrapt 1.14.0 which is incompatible.
tensorflow-transform 1.7.0 requires pyarrow<6,>=1, but you have pyarrow 7.0.0 which is incompatible.
tensorflow-transform 1.7.0 requires tensorflow!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,

In [2]:
import numpy as np
from numpy.random import default_rng
import transformers
import pandas as pd
from tqdm.notebook import tqdm
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import torch
import umap
import optuna
import hdbscan
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from octis.evaluation_metrics.diversity_metrics import TopicDiversity
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel
from sklearn.model_selection import train_test_split
from pylatexenc.latex2text import LatexNodes2Text


In [3]:
def extract_pars(dct):
    tmp = dict()
    tmp['min_dist'] = dct['min_dist']
    tmp['n_neighbors'] = dct['n_neighbors']
    tmp['n_components'] = dct['n_components']
    tmp_1 = dict()
    # tmp_1['cluster_selection_epsilon'] = dct['cluster_selection_epsilon']
    # tmp_1['min_cluster_size'] = dct['min_cluster_size']
    tmp_1['min_samples'] = dct['min_samples']
    tmp_2 = dict()
    tmp_2['top_n_words'] = dct['top_n_words']
    tmp_2['n_gram_range'] = (1, dct['n_gram_range'])
    tmp_2['min_topic_size'] = dct['min_topic_size']
    return tmp, tmp_1, tmp_2

torch.manual_seed(42)
np.random.seed(42)
torch.manual_seed(42)
tqdm.pandas()

In [4]:
df = pd.read_csv('../input/ru-data/habr_cyberleninka.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,data_clean,data_unclean,url
0,0,изложить метод проектирование устройство подач...,Изложен метод проектирования устройств подачи ...,https://cyberleninka.ru/article/n/matematiches...
1,1,статья исследовательский метод оценка панель л...,В статье представлены исследовательские методы...,https://cyberleninka.ru/article/n/inzhenernaya...
2,2,,,https://cyberleninka.ru/article/n/mirovoy-fina...
3,3,исследование актуальный философский проблема р...,Представлено исследование актуальной философск...,https://cyberleninka.ru/article/n/metodologich...
4,4,статья анализироваться суть понятие определить...,В статье анализируется суть как и понятия. О...,https://cyberleninka.ru/article/n/sotsiokultur...


In [5]:
df.dropna(inplace=True)

In [6]:
train_df, test_df, _, _ = train_test_split(df, np.arange(df.shape[0]), test_size=0.3, random_state=42)

In [7]:
data_test = test_df['data_clean'].tolist()
data_val = train_df['data_clean'].tolist()

In [8]:


def get_crps(txts: list):
    ret = []
    for el in txts:
        ret.append(el.split())
    return ret


def get_dict(data):
    return dict(zip(data['Topic'].tolist(), [el.split('_')[1:] for el in data['Name'].tolist()]))


def transform_topics(lst, dct):
    ret = []
    for el in lst:
        ret.append(dct[el])
    return ret


def compute_coherence(topic_model, name_c, data, topics_, topn):
    cleaned_docs = topic_model._preprocess_text(data)

    # Extract vectorizer and tokenizer from BERTopic
    vectorizer = topic_model.vectorizer_model
    tokenizer = vectorizer.build_tokenizer()

    # Extract features for Topic Coherence evaluation
    words = vectorizer.get_feature_names()
    tokens = [tokenizer(doc) for doc in cleaned_docs]
    dictionary = corpora.Dictionary(tokens)
    corpus = [dictionary.doc2bow(token) for token in tokens]
    topic_words = [[words for words, _ in topic_model.get_topic(topic) if words!='']
                   for topic in range(len(set(topics_))-1)]

    # Evaluate
    coherence_model = CoherenceModel(topics=topic_words,
                                     texts=tokens,
                                     corpus=corpus,
                                     dictionary=dictionary,
                                     coherence=name_c, topn=topn)
    coherence = coherence_model.get_coherence()
    return coherence


@torch.no_grad()
def compute_metrics(topics_, topic_model_, data, top_k=5):
    try:
        output = {'topics' : [[words for words, _ in topic_model.get_topic(topic)]for topic in range(len(set(topics_[0]))-1)]}
        topic_diversity = TopicDiversity(topk=top_k)
        topic_diversity_score = topic_diversity.score(output)
    except:
        topic_diversity_score = None

    npmi_score = compute_coherence(topic_model_,'c_npmi', data, topics_[0], top_k)
    cv_score = compute_coherence(topic_model_,'c_v', data, topics_[0], top_k)

    return topic_diversity_score, npmi_score, cv_score

In [9]:


lst_models = ['paraphrase-multilingual-MiniLM-L12-v2', 'paraphrase-multilingual-mpnet-base-v2']
scores_untrained = []
with torch.no_grad():
    for md in tqdm(lst_models):
        sentence_model = SentenceTransformer(md, device="cuda")
        topic_model = BERTopic(embedding_model=sentence_model)
        topics = topic_model.fit_transform(np.array(data_test))
        scores_untrained.append(compute_metrics(topics, topic_model, np.array(data_test)))
print(scores_untrained)



  0%|          | 0/2 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/968 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.79k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/645 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/471M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/480 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/14.8M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.77k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/723 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/402 [00:00<?, ?B/s]

[(0.9485714285714286, 0.006829672494870022, 0.5310892970937127), (0.9314285714285714, 0.07366114475266222, 0.6137037075643514)]


In [10]:
@torch.no_grad()
def objective(trial):
    torch.cuda.empty_cache()
    tnw = trial.suggest_int("top_n_words", 10, 30, log=True)
    ngr = trial.suggest_int("n_gram_range", 1, 3, log=True)
    mts = trial.suggest_int("min_topic_size", 5, 50, log=True)

    min_dist = trial.suggest_float("min_dist", 0.000001, 1, log=True)
    n_neigh = trial.suggest_int("n_neighbors", 2, 100, log=True)
    n_comp = trial.suggest_int("n_components", 10, 250, log=True)
    umap_model = umap.UMAP(
        n_neighbors=n_neigh,
        min_dist=min_dist,
        n_components=n_comp,
        random_state=42,
    )
    # cse = trial.suggest_float("cluster_selection_epsilon", 0.0001, 10, log=True)
    # mcs = trial.suggest_int("min_cluster_size", 2, 100, log=True)
    ms = trial.suggest_int("min_samples", 2, 40, log=True)
    hdbscan_model = hdbscan.HDBSCAN(min_samples=ms)
    topic_model = BERTopic(embedding_model=sentence_model, top_n_words=tnw, n_gram_range=(1, ngr), min_topic_size=mts,
                           umap_model=umap_model, hdbscan_model=hdbscan_model)
    try:
        topics = topic_model.fit_transform(np.array(data_val))
        return compute_metrics(topics, topic_model, np.array(data_val))[1]
    except:
        return -1


In [11]:
params_for_models = []
for md in lst_models:
    sentence_model = SentenceTransformer(md, device="cuda")
    study = optuna.create_study(direction="maximize")
    study.enqueue_trial(
        {
            'top_n_words': 10,
            'n_gram_range': 1,
            'min_topic_size': 10,
            'min_dist': 0.000001,
            'n_neighbors': 15,
            'n_components': 5,
            # 'cluster_selection_epsilon': 0.0,
            # 'min_cluster_size': 5,
            'min_samples': 5
        }
    )
    study.optimize(objective, n_trials=30, n_jobs=1)
    params_for_models.append(study.best_trial)

[32m[I 2022-05-17 09:33:12,931][0m A new study created in memory with name: no-name-3318c0b1-ddbf-450e-a39b-41c9f795cf69[0m
  from ipykernel import kernelapp as app
  create_trial(state=TrialState.WAITING, system_attrs={"fixed_params": params})
  create_trial(state=TrialState.WAITING, system_attrs={"fixed_params": params})


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[32m[I 2022-05-17 09:34:19,264][0m Trial 0 finished with value: 0.04614603363845691 and parameters: {'top_n_words': 10, 'n_gram_range': 1, 'min_topic_size': 10, 'min_dist': 1e-06, 'n_neighbors': 15, 'n_components': 56, 'min_samples': 5}. Best is trial 0 with value: 0.04614603363845691.[0m
[32m[I 2022-05-17 09:37:02,868][0m Trial 1 finished with value: 0.06008853349265184 and parameters: {'top_n_words': 15, 'n_gram_range': 1, 'min_topic_size': 6, 'min_dist': 0.9308184165949985, 'n_neighbors': 38, 'n_components': 234, 'min_samples': 4}. Best is trial 1 with value: 0.06008853349265184.[0m
[32m[I 2022-05-17 09:37:31,581][0m Trial 2 finished with value: -1.0 and parameters: {'top_n_words': 21, 'n_gram_range': 3, 'min_topic_size': 46, 'min_dist': 5.075513515602615e-06, 'n_neighbors': 19, 'n_components': 13, 'min_samples': 2}. Best is trial 1 with value: 0.06008853349265184.[0m
[32m[I 2022-05-17 09:38:47,144][0m Trial 3 finished with value: 0.04440967574240297 and parameters: {'top

In [12]:
scores_paramed= []
lst_models_3 = ['paraphrase-multilingual-MiniLM-L12-v2', 'paraphrase-multilingual-mpnet-base-v2']
for i, md in tqdm(enumerate(lst_models_3)):
    params_ = extract_pars(params_for_models[i].params)
    sentence_model = SentenceTransformer(md, device="cuda")
    umap_model = umap.UMAP(**params_[0], random_state=42)
    hdbscan_model = hdbscan.HDBSCAN(**params_[1])
    topic_model = BERTopic(embedding_model=sentence_model, umap_model=umap_model, hdbscan_model=hdbscan_model, **params_[2])
    topics = topic_model.fit_transform(np.array(data_test))
    scores_paramed.append(compute_metrics(topics, topic_model, np.array(data_test)))
print(scores_paramed)

0it [00:00, ?it/s]

[(0.88, 0.04354432263031198, 0.5928658638302011), (1.0, 0.005253884690778005, 0.5218015971366817)]


In [13]:
!pip install langdetect

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 KB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- done
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l- \ | / done
[?25h  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993242 sha256=764436c75d44f67e22652feaf5b5024f0314840a6b7e856fa2fb1ef6cb1a5e9b
  Stored in directory: /root/.cache/pip/wheels/c5/96/8a/f90c59ed25d75e50a8c10a1b1c2d4c402e4dacfa87f3aff36a
Successfully built langdetect
Installing collected packages: lan

In [14]:
from langdetect import detect

data = df['data_clean'].tolist()
total = 0
eng = 0
for sent in tqdm(data):
    for word in sent.split():
        try:
            if detect(word) != 'ru':
                eng += 1
            total += 1
        except:
            continue
print(eng/total * 100)

  0%|          | 0/7577 [00:00<?, ?it/s]

32.2663029741343
