In [1]:
!pip install -q sentence-transformers pylatexenc bertopic hdbscan octis

You should consider upgrading via the '/home/paul/anaconda3/bin/python -m pip install --upgrade pip' command.[0m


In [2]:
import numpy as np
from numpy.random import default_rng
import transformers
import pandas as pd
from tqdm.notebook import tqdm
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import torch
import umap
import optuna
import hdbscan
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from octis.evaluation_metrics.diversity_metrics import TopicDiversity
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel

from pylatexenc.latex2text import LatexNodes2Text

In [3]:
def split_data(data, train_size=5000, optim_size=6000):
    test_size = len(data) - train_size - optim_size
    rng = np.random.default_rng(49)
    labels = np.array(['train'] * train_size + ['val'] * optim_size + ['test'] * test_size)
    rng.shuffle(labels)
    return labels

torch.manual_seed(42)
np.random.seed(42)
torch.manual_seed(42)
transformers.set_seed(42)
tqdm.pandas()
docs_df = pd.read_csv('to_use.csv')
df = docs_df.sample(25000, random_state=42)
df['type_set'] = split_data(df)
np.unique(df['type_set'], return_counts=True)

  docs_df = pd.read_csv('to_use.csv')


(array(['test', 'train', 'val'], dtype=object), array([14000,  5000,  6000]))

In [4]:
def clean_abstract(txt, stp_wrds):
    txt = txt.lower()
    try:
        txt = LatexNodes2Text().latex_to_text(txt)
    except:
        txt = txt.lower()
    sup = word_tokenize(txt, language="english")
    lemmatizer = WordNetLemmatizer()
    sup = [lemmatizer.lemmatize(word) for word in sup]
    ret = []
    for el in sup:
        el.replace('\\', '')
        if el not in stp_wrds and el not in string.punctuation:
            ret.append(el)
    return ' '.join(ret)

In [5]:
stop_words = nltk.corpus.stopwords.words("english")
stop_words += ['we', 'paper', 'new', 'article', "''", "``", "”", 'et', 'al', 'study', 'state', 'of', 'the', 'art']
cleaned_abstract = df['abstract'].progress_apply(clean_abstract, args=[stop_words])
df['abstract_uncleaned'] = df['abstract'].copy()
df['abstract'] = cleaned_abstract
data_val = df[df['type_set'] == 'val']['abstract'].tolist()

  0%|          | 0/25000 [00:00<?, ?it/s]

In [6]:
data_test = df[df['type_set'] == 'test']['abstract'].tolist()

In [7]:
data_val[0]

'convergence stationarity random walk dynamic random digraph given degree sequence digraph undergo full regeneration independent geometrically distributed random time interval parameter α. relaxation stationarity result competition regeneration mixing static digraph number vertex n tends infinity parameter α tends zero find three scenario according whether αlog n converges zero infinity finite positive value limit zero relaxation stationarity occurs two separate stage first due mixing static digraph second due regeneration limit infinite enough time static digraph mix relaxation stationarity dictated regeneration finally limit finite positive value find mixed behaviour interpolating two extreme crucial ingredient analysis control suitable approximation unknown stationary distribution'

In [8]:
def get_crps(txts: list):
    ret = []
    for el in txts:
        ret.append(el.split())
    return ret


def get_dict(data):
    return dict(zip(data['Topic'].tolist(), [el.split('_')[1:] for el in data['Name'].tolist()]))


def transform_topics(lst, dct):
    ret = []
    for el in lst:
        ret.append(dct[el])
    return ret


def compute_coherence(topic_model, name_c, data, topics_, topn):
    cleaned_docs = topic_model._preprocess_text(data)

    # Extract vectorizer and tokenizer from BERTopic
    vectorizer = topic_model.vectorizer_model
    tokenizer = vectorizer.build_tokenizer()

    # Extract features for Topic Coherence evaluation
    words = vectorizer.get_feature_names()
    tokens = [tokenizer(doc) for doc in cleaned_docs]
    dictionary = corpora.Dictionary(tokens)
    corpus = [dictionary.doc2bow(token) for token in tokens]
    topic_words = [[words for words, _ in topic_model.get_topic(topic) if words!='']
                   for topic in range(len(set(topics_))-1)]

    # Evaluate
    coherence_model = CoherenceModel(topics=topic_words,
                                     texts=tokens,
                                     corpus=corpus,
                                     dictionary=dictionary,
                                     coherence=name_c, topn=topn)
    coherence = coherence_model.get_coherence()
    return coherence


@torch.no_grad()
def compute_metrics(topics_, topic_model_, data, top_k=5):
    try:
        output = {'topics' : [[words for words, _ in topic_model.get_topic(topic)]for topic in range(len(set(topics_[0]))-1)]}
        topic_diversity = TopicDiversity(topk=top_k)
        topic_diversity_score = topic_diversity.score(output)
    except:
        topic_diversity_score = None

    npmi_score = compute_coherence(topic_model_,'c_npmi', data, topics_[0], top_k)
    cv_score = compute_coherence(topic_model_,'c_v', data, topics_[0], top_k)

    return topic_diversity_score, npmi_score, cv_score

In [9]:
lst_models = ['johngiorgi/declutr-base', 'google/electra-base-discriminator', 'sentence-transformers/all-distilroberta-v1',
              'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract', 'all-mpnet-base-v2']
scores_untrained = []
with torch.no_grad():
    for md in lst_models:
        sentence_model = SentenceTransformer(md, device="cuda")
        topic_model = BERTopic(embedding_model=sentence_model)
        topics = topic_model.fit_transform(np.array(data_test))
        scores_untrained.append(compute_metrics(topics, topic_model, np.array(data_test)))
print(scores_untrained)

Downloading:   0%|          | 0.00/391 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.44k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/548 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/501M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/798k [00:00<?, ?B/s]

Some weights of the model checkpoint at /root/.cache/torch/sentence_transformers/johngiorgi_declutr-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.decoder.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Downloading:   0%|          | 0.00/390 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/666 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Some weights of the model checkpoint at /root/.cache/torch/sentence_transformers/google_electra-base-discriminator were not used when initializing ElectraModel: ['discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/737 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.86k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/653 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/15.7k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/329M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/333 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/391 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.07k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.94k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/225k [00:00<?, ?B/s]

Some weights of the model checkpoint at /root/.cache/torch/sentence_transformers/microsoft_BiomedNLP-PubMedBERT-base-uncased-abstract were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

[(0.8216867469879519, 0.1491818372449558, 0.6768210967742299), (0.8371681415929203, -0.1863928028104267, 0.3271564661763955), (0.8012578616352202, 0.1649150506281666, 0.6895854913749707), (0.781578947368421, 0.12828512277294654, 0.6679900561014893), (0.8241758241758241, 0.17764859323812446, 0.700082176733079)]


In [10]:
lst_models_2 = ['model_w/tsdae-declutr-base', 'model_w/tsdae-electra-base-discriminator',
                'model_w/tsdae-all-distilroberta-v1', 'model_w/tsdae-BiomedNLP-PubMedBERT-base-uncased-abstract']
scores_trained = []
for md in tqdm(lst_models_2):
    sentence_model = SentenceTransformer(md, device="cuda")
    topic_model = BERTopic(embedding_model=sentence_model)
    topics = topic_model.fit_transform(np.array(data_test))
    scores_trained.append(compute_metrics(topics, topic_model, np.array(data_test)))
print(scores_trained)

  0%|          | 0/4 [00:00<?, ?it/s]

[(0.8189189189189189, 0.16313223258919463, 0.695366367610592), (1.0, -0.03706645614338682, 0.449954260949123), (1.0, 0.06886228904641256, 0.4568813321059566), (0.8166666666666667, 0.14485702070906367, 0.6903788020275959)]


In [11]:
@torch.no_grad()
def objective(trial):
    torch.cuda.empty_cache()
    tnw = trial.suggest_int("top_n_words", 10, 30, log=True)
    ngr = trial.suggest_int("n_gram_range", 1, 3, log=True)
    mts = trial.suggest_int("min_topic_size", 5, 50, log=True)

    min_dist = trial.suggest_float("min_dist", 0.000001, 1, log=True)
    n_neigh = trial.suggest_int("n_neighbors", 2, 100, log=True)
    n_comp = trial.suggest_int("n_components", 10, 250, log=True)
    umap_model = umap.UMAP(
        n_neighbors=n_neigh,
        min_dist=min_dist,
        n_components=n_comp,
        random_state=42,
    )
    # cse = trial.suggest_float("cluster_selection_epsilon", 0.0001, 10, log=True)
    # mcs = trial.suggest_int("min_cluster_size", 2, 100, log=True)
    ms = trial.suggest_int("min_samples", 2, 40, log=True)
    hdbscan_model = hdbscan.HDBSCAN(min_samples=ms)
    topic_model = BERTopic(embedding_model=sentence_model, top_n_words=tnw, n_gram_range=(1, ngr), min_topic_size=mts,
                           umap_model=umap_model, hdbscan_model=hdbscan_model)
    try:
        topics = topic_model.fit_transform(np.array(data_val))
        return compute_metrics(topics, topic_model, np.array(data_val))[1]
    except:
        return -1

In [12]:
lst_models_3 = ['model_w/tsdae-declutr-base', 'model_w/tsdae-electra-base-discriminator',
                'model_w/tsdae-all-distilroberta-v1', 'model_w/tsdae-BiomedNLP-PubMedBERT-base-uncased-abstract',
                'all-mpnet-base-v2']
params_for_models = []
for md in lst_models_3:
    sentence_model = SentenceTransformer(md, device="cuda")
    study = optuna.create_study(direction="maximize")
    study.enqueue_trial(
        {
            'top_n_words': 10,
            'n_gram_range': 1,
            'min_topic_size': 10,
            'min_dist': 0.000001,
            'n_neighbors': 15,
            'n_components': 5,
            # 'cluster_selection_epsilon': 0.0,
            # 'min_cluster_size': 5,
            'min_samples': 5
        }
    )
    study.optimize(objective, n_trials=30, n_jobs=1)
    params_for_models.append(study.best_trial)

[32m[I 2022-05-15 15:45:30,966][0m A new study created in memory with name: no-name-7c13ef6b-9088-4cfd-aedc-c9eec061244c[0m
  create_trial(state=TrialState.WAITING, system_attrs={"fixed_params": params})
  create_trial(state=TrialState.WAITING, system_attrs={"fixed_params": params})
[32m[I 2022-05-15 15:47:21,668][0m Trial 0 finished with value: 0.033956954270832417 and parameters: {'top_n_words': 10, 'n_gram_range': 1, 'min_topic_size': 10, 'min_dist': 1e-06, 'n_neighbors': 15, 'n_components': 25, 'min_samples': 5}. Best is trial 0 with value: 0.033956954270832417.[0m
[32m[I 2022-05-15 15:48:53,315][0m Trial 1 finished with value: 0.06132189501261065 and parameters: {'top_n_words': 21, 'n_gram_range': 1, 'min_topic_size': 11, 'min_dist': 7.535772053962225e-06, 'n_neighbors': 7, 'n_components': 22, 'min_samples': 9}. Best is trial 1 with value: 0.06132189501261065.[0m
[32m[I 2022-05-15 15:51:30,519][0m Trial 2 finished with value: 0.0197056796061845 and parameters: {'top_n_w

In [13]:
print(params_for_models)

[FrozenTrial(number=14, values=[0.14658467980924528], datetime_start=datetime.datetime(2022, 5, 15, 16, 13, 46, 500291), datetime_complete=datetime.datetime(2022, 5, 15, 16, 16, 28, 56658), params={'top_n_words': 13, 'n_gram_range': 1, 'min_topic_size': 24, 'min_dist': 0.002488734602240512, 'n_neighbors': 99, 'n_components': 69, 'min_samples': 21}, distributions={'top_n_words': IntLogUniformDistribution(high=30, low=10, step=1), 'n_gram_range': IntLogUniformDistribution(high=3, low=1, step=1), 'min_topic_size': IntLogUniformDistribution(high=50, low=5, step=1), 'min_dist': LogUniformDistribution(high=1.0, low=1e-06), 'n_neighbors': IntLogUniformDistribution(high=100, low=2, step=1), 'n_components': IntLogUniformDistribution(high=250, low=10, step=1), 'min_samples': IntLogUniformDistribution(high=40, low=2, step=1)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=14, state=TrialState.COMPLETE, value=None), FrozenTrial(number=29, values=[0.11922874311657132], datetime_s

In [14]:
def extract_pars(dct):
    tmp = dict()
    tmp['min_dist'] = dct['min_dist']
    tmp['n_neighbors'] = dct['n_neighbors']
    tmp['n_components'] = dct['n_components']
    tmp_1 = dict()
    # tmp_1['cluster_selection_epsilon'] = dct['cluster_selection_epsilon']
    # tmp_1['min_cluster_size'] = dct['min_cluster_size']
    tmp_1['min_samples'] = dct['min_samples']
    tmp_2 = dict()
    tmp_2['top_n_words'] = dct['top_n_words']
    tmp_2['n_gram_range'] = (1, dct['n_gram_range'])
    tmp_2['min_topic_size'] = dct['min_topic_size']
    return tmp, tmp_1, tmp_2

In [15]:
scores_paramed= []
lst_models_3 = ['model_w/tsdae-declutr-base', 'model_w/tsdae-electra-base-discriminator',
                'model_w/tsdae-all-distilroberta-v1', 'model_w/tsdae-BiomedNLP-PubMedBERT-base-uncased-abstract',
                'all-mpnet-base-v2']
for i, md in tqdm(enumerate(lst_models_3)):
    params_ = extract_pars(params_for_models[i])
    sentence_model = SentenceTransformer(md, device="cuda")
    umap_model = umap.UMAP(**params_[0], random_state=42)
    hdbscan_model = hdbscan.HDBSCAN(**params_[1])
    topic_model = BERTopic(embedding_model=sentence_model, umap_model=umap_model, hdbscan_model=hdbscan_model, **params_[2])
    topics = topic_model.fit_transform(np.array(data_test))
    scores_paramed.append(compute_metrics(topics, topic_model, np.array(data_test)))
print(scores_paramed)

0it [00:00, ?it/s]

[(0.9153846153846154, 0.1635431657215938, 0.7149958402605795), (0.8578947368421053, 0.13978068583345116, 0.6686617369147273), (0.8555555555555555, 0.15337788179734224, 0.6919941448233904), (0.8631578947368421, 0.13549285052236973, 0.7085891360388634), (0.8571428571428571, 0.1573613540241867, 0.6823278178506873)]


In [16]:
scores_trained.append((None, None, None))

In [17]:
lst_models_3 = ['model_w/tsdae-declutr-base', 'model_w/tsdae-electra-base-discriminator',
                'model_w/tsdae-all-distilroberta-v1', 'model_w/tsdae-BiomedNLP-PubMedBERT-base-uncased-abstract',
                'kek/all-mpnet-base-v2']
tmp = list(zip([x.split('/')[1] + ' untrained' for x in lst_models_3], scores_untrained)) + list(zip([x.split('/')[1] +
' trained'for x in lst_models_3], scores_trained)) + list(zip([x.split('/')[1] + ' params' for x in lst_models_3], scores_paramed))

In [18]:
tmp

[('tsdae-declutr-base untrained',
  (0.8216867469879519, 0.1491818372449558, 0.6768210967742299)),
 ('tsdae-electra-base-discriminator untrained',
  (0.8371681415929203, -0.1863928028104267, 0.3271564661763955)),
 ('tsdae-all-distilroberta-v1 untrained',
  (0.8012578616352202, 0.1649150506281666, 0.6895854913749707)),
 ('tsdae-BiomedNLP-PubMedBERT-base-uncased-abstract untrained',
  (0.781578947368421, 0.12828512277294654, 0.6679900561014893)),
 ('all-mpnet-base-v2 untrained',
  (0.8241758241758241, 0.17764859323812446, 0.700082176733079)),
 ('tsdae-declutr-base trained',
  (0.8189189189189189, 0.16313223258919463, 0.695366367610592)),
 ('tsdae-electra-base-discriminator trained',
  (1.0, -0.03706645614338682, 0.449954260949123)),
 ('tsdae-all-distilroberta-v1 trained',
  (1.0, 0.06886228904641256, 0.4568813321059566)),
 ('tsdae-BiomedNLP-PubMedBERT-base-uncased-abstract trained',
  (0.8166666666666667, 0.14485702070906367, 0.6903788020275959)),
 ('all-mpnet-base-v2 trained', (None, No

In [19]:
dct = dict(tmp)
results = pd.DataFrame(dct)
results.index = ['topic diversity score', 'npmi score', 'cv score']
results.T.sort_index().to_csv('results_my.csv')

In [21]:
results.T.sort_index()

Unnamed: 0,topic diversity score,npmi score,cv score
all-mpnet-base-v2 params,0.857143,0.157361,0.682328
all-mpnet-base-v2 trained,,,
all-mpnet-base-v2 untrained,0.824176,0.177649,0.700082
tsdae-BiomedNLP-PubMedBERT-base-uncased-abstract params,0.863158,0.135493,0.708589
tsdae-BiomedNLP-PubMedBERT-base-uncased-abstract trained,0.816667,0.144857,0.690379
tsdae-BiomedNLP-PubMedBERT-base-uncased-abstract untrained,0.781579,0.128285,0.66799
tsdae-all-distilroberta-v1 params,0.855556,0.153378,0.691994
tsdae-all-distilroberta-v1 trained,1.0,0.068862,0.456881
tsdae-all-distilroberta-v1 untrained,0.801258,0.164915,0.689585
tsdae-declutr-base params,0.915385,0.163543,0.714996


In [None]:
# https://towardsdatascience.com/evaluate-topic-model-in-python-latent-dirichlet-allocation-lda-7d57484bb5d0