In [1]:
!pip install -q -U transformers sentence-transformers pylatexenc contextualized_topic_models optuna octis

[K     |████████████████████████████████| 4.2 MB 5.4 MB/s 
[K     |████████████████████████████████| 79 kB 7.4 MB/s 
[K     |████████████████████████████████| 162 kB 45.0 MB/s 
[K     |████████████████████████████████| 308 kB 35.3 MB/s 
[K     |████████████████████████████████| 129 kB 49.1 MB/s 
[K     |████████████████████████████████| 6.6 MB 34.2 MB/s 
[K     |████████████████████████████████| 596 kB 46.6 MB/s 
[K     |████████████████████████████████| 84 kB 3.1 MB/s 
[K     |████████████████████████████████| 1.2 MB 45.9 MB/s 
[K     |████████████████████████████████| 121 kB 47.6 MB/s 
[K     |████████████████████████████████| 783 kB 41.9 MB/s 
[K     |████████████████████████████████| 24.1 MB 9.6 MB/s 
[K     |████████████████████████████████| 366 kB 42.2 MB/s 
[K     |████████████████████████████████| 381 kB 10.4 MB/s 
[K     |████████████████████████████████| 1.4 MB 41.0 MB/s 
[K     |████████████████████████████████| 1.6 MB 34.7 MB/s 
[K     |████████████████████

In [2]:
from transformers import AutoTokenizer, AutoModel
from sklearn.cluster import DBSCAN, OPTICS
from sklearn.metrics import silhouette_score, davies_bouldin_score
import optuna
from numpy.random import default_rng
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import torch
import optuna

from pylatexenc.latex2text import LatexNodes2Text

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
tqdm.pandas()
docs_df = pd.read_csv('/content/drive/My Drive/to_use.csv')
df = docs_df.sample(15000, random_state=42)
rng = default_rng(49)
df['type_set'] = rng.choice(np.array(['train', 'val', 'test']), 15000)

  


In [5]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [6]:
def clean_abstract(txt, stp_wrds):
    txt = txt.lower()
    try:
        txt = LatexNodes2Text().latex_to_text(txt)
    except:
        txt = txt.lower()
    sup = word_tokenize(txt, language="english")
    lemmatizer = WordNetLemmatizer()
    sup = [lemmatizer.lemmatize(word) for word in sup]
    ret = []
    for el in sup:
        el.replace('\\', '')
        if el not in stp_wrds and el not in string.punctuation:
            ret.append(el)
    return ' '.join(ret)


stop_words = nltk.corpus.stopwords.words("english")
stop_words += ['we', 'paper', 'new', 'article', "''", "``", "”", 'et', 'al', 'study', 'state', 'of', 'the', 'art']
cleaned_abstract = df['abstract'].progress_apply(clean_abstract, args=[stop_words])
df['abstract'] = cleaned_abstract

  0%|          | 0/15000 [00:00<?, ?it/s]

In [7]:
train = df[df['type_set'] == 'train']['abstract'].tolist()
val = df[df['type_set'] == 'val']['abstract'].tolist()
test = df[df['type_set'] == 'test']['abstract'].tolist()

In [13]:
from contextualized_topic_models.evaluation.measures import CoherenceNPMI, CoherenceCV, InvertedRBO, TopicDiversity


def compute_metrics(train_model, textss, top_k=10):
    texts = [doc.split() for doc in textss]
    npmi = CoherenceNPMI(texts=texts, topics=train_model.get_topic_lists(10))
    td = TopicDiversity(topics=train_model.get_topic_lists(25))
    cv = CoherenceCV(texts=texts, topics=train_model.get_topic_lists(10))

    topic_diversity_score = td.score()
    npmi_score = npmi.score()
    cv_score = cv.score()

    return topic_diversity_score, npmi_score, cv_score

In [15]:
from contextualized_topic_models.models.ctm import CombinedTM
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation


tp = TopicModelDataPreparation('sentence-transformers/all-mpnet-base-v2')
training_dataset = tp.fit(text_for_contextual=train, text_for_bow=train)

ctm = CombinedTM(bow_size=len(tp.vocab), contextual_size=768, n_components=50, num_epochs=5)
ctm.fit(training_dataset)

In [14]:
compute_metrics(ctm, train)

(0.5008, -0.03513024593027903, 0.504954223950875)

In [16]:
def objective(trial):
    torch.cuda.empty_cache()
    epochs = trial.suggest_int("num_epochs", 5, 15, log=True)
    n_comp = trial.suggest_int("n_components", 10, 150, log=True)
    ctm = CombinedTM(bow_size=len(tp.vocab), contextual_size=768, n_components=n_comp, num_epochs=epochs)
    ctm.fit(training_dataset)
    try:
        return compute_metrics(ctm, train)[1]
    except:
        return -49

In [17]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=25, n_jobs=1)

[32m[I 2022-05-16 10:34:02,628][0m A new study created in memory with name: no-name-26f7680a-3d5f-4b03-a031-5fb0bb4e393f[0m
Epoch: [13/13]	 Seen Samples: [65533/65533]	Train Loss: 915.5513825691827	Time: 0:00:03.523247: : 13it [00:51,  3.93s/it]
Sampling: [20/20]: : 20it [00:56,  2.80s/it]
[32m[I 2022-05-16 10:36:02,756][0m Trial 0 finished with value: 0.0023599992079648183 and parameters: {'num_epochs': 13, 'n_components': 26}. Best is trial 0 with value: 0.0023599992079648183.[0m
Epoch: [14/14]	 Seen Samples: [70574/70574]	Train Loss: 911.5462767804007	Time: 0:00:03.639111: : 14it [00:50,  3.61s/it]
Sampling: [20/20]: : 20it [00:57,  2.85s/it]
[32m[I 2022-05-16 10:38:04,813][0m Trial 1 finished with value: 0.01051321646426849 and parameters: {'num_epochs': 14, 'n_components': 28}. Best is trial 1 with value: 0.01051321646426849.[0m
Epoch: [9/9]	 Seen Samples: [45369/45369]	Train Loss: 924.9398440599583	Time: 0:00:03.578174: : 9it [00:34,  3.78s/it]
Sampling: [20/20]: : 20it 

In [18]:
study.best_trial.params

{'n_components': 50, 'num_epochs': 15}

In [19]:
tp = TopicModelDataPreparation('sentence-transformers/all-mpnet-base-v2')
test_dataset = tp.fit(text_for_contextual=df['abstract'].tolist(), text_for_bow=df['abstract'].tolist())
# corpus = tp.fit(text_for_contextual=df['abstract'].tolist(), text_for_bow=df['abstract'].tolist())

ctm = CombinedTM(bow_size=len(tp.vocab), contextual_size=768, **study.best_trial.params)
ctm.fit(test_dataset)



Batches:   0%|          | 0/75 [00:00<?, ?it/s]

Epoch: [15/15]	 Seen Samples: [225000/225000]	Train Loss: 956.7072411458333	Time: 0:00:14.156317: : 15it [03:34, 14.30s/it]
Sampling: [20/20]: : 20it [03:15,  9.79s/it]


In [20]:
compute_metrics(ctm, df['abstract'].tolist())

(0.6472, 0.04274921193542138, 0.6143327177688397)