In [1]:
%reload_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns; sns.set_theme()  # noqa
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
from python.cogtext.pubmed.preprocessing import PubMedPreprocessor

In [14]:
# Data and parameters
PUBMED = pd.read_csv('data/pubmed/abstracts.csv.gz').dropna(subset=['abstract']).reset_index()
PUBMED = PUBMED.sample(frac=0.05, random_state=0)

PUBMED = PUBMED.pipe(PubMedPreprocessor.select_relevant_journals)
PUBMED = PUBMED.pipe(PubMedPreprocessor.remove_short_abstracts)

In [18]:
X = PUBMED['abstract'].values
y = PUBMED['subcategory'].astype('category').cat.codes

# pretrained document embeddings
embeddings_file = 'models/universal-sentence-encoder-v4/abstracts_embeddings.npz'
doc_embedding_model = SentenceTransformer('all-distilroberta-v1')

# embeddings_file = 'models/all-MiniLM-L6-v2/abstracts_embeddings.npz'
doc_embeddings = np.load(embeddings_file)['arr_0']
doc_embeddings = doc_embeddings[PUBMED.index]

# or retrain the document embedding model from scrach
# doc_embeddings = doc_embedding_model.encode(X, show_progress_bar=True)

In [19]:

# UMAP
umap_model = UMAP(
    n_neighbors=15,
    n_components=2,
    min_dist=0.0,
    metric='cosine',
    low_memory=False
)

# HDBSCAN
hdbscan_model = HDBSCAN(
    min_cluster_size=5,
    metric='euclidean',
    prediction_data=True
)

vectorizer_model = CountVectorizer(ngram_range=(1, 3), stop_words='english',
                                   min_df=3, max_df=int(X.shape[0] * 1.0))

# BERTopic
model = BERTopic(calculate_probabilities=True,
                 nr_topics='auto',
                 embedding_model=doc_embedding_model,
                 umap_model=umap_model,
                 hdbscan_model=hdbscan_model,
                 vectorizer_model=vectorizer_model,
                 verbose=True)

# fit the topic model
topics, scores = model.fit_transform(
    documents=X, y=y, embeddings=doc_embeddings
)

model.get_topic_info()

OMP: Info #271: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
2021-11-01 19:40:06,925 - BERTopic - Reduced dimensionality with UMAP
2021-11-01 19:43:53,211 - BERTopic - Clustered UMAP embeddings with HDBSCAN
2021-11-01 19:49:23,202 - BERTopic - Reduced number of topics from 476 to 293


Unnamed: 0,Topic,Count,Name
0,-1,5102,-1_schizophrenia_stimuli_depression_information
1,0,735,0_hyperactivity_disorder adhd_hyperactivity di...
2,1,317,1_mutations_sequencing_mutation_exome sequencing
3,2,254,2_speech_language_bilingual_languages
4,3,167,3_dementia_alzheimer_alzheimer disease_ad pati...
...,...,...,...
276,288,5,288_patients pediatric ms_aβ levels_pediatric ...
275,289,5,289_information flow_level language_caucasian_...
274,290,5,290_verbal figural fluency_verbal figural_figu...
273,282,5,282_sentence comprehension_sentence_psychosis_...


In [1]:
scores_df = pd.DataFrame(scores, index=PUBMED.index)
scores_df['label'] = PUBMED['subcategory']
scores_df = scores_df.groupby('label').mean()

fig, ax = plt.subplots(1,1, figsize=(10,10))

# projected_scores = PCA(n_components=2, random_state=0).fit_transform(scores_df)
projected_scores = UMAP(n_components=2, random_state=0).fit_transform(scores_df)
projected_scores = pd.DataFrame(projected_scores, index=scores_df.index)

sns.scatterplot(data=projected_scores, x=0, y=1, ax=ax, palette='Accent', s=200)

for (lbl, x, y) in projected_scores.itertuples():
  lbl = f'{lbl[:12]}...' if len(lbl)>10 else lbl
  ax.text(x+0.01, y-np.random.random()*.01, f'{lbl}', alpha=0.5, fontsize=10)

plt.suptitle('2-D projection of the topic embeddings.')
plt.show()
# pd.DataFrame(scores)

NameError: name 'pd' is not defined