# Topic Modeling
This notebooks aims to map vectorized documents to a topic space. Topic space reflects the underlying structure of the documents.

In [81]:
%reload_ext autoreload
%autoreload 2

from pathlib import Path
import numpy as np
import pandas as pd
from python.cogtext.datasets.pubmed import PubMedDataLoader
from python.cogtext.topic_model import TopicModel

In [None]:
# PUBMED = PubMedDataLoader(preprocessed=False, drop_low_occurred_labels=False).load()
# EMBEDDINGS = np.load('models/embeddings/abstracts_UMAPv4.npz')['arr_0']
embeddings = np.load('models/embeddings/abstracts_all-MiniLM-L6-v2.npz')['arr_0']
documents = pd.read_csv('models/cogtext/abstracts_clusters.csv.gz', index_col=0)
reduced_embeddings = np.load('models/embeddings/abstracts_UMAP5d.npz')['arr_0']

In [None]:
model = TopicModel(parametric_umap=False, verbose=True)
clusters, weights = model.fit_transform(embeddings, umap_embeddings=reduced_embeddings)

# drop cluster "-1" and make the rest 1-indexed
documents['cluster'] = np.where(clusters >= 0, clusters + 1, np.nan)

# store
documents.to_csv('models/cogtext/abstracts_clusters.csv.gz', index=True)
np.savez('models/embeddings/abstracts_cluster-weights.npz', weights)

# report cluster frequencies
documents['cluster'].value_counts()

In [93]:
%reload_ext watermark
%watermark
%watermark -iv -p umap,hdbscan,joblib,numpy,numba,pytorch,tensorflow,python.cogtext

Last updated: 2021-12-05T12:41:20.252167+01:00

Python implementation: CPython
Python version       : 3.9.7
IPython version      : 7.28.0

Compiler    : Clang 11.1.0 
OS          : Darwin
Release     : 21.1.0
Machine     : x86_64
Processor   : i386
CPU cores   : 12
Architecture: 64bit

umap          : 0.5.2
hdbscan       : 0.8.27
joblib        : 1.1.0
numpy         : 1.20.3
numba         : 0.54.1
pytorch       : not installed
tensorflow    : 2.7.0
python.cogtext: 0.1.2021120512

sys   : 3.9.7 | packaged by conda-forge | (default, Sep 29 2021, 19:23:19) 
[Clang 11.1.0 ]
pandas: 1.3.4
numpy : 1.20.3

