In [None]:
import os
os.environ['NUMBA_CACHE_DIR'] = '/tmp/numba_cache'
os.environ['MPLCONFIGDIR'] = '/tmp/matplotlib_cache'
os.environ['HF_HOME'] = '/tmp/huggingface_cache'

from bertopic import BERTopic
from sklearn.datasets import fetch_20newsgroups
from umap import UMAP
from sentence_transformers import SentenceTransformer

In [None]:
docs = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))['data']

print(f'length: {len(docs)}')

docs = docs[0:9000]

In [None]:
topic_model = BERTopic()
topics, probs = topic_model.fit_transform(docs)

In [None]:
docs[0]

In [None]:
topic_model.get_document_info(docs)

In [None]:
topic_model.visualize_topics()

In [None]:
topic_model.visualize_heatmap(n_clusters=0)

In [None]:
# Prepare embeddings
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = sentence_model.encode(docs, show_progress_bar=False)

# Train BERTopic
topic_model = BERTopic().fit(docs, embeddings)

# Reduce dimensionality
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)

In [None]:
topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings, hide_document_hover=True)

In [None]:
# with the reduced embeddings
topic_model.visualize_document_datamap(docs, reduced_embeddings=reduced_embeddings)