as per: https://colab.research.google.com/drive/1BoQ_vakEVtojsd2x_U6-_x52OOuqruj2?usp=sharing#scrollTo=w3WRXoRP2ej8


In [4]:
import os
import dotenv
import openai
import pandas as pd

from typing import Text
from bertopic import BERTopic
from bertopic.backend import OpenAIBackend
from nltk.tokenize import sent_tokenize, word_tokenize

In [5]:
df = pd.read_feather("analysis.feather")
documents = df["text"].tolist()

In [None]:
# sentences = [sent_tokenize(x) for x in documents]
# sentences = [sentence for doc in sentences for sentence in doc]
# len(sentences)

1756

In [8]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(documents, show_progress_bar=True)

# dotenv.load_dotenv()
# client = openai.OpenAI(api_key=os.environ["OPENAI_API_KEY"])
# embedding_model = OpenAIBackend(
#     client, "text-embedding-3-large"
# )  # or text-embedding-3-small

Batches: 100%|██████████| 2/2 [00:04<00:00,  2.02s/it]


In [9]:
from umap import UMAP

umap_model = UMAP(
    n_neighbors=15, n_components=5, min_dist=0.0, metric="cosine", random_state=42
)

In [14]:
from hdbscan import HDBSCAN

# A higher min_topic_size will generate fewer topics and a lower min_topic_size will generate more topics.

hdbscan_model = HDBSCAN(
    min_cluster_size=5,
    metric="euclidean",
    cluster_selection_method="eom",
    prediction_data=True,
)

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer_model = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 2))

In [16]:
from bertopic.representation import (
    KeyBERTInspired,
    MaximalMarginalRelevance,
    OpenAI,
    PartOfSpeech,
)

# KeyBERT
keybert_model = KeyBERTInspired()

# Part-of-Speech
pos_model = PartOfSpeech("en_core_web_sm")

# MMR
mmr_model = MaximalMarginalRelevance(diversity=0.3)

# GPT
prompt = """
I have a topic that contains the following documents:
[DOCUMENTS]

The topic is described by the following keywords: [KEYWORDS]

Based on the information above, extract a short but highly descriptive topic label of at most 5 words. Make sure it is in the following format:
topic: <topic label>
"""
openai_model = OpenAI(
    client=openai.OpenAI(api_key=os.environ["OPENAI_API_KEY"]),
    model="gpt-4.1-nano-2025-04-14",
    exponential_backoff=True,
    chat=True,
    prompt=prompt,
)

# also consider, in order of pricing (0.010 -> 2.0):
# gpt-4.1-nano-2025-04-14
# gpt-4o-mini-2024-07-18
# o3-mini-2025-01-31
# o1-mini-2024-09-12
# gpt-4.1-2025-04-14
# o3-2025-04-16

# All representation models
representation_model = {
    "KeyBERT": keybert_model,
    "OpenAI": openai_model,
    "MMR": mmr_model,
    "POS": pos_model,
}

In [17]:
from bertopic import BERTopic
from huggingface_hub import DocumentQuestionAnsweringOutputElement

topic_model = BERTopic(
    # Pipeline models
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    representation_model=representation_model,
    # Hyperparameters
    top_n_words=10,
    verbose=True,
)

topics, probs = topic_model.fit_transform(documents, embeddings)

2025-06-11 21:18:33,742 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-06-11 21:18:33,872 - BERTopic - Dimensionality - Completed ✓
2025-06-11 21:18:33,873 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-06-11 21:18:33,889 - BERTopic - Cluster - Completed ✓
2025-06-11 21:18:33,913 - BERTopic - Representation - Fine-tuning topics using representation models.
100%|██████████| 3/3 [00:03<00:00,  1.18s/it]
2025-06-11 21:18:57,300 - BERTopic - Representation - Completed ✓


In [18]:
topic_model.save("slop_min5_minilm", serialization="safetensors")

In [None]:
chatgpt_topic_labels = {
    topic: " | ".join(list(zip(*values))[0])
    for topic, values in topic_model.topic_aspects_["OpenAI"].items()
}
chatgpt_topic_labels[-1] = "Outlier Topic"
topic_model.set_topic_labels(chatgpt_topic_labels)

In [21]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,CustomName,Representation,KeyBERT,OpenAI,MMR,POS,Representative_Docs
0,-1,18,-1_ai_people_parade_trump,Outlier Topic,"[ai, people, parade, trump, event, halloween, ...","[ai generated, halloween parade, ai, ai slop, ...",[AI-Generated Content and Misinformation],"[ai, trump, event, coca cola, dublin, like, ha...","[people, parade, event, halloween, company, we...",[The MAGA Aesthetic Is AI Slop\nFar-right infl...
1,0,15,0_ai_slop_generated_content,AI-Generated Content and Slop,"[ai, slop, generated, content, ai generated, 2...","[ai slop, ai content, ai generated, ai, artifi...",[AI-Generated Content and Slop],"[ai, content, ai generated, facebook, ai slop,...","[slop, content, images, like, media, people, i...",[This piece is part of a series on the great i...
2,1,8,1_game_ai_enix_square enix,Square Enix AI Game Development,"[game, ai, enix, square enix, square, developm...","[ai generated, square enix, ai, use ai, enix, ...",[Square Enix AI Game Development],"[ai, enix, square enix, voice, activision, gam...","[game, development, new, voice, year, use, gam...","[Square Enix ""dabbled"" with AI technology for ..."


In [23]:
topic_model.visualize_topics(custom_labels=True)

ValueError: zero-size array to reduction operation maximum which has no identity

In [24]:
topic_model.visualize_hierarchy(custom_labels=True)

In [25]:
topic_model.visualize_heatmap()

In [26]:
similar_topics, similarity = topic_model.find_topics("slop", top_n=5)
topic_model.get_topic(similar_topics[1])

[('ai', 0.0582049322667895),
 ('people', 0.037751107217782943),
 ('parade', 0.032528672913983925),
 ('trump', 0.029896528103374054),
 ('event', 0.029834557565890266),
 ('halloween', 0.029566025093705216),
 ('said', 0.028689933852323813),
 ('cola', 0.028379213294383424),
 ('coca cola', 0.02778019835602726),
 ('coca', 0.02778019835602726)]

In [27]:
topic_model.get_topic(1, full=True)

{'Main': [('game', 0.10269207793410494),
  ('ai', 0.09151255531699912),
  ('enix', 0.05372803455140463),
  ('square enix', 0.05372803455140463),
  ('square', 0.05340761664646968),
  ('development', 0.0456267807880588),
  ('foamstars', 0.045259665957552805),
  ('new', 0.03952167200480227),
  ('voice', 0.039311195182210094),
  ('kiryu', 0.03377463292982146)],
 'KeyBERT': [('ai generated', 0.5567626),
  ('square enix', 0.5369373),
  ('ai', 0.50949705),
  ('use ai', 0.49716032),
  ('enix', 0.4778438),
  ('applying ai', 0.44699562),
  ('ai cutting', 0.42207813),
  ('content development', 0.3843106),
  ('generative ai', 0.36696768),
  ('artwork', 0.3465132)],
 'OpenAI': [('Square Enix AI Game Development', 1)],
 'MMR': [('ai', 0.09151255531699912),
  ('enix', 0.05372803455140463),
  ('square enix', 0.05372803455140463),
  ('voice', 0.039311195182210094),
  ('activision', 0.03134135348831937),
  ('games', 0.030162534676338636),
  ('create', 0.028133841097317057),
  ('letter', 0.02398654654451

In [29]:
# `topic_distr` contains the distribution of topics in each document
topic_distr, _ = topic_model.approximate_distribution(documents, window=8, stride=4)

100%|██████████| 1/1 [00:00<00:00,  4.06it/s]


In [31]:
# Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively:
reduced_embeddings = UMAP(
    n_neighbors=10, n_components=2, min_dist=0.0, metric="cosine"
).fit_transform(embeddings)
# Visualize the documents in 2-dimensional space and show the titles on hover instead of the abstracts
# NOTE: You can hide the hover with `hide_document_hover=True` which is especially helpful if you have a large dataset
topic_model.visualize_documents(
    df['title'].tolist(), reduced_embeddings=reduced_embeddings, custom_labels=True
)

In [32]:
# We can also hide the annotation to have a more clear overview of the topics
topic_model.visualize_documents(
    df['title'].tolist(),
    reduced_embeddings=reduced_embeddings,
    custom_labels=True,
    hide_annotations=True,
)