as per: https://colab.research.google.com/drive/1BoQ_vakEVtojsd2x_U6-_x52OOuqruj2?usp=sharing#scrollTo=w3WRXoRP2ej8


In [1]:
import os
import dotenv
import openai
import pandas as pd
import numpy as np

from typing import Text
from bertopic import BERTopic
from bertopic.backend import OpenAIBackend
from nltk.tokenize import sent_tokenize, word_tokenize

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
MODEL_NAME = "slop_min40_openai_sentences.ipynb"

In [3]:
df = pd.read_feather("analysis.feather")
documents = df["text"].tolist()

In [4]:
# comment this out for document-level, leave it in for sentence level
sentences = [sent_tokenize(x) for x in documents]
documents = [sentence for doc in sentences for sentence in doc]
len(documents)

df = pd.DataFrame({"text": documents})

In [5]:
# use this for the base embeddings
# from sentence_transformers import SentenceTransformer

# embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
# embeddings = embedding_model.encode(documents, show_progress_bar=True)

In [6]:
# use this if you want openai embeddings (better performance)
dotenv.load_dotenv()
client = openai.OpenAI(api_key=os.environ["OPENAI_API_KEY"])
embedding_model = OpenAIBackend(client, "text-embedding-3-large")

# careful !
# no need to compute more than once here
# df["embedding"] = df["text"].apply(
#     lambda x: client.embeddings.create(model="text-embedding-3-large", input=x)
#     .data[0]
#     .embedding
# )
# df.to_feather("analysis_with_embeddings_40_sentences.feather")

df = pd.read_feather("analysis_with_embeddings_40_sentences.feather")

embeddings = np.array(df["embedding"].tolist())
embeddings.shape

(1756, 3072)

In [7]:
# import asyncio
# import pandas as pd
# from openai import AsyncOpenAI, OpenAIError
# from tqdm.asyncio import tqdm_asyncio
# import nest_asyncio

# # Enable nested event loop support (for Jupyter etc.)
# nest_asyncio.apply()

# client = AsyncOpenAI()

# # Settings
# BATCH_SIZE = 10
# MAX_RETRIES = 3
# RETRY_BACKOFF = 2
# CONCURRENCY_LIMIT = 5  # Maximum concurrent OpenAI requests

# semaphore = asyncio.Semaphore(CONCURRENCY_LIMIT)

# async def embed_batch(texts, attempt=1):
#     async with semaphore:
#         try:
#             response = await client.embeddings.create(
#                 model="text-embedding-3-large",
#                 input=texts
#             )
#             return [item.embedding for item in response.data]

#         except OpenAIError as e:
#             if attempt <= MAX_RETRIES:
#                 wait = RETRY_BACKOFF ** attempt
#                 print(f"[Retrying in {wait}s] Error: {e}")
#                 await asyncio.sleep(wait)
#                 return await embed_batch(texts, attempt + 1)
#             else:
#                 print(f"[Failed after {MAX_RETRIES} attempts] {e}")
#                 return [None] * len(texts)

# async def embed_all(texts):
#     batches = [texts[i:i + BATCH_SIZE] for i in range(0, len(texts), BATCH_SIZE)]

#     tasks = [embed_batch(batch) for batch in batches]
#     results = []
#     for coro in tqdm_asyncio.gather(*tasks, desc="Embedding", total=len(batches)):
#         batch_result = await coro
#         results.extend(batch_result)
#     return results

# async def run_embedding(df):
#     texts = df["text"].tolist()
#     embeddings = await embed_all(texts)
#     df["embedding"] = embeddings
#     return df

# # Run the whole thing
# df = asyncio.run(run_embedding(df))

In [8]:
num_failed = df["embedding"].apply(lambda x: x is None).sum()
print(f"Failed embeddings: {num_failed}")

Failed embeddings: 0


In [9]:
from umap import UMAP

umap_model = UMAP(
    n_neighbors=15, n_components=5, min_dist=0.0, metric="cosine", random_state=42
)

In [10]:
from hdbscan import HDBSCAN

# A higher min_topic_size will generate fewer topics and a lower min_topic_size will generate more topics.

hdbscan_model = HDBSCAN(
    min_cluster_size=10,
    metric="euclidean",
    cluster_selection_method="eom",
    prediction_data=True,
)

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer_model = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 2))

In [12]:
from bertopic.representation import (
    KeyBERTInspired,
    MaximalMarginalRelevance,
    OpenAI,
    PartOfSpeech,
)

# KeyBERT
keybert_model = KeyBERTInspired()

# Part-of-Speech
pos_model = PartOfSpeech("en_core_web_sm")

# MMR
mmr_model = MaximalMarginalRelevance(diversity=0.3)

# GPT
prompt = """
I have a topic that contains the following documents:
[DOCUMENTS]

The topic is described by the following keywords: [KEYWORDS]

Based on the information above, extract a short but highly descriptive topic label of at most 5 words. Make sure it is in the following format:
topic: <topic label>
"""
openai_model = OpenAI(
    client=openai.OpenAI(api_key=os.environ["OPENAI_API_KEY"]),
    model="gpt-4.1-nano-2025-04-14",
    exponential_backoff=True,
    chat=True,
    prompt=prompt,
)

# also consider, in order of pricing (0.010 -> 2.0):
# gpt-4.1-nano-2025-04-14
# gpt-4o-mini-2024-07-18
# o3-mini-2025-01-31
# o1-mini-2024-09-12
# gpt-4.1-2025-04-14
# o3-2025-04-16

# All representation models
representation_model = {
    "KeyBERT": keybert_model,
    "OpenAI": openai_model,
    "MMR": mmr_model,
    "POS": pos_model,
}

In [13]:
from bertopic import BERTopic
from huggingface_hub import DocumentQuestionAnsweringOutputElement
from numpy import ndarray

topic_model = BERTopic(
    # Pipeline models
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    representation_model=representation_model,  # type: ignore
    # Hyperparameters
    top_n_words=10,
    verbose=True,
)

topics, probs = topic_model.fit_transform(documents, embeddings)
# topics, probs = topic_model.fit_transform(documents)

2025-06-11 22:55:07,875 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-06-11 22:55:46,146 - BERTopic - Dimensionality - Completed ✓
2025-06-11 22:55:46,149 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-06-11 22:55:46,250 - BERTopic - Cluster - Completed ✓
2025-06-11 22:55:46,258 - BERTopic - Representation - Fine-tuning topics using representation models.
100%|██████████| 40/40 [00:20<00:00,  1.98it/s]
2025-06-11 22:57:05,468 - BERTopic - Representation - Completed ✓


In [14]:
topic_model.save(MODEL_NAME, serialization="safetensors")

In [15]:
topic_model.visualize_topics(custom_labels=True)

In [16]:
chatgpt_topic_labels = {
    topic: " | ".join(list(zip(*values))[0])
    for topic, values in topic_model.topic_aspects_["OpenAI"].items()
}
chatgpt_topic_labels[-1] = "Outlier Topic"
topic_model.set_topic_labels(chatgpt_topic_labels)

topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,CustomName,Representation,KeyBERT,OpenAI,MMR,POS,Representative_Docs
0,-1,467,-1_slop_ai_internet_content,Outlier Topic,"[slop, ai, internet, content, ai slop, people,...","[ai generated, generative ai, ai tools, social...",[AI-Generated Content and Misinformation],"[slop, content, spam, generated, social media,...","[slop, internet, content, people, spam, image,...",[That information was readily available elsewh...
1,0,179,0_facebook_images_content_ai,AI-Generated Content Monetization,"[facebook, images, content, ai, money, creator...","[social media, tiktok, content creators, ai ge...",[AI-Generated Content Monetization],"[facebook, images, creators, platforms, ai gen...","[images, content, money, creators, platforms, ...",[These pages make money through Facebook’s Per...
2,1,75,1_activision_game_black_steam,Activision's Use of Generative AI in Gaming,"[activision, game, black, steam, sold, assets,...","[activision, microsoft, video games, ai genera...",[Activision's Use of Generative AI in Gaming],"[activision, steam, assets, generative ai, zom...","[game, black, steam, assets, zombie, generativ...",[Activision is embroiled in a backlash over Ca...
3,2,63,2_cola_coca_coca cola_company,Coca-Cola Christmas AI Controversy,"[cola, coca, coca cola, company, ad, holiday, ...","[coca cola, coca, commercials, company said, a...",[Coca-Cola Christmas AI Controversy],"[coca cola, company, christmas, commercials, b...","[cola, company, ad, holiday, commercial, comme...","[- ^ ""Coca-Cola causes controversy with AI-gen..."
4,3,61,3_event_experience_children_factory,Controversial Willy's Chocolate Event,"[event, experience, children, factory, imagina...","[charlie, house illuminati, venue, generated g...",[Controversial Willy's Chocolate Event],"[experience, imagination, arrived, connell, tr...","[event, experience, children, factory, imagina...",[Put on by London-based event company House of...
5,4,61,4_misinformation_fema_reality_people,Misinformation and Conspiracy During Hurricanes,"[misinformation, fema, reality, people, theori...","[misinformation, conspiracy theories, fema, cr...",[Misinformation and Conspiracy During Hurricanes],"[misinformation, fema, reality, theories, hurr...","[misinformation, reality, people, theories, hu...",[FEMA Administrator Deanne Criswell said that ...
6,5,60,5_parade_dublin_halloween_halloween parade,Dublin Halloween Parade Hoax,"[parade, dublin, halloween, halloween parade, ...","[dublin halloween, halloween parade, parade du...",[Dublin Halloween Parade Hoax],"[dublin, halloween, halloween parade, parade d...","[parade, halloween, street, city, people, even...",[Chaos in Dublin as thousands turn up for AI ‘...
7,6,47,6_google_search_results_websites,Google Search AI Integration,"[google, search, results, websites, engine, se...","[results google, search engines, google, searc...",[Google Search AI Integration],"[google, websites, engine, search results, ans...","[search, results, websites, engine, links, pag...","[into its search results on Bing, and it had s..."
8,7,43,7_times_net_eurogamer_york times,Prominent News and Media Outlets,"[times, net, eurogamer, york times, news, atla...","[york times, new york, nbc news, eurogamer, ne...",[Prominent News and Media Outlets],"[eurogamer, york times, news, reporter, verge,...","[news, reporter, globe, field, sports, coverag...","[The New York Times., The New York Times., The..."
9,8,40,8_kiryu_enix_square enix_square,Square Enix's AI Innovation Strategy,"[kiryu, enix, square enix, square, letter, agg...","[square enix, enix president, applying ai, eni...",[Square Enix's AI Innovation Strategy],"[enix, square enix, applying ai, aggressive ap...","[letter, aggressive, new, technologies, develo...","[In turn, Kiryu and Square Enix plan to be, as..."


In [17]:
topic_model.get_topic_info()["CustomName"].tolist()

['Outlier Topic',
 'AI-Generated Content Monetization',
 "Activision's Use of Generative AI in Gaming",
 'Coca-Cola Christmas AI Controversy',
 "Controversial Willy's Chocolate Event",
 'Misinformation and Conspiracy During Hurricanes',
 'Dublin Halloween Parade Hoax',
 'Google Search AI Integration',
 'Prominent News and Media Outlets',
 "Square Enix's AI Innovation Strategy",
 'Event Cancellation and Refund Controversy',
 'AI-Generated Political Fan Imagery',
 'Balancing AI and Human Content',
 'AI-Generated Film Promotion Controversies',
 '2024 November News and Events',
 'Square Enix AI Game Art',
 'AI-Generated Books in Libraries',
 'AI Debate and Implications',
 'Trump Fast Food AIMockery',
 'Noa App Media and Stories',
 'John Milton Freedom Foundation Activities',
 'IGN News and Editing',
 'Discourse on "Slop" and Economy',
 'AI-Generated Halloween Event Misinformation',
 'Document Retrieval Dates 2024',
 'Personal Interactions and Clarifications',
 '2024 November Event Timeline

In [18]:
topic_model.visualize_hierarchy(custom_labels=True)

In [19]:
topic_model.visualize_heatmap()

In [26]:
similar_topics, similarity = topic_model.find_topics("slop", top_n=5)
topic_model.get_topic(similar_topics[0])

[('slop', 0.3351277303495191),
 ('slop slop', 0.2620204291913828),
 ('sector', 0.1375047811120037),
 ('perfect example', 0.07887701224512764),
 ('appropriate', 0.07887701224512764),
 ('requires', 0.07887701224512764),
 ('represent', 0.07887701224512764),
 ('concerns', 0.07887701224512764),
 ('disgusting', 0.07887701224512764),
 ('drowning', 0.07887701224512764)]

In [22]:
# `topic_distr` contains the distribution of topics in each document
topic_distr, _ = topic_model.approximate_distribution(documents, window=8, stride=4)

100%|██████████| 2/2 [00:00<00:00, 10.84it/s]


In [27]:
reduced_embeddings = UMAP(
    n_neighbors=10, n_components=2, min_dist=0.0, metric="cosine"
).fit_transform(embeddings)
topic_model.visualize_documents(
    documents,
    reduced_embeddings=reduced_embeddings,
    custom_labels=True,
    hide_annotations=True,
)

In [24]:
embeddings.shape

(1756, 3072)