as per: https://colab.research.google.com/drive/1BoQ_vakEVtojsd2x_U6-_x52OOuqruj2?usp=sharing#scrollTo=w3WRXoRP2ej8


In [28]:
import os
import dotenv
import openai
import pandas as pd
import numpy as np

from typing import Text
from bertopic import BERTopic
from bertopic.backend import OpenAIBackend
from nltk.tokenize import sent_tokenize, word_tokenize

In [29]:
MODEL_NAME = "slop_min15_df1_openai_sentences.ipynb"

In [30]:
df = pd.read_feather("analysis.feather")
documents = df["text"].tolist()

In [31]:
# comment this out for document-level, leave it in for sentence level
sentences = [sent_tokenize(x) for x in documents]
documents = [sentence for doc in sentences for sentence in doc]
len(documents)

df = pd.DataFrame({"text": documents})

In [32]:
# use this for the base embeddings
# from sentence_transformers import SentenceTransformer

# embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
# embeddings = embedding_model.encode(documents, show_progress_bar=True)

In [33]:
# use this if you want openai embeddings (better performance)
dotenv.load_dotenv()
client = openai.OpenAI(api_key=os.environ["OPENAI_API_KEY"])
embedding_model = OpenAIBackend(client, "text-embedding-3-large")

# careful !
# no need to compute more than once here
# df["embedding"] = df["text"].apply(
#     lambda x: client.embeddings.create(model="text-embedding-3-large", input=x)
#     .data[0]
#     .embedding
# )
# df.to_feather("analysis_with_embeddings_40_sentences.feather")

df = pd.read_feather("analysis_with_embeddings_40_sentences.feather")

embeddings = np.array(df["embedding"].tolist())
embeddings.shape

(1756, 3072)

In [34]:
# import asyncio
# import pandas as pd
# from openai import AsyncOpenAI, OpenAIError
# from tqdm.asyncio import tqdm_asyncio
# import nest_asyncio

# # Enable nested event loop support (for Jupyter etc.)
# nest_asyncio.apply()

# client = AsyncOpenAI()

# # Settings
# BATCH_SIZE = 10
# MAX_RETRIES = 3
# RETRY_BACKOFF = 2
# CONCURRENCY_LIMIT = 5  # Maximum concurrent OpenAI requests

# semaphore = asyncio.Semaphore(CONCURRENCY_LIMIT)

# async def embed_batch(texts, attempt=1):
#     async with semaphore:
#         try:
#             response = await client.embeddings.create(
#                 model="text-embedding-3-large",
#                 input=texts
#             )
#             return [item.embedding for item in response.data]

#         except OpenAIError as e:
#             if attempt <= MAX_RETRIES:
#                 wait = RETRY_BACKOFF ** attempt
#                 print(f"[Retrying in {wait}s] Error: {e}")
#                 await asyncio.sleep(wait)
#                 return await embed_batch(texts, attempt + 1)
#             else:
#                 print(f"[Failed after {MAX_RETRIES} attempts] {e}")
#                 return [None] * len(texts)

# async def embed_all(texts):
#     batches = [texts[i:i + BATCH_SIZE] for i in range(0, len(texts), BATCH_SIZE)]

#     tasks = [embed_batch(batch) for batch in batches]
#     results = []
#     for coro in tqdm_asyncio.gather(*tasks, desc="Embedding", total=len(batches)):
#         batch_result = await coro
#         results.extend(batch_result)
#     return results

# async def run_embedding(df):
#     texts = df["text"].tolist()
#     embeddings = await embed_all(texts)
#     df["embedding"] = embeddings
#     return df

# # Run the whole thing
# df = asyncio.run(run_embedding(df))

In [35]:
num_failed = df["embedding"].apply(lambda x: x is None).sum()
print(f"Failed embeddings: {num_failed}")

Failed embeddings: 0


In [36]:
from umap import UMAP

umap_model = UMAP(
    n_neighbors=15, n_components=5, min_dist=0.0, metric="cosine", random_state=42
)

In [37]:
from hdbscan import HDBSCAN

# A higher min_topic_size will generate fewer topics and a lower min_topic_size will generate more topics.

hdbscan_model = HDBSCAN(
    min_cluster_size=15,
    metric="euclidean",
    cluster_selection_method="eom",
    prediction_data=True,
)

In [38]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer_model = CountVectorizer(stop_words="english", min_df=1, ngram_range=(1, 2))

In [39]:
from bertopic.representation import (
    KeyBERTInspired,
    MaximalMarginalRelevance,
    OpenAI,
    PartOfSpeech,
)

# KeyBERT
keybert_model = KeyBERTInspired()

# Part-of-Speech
pos_model = PartOfSpeech("en_core_web_sm")

# MMR
mmr_model = MaximalMarginalRelevance(diversity=0.3)

# GPT
prompt = """
I have a topic that contains the following documents:
[DOCUMENTS]

The topic is described by the following keywords: [KEYWORDS]

Based on the information above, extract a short but highly descriptive topic label of at most 5 words. Make sure it is in the following format:
topic: <topic label>
"""
openai_model = OpenAI(
    client=openai.OpenAI(api_key=os.environ["OPENAI_API_KEY"]),
    model="gpt-4.1-nano-2025-04-14",
    exponential_backoff=True,
    chat=True,
    prompt=prompt,
)

# also consider, in order of pricing (0.010 -> 2.0):
# gpt-4.1-nano-2025-04-14
# gpt-4o-mini-2024-07-18
# o3-mini-2025-01-31
# o1-mini-2024-09-12
# gpt-4.1-2025-04-14
# o3-2025-04-16

# All representation models
representation_model = {
    "KeyBERT": keybert_model,
    "OpenAI": openai_model,
    "MMR": mmr_model,
    "POS": pos_model,
}

In [40]:
from bertopic import BERTopic
from huggingface_hub import DocumentQuestionAnsweringOutputElement
from numpy import ndarray

topic_model = BERTopic(
    # Pipeline models
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    representation_model=representation_model,  # type: ignore
    # Hyperparameters
    top_n_words=10,
    verbose=True,
)

topics, probs = topic_model.fit_transform(documents, embeddings)
# topics, probs = topic_model.fit_transform(documents)

2025-06-11 23:02:43,880 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-06-11 23:03:06,141 - BERTopic - Dimensionality - Completed ✓
2025-06-11 23:03:06,144 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-06-11 23:03:06,231 - BERTopic - Cluster - Completed ✓
2025-06-11 23:03:06,238 - BERTopic - Representation - Fine-tuning topics using representation models.
100%|██████████| 19/19 [00:08<00:00,  2.13it/s]
2025-06-11 23:03:46,457 - BERTopic - Representation - Completed ✓


In [41]:
topic_model.save(MODEL_NAME, serialization="safetensors")

In [42]:
topic_model.visualize_topics(custom_labels=True)

In [43]:
chatgpt_topic_labels = {
    topic: " | ".join(list(zip(*values))[0])
    for topic, values in topic_model.topic_aspects_["OpenAI"].items()
}
chatgpt_topic_labels[-1] = "Outlier Topic"
topic_model.set_topic_labels(chatgpt_topic_labels)

topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,CustomName,Representation,KeyBERT,OpenAI,MMR,POS,Representative_Docs
0,-1,191,-1_voice_ai_spotify_wrapped,Outlier Topic,"[voice, ai, spotify, wrapped, game, using, use...","[ai generated, using ai, generative ai, spotif...",[AI Impact on Voice Actors],"[voice, spotify, actors, studios, noa, voice a...","[voice, game, users, company, event, actors, s...","[Similarly, Spotify’s new Wrapped AI podcast f..."
1,0,923,0_ai_slop_content_generated,AI-Generated Content and Slop,"[ai, slop, content, generated, facebook, image...","[slop, ai slop, ai generated, generative ai, s...",[AI-Generated Content and Slop],"[generated, facebook, ai generated, ai slop, t...","[slop, content, images, people, media, social,...","[- ""AI slop"" images are all over Facebook., Wh..."
2,1,91,1_parade_halloween_dublin_halloween parade,Dublin Halloween Parade Hoax,"[parade, halloween, dublin, halloween parade, ...","[halloween parade, parade dublin, dublin hallo...",[Dublin Halloween Parade Hoax],"[parade, halloween parade, connell street, mys...","[parade, halloween, street, website, people, c...","[People waiting for a halloween parade., “All ..."
3,2,66,2_activision_game_duty_ai,Activision AI-Generated Game Content,"[activision, game, duty, ai, black, black ops,...","[activision ai, activision, activision sold, g...",[Activision AI-Generated Game Content],"[activision, black ops, steam, generative ai, ...","[game, black, loading, generative, steam, bund...",[Activision states on the game's product page ...
4,3,60,3_cola_coca cola_coca_company,Coca-Cola's AI Holiday Campaign,"[cola, coca cola, coca, company, ad, holidays,...","[coca cola, cola company, coke, christmas ad, ...",[Coca-Cola's AI Holiday Campaign],"[coca cola, backlash, commercials, brand, holi...","[cola, company, ad, holidays, holiday, commerc...",[Coca-Cola will always remain dedicated to cre...
5,4,60,4_chocolate_willy_event_experience,Willy's Chocolate Experience Fail,"[chocolate, willy, event, experience, chocolat...","[wonka chocolate, willy chocolate, wonka event...",[Willy's Chocolate Experience Fail],"[chocolate experience, willy chocolate, wonka,...","[chocolate, event, experience, children, wareh...",[Note that it’s named after “Willy” but not “W...
6,5,44,5_scammers_highly_mistake_actually,Event Cancellation and Apologies,"[scammers, highly, mistake, actually, highly e...","[really sorry, highly embarrassed, mistake sca...",[Event Cancellation and Apologies],"[scammers, highly embarrassed, tickets, actual...","[scammers, mistake, embarrassed, tickets, scam...",[And you guys should not say that we are the s...
7,6,43,6_times_eurogamer net_eurogamer_net,Prominent News Media Outlets,"[times, eurogamer net, eurogamer, net, news, y...","[york times, times reporter, washington post, ...",[Prominent News Media Outlets],"[eurogamer net, eurogamer, york times, net gam...","[news, reporter, sports, coverage, editor, yea...","[The New York Times., The New York Times., The..."
8,7,38,7_enix_square enix_square_kiryu,Square Enix's AI and Tech Strategy,"[enix, square enix, square, kiryu, aggressive,...","[square enix, enix president, enix intends, en...",[Square Enix's AI and Tech Strategy],"[enix, square enix, aggressive, applying ai, d...","[aggressive, blockchain, letter, year, develop...",[The inclusion of AI in Foamstars is perhaps u...
9,8,35,8_film_ai_poster_generated,Paramount's AI-Generated Film Content,"[film, ai, poster, generated, ai generated, pa...","[promoting film, paramount pictures, used ai, ...",[Paramount's AI-Generated Film Content],"[paramount, civil war, novocaine, a24, posters...","[film, poster, men, posters, voice, quality, n...","[[21]\nIn March 2025, Paramount Pictures was c..."


In [44]:
topic_model.get_topic_info()["CustomName"].tolist()

['Outlier Topic',
 'AI-Generated Content and Slop',
 'Dublin Halloween Parade Hoax',
 'Activision AI-Generated Game Content',
 "Coca-Cola's AI Holiday Campaign",
 "Willy's Chocolate Experience Fail",
 'Event Cancellation and Apologies',
 'Prominent News Media Outlets',
 "Square Enix's AI and Tech Strategy",
 "Paramount's AI-Generated Film Content",
 '2024 Tech and Media Coverage',
 'AI Use in Foamstars Development',
 '2025 Document Retrieval Dates',
 'IGN Editorial Contacts and Activision Commentary',
 '2024 Document Retrieval Timeline',
 '2024 Fans Unwrapping Reaction',
 'Misjudgments and Humanization',
 '404 Media Advertising and Storytelling',
 'Angry Birds Block Puzzle Launch']

In [45]:
topic_model.visualize_hierarchy(custom_labels=True)

In [46]:
topic_model.visualize_heatmap()

In [47]:
similar_topics, similarity = topic_model.find_topics("slop", top_n=5)
topic_model.get_topic(similar_topics[0])

[('ai', 0.02266886745875234),
 ('slop', 0.01856637863911902),
 ('content', 0.013904396855922901),
 ('generated', 0.012426233049074795),
 ('facebook', 0.010393436838820361),
 ('images', 0.010167166818381961),
 ('ai generated', 0.009838463272417282),
 ('ai slop', 0.009216027165670937),
 ('trump', 0.00911492515010106),
 ('like', 0.008468276306572606)]

In [48]:
# `topic_distr` contains the distribution of topics in each document
topic_distr, _ = topic_model.approximate_distribution(documents, window=8, stride=4)

  0%|          | 0/2 [00:00<?, ?it/s]

100%|██████████| 2/2 [00:00<00:00,  9.51it/s]


In [49]:
reduced_embeddings = UMAP(
    n_neighbors=10, n_components=2, min_dist=0.0, metric="cosine"
).fit_transform(embeddings)
topic_model.visualize_documents(
    documents,
    reduced_embeddings=reduced_embeddings,
    custom_labels=True,
    hide_annotations=True,
)

In [50]:
embeddings.shape

(1756, 3072)