as per: https://colab.research.google.com/drive/1BoQ_vakEVtojsd2x_U6-_x52OOuqruj2?usp=sharing#scrollTo=w3WRXoRP2ej8


In [28]:
import os
import dotenv
import openai
import pandas as pd
import numpy as np

from typing import Text
from bertopic import BERTopic
from bertopic.backend import OpenAIBackend
from nltk.tokenize import sent_tokenize, word_tokenize

In [29]:
MODEL_NAME = "slop_min40_openai_sentences.ipynb"

In [30]:
df = pd.read_feather("analysis.feather")
documents = df["text"].tolist()

In [31]:
# comment this out for document-level, leave it in for sentence level
sentences = [sent_tokenize(x) for x in documents]
documents = [sentence for doc in sentences for sentence in doc]
len(documents)

df = pd.DataFrame({"text": documents})

In [32]:
# use this for the base embeddings
# from sentence_transformers import SentenceTransformer

# embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
# embeddings = embedding_model.encode(documents, show_progress_bar=True)

In [33]:
# use this if you want openai embeddings (better performance)
dotenv.load_dotenv()
client = openai.OpenAI(api_key=os.environ["OPENAI_API_KEY"])
embedding_model = OpenAIBackend(client, "text-embedding-3-large")

# careful !
# no need to compute more than once here
# df["embedding"] = df["text"].apply(
#     lambda x: client.embeddings.create(model="text-embedding-3-large", input=x)
#     .data[0]
#     .embedding
# )
# df.to_feather("analysis_with_embeddings_40_sentences.feather")

df = pd.read_feather("analysis_with_embeddings_40_sentences.feather")

embeddings = np.array(df["embedding"].tolist())
embeddings.shape

(1756, 3072)

In [34]:
# import asyncio
# import pandas as pd
# from openai import AsyncOpenAI, OpenAIError
# from tqdm.asyncio import tqdm_asyncio
# import nest_asyncio

# # Enable nested event loop support (for Jupyter etc.)
# nest_asyncio.apply()

# client = AsyncOpenAI()

# # Settings
# BATCH_SIZE = 10
# MAX_RETRIES = 3
# RETRY_BACKOFF = 2
# CONCURRENCY_LIMIT = 5  # Maximum concurrent OpenAI requests

# semaphore = asyncio.Semaphore(CONCURRENCY_LIMIT)

# async def embed_batch(texts, attempt=1):
#     async with semaphore:
#         try:
#             response = await client.embeddings.create(
#                 model="text-embedding-3-large",
#                 input=texts
#             )
#             return [item.embedding for item in response.data]

#         except OpenAIError as e:
#             if attempt <= MAX_RETRIES:
#                 wait = RETRY_BACKOFF ** attempt
#                 print(f"[Retrying in {wait}s] Error: {e}")
#                 await asyncio.sleep(wait)
#                 return await embed_batch(texts, attempt + 1)
#             else:
#                 print(f"[Failed after {MAX_RETRIES} attempts] {e}")
#                 return [None] * len(texts)

# async def embed_all(texts):
#     batches = [texts[i:i + BATCH_SIZE] for i in range(0, len(texts), BATCH_SIZE)]

#     tasks = [embed_batch(batch) for batch in batches]
#     results = []
#     for coro in tqdm_asyncio.gather(*tasks, desc="Embedding", total=len(batches)):
#         batch_result = await coro
#         results.extend(batch_result)
#     return results

# async def run_embedding(df):
#     texts = df["text"].tolist()
#     embeddings = await embed_all(texts)
#     df["embedding"] = embeddings
#     return df

# # Run the whole thing
# df = asyncio.run(run_embedding(df))

In [35]:
num_failed = df["embedding"].apply(lambda x: x is None).sum()
print(f"Failed embeddings: {num_failed}")

Failed embeddings: 0


In [36]:
from umap import UMAP

umap_model = UMAP(
    n_neighbors=15, n_components=5, min_dist=0.0, metric="cosine", random_state=42
)

In [37]:
from hdbscan import HDBSCAN

# A higher min_topic_size will generate fewer topics and a lower min_topic_size will generate more topics.

hdbscan_model = HDBSCAN(
    min_cluster_size=20,
    metric="euclidean",
    cluster_selection_method="eom",
    prediction_data=True,
)

In [38]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer_model = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 2))

In [39]:
from bertopic.representation import (
    KeyBERTInspired,
    MaximalMarginalRelevance,
    OpenAI,
    PartOfSpeech,
)

# KeyBERT
keybert_model = KeyBERTInspired()

# Part-of-Speech
pos_model = PartOfSpeech("en_core_web_sm")

# MMR
mmr_model = MaximalMarginalRelevance(diversity=0.3)

# GPT
prompt = """
I have a topic that contains the following documents:
[DOCUMENTS]

The topic is described by the following keywords: [KEYWORDS]

Based on the information above, extract a short but highly descriptive topic label of at most 5 words. Make sure it is in the following format:
topic: <topic label>
"""
openai_model = OpenAI(
    client=openai.OpenAI(api_key=os.environ["OPENAI_API_KEY"]),
    model="gpt-4.1-nano-2025-04-14",
    exponential_backoff=True,
    chat=True,
    prompt=prompt,
)

# also consider, in order of pricing (0.010 -> 2.0):
# gpt-4.1-nano-2025-04-14
# gpt-4o-mini-2024-07-18
# o3-mini-2025-01-31
# o1-mini-2024-09-12
# gpt-4.1-2025-04-14
# o3-2025-04-16

# All representation models
representation_model = {
    "KeyBERT": keybert_model,
    "OpenAI": openai_model,
    "MMR": mmr_model,
    "POS": pos_model,
}

In [None]:
from bertopic import BERTopic
from huggingface_hub import DocumentQuestionAnsweringOutputElement
from numpy import ndarray

topic_model = BERTopic(
    # Pipeline models
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    representation_model=representation_model,  # type: ignore
    # Hyperparameters
    top_n_words=10,
    verbose=True,
)

topics, probs = topic_model.fit_transform(documents, embeddings)
# topics, probs = topic_model.fit_transform(documents)

2025-06-11 22:50:55,125 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-06-11 22:51:17,891 - BERTopic - Dimensionality - Completed ✓
2025-06-11 22:51:17,894 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-06-11 22:51:18,029 - BERTopic - Cluster - Completed ✓
2025-06-11 22:51:18,041 - BERTopic - Representation - Fine-tuning topics using representation models.
100%|██████████| 10/10 [00:05<00:00,  1.87it/s]


In [None]:
topic_model.save(MODEL_NAME, serialization="safetensors")

In [None]:
topic_model.visualize_topics(custom_labels=True)

In [None]:
chatgpt_topic_labels = {
    topic: " | ".join(list(zip(*values))[0])
    for topic, values in topic_model.topic_aspects_["OpenAI"].items()
}
chatgpt_topic_labels[-1] = "Outlier Topic"
topic_model.set_topic_labels(chatgpt_topic_labels)

topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,CustomName,Representation,KeyBERT,OpenAI,MMR,POS,Representative_Docs
0,-1,375,-1_ai_spotify_internet_content,Outlier Topic,"[ai, spotify, internet, content, generated, we...","[ai generated, bots, social media, using ai, g...",[AI-Generated Content and Misinformation],"[spotify, content, generated, media, people, a...","[internet, content, website, event, media, peo...",[That information was readily available elsewh...
1,0,273,0_facebook_ai_images_content,AI-Generated Content on Social Media,"[facebook, ai, images, content, generative, ge...","[generative ai, ai generated, social media, ti...",[AI-Generated Content on Social Media],"[facebook, images, generative ai, chatgpt, pla...","[images, content, generative, platforms, creat...","[TikTok, Facebook and other social media platf..."
2,1,201,1_game_ai_square_activision,AI Usage in Gaming Development,"[game, ai, square, activision, development, ne...","[generative ai, ai generated, use ai, generate...",[AI Usage in Gaming Development],"[game, activision, voice, kiryu, angry, genera...","[game, development, new, voice, company, games...",[Activision states on the game's product page ...
3,2,188,2_trump_image_misinformation_images,AI-Generated Political Misinformation,"[trump, image, misinformation, images, account...","[trump, elon musk, followers, generated images...",[AI-Generated Political Misinformation],"[trump, image, misinformation, influencers, fa...","[image, misinformation, images, account, peopl...",[Donald Trump—or at least whoever controls his...
4,3,141,3_slop_term_spam_ai slop,AI-Generated Content and Spam,"[slop, term, spam, ai slop, ai, internet, gene...","[slop isn, slop, slop ai, sloppers, ai slop, s...",[AI-Generated Content and Spam],"[slop, spam, internet, word, generated content...","[slop, term, spam, internet, content, word, ea...","[That’s slop., Why slop?, But nonsynthetic slo..."
5,4,93,4_brands_brand_content_ai,Authentic Branding in Viral Trends,"[brands, brand, content, ai, brainrot, human, ...","[italian brainrot, branding, brainrot, brands,...",[Authentic Branding in Viral Trends],"[brands, content, human, italian brainrot, ent...","[brands, brand, content, brainrot, human, entr...","[As entrepreneurs, our brands are our lifebloo..."
6,5,90,5_parade_dublin_street_connell,Dublin Halloween Parade Events,"[parade, dublin, street, connell, website, com...","[parade, dublin, myspirithalloween, crowd, mys...",[Dublin Halloween Parade Events],"[parade, dublin, street, connell, people, mysp...","[parade, street, website, people, city, listin...","[On Thursday, hundreds of people gathered on O..."
7,6,69,6_ign_news_times_new york,Media and News Outlets,"[ign, news, times, new york, york, net, euroga...","[ign, gaming, eurogamer, activision, gizmodo, ...",[Media and News Outlets],"[news, eurogamer, york times, reporter, atlant...","[news, reporter, editor, comment, updates, gro...","[IGN., IGN., IGN.]"
8,7,61,7_cola_coca_coca cola_company,Coca-Cola AI Holiday Ads Backlash,"[cola, coca, coca cola, company, ad, holiday, ...","[coca cola, coca, commercials, company said, a...",[Coca-Cola AI Holiday Ads Backlash],"[coca cola, company, christmas, commercials, b...","[cola, company, ad, holiday, commercial, comme...","[- ^ ""Coca-Cola causes controversy with AI-gen..."
9,8,55,8_experience_event_children_factory,Disappointing Willy Wonka Event,"[experience, event, children, factory, imagina...","[charlie, generated gibberish, gizmodo, magica...",[Disappointing Willy Wonka Event],"[experience, children, imagination, arrived, c...","[experience, event, children, factory, imagina...","[Held at the Box Hub, a warehouse event space ..."


In [None]:
topic_model.get_topic_info()["CustomName"].tolist()

['Outlier Topic',
 'AI-Generated Content on Social Media',
 'AI Usage in Gaming Development',
 'AI-Generated Political Misinformation',
 'AI-Generated Content and Spam',
 'Authentic Branding in Viral Trends',
 'Dublin Halloween Parade Events',
 'Media and News Outlets',
 'Coca-Cola AI Holiday Ads Backlash',
 'Disappointing Willy Wonka Event',
 '2024-2025 Calendar Dates',
 'AI Integration in Search Engines',
 'AI-Generated Film Posters Controversy',
 '2024 Monthly Event Highlights',
 'AI-Generated Books in Libraries',
 'Personal Identity and Responsibility']

In [None]:
topic_model.visualize_hierarchy(custom_labels=True)

In [None]:
topic_model.visualize_heatmap()

In [None]:
similar_topics, similarity = topic_model.find_topics("slop", top_n=5)
topic_model.get_topic(similar_topics[0])

[('slop', 0.12661864243087656),
 ('term', 0.0625602752775144),
 ('spam', 0.05904437639970761),
 ('ai slop', 0.03645851505399794),
 ('ai', 0.03366185665676671),
 ('internet', 0.03110414749436344),
 ('generated', 0.02649438303872978),
 ('content', 0.026210223929013735),
 ('word', 0.02555226328783056),
 ('think', 0.022382161657595923)]

In [None]:
topic_model.get_topic(0, full=True)

{'Main': [('facebook', 0.04659195431291535),
  ('ai', 0.04607431547314764),
  ('images', 0.035854051333467724),
  ('content', 0.03136514076542845),
  ('generative', 0.024428769465126052),
  ('generative ai', 0.02245685769923068),
  ('chatgpt', 0.022343556885121487),
  ('platforms', 0.020548202360271425),
  ('creators', 0.020174739159686338),
  ('engagement', 0.01999160352879291)],
 'KeyBERT': [('generative ai', 0.4859438540065479),
  ('ai generated', 0.47298957995907676),
  ('social media', 0.4283856548230271),
  ('tiktok', 0.4106319149079104),
  ('generated images', 0.397606181202182),
  ('use ai', 0.36645479565401134),
  ('ai tools', 0.3610315277706539),
  ('fake', 0.354350820060206),
  ('ai image', 0.35112872276745866),
  ('generated content', 0.35051996495914073)],
 'OpenAI': [('AI-Generated Content on Social Media', 1)],
 'MMR': [('facebook', 0.04659195431291535),
  ('images', 0.035854051333467724),
  ('generative ai', 0.02245685769923068),
  ('chatgpt', 0.022343556885121487),
  (

In [None]:
# `topic_distr` contains the distribution of topics in each document
topic_distr, _ = topic_model.approximate_distribution(documents, window=8, stride=4)

  0%|          | 0/2 [00:00<?, ?it/s]

100%|██████████| 2/2 [00:00<00:00, 10.17it/s]


In [None]:
reduced_embeddings = UMAP(
    n_neighbors=10, n_components=2, min_dist=0.0, metric="cosine"
).fit_transform(embeddings)
topic_model.visualize_documents(
    documents,
    reduced_embeddings=reduced_embeddings,
    custom_labels=True,
    # hide_annotations=True,
)

IndexError: index 312 is out of bounds for axis 0 with size 41

In [None]:
embeddings.shape

(41, 3072)