as per: https://colab.research.google.com/drive/1BoQ_vakEVtojsd2x_U6-_x52OOuqruj2?usp=sharing#scrollTo=w3WRXoRP2ej8


In [28]:
import os
import dotenv
import openai
import pandas as pd
import numpy as np

from typing import Text
from bertopic import BERTopic
from bertopic.backend import OpenAIBackend
from nltk.tokenize import sent_tokenize, word_tokenize

In [29]:
MODEL_NAME = "slop_min40_openai_sentences.ipynb"

In [30]:
df = pd.read_feather("analysis.feather")
documents = df["text"].tolist()

In [31]:
# comment this out for document-level, leave it in for sentence level
sentences = [sent_tokenize(x) for x in documents]
documents = [sentence for doc in sentences for sentence in doc]
len(documents)

df = pd.DataFrame({"text": documents})

In [32]:
# use this for the base embeddings
# from sentence_transformers import SentenceTransformer

# embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
# embeddings = embedding_model.encode(documents, show_progress_bar=True)

In [33]:
# use this if you want openai embeddings (better performance)
dotenv.load_dotenv()
client = openai.OpenAI(api_key=os.environ["OPENAI_API_KEY"])
embedding_model = OpenAIBackend(client, "text-embedding-3-large")

# careful !
# no need to compute more than once here
# df["embedding"] = df["text"].apply(
#     lambda x: client.embeddings.create(model="text-embedding-3-large", input=x)
#     .data[0]
#     .embedding
# )
# df.to_feather("analysis_with_embeddings_40_sentences.feather")

df = pd.read_feather("analysis_with_embeddings_40_sentences.feather")

embeddings = np.array(df["embedding"].tolist())
embeddings.shape

(1756, 3072)

In [34]:
# import asyncio
# import pandas as pd
# from openai import AsyncOpenAI, OpenAIError
# from tqdm.asyncio import tqdm_asyncio
# import nest_asyncio

# # Enable nested event loop support (for Jupyter etc.)
# nest_asyncio.apply()

# client = AsyncOpenAI()

# # Settings
# BATCH_SIZE = 10
# MAX_RETRIES = 3
# RETRY_BACKOFF = 2
# CONCURRENCY_LIMIT = 5  # Maximum concurrent OpenAI requests

# semaphore = asyncio.Semaphore(CONCURRENCY_LIMIT)

# async def embed_batch(texts, attempt=1):
#     async with semaphore:
#         try:
#             response = await client.embeddings.create(
#                 model="text-embedding-3-large",
#                 input=texts
#             )
#             return [item.embedding for item in response.data]

#         except OpenAIError as e:
#             if attempt <= MAX_RETRIES:
#                 wait = RETRY_BACKOFF ** attempt
#                 print(f"[Retrying in {wait}s] Error: {e}")
#                 await asyncio.sleep(wait)
#                 return await embed_batch(texts, attempt + 1)
#             else:
#                 print(f"[Failed after {MAX_RETRIES} attempts] {e}")
#                 return [None] * len(texts)

# async def embed_all(texts):
#     batches = [texts[i:i + BATCH_SIZE] for i in range(0, len(texts), BATCH_SIZE)]

#     tasks = [embed_batch(batch) for batch in batches]
#     results = []
#     for coro in tqdm_asyncio.gather(*tasks, desc="Embedding", total=len(batches)):
#         batch_result = await coro
#         results.extend(batch_result)
#     return results

# async def run_embedding(df):
#     texts = df["text"].tolist()
#     embeddings = await embed_all(texts)
#     df["embedding"] = embeddings
#     return df

# # Run the whole thing
# df = asyncio.run(run_embedding(df))

In [35]:
num_failed = df["embedding"].apply(lambda x: x is None).sum()
print(f"Failed embeddings: {num_failed}")

Failed embeddings: 0


In [36]:
from umap import UMAP

umap_model = UMAP(
    n_neighbors=15, n_components=5, min_dist=0.0, metric="cosine", random_state=42
)

In [37]:
from hdbscan import HDBSCAN

# A higher min_topic_size will generate fewer topics and a lower min_topic_size will generate more topics.

hdbscan_model = HDBSCAN(
    min_cluster_size=20,
    metric="euclidean",
    cluster_selection_method="eom",
    prediction_data=True,
)

In [38]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer_model = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 2))

In [39]:
from bertopic.representation import (
    KeyBERTInspired,
    MaximalMarginalRelevance,
    OpenAI,
    PartOfSpeech,
)

# KeyBERT
keybert_model = KeyBERTInspired()

# Part-of-Speech
pos_model = PartOfSpeech("en_core_web_sm")

# MMR
mmr_model = MaximalMarginalRelevance(diversity=0.3)

# GPT
prompt = """
I have a topic that contains the following documents:
[DOCUMENTS]

The topic is described by the following keywords: [KEYWORDS]

Based on the information above, extract a short but highly descriptive topic label of at most 5 words. Make sure it is in the following format:
topic: <topic label>
"""
openai_model = OpenAI(
    client=openai.OpenAI(api_key=os.environ["OPENAI_API_KEY"]),
    model="gpt-4.1-nano-2025-04-14",
    exponential_backoff=True,
    chat=True,
    prompt=prompt,
)

# also consider, in order of pricing (0.010 -> 2.0):
# gpt-4.1-nano-2025-04-14
# gpt-4o-mini-2024-07-18
# o3-mini-2025-01-31
# o1-mini-2024-09-12
# gpt-4.1-2025-04-14
# o3-2025-04-16

# All representation models
representation_model = {
    "KeyBERT": keybert_model,
    "OpenAI": openai_model,
    "MMR": mmr_model,
    "POS": pos_model,
}

In [41]:
from bertopic import BERTopic
from huggingface_hub import DocumentQuestionAnsweringOutputElement
from numpy import ndarray

topic_model = BERTopic(
    # Pipeline models
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    representation_model=representation_model,  # type: ignore
    # Hyperparameters
    top_n_words=10,
    verbose=True,
)

topics, probs = topic_model.fit_transform(documents, embeddings)
# topics, probs = topic_model.fit_transform(documents)

2025-06-11 22:50:55,125 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-06-11 22:51:17,891 - BERTopic - Dimensionality - Completed ✓
2025-06-11 22:51:17,894 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-06-11 22:51:18,029 - BERTopic - Cluster - Completed ✓
2025-06-11 22:51:18,041 - BERTopic - Representation - Fine-tuning topics using representation models.
100%|██████████| 10/10 [00:05<00:00,  1.87it/s]
2025-06-11 22:51:44,157 - BERTopic - Representation - Completed ✓


In [42]:
topic_model.save(MODEL_NAME, serialization="safetensors")

In [43]:
topic_model.visualize_topics(custom_labels=True)

In [44]:
chatgpt_topic_labels = {
    topic: " | ".join(list(zip(*values))[0])
    for topic, values in topic_model.topic_aspects_["OpenAI"].items()
}
chatgpt_topic_labels[-1] = "Outlier Topic"
topic_model.set_topic_labels(chatgpt_topic_labels)

topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,CustomName,Representation,KeyBERT,OpenAI,MMR,POS,Representative_Docs
0,-1,108,-1_2024_block_birds_angry birds,Outlier Topic,"[2024, block, birds, angry birds, angry, spoti...","[angry birds, rovio, birds block, block quest,...",[Angry Birds Block Puzzle Game],"[angry birds, spotify, block quest, birds bloc...","[block, users, soft, launch, stories, title, p...",[- Angry Birds Block Quest is a block puzzle g...
1,0,924,0_ai_slop_content_generated,AI-Generated Content and Slop,"[ai, slop, content, generated, facebook, image...","[slop, ai generated, ai slop, generative ai, a...",[AI-Generated Content and Slop],"[generated, ai generated, ai slop, trump, plat...","[slop, content, images, people, media, social,...","[- ^ ""Slop is the new name for unwanted AI-gen..."
2,1,296,1_halloween_event_said_people,Dublin Halloween Parade False Listing,"[halloween, event, said, people, website, stre...","[halloween, events, event, ireland, irish, adv...",[Dublin Halloween Parade False Listing],"[halloween, people, website, street, events, p...","[halloween, event, people, website, street, ex...","[What happened at the event?, ""Dublin: Hallowe..."
3,2,186,2_game_ai_square_activision,AI Integration in Gaming Development,"[game, ai, square, activision, development, ne...","[generative ai, ai generated, activision, use ...",[AI Integration in Gaming Development],"[game, activision, company, kiryu, generated, ...","[game, development, new, art, voice, company, ...",[Activision recently added a warning label to ...
4,3,61,3_cola_coca cola_coca_company,Coca-Cola AI Holiday Campaign Backlash,"[cola, coca cola, coca, company, ad, holiday, ...","[coca cola, coca, commercials, company said, a...",[Coca-Cola AI Holiday Campaign Backlash],"[coca cola, company, christmas, backlash, comm...","[cola, company, ad, holiday, commercial, backl...","[""Coca-Cola's Holiday Ads Trade the 'Real Thin..."
5,4,50,4_2025_march 2025_march_2024,2024-2025 Monthly Dates,"[2025, march 2025, march, 2024, 24, september ...","[march 2025, april 2025, february 2025, januar...",[2024-2025 Monthly Dates],"[2025, march 2025, 2024, september 2024, novem...","[topics, , , , , , , , , ]","[Retrieved 24 March 2025., Retrieved 24 March ..."
6,5,43,5_times_york_new york_eurogamer,Major News and Media Outlets,"[times, york, new york, eurogamer, net, news, ...","[york times, new york, nbc news, eurogamer, ne...",[Major News and Media Outlets],"[eurogamer, news, york times, reporter, gizmod...","[news, reporter, newsletter, recommendation, g...","[The New York Times., The New York Times., The..."
7,6,34,6_film_poster_ai_generated,AI-Generated Film Promotion Controversy,"[film, poster, ai, generated, ai generated, am...","[criticized ai, used ai, ai generated, using a...",[AI-Generated Film Promotion Controversy],"[film, ai generated, amazon, posters, voice, n...","[film, poster, posters, voice, quality, narrat...","[In a recent Instagram post, users sighted a r..."
8,7,33,7_2024_november 2024_november_january,2024 Monthly Timeline and Personalities,"[2024, november 2024, november, january, augus...","[november 2024, october 2024, august 2024, sep...",[2024 Monthly Timeline and Personalities],"[november 2024, august 2024, october 2024, cha...","[, , , , , , , , , ]","[- ^ Kalita, Parash Jyoti (1 November 2024)., ..."
9,8,21,8_ign_wesley_com_twitter,IGN News and Editorial Reporting,"[ign, wesley, com, twitter, news editor, edito...","[ign, activision, editor ign, news, twitter, c...",[IGN News and Editorial Reporting],"[ign, news editor, asked, activision, wesley u...","[news, comment, reporting, novels, reporter, w...","[IGN., IGN., IGN.]"


In [45]:
topic_model.get_topic_info()["CustomName"].tolist()

['Outlier Topic',
 'AI-Generated Content and Slop',
 'Dublin Halloween Parade False Listing',
 'AI Integration in Gaming Development',
 'Coca-Cola AI Holiday Campaign Backlash',
 '2024-2025 Monthly Dates',
 'Major News and Media Outlets',
 'AI-Generated Film Promotion Controversy',
 '2024 Monthly Timeline and Personalities',
 'IGN News and Editorial Reporting']

In [46]:
topic_model.visualize_hierarchy(custom_labels=True)

In [47]:
topic_model.visualize_heatmap()

In [48]:
similar_topics, similarity = topic_model.find_topics("slop", top_n=5)
topic_model.get_topic(similar_topics[0])

[('ai', 0.06015559246170202),
 ('slop', 0.05229313813253623),
 ('content', 0.03968638260799799),
 ('generated', 0.035298260856005764),
 ('facebook', 0.03050414028139656),
 ('images', 0.02970742176758639),
 ('ai generated', 0.028291024438685117),
 ('ai slop', 0.027086456512829923),
 ('trump', 0.026963869504308152),
 ('like', 0.02477570686961289)]

In [49]:
topic_model.get_topic(0, full=True)

{'Main': [('ai', 0.06015559246170202),
  ('slop', 0.05229313813253623),
  ('content', 0.03968638260799799),
  ('generated', 0.035298260856005764),
  ('facebook', 0.03050414028139656),
  ('images', 0.02970742176758639),
  ('ai generated', 0.028291024438685117),
  ('ai slop', 0.027086456512829923),
  ('trump', 0.026963869504308152),
  ('like', 0.02477570686961289)],
 'KeyBERT': [('slop', 0.46801039679850576),
  ('ai generated', 0.46029763303437715),
  ('ai slop', 0.44584036816165035),
  ('generative ai', 0.42663878485197304),
  ('ai tools', 0.3289752381266077),
  ('spam', 0.31418053725131934),
  ('generated content', 0.3137496920012282),
  ('fake', 0.3061717210821604),
  ('social media', 0.304574338670489),
  ('artificial', 0.29525049671040327)],
 'OpenAI': [('AI-Generated Content and Slop', 1)],
 'MMR': [('generated', 0.035298260856005764),
  ('ai generated', 0.028291024438685117),
  ('ai slop', 0.027086456512829923),
  ('trump', 0.026963869504308152),
  ('platforms', 0.0175991068361659

In [50]:
# `topic_distr` contains the distribution of topics in each document
topic_distr, _ = topic_model.approximate_distribution(documents, window=8, stride=4)

100%|██████████| 2/2 [00:00<00:00, 10.16it/s]


In [51]:
reduced_embeddings = UMAP(
    n_neighbors=10, n_components=2, min_dist=0.0, metric="cosine"
).fit_transform(embeddings)
topic_model.visualize_documents(
    documents,
    reduced_embeddings=reduced_embeddings,
    custom_labels=True,
    # hide_annotations=True,
)

In [52]:
embeddings.shape

(1756, 3072)