as per: https://colab.research.google.com/drive/1BoQ_vakEVtojsd2x_U6-_x52OOuqruj2?usp=sharing#scrollTo=w3WRXoRP2ej8


In [28]:
import os
import dotenv
import openai
import pandas as pd
import numpy as np

from typing import Text
from bertopic import BERTopic
from bertopic.backend import OpenAIBackend
from nltk.tokenize import sent_tokenize, word_tokenize

In [61]:
MODEL_NAME = "subset"

In [30]:
df = pd.read_feather("analysis.feather")
documents = df["text"].tolist()

In [31]:
# comment this out for document-level, leave it in for sentence level
sentences = [sent_tokenize(x) for x in documents]
documents = [sentence for doc in sentences for sentence in doc]
len(documents)

df = pd.DataFrame({"text": documents})

In [32]:
# use this for the base embeddings
# from sentence_transformers import SentenceTransformer

# embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
# embeddings = embedding_model.encode(documents, show_progress_bar=True)

In [33]:
# use this if you want openai embeddings (better performance)
dotenv.load_dotenv()
client = openai.OpenAI(api_key=os.environ["OPENAI_API_KEY"])
embedding_model = OpenAIBackend(client, "text-embedding-3-large")

# careful !
# no need to compute more than once here
# df["embedding"] = df["text"].apply(
#     lambda x: client.embeddings.create(model="text-embedding-3-large", input=x)
#     .data[0]
#     .embedding
# )
# df.to_feather("analysis_with_embeddings_40_sentences.feather")

df = pd.read_feather("analysis_with_embeddings_40_sentences.feather")

embeddings = np.array(df["embedding"].tolist())
embeddings.shape

(1756, 3072)

In [34]:
# import asyncio
# import pandas as pd
# from openai import AsyncOpenAI, OpenAIError
# from tqdm.asyncio import tqdm_asyncio
# import nest_asyncio

# # Enable nested event loop support (for Jupyter etc.)
# nest_asyncio.apply()

# client = AsyncOpenAI()

# # Settings
# BATCH_SIZE = 10
# MAX_RETRIES = 3
# RETRY_BACKOFF = 2
# CONCURRENCY_LIMIT = 5  # Maximum concurrent OpenAI requests

# semaphore = asyncio.Semaphore(CONCURRENCY_LIMIT)

# async def embed_batch(texts, attempt=1):
#     async with semaphore:
#         try:
#             response = await client.embeddings.create(
#                 model="text-embedding-3-large",
#                 input=texts
#             )
#             return [item.embedding for item in response.data]

#         except OpenAIError as e:
#             if attempt <= MAX_RETRIES:
#                 wait = RETRY_BACKOFF ** attempt
#                 print(f"[Retrying in {wait}s] Error: {e}")
#                 await asyncio.sleep(wait)
#                 return await embed_batch(texts, attempt + 1)
#             else:
#                 print(f"[Failed after {MAX_RETRIES} attempts] {e}")
#                 return [None] * len(texts)

# async def embed_all(texts):
#     batches = [texts[i:i + BATCH_SIZE] for i in range(0, len(texts), BATCH_SIZE)]

#     tasks = [embed_batch(batch) for batch in batches]
#     results = []
#     for coro in tqdm_asyncio.gather(*tasks, desc="Embedding", total=len(batches)):
#         batch_result = await coro
#         results.extend(batch_result)
#     return results

# async def run_embedding(df):
#     texts = df["text"].tolist()
#     embeddings = await embed_all(texts)
#     df["embedding"] = embeddings
#     return df

# # Run the whole thing
# df = asyncio.run(run_embedding(df))

In [35]:
num_failed = df["embedding"].apply(lambda x: x is None).sum()
print(f"Failed embeddings: {num_failed}")

Failed embeddings: 0


In [56]:
from umap import UMAP

umap_model = UMAP(
    n_neighbors=15, n_components=5, min_dist=0.0, metric="cosine", random_state=42
)

In [73]:
from hdbscan import HDBSCAN

# A higher min_topic_size will generate fewer topics and a lower min_topic_size will generate more topics.

hdbscan_model = HDBSCAN(
    min_cluster_size=18,
    metric="euclidean",
    cluster_selection_method="eom",
    prediction_data=True,
)

In [74]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer_model = CountVectorizer(stop_words="english", min_df=1, ngram_range=(1, 2))

In [75]:
from bertopic.representation import (
    KeyBERTInspired,
    MaximalMarginalRelevance,
    OpenAI,
    PartOfSpeech,
)

# KeyBERT
keybert_model = KeyBERTInspired()

# Part-of-Speech
pos_model = PartOfSpeech("en_core_web_sm")

# MMR
mmr_model = MaximalMarginalRelevance(diversity=0.3)

# GPT
prompt = """
I have a topic that contains the following documents:
[DOCUMENTS]

The topic is described by the following keywords: [KEYWORDS]

Based on the information above, extract a short but highly descriptive topic label of at most 5 words. Make sure it is in the following format:
topic: <topic label>
"""
openai_model = OpenAI(
    client=openai.OpenAI(api_key=os.environ["OPENAI_API_KEY"]),
    model="gpt-4.1-nano-2025-04-14",
    exponential_backoff=True,
    chat=True,
    prompt=prompt,
)

# also consider, in order of pricing (0.010 -> 2.0):
# gpt-4.1-nano-2025-04-14
# gpt-4o-mini-2024-07-18
# o3-mini-2025-01-31
# o1-mini-2024-09-12
# gpt-4.1-2025-04-14
# o3-2025-04-16

# All representation models
representation_model = {
    "KeyBERT": keybert_model,
    "OpenAI": openai_model,
    "MMR": mmr_model,
    "POS": pos_model,
}

In [76]:
from bertopic import BERTopic
from huggingface_hub import DocumentQuestionAnsweringOutputElement
from numpy import ndarray

topic_model = BERTopic(
    # Pipeline models
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    representation_model=representation_model,  # type: ignore
    # Hyperparameters
    top_n_words=10,
    verbose=True,
)

topics, probs = topic_model.fit_transform(documents)
# topics, probs = topic_model.fit_transform(documents)

2025-06-11 23:20:01,286 - BERTopic - Embedding - Transforming documents to embeddings.


2025-06-11 23:20:05,271 - BERTopic - Embedding - Completed ✓
2025-06-11 23:20:05,273 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-06-11 23:20:12,549 - BERTopic - Dimensionality - Completed ✓
2025-06-11 23:20:12,553 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-06-11 23:20:12,614 - BERTopic - Cluster - Completed ✓
2025-06-11 23:20:12,622 - BERTopic - Representation - Fine-tuning topics using representation models.
100%|██████████| 12/12 [00:06<00:00,  1.95it/s]
2025-06-11 23:20:40,104 - BERTopic - Representation - Completed ✓


In [77]:
topic_model.save(MODEL_NAME, serialization="safetensors")

In [78]:
topic_model.visualize_topics(custom_labels=True)

In [79]:
chatgpt_topic_labels = {
    topic: " | ".join(list(zip(*values))[0])
    for topic, values in topic_model.topic_aspects_["OpenAI"].items()
}
chatgpt_topic_labels[-1] = "Outlier Topic"
topic_model.set_topic_labels(chatgpt_topic_labels)

topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,CustomName,Representation,KeyBERT,OpenAI,MMR,POS,Representative_Docs
0,-1,293,-1_ai_image_generated_content,Outlier Topic,"[ai, image, generated, content, like, internet...","[ai generated, generative ai, bots, fake, soci...",[AI-Generated Content and Misinformation],"[generated, content, chatgpt, ai generated, tr...","[image, content, internet, people, images, med...","[[4]\nIt has been variously defined as ""digita..."
1,0,133,0_slop_term_spam_ai,Artificial Intelligence Spam Content,"[slop, term, spam, ai, ai slop, generated, thi...","[term slop, slop isn, slop term, slop ai, slop...",[Artificial Intelligence Spam Content],"[slop, spam, ai slop, word, slop term, generat...","[slop, term, spam, internet, word, early, cont...","[That’s slop., They’re slop as well., Why slop?]"
2,1,104,1_ai_content_brands_brand,Viral Italian Brainrot and Brands,"[ai, content, brands, brand, ai slop, human, g...","[italian brainrot, authentic content, makes br...",[Viral Italian Brainrot and Brands],"[content, brands, ai slop, italian brainrot, e...","[content, brands, brand, human, brainrot, slop...","[- ^ ""Why Brands Are Embracing 'Italian Brainr..."
3,2,72,2_content_ai_money_make,AI-Driven Content Monetization,"[content, ai, money, make, creators, videos, m...","[generative ai, ai generated, viral content, m...",[AI-Driven Content Monetization],"[content, creators, videos, make money, platfo...","[content, money, creators, videos, platforms, ...","[Since the generative AI boom began, a class o..."
4,3,69,3_facebook_ai_images_meta,AI-Generated Bizarre Social Media Content,"[facebook, ai, images, meta, slop, jesus, ai s...","[slop facebook, facebook ai, facebook just, im...",[AI-Generated Bizarre Social Media Content],"[facebook, meta, ai slop, ai generated, fake, ...","[images, slop, pages, children, fake, birthday...",[Image-generated slop has also blossomed on Fa...
5,4,69,4_ai_generative_generative ai_technology,Challenges and Potential of Generative AI,"[ai, generative, generative ai, technology, cr...","[generative ai, generative, believe generative...",[Challenges and Potential of Generative AI],"[generative, generative ai, creative, ai poten...","[generative, technology, creative, potential, ...","[""I believe that generative AI has the potenti..."
6,5,57,5_fema_misinformation_people_reality,Misinformation and Conspiracy Theories,"[fema, misinformation, people, reality, public...","[misinformation crisis, conspiracy theories, m...",[Misinformation and Conspiracy Theories],"[fema, misinformation, north carolina, hurrica...","[misinformation, people, reality, public, theo...",[To watch as real information is overwhelmed b...
7,6,32,6_books_ai generated_generated_librarians,AI-Generated Books in Libraries,"[books, ai generated, generated, librarians, l...","[librarians, public libraries, books cost, ebo...",[AI-Generated Books in Libraries],"[books, librarians, hoopla, edible, sula, eboo...","[books, librarians, libraries, library, catalo...",[Both companies have a variety of payment opti...
8,7,28,7_trump_harris_donald_donald trump,AI-Generated Political Misinformation,"[trump, harris, donald, donald trump, posted, ...","[swifties trump, trump posted, depictions trum...",[AI-Generated Political Misinformation],"[trump, donald trump, taylor swift, images, tr...","[images, shirts, women, social, depictions, fa...","[One pro-Trump, Elon-Musk-fanboy account recen..."
9,8,26,8_search_google_results_search results,AI-Driven Search Result Reliability,"[search, google, results, search results, over...","[leading google, search engines, like google, ...",[AI-Driven Search Result Reliability],"[google, search results, search engine, answer...","[search, results, overviews, engine, links, an...","[Last week, Google announced an ambitious plan..."


In [80]:
topic_model.get_topic_info()["CustomName"].tolist()

['Outlier Topic',
 'Artificial Intelligence Spam Content',
 'Viral Italian Brainrot and Brands',
 'AI-Driven Content Monetization',
 'AI-Generated Bizarre Social Media Content',
 'Challenges and Potential of Generative AI',
 'Misinformation and Conspiracy Theories',
 'AI-Generated Books in Libraries',
 'AI-Generated Political Misinformation',
 'AI-Driven Search Result Reliability',
 'Trump, AI, and Food Culture',
 "Milton Foundation's Political Advocacy"]

In [82]:
topic_model.visualize_hierarchy(custom_labels=True)

In [83]:
topic_model.visualize_heatmap()

In [84]:
similar_topics, similarity = topic_model.find_topics("slop", top_n=5)
topic_model.get_topic(similar_topics[0])

[('slop', 0.08273776813338832),
 ('term', 0.03536383671684438),
 ('spam', 0.0322262100589285),
 ('ai', 0.02597509483867211),
 ('ai slop', 0.023258743972206463),
 ('generated', 0.016537981581385204),
 ('think', 0.015182720070688561),
 ('internet', 0.014979480124834724),
 ('word', 0.014479035025482007),
 ('early', 0.013120300833840953)]

In [85]:
topic_model.get_topic(0)


[('slop', 0.08273776813338832),
 ('term', 0.03536383671684438),
 ('spam', 0.0322262100589285),
 ('ai', 0.02597509483867211),
 ('ai slop', 0.023258743972206463),
 ('generated', 0.016537981581385204),
 ('think', 0.015182720070688561),
 ('internet', 0.014979480124834724),
 ('word', 0.014479035025482007),
 ('early', 0.013120300833840953)]

In [86]:
# `topic_distr` contains the distribution of topics in each document
topic_distr, _ = topic_model.approximate_distribution(documents, window=8, stride=4)

100%|██████████| 1/1 [00:00<00:00,  8.03it/s]


In [87]:
reduced_embeddings = UMAP(
    n_neighbors=10, n_components=2, min_dist=0.0, metric="cosine"
).fit_transform(embeddings)
topic_model.visualize_documents(
    documents,
    reduced_embeddings=reduced_embeddings,
    custom_labels=True,
    hide_annotations=True,
)

In [72]:
embeddings.shape

(1756, 3072)

In [None]:
# TODO
# run it through only the ones from the genai and slop category

In [53]:
# Get metadata about each document
doc_info = topic_model.get_document_info(documents)
doc_info.head()

Unnamed: 0,Document,Topic,Name,CustomName,Representation,KeyBERT,OpenAI,MMR,POS,Representative_Docs,Top_n_words,Probability,Representative_document
0,"AI slop\n""AI slop"", often simply ""slop"", is a ...",0,0_ai_slop_content_generated,AI-Generated Content and Slop,"[ai, slop, content, generated, facebook, image...","[slop, ai slop, ai generated, generative ai, s...",[AI-Generated Content and Slop],"[generated, facebook, ai generated, ai slop, t...","[slop, content, images, people, media, social,...","[- ""AI slop"" images are all over Facebook., Wh...",ai - slop - content - generated - facebook - i...,1.0,False
1,"[1][4][5] Coined in the 2020s, the term has a ...",0,0_ai_slop_content_generated,AI-Generated Content and Slop,"[ai, slop, content, generated, facebook, image...","[slop, ai slop, ai generated, generative ai, s...",[AI-Generated Content and Slop],"[generated, facebook, ai generated, ai slop, t...","[slop, content, images, people, media, social,...","[- ""AI slop"" images are all over Facebook., Wh...",ai - slop - content - generated - facebook - i...,1.0,False
2,"[4]\nIt has been variously defined as ""digital...",0,0_ai_slop_content_generated,AI-Generated Content and Slop,"[ai, slop, content, generated, facebook, image...","[slop, ai slop, ai generated, generative ai, s...",[AI-Generated Content and Slop],"[generated, facebook, ai generated, ai slop, t...","[slop, content, images, people, media, social,...","[- ""AI slop"" images are all over Facebook., Wh...",ai - slop - content - generated - facebook - i...,1.0,False
3,"[7]\nJonathan Gilmore, a philosophy professor ...",0,0_ai_slop_content_generated,AI-Generated Content and Slop,"[ai, slop, content, generated, facebook, image...","[slop, ai slop, ai generated, generative ai, s...",[AI-Generated Content and Slop],"[generated, facebook, ai generated, ai slop, t...","[slop, content, images, people, media, social,...","[- ""AI slop"" images are all over Facebook., Wh...",ai - slop - content - generated - facebook - i...,1.0,False
4,[8]\nOrigin of the term\n[edit]As early large ...,0,0_ai_slop_content_generated,AI-Generated Content and Slop,"[ai, slop, content, generated, facebook, image...","[slop, ai slop, ai generated, generative ai, s...",[AI-Generated Content and Slop],"[generated, facebook, ai generated, ai slop, t...","[slop, content, images, people, media, social,...","[- ""AI slop"" images are all over Facebook., Wh...",ai - slop - content - generated - facebook - i...,1.0,False


In [88]:
doc_info.to_feather('subset_documents_classified.feather')

In [54]:
topic_number = 0
filtered_docs = doc_info[doc_info["Topic"] == topic_number]
filtered_docs

Unnamed: 0,Document,Topic,Name,CustomName,Representation,KeyBERT,OpenAI,MMR,POS,Representative_Docs,Top_n_words,Probability,Representative_document
0,"AI slop\n""AI slop"", often simply ""slop"", is a ...",0,0_ai_slop_content_generated,AI-Generated Content and Slop,"[ai, slop, content, generated, facebook, image...","[slop, ai slop, ai generated, generative ai, s...",[AI-Generated Content and Slop],"[generated, facebook, ai generated, ai slop, t...","[slop, content, images, people, media, social,...","[- ""AI slop"" images are all over Facebook., Wh...",ai - slop - content - generated - facebook - i...,1.0,False
1,"[1][4][5] Coined in the 2020s, the term has a ...",0,0_ai_slop_content_generated,AI-Generated Content and Slop,"[ai, slop, content, generated, facebook, image...","[slop, ai slop, ai generated, generative ai, s...",[AI-Generated Content and Slop],"[generated, facebook, ai generated, ai slop, t...","[slop, content, images, people, media, social,...","[- ""AI slop"" images are all over Facebook., Wh...",ai - slop - content - generated - facebook - i...,1.0,False
2,"[4]\nIt has been variously defined as ""digital...",0,0_ai_slop_content_generated,AI-Generated Content and Slop,"[ai, slop, content, generated, facebook, image...","[slop, ai slop, ai generated, generative ai, s...",[AI-Generated Content and Slop],"[generated, facebook, ai generated, ai slop, t...","[slop, content, images, people, media, social,...","[- ""AI slop"" images are all over Facebook., Wh...",ai - slop - content - generated - facebook - i...,1.0,False
3,"[7]\nJonathan Gilmore, a philosophy professor ...",0,0_ai_slop_content_generated,AI-Generated Content and Slop,"[ai, slop, content, generated, facebook, image...","[slop, ai slop, ai generated, generative ai, s...",[AI-Generated Content and Slop],"[generated, facebook, ai generated, ai slop, t...","[slop, content, images, people, media, social,...","[- ""AI slop"" images are all over Facebook., Wh...",ai - slop - content - generated - facebook - i...,1.0,False
4,[8]\nOrigin of the term\n[edit]As early large ...,0,0_ai_slop_content_generated,AI-Generated Content and Slop,"[ai, slop, content, generated, facebook, image...","[slop, ai slop, ai generated, generative ai, s...",[AI-Generated Content and Slop],"[generated, facebook, ai generated, ai slop, t...","[slop, content, images, people, media, social,...","[- ""AI slop"" images are all over Facebook., Wh...",ai - slop - content - generated - facebook - i...,1.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1700,"However, he says, the rise of ChatGPT and simi...",0,0_ai_slop_content_generated,AI-Generated Content and Slop,"[ai, slop, content, generated, facebook, image...","[slop, ai slop, ai generated, generative ai, s...",[AI-Generated Content and Slop],"[generated, facebook, ai generated, ai slop, t...","[slop, content, images, people, media, social,...","[- ""AI slop"" images are all over Facebook., Wh...",ai - slop - content - generated - facebook - i...,1.0,False
1719,"""Artificial intelligence (AI) and its potentia...",0,0_ai_slop_content_generated,AI-Generated Content and Slop,"[ai, slop, content, generated, facebook, image...","[slop, ai slop, ai generated, generative ai, s...",[AI-Generated Content and Slop],"[generated, facebook, ai generated, ai slop, t...","[slop, content, images, people, media, social,...","[- ""AI slop"" images are all over Facebook., Wh...",ai - slop - content - generated - facebook - i...,1.0,False
1720,"""However, the introduction of ChatGPT, which a...",0,0_ai_slop_content_generated,AI-Generated Content and Slop,"[ai, slop, content, generated, facebook, image...","[slop, ai slop, ai generated, generative ai, s...",[AI-Generated Content and Slop],"[generated, facebook, ai generated, ai slop, t...","[slop, content, images, people, media, social,...","[- ""AI slop"" images are all over Facebook., Wh...",ai - slop - content - generated - facebook - i...,1.0,False
1721,"""I believe that generative AI has the potentia...",0,0_ai_slop_content_generated,AI-Generated Content and Slop,"[ai, slop, content, generated, facebook, image...","[slop, ai slop, ai generated, generative ai, s...",[AI-Generated Content and Slop],"[generated, facebook, ai generated, ai slop, t...","[slop, content, images, people, media, social,...","[- ""AI slop"" images are all over Facebook., Wh...",ai - slop - content - generated - facebook - i...,1.0,False


In [55]:
documents = filtered_docs["Document"].tolist()
documents

['AI slop\n"AI slop", often simply "slop", is a term for low-quality media, including writing and images, made using generative artificial intelligence technology, characterized by an inherent lack of effort, logic, or purpose.',
 '[1][4][5] Coined in the 2020s, the term has a pejorative connotation akin to "spam".',
 '[4]\nIt has been variously defined as "digital clutter", "filler content produced by AI tools that prioritize speed and quantity over substance and quality",[6] and "shoddy or unwanted AI content in social media, art, books and, increasingly, in search results".',
 '[7]\nJonathan Gilmore, a philosophy professor at the City University of New York, describes the "incredibly banal, realistic style" of AI slop as being "very easy to process".',
 '[8]\nOrigin of the term\n[edit]As early large language models (LLMs) and image diffusion models accelerated the creation of high-volume but low-quality written content and images, discussion commenced among journalists and on social