as per: https://colab.research.google.com/drive/1BoQ_vakEVtojsd2x_U6-_x52OOuqruj2?usp=sharing#scrollTo=w3WRXoRP2ej8


In [52]:
import os
import dotenv
import openai
import pandas as pd

from typing import Text
from bertopic import BERTopic
from bertopic.backend import OpenAIBackend
from nltk.tokenize import sent_tokenize, word_tokenize

In [None]:
MODEL_NAME = "slop_min150_minilm_sentences"

In [53]:
df = pd.read_feather("analysis.feather")
documents = df["text"].tolist()

In [54]:
# comment this out for document-level, leave it in for sentence level
sentences = [sent_tokenize(x) for x in documents]
documents = [sentence for doc in sentences for sentence in doc]
len(documents)

1756

In [56]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(documents, show_progress_bar=True)

Batches: 100%|██████████| 55/55 [01:47<00:00,  1.96s/it]


In [57]:
# use this if you want openai embeddings (better performance)
# dotenv.load_dotenv()
# client = openai.OpenAI(api_key=os.environ["OPENAI_API_KEY"])
# embedding_model = OpenAIBackend(client, "text-embedding-3-large")

# # careful !
# # df["embedding"] = df["text"].apply(
# #     lambda x: client.embeddings.create(model="text-embedding-3-large", input=x)
# #     .data[0]
# #     .embedding
# # )

# # no need to compute more than once here
# # df.to_feather("analysis_with_embeddings.feather")
# df = pd.read_feather("analysis_with_embeddings.feather")
# embeddings = df["embedding"].tonumpy()
# embeddings

In [58]:
from umap import UMAP

umap_model = UMAP(
    n_neighbors=15, n_components=5, min_dist=0.0, metric="cosine", random_state=42
)

In [None]:
from hdbscan import HDBSCAN

# A higher min_topic_size will generate fewer topics and a lower min_topic_size will generate more topics.

hdbscan_model = HDBSCAN(
    min_cluster_size=150,
    metric="euclidean",
    cluster_selection_method="eom",
    prediction_data=True,
)

In [60]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer_model = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 2))

In [61]:
from bertopic.representation import (
    KeyBERTInspired,
    MaximalMarginalRelevance,
    OpenAI,
    PartOfSpeech,
)

# KeyBERT
keybert_model = KeyBERTInspired()

# Part-of-Speech
pos_model = PartOfSpeech("en_core_web_sm")

# MMR
mmr_model = MaximalMarginalRelevance(diversity=0.3)

# GPT
prompt = """
I have a topic that contains the following documents:
[DOCUMENTS]

The topic is described by the following keywords: [KEYWORDS]

Based on the information above, extract a short but highly descriptive topic label of at most 5 words. Make sure it is in the following format:
topic: <topic label>
"""
openai_model = OpenAI(
    client=openai.OpenAI(api_key=os.environ["OPENAI_API_KEY"]),
    model="gpt-4.1-nano-2025-04-14",
    exponential_backoff=True,
    chat=True,
    prompt=prompt,
)

# also consider, in order of pricing (0.010 -> 2.0):
# gpt-4.1-nano-2025-04-14
# gpt-4o-mini-2024-07-18
# o3-mini-2025-01-31
# o1-mini-2024-09-12
# gpt-4.1-2025-04-14
# o3-2025-04-16

# All representation models
representation_model = {
    "KeyBERT": keybert_model,
    "OpenAI": openai_model,
    "MMR": mmr_model,
    "POS": pos_model,
}

In [63]:
from bertopic import BERTopic
from huggingface_hub import DocumentQuestionAnsweringOutputElement
from numpy import ndarray

topic_model = BERTopic(
    # Pipeline models
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    representation_model=representation_model,
    # Hyperparameters
    top_n_words=10,
    verbose=True,
)

topics, probs = topic_model.fit_transform(documents, embeddings)

2025-06-11 21:53:43,815 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-06-11 21:53:53,454 - BERTopic - Dimensionality - Completed ✓
2025-06-11 21:53:53,458 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-06-11 21:53:53,571 - BERTopic - Cluster - Completed ✓
2025-06-11 21:53:53,585 - BERTopic - Representation - Fine-tuning topics using representation models.
100%|██████████| 85/85 [00:40<00:00,  2.10it/s]
2025-06-11 21:58:22,409 - BERTopic - Representation - Completed ✓


In [64]:
topic_model.save(MODEL_NAME, serialization="safetensors")

In [65]:
topic_model.visualize_topics(custom_labels=True)

In [66]:
chatgpt_topic_labels = {
    topic: " | ".join(list(zip(*values))[0])
    for topic, values in topic_model.topic_aspects_["OpenAI"].items()
}
chatgpt_topic_labels[-1] = "Outlier Topic"
topic_model.set_topic_labels(chatgpt_topic_labels)

topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,CustomName,Representation,KeyBERT,OpenAI,MMR,POS,Representative_Docs
0,-1,570,-1_ai_slop_content_ai slop,Outlier Topic,"[ai, slop, content, ai slop, people, generated...","[social media, ai generated, facebook, ai, con...",[AI-Generated Content and Viral Trends],"[ai, content, ai slop, internet, ai generated,...","[slop, content, people, internet, use, like, i...",[Experiences like this — staring at a collecti...
1,0,57,0_cola_coca_coca cola_ad,Coca-Cola's AI Holiday Advertising,"[cola, coca, coca cola, ad, company, holiday, ...","[coca cola, cola, coca, commercial, advertisin...",[Coca-Cola's AI Holiday Advertising],"[coca cola, company, ads, artists, ai technolo...","[cola, ad, company, holiday, commercial, backl...","[- ^ ""Coca-Cola causes controversy with AI-gen..."
2,1,44,1_trump_donald trump_donald_posted,Trump AI-Generated Campaign Images,"[trump, donald trump, donald, posted, images, ...","[trump posted, donald, donald trump, trump, de...",[Trump AI-Generated Campaign Images],"[trump, donald trump, donald, images, mcdonald...","[images, photo, women, image, young, president...",[Taylor Swift fans are not endorsing Donald Tr...
3,2,37,2_experience_wonka_event_factory,Controversial Wonka-themed Event,"[experience, wonka, event, factory, children, ...","[wonka, charlie, pure imagination, roald dahl,...",[Controversial Wonka-themed Event],"[wonka, event, factory, children, sinclair, im...","[experience, event, factory, children, love, p...","[Held at the Box Hub, a warehouse event space ..."
4,3,35,3_2024_november 2024_august 2024_november,November 2024 News Updates,"[2024, november 2024, august 2024, november, a...","[november 2024, october 2024, july 2024, septe...",[November 2024 News Updates],"[2024, november 2024, august 2024, october 202...","[, , , , , , , , , ]","[- ^ Kalita, Parash Jyoti (1 November 2024)., ..."
...,...,...,...,...,...,...,...,...,...,...
80,79,6,79_wyp100_ign_twitter_bankhurst,IGN UK News and Staff,"[wyp100, ign, twitter, bankhurst, wesley uk, u...","[wesley uk, news editor, uk news, reporter, we...",[IGN UK News and Staff],"[wyp100, ign, bankhurst, wesley uk, uk news, n...","[reporter, news, writer, , , , , , , ]","[Find him on Twitter at @wyp100., Wesley is th..."
81,80,6,80_revenue_make money_money_make,Income Generation and Revenue Strategies,"[revenue, make money, money, make, people don,...","[generate revenue, make money, earn, revenue, ...",[Income Generation and Revenue Strategies],"[revenue, make money, people don, generate rev...","[revenue, money, economics, viewers, interview...",[Getachew claims viewers can “earn between $25...
82,81,5,81_404 media_404_media_media jason,404 Media and Jason Koebler,"[404 media, 404, media, media jason, telling, ...","[404 media, 404, media, media jason, jason, st...",[404 Media and Jason Koebler],"[404 media, 404, media, media jason, middle, j...","[middle, story, , , , , , , , ]","[404 Media., 404 Media., 404 Media.]"
83,82,5,82_ops_black ops_black_duty black,AI Use in Call of Duty,"[ops, black ops, black, duty black, loading sc...","[black ops, ai slop, ai generated, ai tools, a...",[AI Use in Call of Duty],"[ops, black ops, duty black, screens, artifici...","[screens, loading, artificial intelligence, al...","[But so far, Black Ops 6 has proven generative..."


In [67]:
topic_model.get_topic_info()['CustomName'].tolist()

['Outlier Topic',
 "Coca-Cola's AI Holiday Advertising",
 'Trump AI-Generated Campaign Images',
 'Controversial Wonka-themed Event',
 'November 2024 News Updates',
 "Square Enix's Use of AI Art",
 'Dublin Halloween Parade Hoax',
 'Major New York News Coverage',
 'Examination of "Slop" in Economy and Culture',
 'Ethical and Creative Impacts of Generative AI',
 'Google Search Engine and SEO',
 'AI-Generated Low-Quality Content',
 'Facebook Spam and Monetization Tactics',
 'John Milton Freedom Foundation and Social Media',
 '2025 Retrieval Records',
 'AI-Generated Content Industry',
 'Advances in Large Language Models',
 'Angry Birds Block Quest Launch',
 'AI Investment Risks and Outlook',
 'Brand Strategy and Trends',
 'Evolution of Spam and Unwanted Content',
 'Misinformation and Extremism During Hurricanes',
 'AI-Driven Zombie Internet Era',
 'Positive Reassurance and Affirmation',
 'Viral Social Media Strategies',
 'Far-Right Media and Conspiracies',
 "Square Enix's AI and Tech Strate

In [68]:
topic_model.visualize_hierarchy(custom_labels=True)

In [69]:
topic_model.visualize_heatmap()

In [73]:
similar_topics, similarity = topic_model.find_topics("slop", top_n=5)
topic_model.get_topic(similar_topics[0])

[('slop', 0.13814578004036843),
 ('slop slop', 0.0740457632875781),
 ('kind slop', 0.05873067566901936),
 ('slop economy', 0.05482755592537289),
 ('sector', 0.05482755592537289),
 ('perfect', 0.04936384219171873),
 ('economy', 0.0473016311535014),
 ('sloppers', 0.0473016311535014),
 ('food', 0.04257222947341317),
 ('kind', 0.035687195244359586)]

In [76]:
topic_model.get_topic(0, full=True)

{'Main': [('cola', 0.08447092669883464),
  ('coca', 0.08338977741428931),
  ('coca cola', 0.08338977741428931),
  ('ad', 0.03672804106733285),
  ('company', 0.03515853933317457),
  ('holiday', 0.03400377154998854),
  ('christmas', 0.03221030674700048),
  ('commercial', 0.028942297257233778),
  ('backlash', 0.0245502460908316),
  ('ads', 0.019743866683900116)],
 'KeyBERT': [('coca cola', 0.61511385),
  ('cola', 0.57094884),
  ('coca', 0.5154464),
  ('commercial', 0.48320517),
  ('advertising', 0.4608228),
  ('ads', 0.41460913),
  ('christmas', 0.4005837),
  ('marketing', 0.38741866),
  ('holiday', 0.36896348),
  ('artificial', 0.36696038)],
 'OpenAI': [("Coca-Cola's AI Holiday Advertising", 1)],
 'MMR': [('coca cola', 0.08338977741428931),
  ('company', 0.03515853933317457),
  ('ads', 0.019743866683900116),
  ('artists', 0.01548003148615951),
  ('ai technology', 0.014013560212587763),
  ('alex', 0.014013560212587763),
  ('creating', 0.01383649108967224),
  ('level work', 0.0135357473482

In [50]:
# `topic_distr` contains the distribution of topics in each document
topic_distr, _ = topic_model.approximate_distribution(documents, window=8, stride=4)

100%|██████████| 1/1 [00:00<00:00,  4.53it/s]


In [78]:
reduced_embeddings = UMAP(
    n_neighbors=10, n_components=2, min_dist=0.0, metric="cosine"
).fit_transform(embeddings)
topic_model.visualize_documents(
    documents, reduced_embeddings=reduced_embeddings, custom_labels=True
)