# BERTOPIC from cluster unit entities 

In [2]:
from local_host_tester import app

In [3]:
app.get("/scraper_cluster").json()

[{'cluster_entity_id': '68e10e7f9f0e01b7def2e413',
  'created_at': '2025-10-04T12:09:12.453000+00:00',
  'deleted_at': None,
  'id': '68e10e689f0e01b7def2e411',
  'scraper_entity_id': '68e10e699f0e01b7def2e412',
  'stages': {'cluster_prep': 'completed',
   'clustering': 'initialized',
   'initialized': 'completed',
   'scraping': 'completed'},
  'updated_at': '2025-10-04T12:10:05.342000+00:00',
  'user_id': '68ac7022e24c87692ba648f4'},
 {'cluster_entity_id': '68e13064c04b5f3e7ed60c9d',
  'created_at': '2025-10-04T14:32:43.391000+00:00',
  'deleted_at': None,
  'id': '68e1300bc04b5f3e7ed60c93',
  'scraper_entity_id': '68e1300fc04b5f3e7ed60c94',
  'stages': {'cluster_prep': 'completed',
   'clustering': 'initialized',
   'initialized': 'completed',
   'scraping': 'completed'},
  'updated_at': '2025-10-04T14:34:41.389000+00:00',
  'user_id': '68ac7022e24c87692ba648f4'}]

In [4]:
scraper_cluster_id = "68e1300bc04b5f3e7ed60c93"

In [5]:
cluster_unit_entities = app.post("/clustering/get_cluster_units", json_data={"scraper_cluster_id": scraper_cluster_id}).json()["cluster_unit_entities"]
len(cluster_unit_entities)

2536

In [6]:
cluster_unit_entities[0]

{'author': '[deleted]',
 'cluster_entity_id': '68e13064c04b5f3e7ed60c9d',
 'comment_post_id': '68c7ed68e677aa3158ca62e8',
 'created_at': '2025-10-04T14:34:12.181000+00:00',
 'created_utc': 1582885910,
 'deleted_at': None,
 'downvotes': 0,
 'enriched_comment_thread_text': None,
 'id': '68e13064c04b5f3e7ed60c9f',
 'post_id': '68c7ed68e677aa3158ca62f3',
 'reddit_id': 'fj02e8m',
 'text': '[deleted]',
 'thread_path_text': ["I'm a comedian with partial hearing loss, and had a VERY weird heckle related to it. Subtitles in the video. [NSFW]\n"],
 'type': 'comment',
 'updated_at': None,
 'upvotes': 14,
 'usertag': None}

In [7]:
# Number of empty text fields
sorted([cluster_unit for cluster_unit in cluster_unit_entities if not cluster_unit["text"]], key=lambda x: x["upvotes"], reverse=True)

[]

In [8]:
sorted([cluster_unit for cluster_unit in cluster_unit_entities if "[deleted]" in cluster_unit["text"]], key=lambda x: x["upvotes"], reverse=True)

[{'author': '[deleted]',
  'cluster_entity_id': '68e13064c04b5f3e7ed60c9d',
  'comment_post_id': '68c81fee1b7cdb8490b40334',
  'created_at': '2025-10-04T14:34:34.132000+00:00',
  'created_utc': 1698332996,
  'deleted_at': None,
  'downvotes': 0,
  'enriched_comment_thread_text': None,
  'id': '68e1307ac04b5f3e7ed6147b',
  'post_id': '68c81fee1b7cdb8490b40379',
  'reddit_id': 'k6jpps5',
  'text': '[deleted]',
  'thread_path_text': ['2yo learning signs appropriate?\nhello! I’m going to start by saying that I nor anyone in my family or extended circle are Deaf or Hard of Hearing. for some reason this subreddit keeps popping up and after reading some posts I have a couple of questions I hope someone would be willing to answer because I am now super worried about being disrespectful.\n\nI have a 2yo. around 1yr we started teaching her some signs to help her communicate before her speech developed; it’s been a really “popular” thing to do in parenting the last several years I feel like, and 

In [20]:
import os
from typing import List, Optional
import pandas as pd
from pydantic import BaseModel
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer

from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired, BaseRepresentation
from bertopic.vectorizers import ClassTfidfTransformer
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel



def get_standard_bertopic(model_setting_index=0, customer_model_settings = None):
    """
    each of the steps in bertopic are assigned using the model settings, in the future this can be replaced with a grid search system to evaluate different model settings, but this is the most basic version with standard parameters. each of the model settings is in a list
    :param model_setting_index: defines which of the hyperparamter settings to pick
    :return: 
    """
    model_settings = [{
        "language": "",
        "nr_topics": "None",
        "document_embedding": 'SentenceTransformer("all-MiniLM-L6-v2")',
        "reduce_dimensionality": "UMAP()",
        "clustering": "HDBSCAN()",
        "topic_tokenization": 'CountVectorizer(stop_words="english")',
        "topic_representation": 'ClassTfidfTransformer()',
        "representation_model": 'KeyBERTInspired()'
    }]
    
    if customer_model_settings is not None:
        for key in model_settings[0].keys():
            if key not in customer_model_settings.keys():
                raise Exception(f"Missing key: {key}")
        model_settings.append(customer_model_settings)
            
            
    # Step 1 - Extract embeddings
    embedding_model = model_settings[model_setting_index]["document_embedding"]
    
    # Step 2 - Reduce dimensionality
    umap_model = model_settings[model_setting_index]["reduce_dimensionality"]

    # Step 3 - Cluster reduced embeddings
    hdbscan_model = model_settings[model_setting_index]["clustering"]

    # Step 4 - Tokenize topics
    vectorizer_model = model_settings[model_setting_index]["topic_tokenization"]

    # Step 5 - Create topic representation
    ctfidf_model = model_settings[model_setting_index]["topic_representation"]

    # Step 6 - (Optional) Fine-tune topic representations with
    # a `bertopic.representation` model
    representation_model = model_settings[model_setting_index]["representation_model"]
    
    language = model_settings[model_setting_index]["language"]
    min_topic_size =  model_settings[model_setting_index].get("min_topic_size", "none")
    calculate_probabilities =  model_settings[model_setting_index].get("calculate_probabilities", False)
    verbose =  model_settings[model_setting_index].get("verbose", False)
    nr_topics =  model_settings[model_setting_index].get("nr_topics", "none")
    zeroshot_topic_list = []


    # All steps together
    topic_model = BERTopic(
        language=language,
        min_topic_size=min_topic_size,
        calculate_probabilities=calculate_probabilities,
        verbose=verbose,
        nr_topics=nr_topics,
        zeroshot_topic_list=zeroshot_topic_list,
        embedding_model=embedding_model,          # Step 1 - Extract embeddings
        umap_model=umap_model,                    # Step 2 - Reduce dimensionality
        hdbscan_model=hdbscan_model,              # Step 3 - Cluster reduced embeddings
        vectorizer_model=vectorizer_model,        # Step 4 - Tokenize topics
        ctfidf_model=ctfidf_model,                # Step 5 - Extract topic words
        representation_model=representation_model # Step 6 - (Optional) Fine-tune topic represenations
    )
    print(topic_model.get_params())
    return topic_model

class BertParamters(BaseModel):
    language: str = "english"
    min_topic_size: int = None
    calculate_probabilities: bool = True
    verbose: bool = False
    nr_topics: Optional[int] = None
    zeroshot_topic_list: Optional[List[str]] = None
    sentence_transformer_model: str ="all-MiniLM-L6-v2",
    umap_n_components: int=2,
    umap_n_neighbors: int=5,
    umap_min_dist: float=0.35
    umap_random_state: int=42,
    clustering_min_samples: int=5,
    clustering_gen_min_span_tree: bool =True,
    clustering_prediction_data: bool=True,
    reduce_frequent_words: bool = True
    count_vectorizer_stop_words: List[str] = list(),
    keybert_top_n_words: int=20

    @classmethod
    def from_parameters(cls, 
                        language = "english",
                        min_topic_size=10,
                        calculate_probabilities=False,
                        verbose=False,
                        nr_topics=None,
                        zeroshot_topic_list=None,
                        sentence_transformer_model="all-MiniLM-L6-v2",
                        umap_n_components=2,
                        umap_n_neighbors=5,
                        umap_min_dist=0.35,
                        umap_random_state=42,
                        clustering_min_samples=5,
                        clustering_gen_min_span_tree=True,
                        clustering_prediction_data=True,
                        reduce_frequent_words= True,
                        count_vectorizer_stop_words = list(),
                        keybert_top_n_words=20):
        return cls(
            language = language,
            min_topic_size=min_topic_size,
            calculate_probabilities=calculate_probabilities,
            verbose=verbose,
            nr_topics=nr_topics,
            zeroshot_topic_list=zeroshot_topic_list,
            sentence_transformer_model=sentence_transformer_model,
            umap_n_components=umap_n_components,
            umap_n_neighbors=umap_n_neighbors,
            umap_min_dist=umap_min_dist,
            umap_random_state=umap_random_state,
            clustering_min_samples=clustering_min_samples,
            clustering_gen_min_span_tree=clustering_gen_min_span_tree,
            clustering_prediction_data=clustering_prediction_data,
            reduce_frequent_words=reduce_frequent_words,
            count_vectorizer_stop_words = count_vectorizer_stop_words,
            keybert_top_n_words=keybert_top_n_words
            )
        
    
    def get_betopic(self):
        #return BERTopic()
        topic_model =  BERTopic(
            language=self.language,
            min_topic_size=self.min_topic_size,
            calculate_probabilities=self.calculate_probabilities,
            verbose=self.verbose,
            nr_topics=self.nr_topics,
            zeroshot_topic_list=self.zeroshot_topic_list,
            embedding_model=SentenceTransformer(self.sentence_transformer_model),  # Step 1 - Extract embeddings
            umap_model=UMAP(n_components=self.umap_n_components, n_neighbors=self.umap_n_neighbors, min_dist=self.umap_min_dist, random_state=self.umap_random_state), # Step 2 - Reduce dimensionality
            hdbscan_model=HDBSCAN(min_samples=self.clustering_min_samples, gen_min_span_tree=self.clustering_gen_min_span_tree, prediction_data=self.clustering_prediction_data),  # Step 3 - Cluster reduced embeddings
            vectorizer_model=CountVectorizer(stop_words=self.count_vectorizer_stop_words),   # Step 4 - Tokenize topics
            ctfidf_model=ClassTfidfTransformer(reduce_frequent_words=self.reduce_frequent_words),  # Step 5 - Extract topic words

            representation_model=KeyBERTInspired(top_n_words=self.keybert_top_n_words) # Step 6 - (Optional) Fine-tune topic represenations
        )
        return topic_model

def train_bertopic_model(documents: List[str], bert_parameters: BertParamters):
    """
    trains the bertopic model from the documents that are given as a parameter
    :param documents: an array of documents to get topics made of
    :param model_setting_index: specifies which batch of hyperparameters to pick that define bertopic. Standard = 0
    :return: 
    """
    topic_model = bert_parameters.get_betopic()
    print(topic_model.get_params())
    topics, props = topic_model.fit_transform(documents)
    return topic_model, topics, props
    
def get_topics_that_relate_to_word(topic_model: BERTopic, word_to_relate: str):
    """
    this function finds the topics that relate most to the input words given which is word_to_relate
    :param topic_model: 
    :return: 
    """
    similar_topics, similarities = topic_model.find_topics(word_to_relate, top_n=3)
    for topic_nr, similarity in zip(similar_topics, similarities):
        print(f"topic_nr: {topic_nr} | similarity to {word_to_relate}: {similarity}")
        print(f"words with similarities for topic:")
        print(topic_model.get_topic(topic_nr))
        

def get_coherence(df_, content_col_name, _topic_model, _topics_numbers):
    # add the new topic numbers to the "topic" column
    df_['topic'] = _topics_numbers
    documents_per_topic = df.groupby(['topic'], as_index=False).agg({content_col_name: ' '.join})
    cleaned_docs = _topic_model._preprocess_text(documents_per_topic[content_col_name].values)
    
    # Extract vectorizer and analyzer from BERTopic
    vectorizer = _topic_model.vectorizer_model
    analyzer = vectorizer.build_analyzer()
    
    
    # Extract features for Topic Coherence evaluation
    #words = vectorizer.get_feature_names()
    tokens = [analyzer(doc) for doc in cleaned_docs]
    dictionary = corpora.Dictionary(tokens)
    corpus = [dictionary.doc2bow(token) for token in tokens]
    topic_words = [[words for words, _ in _topic_model.get_topic(topic)] 
                   for topic in range(len(set(df_["topic"]))-1)]
    
    # Evaluate
    coherence_model = CoherenceModel(topics=topic_words, 
                                     texts=tokens, 
                                     corpus=corpus,
                                     dictionary=dictionary, 
                                     coherence='c_v')
    coherence = coherence_model.get_coherence()
    return coherence

def get_topic_diversity(topic_model):
    unique_words = set()
    total_words = 0
    
    for topic_nr, topic_similarity_word_list in topic_model.topic_representations_.items():
        # the first topic is not a topic, these are the outlier documents, so we should disregard these
        if topic_nr == -1:
            continue
        for topic_word, similarity in topic_similarity_word_list:
            
            total_words += 1
            unique_words.add(topic_word)
            
    topic_diversity = len(unique_words)/total_words
    return topic_diversity

def show_bertopic_evaluations(df, content_col_name, bert_parameters: BertParamters, iteration=0):
    print("iteration = ", iteration)
    text_data_list = df[content_col_name]
    _topic_model_content, _topics_numbers, _probs_numbers = train_bertopic_model(text_data_list,
                                                                                 bert_parameters=bert_parameters)
    print("total number of topics = ", len(_topic_model_content.get_topic_info()))
    print("_topic_model_content.get_topic_info(-1) = ", _topic_model_content.get_topic_info(-1))
    print("documents without a topic = ", _topic_model_content.get_topic_info(-1)["Count"].values[0])
    print("topic diversity = ", get_topic_diversity(_topic_model_content))
    print("topic coherence = ", get_coherence(df, content_col_name, _topic_model_content, _topics_numbers))
    for topic_map_nr in range(1000):
        if not os.path.exists(f'plots/intertopic_{topic_map_nr}.jpg'):
            print(f"iteration_nr = {topic_map_nr}")
            iteration = topic_map_nr
            break
    print(f"writing the new image to 'plots/intertopic_{iteration}.jpg' & plots/topicbar_{iteration}.jpg ")
    _topic_model_content.visualize_topics().write_image(f'plots/intertopic_{iteration}.jpg')
    _topic_model_content.visualize_barchart(top_n_topics=60, n_words=10).write_image(f"plots/topicbar_{iteration}.jpg")
    return _topic_model_content, _topics_numbers, _probs_numbers


stopwords = ["post_title:"] # We have to disregard this because we use it in every post



        
custom_bert_parameters = {
    "language": "english",
    #"min_topic_size": 100,
    "calculate_probabilities": True,
    "verbose": False,
    "nr_topics": "none",
    "zeroshot_topic_list"
    "document_embedding": SentenceTransformer("all-MiniLM-L6-v2"),
    "reduce_dimensionality": UMAP(n_components=5, n_neighbors=5, min_dist=0.35, random_state=42),
    "clustering": HDBSCAN(min_samples=5, gen_min_span_tree=True, prediction_data=True),
    "topic_tokenization": CountVectorizer(stop_words=stopwords),
    "topic_representation": ClassTfidfTransformer(reduce_frequent_words=True),
    "representation_model": KeyBERTInspired(top_n_words=20)
}
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
df = pd.DataFrame(cluster_unit_entities)
topic_, topicnum_, probs_ = show_bertopic_evaluations(df=df, content_col_name="text", bert_parameters=BertParamters.from_parameters())


 

iteration =  0
{'calculate_probabilities': False, 'ctfidf_model': ClassTfidfTransformer(reduce_frequent_words=True), 'embedding_model': SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False, 'architecture': 'BertModel'})
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), 'hdbscan_model': HDBSCAN(gen_min_span_tree=True, min_samples=5, prediction_data=True), 'language': None, 'low_memory': False, 'min_topic_size': 10, 'n_gram_range': (1, 1), 'nr_topics': None, 'representation_model': KeyBERTInspired(top_n_words=20), 'seed_topic_list': None, 'top_n_words': 10, 'umap_model': UMAP(min_dist=0.35, n_neighbors=5, random_state=42), 'vectorizer_model': CountVectorizer(stop_words=[]), 'verbose': False, 

ValueError: unable to interpret topic as either a list of tokens or a list of ids

In [19]:
topic_b = BertParamters.from_parameters().get_betopic()
topic_b.fit_transform(df["text"])

([5,
  -1,
  8,
  1,
  1,
  1,
  0,
  -1,
  0,
  0,
  0,
  -1,
  36,
  5,
  5,
  0,
  0,
  -1,
  0,
  0,
  1,
  0,
  -1,
  0,
  0,
  0,
  0,
  21,
  0,
  0,
  3,
  3,
  35,
  0,
  19,
  0,
  0,
  0,
  1,
  0,
  0,
  5,
  3,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  -1,
  -1,
  0,
  1,
  0,
  0,
  0,
  -1,
  0,
  21,
  0,
  33,
  -1,
  -1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  3,
  0,
  0,
  3,
  35,
  0,
  5,
  0,
  0,
  0,
  3,
  -1,
  35,
  0,
  0,
  -1,
  0,
  39,
  0,
  5,
  0,
  0,
  1,
  0,
  0,
  0,
  35,
  0,
  1,
  0,
  0,
  0,
  21,
  0,
  -1,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  3,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  -1,
  0,
  0,
  0,
  0,
  0,
  3,
  -1,
  -1,
  0,
  0,
  -1,
  15,
  0,
  1,
  0,
  15,
  -1,
  1,
  1,
  1,
  -1,
  15,
  -1,
  0,
  0,
  15,
  0,
  0,
  1,
  -1,
  -1,
  10,
  0,
  -1,
  1,
  -1,
  -1,
  10,
  1,
  3,
  10,
  10,
  10,
  10,
  -1,
  -1,
  0,
  10,
  -1,
  -1,

In [None]:
topic_m = BertParamters.from_parameters().get_betopic()

In [17]:
topic_m.fit_transform(df["text"])

([3,
  8,
  11,
  9,
  -1,
  9,
  -1,
  24,
  0,
  -1,
  -1,
  -1,
  -1,
  3,
  3,
  -1,
  0,
  1,
  -1,
  64,
  48,
  21,
  58,
  -1,
  -1,
  -1,
  -1,
  27,
  70,
  0,
  1,
  1,
  1,
  -1,
  -1,
  -1,
  0,
  110,
  9,
  0,
  -1,
  3,
  1,
  -1,
  108,
  76,
  -1,
  0,
  0,
  -1,
  58,
  58,
  0,
  -1,
  82,
  36,
  -1,
  -1,
  -1,
  -1,
  -1,
  53,
  13,
  -1,
  -1,
  -1,
  0,
  -1,
  76,
  0,
  -1,
  -1,
  32,
  0,
  0,
  0,
  21,
  -1,
  -1,
  68,
  -1,
  68,
  -1,
  1,
  62,
  -1,
  3,
  -1,
  68,
  -1,
  1,
  68,
  -1,
  68,
  68,
  24,
  68,
  -1,
  68,
  3,
  -1,
  96,
  48,
  104,
  -1,
  -1,
  62,
  -1,
  -1,
  82,
  96,
  21,
  8,
  -1,
  20,
  -1,
  104,
  -1,
  0,
  48,
  21,
  104,
  -1,
  82,
  -1,
  21,
  108,
  32,
  -1,
  -1,
  -1,
  1,
  69,
  -1,
  -1,
  69,
  0,
  69,
  69,
  -1,
  76,
  0,
  64,
  69,
  -1,
  1,
  15,
  -1,
  -1,
  -1,
  -1,
  31,
  32,
  -1,
  108,
  15,
  31,
  -1,
  33,
  56,
  -1,
  31,
  -1,
  64,
  -1,
  -1,
  21,
  32,
  -1,
  -1,
  -1,
  -

In [None]:
{'calculate_probabilities': False, 'ctfidf_model': ClassTfidfTransformer(), 'embedding_model': None, 'hdbscan_model': HDBSCAN(min_cluster_size=10, prediction_data=True), 'language': 'english', 'low_memory': False, 'min_topic_size': 10, 'n_gram_range': (1, 1), 'nr_topics': None, 'representation_model': None, 'seed_topic_list': None, 'top_n_words': 10, 'umap_model': UMAP(low_memory=False, metric='cosine', min_dist=0.0, n_components=5), 'vectorizer_model': CountVectorizer(), 'verbose': False, 'zeroshot_min_similarity': 0.7, 'zeroshot_topic_list': None}


In [32]:
df

Unnamed: 0,author,cluster_entity_id,comment_post_id,created_at,created_utc,deleted_at,downvotes,enriched_comment_thread_text,id,post_id,reddit_id,text,thread_path_text,type,updated_at,upvotes,usertag
0,acurrantafair,68e13064c04b5f3e7ed60c9d,68c7ed68e677aa3158ca62f3,2025-10-04T14:34:12.181000+00:00,1582873446,,0,,68e13064c04b5f3e7ed60c9e,68c7ed68e677aa3158ca62f3,faqub4,"I'm a comedian with partial hearing loss, and ...",[],post,,61,Video
1,[deleted],68e13064c04b5f3e7ed60c9d,68c7ed68e677aa3158ca62e8,2025-10-04T14:34:12.181000+00:00,1582885910,,0,,68e13064c04b5f3e7ed60c9f,68c7ed68e677aa3158ca62f3,fj02e8m,[deleted],"[I'm a comedian with partial hearing loss, and...",comment,,14,
2,Indy_Pendant,68e13064c04b5f3e7ed60c9d,68c7ed68e677aa3158ca62e9,2025-10-04T14:34:12.181000+00:00,1582900725,,0,,68e13064c04b5f3e7ed60ca0,68c7ed68e677aa3158ca62f3,fj0igf1,Handled like a champ! Serious props for handli...,"[I'm a comedian with partial hearing loss, and...",comment,,7,
3,IrishLuigi,68e13064c04b5f3e7ed60c9d,68c7ed68e677aa3158ca62ea,2025-10-04T14:34:12.181000+00:00,1582898172,,0,,68e13064c04b5f3e7ed60ca1,68c7ed68e677aa3158ca62f3,fj0erw6,"Many laughs were had, your banter game is solid.","[I'm a comedian with partial hearing loss, and...",comment,,7,
4,PockettesMJV,68e13064c04b5f3e7ed60c9d,68c7ed68e677aa3158ca62eb,2025-10-04T14:34:12.181000+00:00,1582923259,,0,,68e13064c04b5f3e7ed60ca2,68c7ed68e677aa3158ca62f3,fj1mnww,\*claps in ASL aggressively\*,"[I'm a comedian with partial hearing loss, and...",comment,,8,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2531,BritishDeafMan,68e13064c04b5f3e7ed60c9d,68e0ea2ba792cd3888c83191,2025-10-04T14:34:41.311000+00:00,1643478572,,0,,68e13081c04b5f3e7ed61681,68e0ea2ba792cd3888c83194,huqt0nt,1. Yes it is completely independent from Engli...,[Very basic questions\nI'm interested in langu...,comment,,5,Native
2532,CrazyNinjaJay,68e13064c04b5f3e7ed60c9d,68e0ea2ba792cd3888c83193,2025-10-04T14:34:41.311000+00:00,1643966703,,0,,68e13081c04b5f3e7ed61682,68e0ea2ba792cd3888c83194,hvjiieo,I would disagree with the other comment partia...,[Very basic questions\nI'm interested in langu...,comment,,1,
2533,takhana,68e13064c04b5f3e7ed60c9d,68c8214fd4e901a1fc3de9cc,2025-10-04T14:34:41.363000+00:00,1685552663,,0,,68e13081c04b5f3e7ed61683,68c8214fd4e901a1fc3de9cc,13wrwa7,Where do I find native BSL speakers?\nExcuse t...,[],post,,5,Question
2534,vintagelingstitches,68e13064c04b5f3e7ed60c9d,68c8214fd4e901a1fc3de9c9,2025-10-04T14:34:41.363000+00:00,1685558551,,0,,68e13081c04b5f3e7ed61684,68c8214fd4e901a1fc3de9cc,jmdgiuc,Have a look at zebra access I'm doing a course...,[Where do I find native BSL speakers?\nExcuse ...,comment,,4,


In [24]:
type(SentenceTransformer())

sentence_transformers.SentenceTransformer.SentenceTransformer

In [13]:
df

Unnamed: 0,author,cluster_entity_id,comment_post_id,created_at,created_utc,deleted_at,downvotes,enriched_comment_thread_text,id,post_id,reddit_id,text,thread_path_text,type,updated_at,upvotes,usertag,topic
0,acurrantafair,68e13064c04b5f3e7ed60c9d,68c7ed68e677aa3158ca62f3,2025-10-04T14:34:12.181000+00:00,1582873446,,0,,68e13064c04b5f3e7ed60c9e,68c7ed68e677aa3158ca62f3,faqub4,"I'm a comedian with partial hearing loss, and ...",[],post,,61,Video,0
1,[deleted],68e13064c04b5f3e7ed60c9d,68c7ed68e677aa3158ca62e8,2025-10-04T14:34:12.181000+00:00,1582885910,,0,,68e13064c04b5f3e7ed60c9f,68c7ed68e677aa3158ca62f3,fj02e8m,[deleted],"[I'm a comedian with partial hearing loss, and...",comment,,14,,6
2,Indy_Pendant,68e13064c04b5f3e7ed60c9d,68c7ed68e677aa3158ca62e9,2025-10-04T14:34:12.181000+00:00,1582900725,,0,,68e13064c04b5f3e7ed60ca0,68c7ed68e677aa3158ca62f3,fj0igf1,Handled like a champ! Serious props for handli...,"[I'm a comedian with partial hearing loss, and...",comment,,7,,7
3,IrishLuigi,68e13064c04b5f3e7ed60c9d,68c7ed68e677aa3158ca62ea,2025-10-04T14:34:12.181000+00:00,1582898172,,0,,68e13064c04b5f3e7ed60ca1,68c7ed68e677aa3158ca62f3,fj0erw6,"Many laughs were had, your banter game is solid.","[I'm a comedian with partial hearing loss, and...",comment,,7,,-1
4,PockettesMJV,68e13064c04b5f3e7ed60c9d,68c7ed68e677aa3158ca62eb,2025-10-04T14:34:12.181000+00:00,1582923259,,0,,68e13064c04b5f3e7ed60ca2,68c7ed68e677aa3158ca62f3,fj1mnww,\*claps in ASL aggressively\*,"[I'm a comedian with partial hearing loss, and...",comment,,8,,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2531,BritishDeafMan,68e13064c04b5f3e7ed60c9d,68e0ea2ba792cd3888c83191,2025-10-04T14:34:41.311000+00:00,1643478572,,0,,68e13081c04b5f3e7ed61681,68e0ea2ba792cd3888c83194,huqt0nt,1. Yes it is completely independent from Engli...,[Very basic questions\nI'm interested in langu...,comment,,5,Native,26
2532,CrazyNinjaJay,68e13064c04b5f3e7ed60c9d,68e0ea2ba792cd3888c83193,2025-10-04T14:34:41.311000+00:00,1643966703,,0,,68e13081c04b5f3e7ed61682,68e0ea2ba792cd3888c83194,hvjiieo,I would disagree with the other comment partia...,[Very basic questions\nI'm interested in langu...,comment,,1,,26
2533,takhana,68e13064c04b5f3e7ed60c9d,68c8214fd4e901a1fc3de9cc,2025-10-04T14:34:41.363000+00:00,1685552663,,0,,68e13081c04b5f3e7ed61683,68c8214fd4e901a1fc3de9cc,13wrwa7,Where do I find native BSL speakers?\nExcuse t...,[],post,,5,Question,5
2534,vintagelingstitches,68e13064c04b5f3e7ed60c9d,68c8214fd4e901a1fc3de9c9,2025-10-04T14:34:41.363000+00:00,1685558551,,0,,68e13081c04b5f3e7ed61684,68c8214fd4e901a1fc3de9cc,jmdgiuc,Have a look at zebra access I'm doing a course...,[Where do I find native BSL speakers?\nExcuse ...,comment,,4,,-1


In [22]:
# Option 1: unpack the tuple from iterrows
for index, row in df.loc[df["topic"] == 0].iterrows():
    print("index = ", index)
    print(row["text"])
    print("----\n\n")

index =  0
I'm a comedian with partial hearing loss, and had a VERY weird heckle related to it. Subtitles in the video. [NSFW]

----


index =  8
Two bills for open captions (on-screen subtitles) in New York and Washington state!
It has been a VERY busy past two weeks for us: BOTH New York AND Washington state have bills for open captions in movie theaters! Rather than post a long Reddit post, pointing you to two updates that went out on the Caption Action 3 petition. The Washington state update just went out. Washington's bill has a hearing very soon, details are in the update. Questions? Post a comment and we will reply. 

1. New York State: [https://www.change.org/p/movie-theaters-open-captions-subtitles-are-better-for-everyone/u/33174905](https://www.change.org/p/movie-theaters-open-captions-subtitles-are-better-for-everyone/u/33174905)

2. Washington state: [https://www.change.org/p/movie-theaters-open-captions-subtitles-are-better-for-everyone/u/33210797](https://www.change.org/p