In [None]:
!pip install bertopic[flair,gensim,spacy,use] transformers sentencepiece

Collecting bertopic[flair,gensim,spacy,use]
  Downloading bertopic-0.17.3-py3-none-any.whl.metadata (24 kB)
Collecting flair>=0.7 (from bertopic[flair,gensim,spacy,use])
  Downloading flair-0.15.1-py3-none-any.whl.metadata (12 kB)
Collecting gensim>=4.0.0 (from bertopic[flair,gensim,spacy,use])
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Collecting boto3>=1.20.27 (from flair>=0.7->bertopic[flair,gensim,spacy,use])
  Downloading boto3-1.42.1-py3-none-any.whl.metadata (6.8 kB)
Collecting conllu<5.0.0,>=4.0 (from flair>=0.7->bertopic[flair,gensim,spacy,use])
  Downloading conllu-4.5.3-py2.py3-none-any.whl.metadata (19 kB)
Collecting deprecated>=1.2.13 (from flair>=0.7->bertopic[flair,gensim,spacy,use])
  Downloading deprecated-1.3.1-py2.py3-none-any.whl.metadata (5.9 kB)
Collecting ftfy>=6.1.0 (from flair>=0.7->bertopic[flair,gensim,spacy,use])
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Collecting langdetect>=1.

### 1. Load comments for a video

In [None]:
from google.colab import drive
import pandas as pd
drive.mount('/content/drive')

VIDEO_ID = 'j31dmodZ-5c'
df = pd.read_csv(f'/content/drive/MyDrive/CommentsData/{VIDEO_ID}/comments_preprocessed.csv')


Mounted at /content/drive


### 2. Fit BERTopic model

In [None]:
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
from umap import UMAP
import hdbscan

# 1. Embedding model – MPNet works way better than MiniLM
embedding_model = SentenceTransformer("all-mpnet-base-v2")

# 2. Vectorizer – must include bigrams + ignore rare noise
vectorizer_model = CountVectorizer(
    stop_words="english",
    ngram_range=(1, 2),
    min_df=3
)

# 3. UMAP – critical for short text clustering
umap_model = UMAP(
    n_neighbors=30,
    n_components=12,
    min_dist=0.0,
    metric="cosine",
    random_state=42
)

min_cluster_size = max(len(df.index) // 100, 10)  # Set cluster size to 1% of total comments count or 10, which ever is more.

# 4. HDBSCAN – more granular, better small-topic separation
hdbscan_model = hdbscan.HDBSCAN(
    min_cluster_size=min_cluster_size,
    min_samples=3,
    metric="euclidean",
    cluster_selection_method="leaf",
    prediction_data=True
)

topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    calculate_probabilities=True,
    verbose=True
)

topics, probs = topic_model.fit_transform(df['cleanedCommentText'])


2025-12-02 21:38:31,475 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/508 [00:00<?, ?it/s]

2025-12-02 21:39:10,404 - BERTopic - Embedding - Completed ✓
2025-12-02 21:39:10,405 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-12-02 21:39:48,476 - BERTopic - Dimensionality - Completed ✓
2025-12-02 21:39:48,478 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-12-02 21:39:51,038 - BERTopic - Cluster - Completed ✓
2025-12-02 21:39:51,044 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-12-02 21:39:51,518 - BERTopic - Representation - Completed ✓


### 4. Add topic_id to each comment

In [None]:
print(topic_model.get_topic_info())

df["topic_id"] = topics



    Topic  Count                                   Name  \
0      -1   7681              -1_robot_like_people_just   
1       0    850                  0_ai_data_like_robots   
2       1    586                 1_indian_india_ai_jobs   
3       2    584            2_privacy_house_home_hacked   
4       3    501                  3_maid_month_hire_500   
5       4    449           4_marques_video_videos_thank   
6       5    438         5_dishes_dishwasher_wash_clean   
7       6    424         6_humanoid_human_robots_shaped   
8       7    403         7_robot_terminator_movie_video   
9       8    386         8_creepy_looks_looks like_look   
10      9    378              9_tesla_driving_elon_self   
11     10    347           10_neo_hey neo_hey_neo robot   
12     11    333                         11_14_15_16_33   
13     12    278           12_robot_chores_house_vacuum   
14     13    249               13_robot_house_stove_gas   
15     14    248       14_vr_headset_vr headset_control 

### 5.a. Facebook BART large CNN model for summarization

In [None]:
import gc
import torch

gc.collect()
torch.cuda.empty_cache()
torch.cuda.ipc_collect()

In [None]:
from transformers import pipeline

summarizer = pipeline(
    "summarization",
    model="facebook/bart-large-cnn",
    tokenizer="facebook/bart-large-cnn",
    device=0  # GPU (use device=-1 for CPU)
)

Device set to use cuda:0


In [None]:
def get_representative_comments(
    topic_model, df, topic_id, text_col="clean_text", n=15, max_len=300
):
    """
    Get N most representative comments for a topic based on topic probability.
    Filters out overly long comments to prevent skewed summarization.
    """
    # Rows belonging to this topic
    subset = df[df["topic_id"] == topic_id]

    # Filter out unusually long comments (optional but recommended)
    subset = subset[subset[text_col].str.len() <= max_len]

    # Sort by topic probability (requires probs from BERTopic)
    if "topic_probability" in subset.columns:
        subset = subset.sort_values("topic_probability", ascending=False)
    else:
        # fallback: force deterministic order
        subset = subset.sample(frac=1, random_state=42)

    # Return top N comments as list
    return subset[text_col].head(n).tolist()

def summarize_topic_bart(docs, max_len=80, min_len=25):
    if len(docs) == 0:
        return "Not enough representative comments to summarize."

    combined = " ".join(docs)

    summary = summarizer(
        combined,
        max_length=max_len,
        min_length=min_len,
        do_sample=False
    )[0]["summary_text"]

    return summary


In [None]:
from pprint import pprint

topic_id = 1
docs_for_topic = get_representative_comments(topic_model, df, topic_id, text_col='cleanedCommentText')
pprint(docs_for_topic)
summary = summarize_topic_bart(docs_for_topic)
print(summary)

["so, it's powered by ai, actually indian!!!",
 'so a random indian guy will watch you all the time.',
 'its just outsourcing slave labor to mumbai call centers',
 "can't wait for the minimum wage wfh jobs as a tele operator",
 'think about this. the only way you can make this robot affordable, would be '
 'to pay third world people to operate .the robots so we basically don t have '
 'to look at our help ..',
 'a cleaning worker in india gets about 170 per month. so this company would '
 'proably make money letting these people remote controlle the houshold robot.',
 "why do i feel like there's just going to become this slave class of people "
 "who's job is just driving these robots in people's houses?",
 'ai all indians.',
 'ai stands for an indian',
 'i think we can have robots teleoperated from india they will have job and we '
 'will have robots d',
 'imagine if companies started hiring people in low wage countries to remotely '
 'control robots. you d think you owned a robot at 

In [None]:
def prepare_for_bart(docs):
    formatted = "\n".join(f"- {c}" for c in docs)
    return formatted


text = "tl;dr: \n" + "\n".join(f"- {c}" for c in docs_for_topic)
print(text)

summary = summarizer(text, max_length=80, min_length=20, do_sample=False)[0]['summary_text']
print(summary)


tl;dr: 
- so, it's powered by ai, actually indian!!!
- so a random indian guy will watch you all the time.
- its just outsourcing slave labor to mumbai call centers
- can't wait for the minimum wage wfh jobs as a tele operator
- think about this. the only way you can make this robot affordable, would be to pay third world people to operate .the robots so we basically don t have to look at our help ..
- a cleaning worker in india gets about 170 per month. so this company would proably make money letting these people remote controlle the houshold robot.
- why do i feel like there's just going to become this slave class of people who's job is just driving these robots in people's houses?
- ai all indians.
- ai stands for an indian
- i think we can have robots teleoperated from india they will have job and we will have robots d
- imagine if companies started hiring people in low wage countries to remotely control robots. you d think you owned a robot at home, but in reality, you d just hav

### 5.b. Google Pegasus XSUM model forsummarization

In [None]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

model_name = "google/pegasus-xsum"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)


tokenizer_config.json:   0%|          | 0.00/87.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/259 [00:00<?, ?B/s]

In [None]:
def summarize_pegasus(docs, max_len=60):
    # Format comments as bullet list
    text = "\n".join(f"- {d}" for d in docs)

    inputs = tokenizer(text, truncation=True, return_tensors="pt")
    output = model.generate(
        **inputs,
        max_length=max_len,
        num_beams=5,
        length_penalty=1.0,
        early_stopping=True
    )
    return tokenizer.decode(output[0], skip_special_tokens=True)


topic_id = 1
docs_for_topic = get_representative_comments(topic_model, df, topic_id, text_col='cleanedCommentText', n=25)
pprint(docs_for_topic)
summary = summarize_pegasus(docs_for_topic)
print(summary)

["so, it's powered by ai, actually indian!!!",
 'so a random indian guy will watch you all the time.',
 'its just outsourcing slave labor to mumbai call centers',
 "can't wait for the minimum wage wfh jobs as a tele operator",
 'think about this. the only way you can make this robot affordable, would be '
 'to pay third world people to operate .the robots so we basically don t have '
 'to look at our help ..',
 'a cleaning worker in india gets about 170 per month. so this company would '
 'proably make money letting these people remote controlle the houshold robot.',
 "why do i feel like there's just going to become this slave class of people "
 "who's job is just driving these robots in people's houses?",
 'ai all indians.',
 'ai stands for an indian',
 'i think we can have robots teleoperated from india they will have job and we '
 'will have robots d',
 'imagine if companies started hiring people in low wage countries to remotely '
 'control robots. you d think you owned a robot at 

RuntimeError: Expected all tensors to be on the same device, but got index is on cpu, different from other tensors on cuda:0 (when checking argument in method wrapper_CUDA__index_select)

:### 5.c. FLAN T5-Large model for summarization

In [None]:
import gc
import torch

gc.collect()
torch.cuda.empty_cache()
torch.cuda.ipc_collect()

In [None]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer

model_name = "google/flan-t5-large"

tokenizer = T5Tokenizer.from_pretrained(model_name)

model = T5ForConditionalGeneration.from_pretrained(
    model_name,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    device_map="auto"
)


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
`torch_dtype` is deprecated! Use `dtype` instead!


config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
def format_comments_for_summary(comments):
    cleaned = [c.replace('"', '').strip() for c in comments]
    block = "\n".join(f"- {c}" for c in cleaned)
    return (
        "Summarize the main ideas and themes in these comments. "
        "Focus on shared opinions and recurring concerns:\n"
        + block
    )


def summarize_with_flan_large(comments, max_length=120):
    text = format_comments_for_summary(comments)
    print(text)

    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=2048
    ).to(model.device)

    output_ids = model.generate(
        **inputs,
        max_length=max_length,
        min_length=40,
        num_beams=5,
        no_repeat_ngram_size=3,
        early_stopping=True
    )

    return tokenizer.decode(output_ids[0], skip_special_tokens=True)




In [None]:
summary = summarize_with_flan_large(docs_for_topic)
print(summary)

Summarize the main ideas and themes in these comments. Focus on shared opinions and recurring concerns:
- so, it's powered by ai, actually indian!!!
- so a random indian guy will watch you all the time.
- its just outsourcing slave labor to mumbai call centers
- can't wait for the minimum wage wfh jobs as a tele operator
- think about this. the only way you can make this robot affordable, would be to pay third world people to operate .the robots so we basically don t have to look at our help ..
- a cleaning worker in india gets about 170 per month. so this company would proably make money letting these people remote controlle the houshold robot.
- why do i feel like there's just going to become this slave class of people who's job is just driving these robots in people's houses?
- ai all indians.
- ai stands for an indian
- i think we can have robots teleoperated from india they will have job and we will have robots d
- imagine if companies started hiring people in low wage countries t

:### 5.d. BART Large CNN SamSum model for summarization

In [None]:
import torch
from transformers import BartForConditionalGeneration, BartTokenizer

model_name = "philschmid/bart-large-cnn-samsum"

tokenizer = BartTokenizer.from_pretrained(model_name)

model = BartForConditionalGeneration.from_pretrained(
    model_name,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    device_map="auto"
)


tokenizer_config.json:   0%|          | 0.00/300 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

In [None]:
def format_comments_as_chat(comments):
    chat = []
    for i, c in enumerate(comments, start=1):
        speaker = f"Commenter {i}"
        chat.append(f"{speaker}: {c.strip()}")
    return "\n".join(chat)

def summarize_with_bart_samsum(comments, max_length=120):
    chat_text = "Summarize the following conversation:\n" + format_comments_as_chat(comments)

    inputs = tokenizer(
        chat_text,
        max_length=2048,
        padding="longest",
        truncation=True,
        return_tensors="pt"
    ).to(model.device)

    output_ids = model.generate(
        **inputs,
        max_length=max_length,
        num_beams=5,
        length_penalty=1.0,
        no_repeat_ngram_size=3,
        early_stopping=True
    )

    summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return summary


In [None]:
summary = summarize_with_bart_samsum(docs_for_topic)
print(summary)

Summarize the following conversation:
Commenter 1: so, it's powered by ai, actually indian!!!
Commenter 2: so a random indian guy will watch you all the time.
Commenter 3: its just outsourcing slave labor to mumbai call centers
Commenter 4: can't wait for the minimum wage wfh jobs as a tele operator
Commenter 5: think about this. the only way you can make this robot affordable, would be to pay third world people to operate .the robots so we basically don t have to look at our help ..
Commenter 6: a cleaning worker in india gets about 170 per month. so this company would proably make money letting these people remote controlle the houshold robot.
Commenter 7: why do i feel like there's just going to become this slave class of people who's job is just driving these robots in people's houses?
Commenter 8: ai all indians.
Commenter 9: ai stands for an indian
Commenter 10: i think we can have robots teleoperated from india they will have job and we will have robots d
Commenter 11: imagine i

### Topic Title Generation

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

title_model_name = "google/flan-t5-base"
title_tokenizer = AutoTokenizer.from_pretrained(title_model_name)
title_model = AutoModelForSeq2SeqLM.from_pretrained(title_model_name)

def generate_title(summary, max_length=20):
    prompt = f"Suggest a title for this text:\n\n{summary}\n\nTitle:"

    inputs = title_tokenizer(prompt, return_tensors="pt")
    outputs = title_model.generate(
        **inputs,
        max_new_tokens=max_length,
        num_beams=5,
        no_repeat_ngram_size=3
    )

    return title_tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
title = generate_title(summary)
print("Generated Title:", title)

Generated Title: Indian tele operators to drive robots


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_name = "mistralai/Mistral-7B-Instruct-v0.2"

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

