In [1]:
try:
  from google.colab import drive
  drive.mount('/content/drive')
except ModuleNotFoundError:
  print("Not running on colab...")

RUN_DASHBOARD = True

Mounted at /content/drive


### Install dependencies

In [2]:
!pip install -U gradio youtube-transcript-api google-api-python-client


Collecting gradio
  Downloading gradio-6.0.2-py3-none-any.whl.metadata (16 kB)
Collecting youtube-transcript-api
  Downloading youtube_transcript_api-1.2.3-py3-none-any.whl.metadata (24 kB)
Collecting gradio-client==2.0.1 (from gradio)
  Downloading gradio_client-2.0.1-py3-none-any.whl.metadata (7.1 kB)
Downloading gradio-6.0.2-py3-none-any.whl (21.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.6/21.6 MB[0m [31m118.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading gradio_client-2.0.1-py3-none-any.whl (55 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.4/55.4 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading youtube_transcript_api-1.2.3-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.1/485.1 kB[0m [31m40.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: youtube-transcript-api, gradio-client, gradio
  Attempting uninstall: gradio-client
    Found existing installa

In [3]:
import pandas as pd
from io import BytesIO
from googleapiclient.http import MediaIoBaseDownload, MediaIoBaseUpload # Import needed for downloading and uploading
import re # Keep this import as it's used by clean_comment
from typing import List, Optional

def drive_to_df(folder_id: str, filename: str, column_names: Optional[List[str]] = None) -> pd.DataFrame:
    # Find the file ID of 'comments.csv' in the folder_id
    query = f"name='{filename}' and '{folder_id}' in parents"
    response = service.files().list(q=query, spaces='drive', fields='files(id, name)').execute()
    files = response.get('files', [])

    if files:
        file_id = files[0].get('id')
        print(f"Found '{filename}' with ID: {file_id}")

        # Download the file content
        request = service.files().get_media(fileId=file_id)
        fh = BytesIO()
        downloader = MediaIoBaseDownload(fh, request)
        done = False
        while done is False:
            status, done = downloader.next_chunk()
            print(f"Download {int(status.progress() * 100)}%.")

        fh.seek(0) # Reset buffer position to the beginning

        # Read the downloaded content into a pandas DataFrame
        if column_names:
            df = pd.read_csv(fh, header=None, names=column_names)
        else:
            df = pd.read_csv(fh)
        print("Comments loaded successfully into df.")
    else:
        print(f"Error: '{filename}' not found in folder {folder_id}. Please ensure the file is uploaded.")
        df = pd.DataFrame() # Initialize empty DataFrame to avoid errors later
    return df


def df_to_drive(df, folder_id, filename):
    """
    Uploads a pandas DataFrame as a CSV file to Google Drive.

    Args:
        df (pd.DataFrame): The DataFrame to upload.
        folder_id (str): The ID of the Google Drive folder where the file will be saved.
        filename (str): The desired filename for the CSV file in Google Drive.
    """
    # Convert DataFrame to CSV in-memory
    csv_buffer = BytesIO()
    df.to_csv(csv_buffer, index=False, encoding='utf-8')
    csv_buffer.seek(0) # Rewind the buffer to the beginning

    # Prepare metadata for the new file
    file_metadata = {
        'name': filename,
        'parents': [folder_id]
    }

    # Create a MediaIoBaseUpload object from the BytesIO buffer
    media_body = MediaIoBaseUpload(csv_buffer, mimetype='text/csv', resumable=True)

    # Upload the file
    try:
        uploaded_file = service.files().create(
            body=file_metadata,
            media_body=media_body,
            fields='id'
        ).execute()
        print(f"Uploaded '{filename}' to Drive with ID: {uploaded_file.get('id')}")
    except Exception as e:
        print(f"Error uploading '{filename}' to Drive: {e}")


def clean_comment(text):
    text = str(text).lower()
    text = re.sub(r"http\S+", "", text)  # remove links
    text = re.sub(r"@\w+", "", text)     # remove mentions
    text = re.sub(r"[^a-zA-Z0-9\s.,!?']", " ", text)  # keep punctuation
    text = re.sub(r"\s+", " ", text).strip()  # remove extra spaces
    return text


In [4]:
DRIVE_FOLDER_ID = '1NEXXEvuBZLB1MFqRCkeGbJfYn4wbA2SX'
YOUTUBE_VIDEO_ID = 'zozEm4f_dlw'
YOUTUBE_API_KEY = "AIzaSyBpSCBmvcy9LAQz2Q3cVExHk_y_dbk6zss"

### Set up Drive

In [5]:
# Connect to Drive
from googleapiclient.discovery import build
from googleapiclient.http import MediaFileUpload
from google.colab import auth
auth.authenticate_user()

import google.auth
creds, _ = google.auth.default()
service = build('drive', 'v3', credentials=creds)


In [6]:
# Create directory with name: youtube_video_id if not already exists
def setup_drive(youtube_video_id, drive_folder_id):
    query = f"name='{youtube_video_id}' and mimeType='application/vnd.google-apps.folder' and '{drive_folder_id}' in parents"
    response = service.files().list(q=query, spaces='drive', fields='files(id, name)').execute()
    files = response.get('files', [])

    if files:
        # Folder already exists
        video_folder_id = files[0].get('id')
        print(f"Using existing folder with ID: {video_folder_id}")
    else:
        # Create new folder
        file_metadata = {
            'name': youtube_video_id,
            'mimeType': 'application/vnd.google-apps.folder',
            'parents': [drive_folder_id]
        }
        file = service.files().create(body=file_metadata, fields='id').execute()
        video_folder_id = file.get('id')
        print(f"Created new folder with ID: {video_folder_id}")
    return video_folder_id

if not RUN_DASHBOARD:
  VIDEO_FOLDER_ID = setup_drive(YOUTUBE_VIDEO_ID, DRIVE_FOLDER_ID)

### Extract video transcript and save to Drive


In [7]:
from typing import List, Dict, Optional
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api._errors import TranscriptsDisabled, NoTranscriptFound, VideoUnavailable
import pandas as pd # Import pandas for DataFrame creation


def fetch_english_transcript(video_id: str) -> Optional[List[Dict]]:
    """
    Fetches the English transcript for a YouTube video.

    Preference order:
        1. Manually uploaded English transcript
        2. Auto-generated English transcript
        3. None (if no transcript found or disabled)

    Args:
        video_id (str): The YouTube video ID (e.g., 'dQw4w9WgXcQ').

    Returns:
        list[dict] or None: List of transcript segments as dicts like:
            [
                {"text": "Hello world", "start": 0.0, "duration": 3.5},
                ...
            ]
        Returns None if no transcript is available.
    """
    try:
        # Initialize API
        ytt_api = YouTubeTranscriptApi()

        # Get all available transcripts for this video
        transcript_list = ytt_api.list(video_id)

        # Try manually created English transcript first
        try:
            transcript = transcript_list.find_manually_created_transcript(['en'])
        except NoTranscriptFound:
            # Fallback to auto-generated English transcript
            try:
                transcript = transcript_list.find_generated_transcript(['en'])
            except NoTranscriptFound:
                return None  # Neither manual nor auto-generated transcript found

        # Fetch the transcript data and return as list of dicts
        return transcript.fetch()

    except (TranscriptsDisabled, VideoUnavailable):
        # Transcripts disabled or video unavailable
        return None
    except Exception as e:
        # Handle unexpected errors safely
        print(f"Error fetching transcript for video {video_id}: {e}")
        return None

if not RUN_DASHBOARD:
    transcript_data = fetch_english_transcript(YOUTUBE_VIDEO_ID)

    if transcript_data:
        df_transcript = pd.DataFrame(transcript_data)
        print("Transcript loaded successfully into DataFrame.")

        # Upload to drive using the existing df_to_drive function
        df_to_drive(df_transcript, VIDEO_FOLDER_ID, 'transcript.csv')
    else:
        print(f"No English transcript found for video ID: {YOUTUBE_VIDEO_ID}")


### Extract video metadata and save to Drive


In [8]:
from googleapiclient.discovery import build
import pandas as pd

# Build the YouTube API service
youtube = build('youtube', 'v3', developerKey=YOUTUBE_API_KEY)

def get_video_metadata(video_id: str) -> pd.DataFrame:
    """
    Fetches video metadata (title, description, published date, thumbnail URL, category ID)
    for a given YouTube video ID.

    Args:
        video_id (str): The YouTube video ID.

    Returns:
        pd.DataFrame: A DataFrame containing the video metadata.
    """
    request = youtube.videos().list(
        part="snippet",
        id=video_id
    )
    response = request.execute()

    if response['items']:
        snippet = response['items'][0]['snippet']
        metadata = {
            'video_id': video_id,
            'title': snippet.get('title'),
            'description': snippet.get('description'),
            'publishedAt': snippet.get('publishedAt'),
            'thumbnail_url': snippet['thumbnails']['default']['url'] if 'thumbnails' in snippet else None,
            'categoryId': snippet.get('categoryId')
        }
        return pd.DataFrame([metadata])
    else:
        print(f"No video found with ID: {video_id}")
        return pd.DataFrame()


if not RUN_DASHBOARD:
    # Get video metadata
    df_metadata = get_video_metadata(YOUTUBE_VIDEO_ID)

    if not df_metadata.empty:
        print("Video metadata loaded successfully into DataFrame.")
        # Upload to drive using the existing df_to_drive function
        df_to_drive(df_metadata, VIDEO_FOLDER_ID, 'video_metadata.csv')
    else:
        print(f"Failed to retrieve metadata for video ID: {YOUTUBE_VIDEO_ID}")


### Extract comments from a youtube video and save to Drive

In [9]:
import csv
import time
import requests
import pandas as pd # Import pandas for DataFrame creation

def save_youtube_comments_to_drive(
    video_id: str,
    api_key: str,
    service,
    drive_folder_id: str,
    max_retries: int = 5,
    backoff_base: float = 2.0,
):
    """
    Fetches YouTube comments for a given video ID and stores them in a pandas DataFrame,
    then uploads the DataFrame as a CSV file to a Google Drive folder using df_to_drive.

    Args:
        video_id (str): YouTube video ID
        api_key (str): YouTube Data API v3 key
        service: Authenticated Google Drive API service object
        drive_folder_id (str): Destination Google Drive folder ID
        max_retries (int): Max retries per failed request
        backoff_base (float): Base for exponential backoff (seconds)
    """
    base_url = "https://www.googleapis.com/youtube/v3/commentThreads"
    params = {
        "part": "snippet",
        "videoId": video_id,
        "maxResults": 100,
        "textFormat": "plainText",
        "key": api_key,
    }

    session = requests.Session()
    next_page_token = None
    total_comments = 0
    all_comments_data = [] # List to store all comment rows

    while True:
        if next_page_token:
            params["pageToken"] = next_page_token
        else:
            params.pop("pageToken", None)

        for attempt in range(max_retries):
            try:
                resp = session.get(base_url, params=params, timeout=10)
                if resp.status_code == 200:
                    data = resp.json()
                    break
                else:
                    print(f"Warning: HTTP {resp.status_code}, retrying...")
            except requests.exceptions.RequestException as e:
                print(f"Network error: {e}, retrying...")

            sleep_time = backoff_base ** attempt
            print(f"Sleeping {sleep_time:.1f}s before retry...")
            time.sleep(sleep_time)
        else:
            raise RuntimeError("Max retries exceeded while fetching comments.")

        # Parse comments
        items = data.get("items", [])
        for item in items:
            snippet = item["snippet"]["topLevelComment"]["snippet"]
            row = [
                snippet.get("authorDisplayName", ""),
                snippet.get("textDisplay", "").replace("\n", " "),
                snippet.get("likeCount", 0),
                snippet.get("publishedAt", ""),
            ]
            all_comments_data.append(row) # Append row to list
            total_comments += 1

        print(f"Fetched {len(items)} comments (total: {total_comments})")

        next_page_token = data.get("nextPageToken")
        if not next_page_token:
            break

        # Be polite — small delay to avoid rate limits
        time.sleep(0.3)

    # Create DataFrame from collected data with explicit column names
    df_comments = pd.DataFrame(all_comments_data, columns=['authorDisplayName', 'commentText', 'likeCount', 'publishedAt'])

    # Upload the DataFrame to Google Drive using df_to_drive
    print("Uploading to Google Drive...")
    df_to_drive(df_comments, drive_folder_id, "comments.csv")
    print(f"Total comments saved: {total_comments}")
    return df_comments


if not RUN_DASHBOARD:
    save_youtube_comments_to_drive(
        YOUTUBE_VIDEO_ID,
        YOUTUBE_API_KEY,
        service,
        VIDEO_FOLDER_ID
    )



### Data Preprocessing

In [10]:
import pandas as pd
import os

# Load comments CSV

def preprocess_comments(video_folder_id):
    df = drive_to_df(video_folder_id, 'comments.csv', column_names=['authorDisplayName', 'commentText', 'likeCount', 'publishedAt'])

    df['cleanedCommentText'] = df['commentText'].apply(clean_comment)

    # Remove exact duplicates
    df = df.drop_duplicates(subset='cleanedCommentText')

    # Remove very short comments (e.g. emojis only or less than 3 words)
    df = df[df['cleanedCommentText'].str.split().str.len() > 2]

    df_to_drive(df, video_folder_id, 'comments_preprocessed.csv')
    print(f"Cleaned comments saved to: {os.path.join(video_folder_id, 'comments_preprocessed.csv')}")

    return df

if not RUN_DASHBOARD:
    preprocess_comments(VIDEO_FOLDER_ID)


In [11]:
# For collecting data for multiple videos:

# VIDEO_ID_LIST = [
#     'teMKnxbd_O0',
#     'PssKpzB0Ah0',
#     'UiIRlg4Xr5w',
#     'YYQXk1t_JHM',
#     '7qriveAV7BY',
#     'fqyl5kbZ7Tw',
#     '7ARBJQn6QkM',
#     'hmtuvNfytjM'
# ]

# for YOUTUBE_VIDEO_ID in VIDEO_ID_LIST:
#     # Drive setup
#     VIDEO_FOLDER_ID = setup_drive(YOUTUBE_VIDEO_ID, DRIVE_FOLDER_ID)
#     # Transcript extraction
#     transcript_data = fetch_english_transcript(YOUTUBE_VIDEO_ID)
#     if transcript_data:
#         df_transcript = pd.DataFrame(transcript_data)
#         print("Transcript loaded successfully into DataFrame.")

#         # Upload to drive using the existing df_to_drive function
#         df_to_drive(df_transcript, VIDEO_FOLDER_ID, 'transcript.csv')
#     else:
#         print(f"No English transcript found for video ID: {YOUTUBE_VIDEO_ID}")
#     # Metadata extraction
#     df_metadata = get_video_metadata(YOUTUBE_VIDEO_ID)

#     if not df_metadata.empty:
#         print("Video metadata loaded successfully into DataFrame.")
#         # Upload to drive using the existing df_to_drive function
#         df_to_drive(df_metadata, VIDEO_FOLDER_ID, 'video_metadata.csv')
#     else:
#         print(f"Failed to retrieve metadata for video ID: {YOUTUBE_VIDEO_ID}")
#     # Comments Extraction
#     save_youtube_comments_to_drive(
#         YOUTUBE_VIDEO_ID,
#         YOUTUBE_API_KEY,
#         service,
#         VIDEO_FOLDER_ID
#     )
#     # Preprocess comments
#     preprocess_comments(VIDEO_FOLDER_ID)

Multilingual Detection

In [12]:
#install dependencies


def handle_multilingual_comments(df):
    from langdetect import detect
    from googletrans import Translator
    from langdetect import detect, LangDetectException
    from googletrans import Translator

    translator = Translator()

    # Drop rows with empty or invalid cleaned_comment
    df = df[df['cleaned_comment'].str.strip().str.len() > 3].copy()

    # Function to safely detect language
    def safe_detect(text):
        try:
            return detect(text)
        except LangDetectException:
            return "unknown"

    # Apply language detection
    df['language'] = df['cleaned_comment'].apply(safe_detect)

    # Translate if not English
    def translate_to_english(row):
        if row['language'] == 'en':
            return row['cleaned_comment']
        try:
            return translator.translate(row['cleaned_comment'], dest='en').text
        except Exception:
            return row['cleaned_comment']  # fallback to original if translation fails

    df['translated_comment'] = df.apply(translate_to_english, axis=1)

    # Optional: save to Drive
    df.to_csv("/content/drive/MyDrive/comments_translated.csv", index=False)

    print("✅ Translation complete. File saved to Drive.")
    print(df)

### Intent Classification (Praise, Question, Criticism, Humour, or Suggestion)

In [13]:
!pip install -U transformers

Collecting transformers
  Downloading transformers-4.57.3-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.57.3-py3-none-any.whl (12.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m153.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.57.2
    Uninstalling transformers-4.57.2:
      Successfully uninstalled transformers-4.57.2
Successfully installed transformers-4.57.3


In [14]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F


load_path = "/content/drive/MyDrive/bertweet-intent-checkpoint-001"

intent_tokenizer = AutoTokenizer.from_pretrained(load_path, use_fast=False)
model = AutoModelForSequenceClassification.from_pretrained(load_path)
model.to("cuda")
model.eval()  # Switch to evaluation mode


# Label mapping TODO: Update
id2label = {
    0: "criticism",
    1: "humour",
    2: "other",
    3: "praise",
    4: "question",
    5: "suggestion"
}

def label_comments_with_intent(cleaned_comments_df, threshold=0.55, fallback_label="other", batch_size=32):
    """
    Runs inference on cleanedCommentText column and labels comments based on intent.
    If model confidence < threshold, assigns fallback_label instead.
    """

    texts = cleaned_comments_df["cleanedCommentText"].tolist()

    all_preds = []
    all_probs = []

    device = "cuda"

    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]

        encodings = intent_tokenizer(
            batch,
            max_length=128,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        ).to(device)

        with torch.no_grad():
            outputs = model(**encodings)

        probs = F.softmax(outputs.logits, dim=-1)

        pred_ids = torch.argmax(probs, dim=-1)

        all_preds.extend(pred_ids.cpu().numpy())
        all_probs.extend(probs.max(dim=-1).values.cpu().numpy())

    # Apply fallback threshold
    labelled_comments_df = cleaned_comments_df.copy()
    labelled_comments_df["predicted_prob"] = all_probs
    labelled_comments_df["raw_predicted_label"] = all_preds
    labelled_comments_df["raw_predicted_label_name"] = labelled_comments_df["raw_predicted_label"].map(id2label)

    labelled_comments_df["intent_label"] = labelled_comments_df.apply(
        lambda row: row["raw_predicted_label_name"]
        if row["predicted_prob"] >= threshold
        else fallback_label,
        axis=1
    )

    # Drop internal columns before returning
    labelled_comments_df = labelled_comments_df.drop(columns=["predicted_prob", "raw_predicted_label", "raw_predicted_label_name"])
    return labelled_comments_df


emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0


### Sentiment Classfication (Positive, Negative, Neutral)

In [15]:
#install dependences
!pip install transformers torch
#load cleaned data
import pandas as pd
#for graph visulaization
import matplotlib.pyplot as plt





load sentiment piple line -pretrained model

In [16]:
#load pretrained model
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification


model_name = "cardiffnlp/twitter-roberta-base-sentiment"

# Load tokenizer and model
sentiment_tokenizer = AutoTokenizer.from_pretrained(model_name)
sentiment_model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Create a pipeline for sentiment analysis
sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model=sentiment_model,
    tokenizer=sentiment_tokenizer,
    truncation=True,
    padding="max_length",
    max_length=128,
    device=0,
    top_k=1,
    batch_size=64       # effective GPU batch size
)


def label_comments_with_sentiment(cleaned_comments_df):
    # Drop empties
    df = cleaned_comments_df.dropna(subset=["cleanedCommentText"])
    df = df[df["cleanedCommentText"].str.strip() != ""]

    comments = df["cleanedCommentText"].tolist()

    print(f"\n⏳ Running inference on {len(comments)} comments...\n")
    import time
    start = time.time()
    results = sentiment_pipeline(comments, batch_size=100)   # FAST
    end = time.time()
    print(f"Completed inference in {end-start:.2f} seconds...")

    sentiments = []
    scores = []
    polarities = []

    label_map = {
        "LABEL_0": "Negative",
        "LABEL_1": "Neutral",
        "LABEL_2": "Positive"
    }

    for res in results:
        score_dict = {d["label"]: d["score"] for d in res}
        neg = score_dict.get("LABEL_0", 0)
        pos = score_dict.get("LABEL_2", 0)

        sentiments.append(label_map[max(score_dict, key=score_dict.get)])
        sentiment_score = pos - neg
        scores.append(sentiment_score)
        polarities.append(abs(sentiment_score))

    df["sentiment"] = sentiments
    df["sentiment_score"] = scores
    df["sentiment_polarity"] = polarities
    return df


if not RUN_DASHBOARD:
    # Update the path based on where your file is
    df = pd.read_csv("/content/drive/MyDrive/youtube_comments.csv")

    label_comments_with_sentiment(df)
    #df.to_csv("/content/drive/MyDrive/comments_with_sentiment.csv", index=False)
    print(" Sentiment-labeled file saved to Drive.")




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Device set to use cuda:0


### Topic Classification (BERTopic)


In [17]:
!pip install bertopic[flair,gensim,spacy,use] transformers sentencepiece



model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Collecting bertopic[flair,gensim,spacy,use]
  Downloading bertopic-0.17.4-py3-none-any.whl.metadata (24 kB)
Collecting flair>=0.7 (from bertopic[flair,gensim,spacy,use])
  Downloading flair-0.15.1-py3-none-any.whl.metadata (12 kB)
Collecting gensim>=4.0.0 (from bertopic[flair,gensim,spacy,use])
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Collecting boto3>=1.20.27 (from flair>=0.7->bertopic[flair,gensim,spacy,use])
  Downloading boto3-1.42.4-py3-none-any.whl.metadata (6.8 kB)
Collecting conllu<5.0.0,>=4.0 (from flair>=0.7->bertopic[flair,gensim,spacy,use])
  Downloading conllu-4.5.3-py2.py3-none-any.whl.metadata (19 kB)
Collecting deprecated>=1.2.13 (from flair>=0.7->bertopic[flair,gensim,spacy,use])
  Downloading deprecated-1.3.1-py2.py3-none-any.whl.metadata (5.9 kB)
Collecting ftfy>=6.1.0 (from flair>=0.7->bertopic[flair,gensim,spacy,use])
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Collecting langdetect>=1.

In [18]:
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
from umap import UMAP
import hdbscan

# 1. MPNet Embedding model results better than MiniLM
embedding_model = SentenceTransformer("all-mpnet-base-v2")

# 2. Vectorizer must include bigrams, ignore nouse
vectorizer_model = CountVectorizer(
    stop_words="english",
    ngram_range=(1, 2),
    min_df=3
)

# 3. UMAP reduce dimensions
umap_model = UMAP(
    n_neighbors=30,
    n_components=12,
    min_dist=0.0,
    metric="cosine",
    random_state=42
)



def label_comments_with_topics(comments_df):
    min_cluster_size = max(len(comments_df.index) // 100, 10)  # Set min cluster size to 1% of total comments count or 10, which ever is more.

    # 4. HDBSCAN topic clustering
    hdbscan_model = hdbscan.HDBSCAN(
        min_cluster_size=min_cluster_size,
        min_samples=3,
        metric="euclidean",
        cluster_selection_method="leaf",
        prediction_data=True
    )

    topic_model = BERTopic(
        embedding_model=embedding_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer_model,
        calculate_probabilities=True,
        verbose=True
    )
    topics, probs = topic_model.fit_transform(comments_df['cleanedCommentText'])
    comments_df["topic_id"] = topics
    comments_df["topic_probability"] = probs.max(axis=1)
    return comments_df, topic_model


  $max \{ core_k(a), core_k(b), 1/\alpha d(a,b) \}$.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

### Summarization

In [25]:
import torch
import pandas as pd
from transformers import BartForConditionalGeneration, BartTokenizer

model_name = "philschmid/bart-large-cnn-samsum"

summary_tokenizer = BartTokenizer.from_pretrained(model_name)

summary_model = BartForConditionalGeneration.from_pretrained(
    model_name,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    device_map="auto"
)


from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

title_model_name = "google/flan-t5-base"
title_tokenizer = AutoTokenizer.from_pretrained(title_model_name)
title_model = AutoModelForSeq2SeqLM.from_pretrained(title_model_name)


def get_representative_comments(
    topic_model, df, topic_id, intent_label=None, text_col="cleanedCommentText", n=15, max_len=300
):
    """
    Get N most representative comments for a topic based on topic probability.
    Filters out overly long comments to prevent skewed summarization.
    """
    # Rows belonging to this topic
    subset = df[df["topic_id"] == topic_id]

    if intent_label:
        if type(intent_label) is str:
            subset = subset[subset["intent_label"] == intent_label]
        elif type(intent_label) is list:
            subset = subset[subset["intent_label"].isin(intent_label)]

    # Filter out unusually long comments (optional but recommended)
    subset = subset[subset[text_col].str.len() <= max_len]

    # Sort by topic probability (requires probs from BERTopic)
    if "topic_probability" in subset.columns:
        subset = subset.sort_values("topic_probability", ascending=False)
    else:
        # fallback: force deterministic order
        subset = subset.sample(frac=1, random_state=42)

    # Return top N comments as list
    return subset[text_col].head(n).tolist()


def format_comments_as_chat(comments):
    chat = []
    for i, c in enumerate(comments, start=1):
        speaker = f"Commenter {i}"
        chat.append(f"{speaker}: {c.strip()}")
    return "\n".join(chat)



def summarize_topic(topic_model, comments_df, topic_id, text_col="cleanedCommentText", max_length=256):
    representative_comments = get_representative_comments(topic_model, comments_df, topic_id, text_col=text_col)
    chat_text = "Summarize the following conversation:\n" + format_comments_as_chat(representative_comments)

    inputs = summary_tokenizer(
        chat_text,
        max_length=4096,
        padding="longest",
        truncation=True,
        return_tensors="pt"
    ).to(model.device)

    output_ids = summary_model.generate(
        **inputs,
        max_length=max_length,
        num_beams=5,
        length_penalty=1.0,
        no_repeat_ngram_size=3,
        early_stopping=True
    )

    summary = summary_tokenizer.decode(output_ids[0], skip_special_tokens=True)
    prompt = f"Suggest a title for this text:\n\n{summary}\n\nTitle:"

    inputs = title_tokenizer(prompt, return_tensors="pt")
    outputs = title_model.generate(
        **inputs,
        max_new_tokens=max_length,
        num_beams=5,
        no_repeat_ngram_size=3
    )

    title = title_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return title, summary


def summarize_topic_questions(topic_model, comments_df, topic_id, text_col="cleanedCommentText", max_length=256):
    representative_comments = get_representative_comments(topic_model, comments_df, topic_id, text_col=text_col, intent_label="question")
    print("summarize_topic_questions")
    print(topic_id)
    print(representative_comments)
    if not representative_comments:
        return None
    chat_text = "What are the commenters asking:\n" + format_comments_as_chat(representative_comments)

    inputs = summary_tokenizer(
        chat_text,
        max_length=4096,
        padding="longest",

        truncation=True,
        return_tensors="pt"
    ).to(model.device)

    output_ids = summary_model.generate(
        **inputs,
        max_length=max_length,
        num_beams=5,
        length_penalty=1.0,
        no_repeat_ngram_size=3,
        early_stopping=True
    )

    summary = summary_tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return summary


def summarize_actionable_insights(topic_model, comments_df, topic_id, text_col="cleanedCommentText", max_length=256):
    representative_comments = get_representative_comments(topic_model, comments_df, topic_id, text_col=text_col)
    if not representative_comments:
        return None
    chat_text = "Summarize actionable insights from the following comments:\n" + format_comments_as_chat(representative_comments)

    inputs = summary_tokenizer(
        chat_text,
        max_length=4096,
        padding="longest",

        truncation=True,
        return_tensors="pt"
    ).to(model.device)

    output_ids = summary_model.generate(
        **inputs,
        max_length=max_length,
        num_beams=5,
        length_penalty=1.0,
        no_repeat_ngram_size=3,
        early_stopping=True
    )

    summary = summary_tokenizer.decode(output_ids[0], skip_special_tokens=True)
    prompt = f"Suggest an actionable title for this text:\n\n{summary}\n\nTitle:"

    inputs = title_tokenizer(prompt, return_tensors="pt")
    outputs = title_model.generate(
        **inputs,
        max_new_tokens=max_length,
        num_beams=5,
        no_repeat_ngram_size=3
    )

    title = title_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return title, summary


def score_topics_by_actionability(actionability_df, actionability_topic_model, question_weight=1.0, suggestion_weight=1.5):
    """
    Score each topic by actionability based on counts of questions and suggestions.

    Args:
        actionability_df: DataFrame containing comments with topic_id and intent_label columns
        actionability_topic_model: The topic model fitted on actionability comments
        question_weight: Weight for question intent (default 1.0)
        suggestion_weight: Weight for suggestion intent (default 1.5)

    Returns:
        DataFrame with columns: topic_id, question_count, suggestion_count, actionability_score
        Sorted by actionability_score in descending order
    """
    if actionability_df is None or actionability_df.empty:
        return None

    if "topic_id" not in actionability_df.columns or "intent_label" not in actionability_df.columns:
        return None

    # Get topic info from model
    topics_df = actionability_topic_model.get_topic_info()
    topics_df = topics_df[topics_df["Topic"] != -1]  # remove outlier cluster

    # Count questions and suggestions per topic
    topic_scores = []

    for topic_id in topics_df["Topic"]:
        topic_comments = actionability_df[actionability_df["topic_id"] == topic_id]

        question_count = len(topic_comments[topic_comments["intent_label"] == "question"])
        suggestion_count = len(topic_comments[topic_comments["intent_label"] == "suggestion"])

        # Calculate weighted actionability score
        actionability_score = (question_count * question_weight) + (suggestion_count * suggestion_weight)

        topic_scores.append({
            "topic_id": topic_id,
            "question_count": question_count,
            "suggestion_count": suggestion_count,
            "total_count": question_count + suggestion_count,
            "actionability_score": actionability_score
        })

    # Create DataFrame and sort by score
    scores_df = pd.DataFrame(topic_scores)
    scores_df = scores_df.sort_values(by="actionability_score", ascending=False)

    return scores_df

### Dashboard

In [26]:
import gradio as gr
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns



print(gr.__version__)
import re

# --- Your helpers ---
MAX_TOPICS = 5
MAX_INSIGHTS = 5

def extract_video_id(url):
    pattern = r"(?:v=|youtu\.be/)([a-zA-Z0-9_-]{11})"
    match = re.search(pattern, url)
    return match.group(1) if match else None


import plotly.graph_objects as go

def get_topic_sentiment_distribution_plot(topic_id, comments_df):
    """
    Returns a stacked horizontal bar chart showing sentiment distribution
    across intents for a specific topic ID.

    Requirements:
      - comments_df must contain: topic_id, intent_label, sentiment columns
    """

    # --- Validate inputs ---
    if comments_df is None or comments_df.empty:
        return None

    if "topic_id" not in comments_df.columns:
        return None

    if "intent_label" not in comments_df.columns or "sentiment" not in comments_df.columns:
        return None

    # Filter comments for this topic
    df_topic = comments_df[comments_df["topic_id"] == topic_id]
    if df_topic.empty:
        return None

    # Colors matching your theme
    SENTIMENT_COLOR_MAP = {
        "Positive": "#2a9d8f",  # teal
        "Negative": "#e63946",  # red
        "Neutral":  "#f4a261",  # caramel
    }

    # Prepare pivot counts per intent x sentiment
    counts = (
        df_topic
        .groupby(["intent_label", "sentiment"])
        .size()
        .reset_index(name="count")
    )

    intents = counts["intent_label"].unique()
    sentiments = ["Positive", "Neutral", "Negative"]  # enforce order

    fig = go.Figure()

    # Build stacked bars
    for sentiment in sentiments:
        values = [
            counts[(counts.intent_label == intent) & (counts.sentiment == sentiment)]["count"].sum()
            for intent in intents
        ]

        fig.add_trace(
            go.Bar(
                y=intents,
                x=values,
                orientation="h",
                name=sentiment,
                marker_color=SENTIMENT_COLOR_MAP.get(sentiment, "#95a5a6"),
                hovertemplate=sentiment + ": %{x}<extra></extra>"
            )
        )

    # Styling
    fig.update_layout(
        barmode="stack",
        title={
            "text": f"Topic {topic_id} — Intent-Sentiment Breakdown",
            "x": 0.5,
            "xanchor": "center",
            "font": dict(size=18, color="white")
        },
        paper_bgcolor="#1e1e1e",
        plot_bgcolor="#1e1e1e",
        font=dict(color="white"),
        height=350,
        margin=dict(l=60, r=20, t=50, b=20),
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.02,
            xanchor="center",
            x=0.5,
            font=dict(size=11)
        )
    )

    fig.update_traces(
        textposition="inside",
        texttemplate="%{x}",
        insidetextanchor="middle",
        textfont=dict(color="white", size=10)
    )

    return fig


# --------------------------------------------------------------------
# UI update helpers
# --------------------------------------------------------------------

def metadata_update(meta):
    """
    Returns UI update tuple for metadata components
    """
    def truncate(description):
        short_desc = "\n".join(description.split("\n")[:4] + ['[TRUNCATED]'])  # first 4 lines only
        return short_desc

    return (
        gr.update(value=meta["thumb"], visible=meta["thumb"] is not None),
        gr.update(value=meta["title"], visible=bool(meta["title"])),
        gr.update(value=meta["published"], visible=bool(meta["published"])),
        gr.update(value=truncate(meta["description"]), visible=bool(meta["description"])),
        gr.update(value=meta["category"], visible=bool(meta["category"])),
    )


def transcript_update(transcript_df):
    """
    Build transcript text output with formatted timestamps
    and return UI update tuple.
    """

    if transcript_df is None or transcript_df.empty:
        return (gr.update(value="", visible=False),)

    # Format timestamp + text lines
    formatted_lines = []
    for _, row in transcript_df.iterrows():
        minutes = int(row["start"] // 60)
        seconds = int(row["start"] % 60)
        ts = f"{minutes:02d}:{seconds:02d}"
        formatted_lines.append(f"[{ts}] {row['text']}")

    formatted_transcript = "\n".join(formatted_lines)

    return (gr.update(value=formatted_transcript, visible=True),)

def comment_count_update(comments_df):
    if comments_df is None or comments_df.empty:
        return (gr.update(value="", visible=False),)
    return (gr.update(value=f"### Total comments: **{len(comments_df)}**", visible=True),)

def overall_sentiment_distribution_panel_update(comments_df):
    visible = bool(comments_df is not None and not comments_df.empty and 'sentiment' in comments_df.columns)
    if not visible:
        return gr.update(visible=visible)

    SENTIMENT_COLOR_MAP = {
        "Positive": "#2a9d8f",  # teal green
        "Negative": "#e63946",  # strong coral red
        "Neutral":  "#f4a261",  # warm caramel
    }
    sentiment_counts = comments_df["sentiment"].value_counts()
    labels = sentiment_counts.index.tolist()
    values = sentiment_counts.values.tolist()
    print(sentiment_counts)
    fig = go.Figure(
        data=[
            go.Pie(
                labels=labels,
                values=values,
                hole=0.6,
                textinfo="percent+label",
                marker=dict(
                    colors=[SENTIMENT_COLOR_MAP[label] for label in labels]
                ),
            )
        ]
    )

    fig.update_layout(
        title={
            "text": "Sentiment Distribution",
            "x": 0.5,
            "xanchor": "center",
            "font": dict(size=20, color="white")
        },
        paper_bgcolor="#1e1e1e",
        plot_bgcolor="#1e1e1e",
        font=dict(color="white"),
        height=300,
        margin=dict(l=20, r=20, t=40, b=20)
    )

    fig.update_traces(
        textfont=dict(color="white", size=14),
        hovertemplate="%{label}: %{percent} (%{value})<extra></extra>",
    )
    return gr.update(value=fig, visible=True)


def overall_intent_distribution_panel_update(comments_df):
    # Panel visibility condition
    visible = bool(
        comments_df is not None
        and not comments_df.empty
        and "intent_label" in comments_df.columns
    )

    if not visible:
        return gr.update(visible=False)

    # Intent color mapping dictionary (you can tweak colors per intent category)
    INTENT_COLOR_MAP = {
        "criticism": "#e63946",   # red
        "praise":     "#2a9d8f",  # green
        "humour": "#f4a261",  # amber
        "question":   "#5e60ce",  # indigo
        "suggestion":   "#4a90e2",  # blue
        "other":      "#95a5a6"   # gray
    }

    # Compute distribution
    intent_counts = comments_df["intent_label"].value_counts()
    print("intent_counts")
    print(intent_counts)
    labels = intent_counts.index.tolist()
    values = intent_counts.values.tolist()

    # Build color list respecting label ordering
    colors = [INTENT_COLOR_MAP.get(label, "#7f8c8d") for label in labels]

    # Build horizontal bar chart
    import plotly.express as px

    fig = px.bar(
        x=values,
        y=labels,
        orientation="h",
        text=values,
    )

    # Apply styling + custom colors
    fig.update_traces(
        marker_color=colors,
        textposition="outside",
    )

    fig.update_layout(
        title={
            "text": "Intent Distribution",
            "x": 0.5,
            "xanchor": "center",
            "font": dict(size=20, color="white")
        },
        paper_bgcolor="#1e1e1e",
        plot_bgcolor="#1e1e1e",
        font=dict(color="white"),
        height=300,
        margin=dict(l=40, r=20, t=40, b=20),
        yaxis=dict(categoryorder="total ascending")  # largest at top
    )

    fig.update_layout(
        xaxis_title=None,
        yaxis_title=None,
        xaxis=dict(showgrid=False, showticklabels=False, zeroline=False),
        yaxis=dict(showgrid=False, showticklabels=True, zeroline=False),  # keep labels only
    )

    return gr.update(value=fig, visible=True)


def intentwise_sentiment_distribution_panel_update(comments_df):
    visible = bool(
        comments_df is not None
        and not comments_df.empty
        and "intent_label" in comments_df.columns
        and "sentiment" in comments_df.columns
    )

    if not visible:
        return gr.update(visible=False)

    SENTIMENT_COLOR_MAP = {
        "Positive": "#2a9d8f",
        "Neutral": "#f4a261",
        "Negative": "#e63946",
    }

    # Compute counts
    grouped = comments_df.groupby(["intent_label", "sentiment"]).size().reset_index(name="count")

    intents = sorted(comments_df["intent_label"].unique())  # x-axis categories
    sentiments = ["Positive", "Neutral", "Negative"]

    fig = go.Figure()

    # Add grouped bars (one trace per sentiment)
    for sentiment in sentiments:
        subset = grouped[grouped["sentiment"] == sentiment]

        fig.add_trace(
            go.Bar(
                name=sentiment,
                x=subset["intent_label"],
                y=subset["count"],
                marker_color=SENTIMENT_COLOR_MAP[sentiment],
                text=subset["count"],
                textposition="outside",
            )
        )

    # Layout & theme alignment
    fig.update_layout(
        barmode="group",
        title={
            "text": "Intent-wise Sentiment Distribution",
            "x": 0.5,
            "xanchor": "center",
            "font": dict(size=20, color="white"),
        },
        paper_bgcolor="#1e1e1e",
        plot_bgcolor="#1e1e1e",
        font=dict(color="white"),
        height=400,
        margin=dict(l=20, r=20, t=50, b=50),
        xaxis=dict(
            title="Intent Label",
            tickangle=-25,
            titlefont=dict(color="white"),
            tickfont=dict(color="white"),
        ),
        yaxis=dict(
            title="Count",
            titlefont=dict(color="white"),
            tickfont=dict(color="white"),
            gridcolor="#444",
        ),
        legend=dict(
            title="Sentiment",
            font=dict(color="white"),
            bgcolor="#1e1e1e",
        ),
    )

    fig.update_traces(textposition="inside")

    return gr.update(value=fig, visible=True)

def topic_visualization_panel_update(comments_df, topic_model):
    visible = topic_model is not None
    if not visible:
        return (gr.update(visible=False), gr.update(visible=True))

    fig = topic_model.visualize_topics()

    fig.update_layout(
        paper_bgcolor="#1e1e1e",
        plot_bgcolor="#1e1e1e",
        font=dict(color="white"),
        title=dict(
            font=dict(color="white", size=22),
            x=0.5,
            xanchor="center"
        ),
        legend=dict(
            bgcolor="#1e1e1e",
            bordercolor="#1e1e1e",
            font=dict(color="white")
        )
    )

    # Improve axis styling
    fig.update_xaxes(
        showgrid=False,
        color="white"
    )
    fig.update_yaxes(
        showgrid=False,
        color="white"
    )

    # Bubble / node colors override
    fig.update_traces(
        marker=dict(
            line=dict(color="#2a9d8f", width=2),
            opacity=0.85
        ),
        textfont=dict(color="white")
    )
    return gr.update(visible=True), gr.update(value=fig, visible=True)


def summarize_topics(topic_model, comments_df, topic_summaries={}):
    if topic_model is None:
        return
    topics_df = topic_model.get_topic_info()
    topics_df = topics_df[topics_df["Topic"] != -1]   # remove outlier cluster
    topics_df = topics_df.sort_values(by="Count", ascending=False)
    topics_df = topics_df.head(MAX_TOPICS)

    for _, row in topics_df.iterrows():
        topic_id = row["Topic"]
        if topic_id not in topic_summaries:
            title, summary = summarize_topic(topic_model, comments_df, topic_id)
            topic_summaries[topic_id] = [title, summary, None]
            questions_summary = summarize_topic_questions(topic_model, comments_df, topic_id)
            if questions_summary:
                print("Questions Summary")
                print(f"{topic_id}: {questions_summary}")
                topic_summaries[topic_id][2] = questions_summary
            yield topic_id



def topic_updates(comments_df, topic_model, topic_summaries):
    visible = topic_model is not None
    if not visible:
        # title updates
        title_updates = (gr.update(visible=False),) * MAX_TOPICS
        accordion_updates = (gr.update(visible=False),) * MAX_TOPICS
        panel_updates = (gr.update(visible=False),) * MAX_TOPICS
        plot_updates = (gr.update(visible=False),) * MAX_TOPICS

        return (gr.update(visible=False),) + title_updates + accordion_updates + panel_updates + plot_updates

    topics_df = topic_model.get_topic_info()

    topics_df = topics_df[topics_df["Topic"] != -1]   # remove outlier cluster
    topics_df = topics_df.sort_values(by="Count", ascending=False)
    topics_df = topics_df.head(MAX_TOPICS)


    title_updates = []
    accordion_updates = []
    panel_updates = []
    plot_updates = []

    print("topic_summaries")
    print(topic_summaries)

    for _, row in topics_df.iterrows():
        topic_id = row["Topic"]
        topic_count = row["Count"]

        is_summarized = topic_id in topic_summaries
        title, summary, questions = topic_summaries.get(topic_id, ("", "", "")) if is_summarized else ("", "", "")

        # Extract keywords safely
        try:
            keywords = topic_model.get_topic(topic_id)
            if isinstance(keywords, list):
                keywords = ", ".join([w for w, _ in keywords[:6]])
            else:
                keywords = "(No keywords found)"
        except:
            keywords = "(No keywords available)"

        # --- Title update (inside accordion) ---
        title_value = f"{"🧠" if title else "⏳"} **Topic**: {title if title else topic_id} — {topic_count} comments"
        title_updates.append(
            gr.update(
                value=title_value,
                visible=True,
            )
        )

        # --- Accordion visibility ---
        accordion_updates.append(gr.update(visible=True))

        # --- Panel body update ---
        if not topic_id in topic_summaries:
            panel_body = f"**Top Keywords:** {keywords}"
            panel_body += "<br />⏳ **Summarizing...**"
        else:
            title, summary, questions_summary = topic_summaries[topic_id]
            panel_body = f"**Summary:**<br />{summary}<br />"
            if questions_summary:
                panel_body += f"**Unanswered questions:**<br />{questions_summary}"
        panel_updates.append(
            gr.update(
                value=panel_body,
                visible=True,
            )
        )

        plot_updates.append(
            gr.update(
                value=get_topic_sentiment_distribution_plot(topic_id, comments_df),
                visible=True
            )
        )

    # Fill remaining unused topic slots with invisible placeholders
    remaining = MAX_TOPICS - len(topics_df)
    if remaining > 0:
        title_updates += [gr.update(visible=False)] * remaining
        accordion_updates += [gr.update(visible=False)] * remaining
        panel_updates += [gr.update(visible=False)] * remaining
        plot_updates += [gr.update(visible=False)] * remaining

    print("title_updates")
    print(title_updates)

    updates = title_updates + accordion_updates + panel_updates + plot_updates
    updates = [gr.update(visible=True)] + updates
    return tuple(updates)






def metadata_panel_state(meta):
    """
    Makes metadata accordion visible only when metadata exists
    """
    has_data = any([
        meta["thumb"],
        meta["title"],
        meta["published"],
        meta["description"],
        meta["category"]
    ])
    return gr.update(visible=has_data)


def transcript_panel_state(transcript_df):
    return gr.update(visible=bool(transcript_df is not None and not transcript_df.empty))


def comment_count_panel_state(comments_df):
    return gr.update(visible=bool(comments_df is not None and not comments_df.empty))

def status_update(current_status):
    if current_status is None:
        # Status: Ready
        return gr.update(interactive=True, value="Analyze")
    else:
        return gr.update(interactive=False, value=current_status)


def actionability_update(actionable_insights):
    actionability_accordion_visible = gr.update(visible=bool(actionable_insights is not None))

    # Generate insight updates
    if actionable_insights is not None and len(actionable_insights) > 0:
        insight_title_updates = []
        insight_summary_updates = []

        for i in range(MAX_INSIGHTS):
            if i < len(actionable_insights):
                title, summary = actionable_insights[i]
                insight_title_updates.append(gr.update(label=title, visible=True))
                insight_summary_updates.append(gr.update(value=summary, visible=True))
            else:
                insight_title_updates.append(gr.update(visible=False))
                insight_summary_updates.append(gr.update(visible=False))
    else:
        insight_title_updates = [gr.update(visible=False)] * MAX_INSIGHTS
        insight_summary_updates = [gr.update(visible=False)] * MAX_INSIGHTS

    return actionability_accordion_visible, *insight_title_updates, *insight_summary_updates


# --------------------------------------------------------------------
# Main pipeline
# --------------------------------------------------------------------

def process_video_url(url):
    log = ""

    # metadata placeholder container
    meta = {
        "thumb": None,
        "title": "",
        "published": "",
        "description": "",
        "category": ""
    }

    df_transcript = None
    comments_df = None
    current_status = None
    topic_model = None
    topic_summaries = {}
    actionable_insights = None

    # wrapper to emit current UI state
    def emit():
        return (
            status_update(current_status),
            log,
            metadata_panel_state(meta),
            *metadata_update(meta),
            transcript_panel_state(df_transcript),
            *transcript_update(df_transcript),
            comment_count_panel_state(comments_df),
            *comment_count_update(comments_df),
            overall_sentiment_distribution_panel_update(comments_df),
            overall_intent_distribution_panel_update(comments_df),
            intentwise_sentiment_distribution_panel_update(comments_df),
            *topic_visualization_panel_update(comments_df, topic_model),
            *topic_updates(comments_df, topic_model, topic_summaries),
            *actionability_update(actionable_insights)
        )

    # --- Pipeline execution ---

    current_status = "🔍 Validating YouTube URL..."
    log += f"{current_status}\n"
    yield emit()

    video_id = extract_video_id(url)
    if not video_id:
        log += "❌ Invalid YouTube URL.\n"
        current_status = None
        yield emit()
        return

    current_status = "✅ Video ID extracted: {video_id}"
    log += f"{current_status}\n"
    yield emit()

    current_status = "📂 Creating Google Drive folder..."
    log += f"{current_status}\n"
    yield emit()

    # Your drive setup call here
    folder_id = setup_drive(video_id, DRIVE_FOLDER_ID)

    current_status = f"📁 Folder created: {folder_id}"
    log += f"{current_status}\n"
    yield emit()

    # Fetch transcript
    current_status = "📝 Fetching transcript..."
    log += f"{current_status}\n"
    yield emit()
    transcript = fetch_english_transcript(video_id)

    if transcript:
        df_transcript = pd.DataFrame(transcript)
        current_status = "✅ Transcript fetched successfully"
        log += f"{current_status}\n"
    else:
        df_transcript = None
        current_status = "❌ Transcript unavailable"
        log += f"{current_status}\n"
    yield emit()

    # Fetch metadata
    current_status = "📊 Fetching metadata..."
    log += f"{current_status}\n"
    yield emit()
    df_metadata = get_video_metadata(video_id)

    if not df_metadata.empty:
        row = df_metadata.iloc[0]
        meta["thumb"]       = row["thumbnail_url"]
        meta["title"]       = f"**{row['title']}**"
        meta["published"]   = f"**Published:** {row['publishedAt']}"
        meta["description"] = f"**Description:**\n{row['description']}"
        meta["category"]    = f"**Category ID:** {row['categoryId']}"

        current_status = "✅ Metadata fetched successfully"
        log += f"{current_status}\n"
    else:
        current_status = "❌ Failed to fetch metadata"
        log += f"{current_status}\n"
    yield emit()

    # Fetch comments
    current_status = "📥 Fetching comments..."
    log += f"\n{current_status}\n"
    yield emit()

    comments_df = save_youtube_comments_to_drive(video_id, YOUTUBE_API_KEY, service, folder_id)
    current_status = f"✅ {len(comments_df)} Comments fetched and saved to Drive: {folder_id}"
    log += f"{current_status}\n"
    yield emit()

    # Preprocess comments
    current_status = "🧹 Preprocessing comments..."
    log += f"{current_status}\n"
    yield emit()
    comments_df = preprocess_comments(folder_id)
    current_status = "✅ Comments preprocessed"
    log += f"{current_status}\n"
    yield emit()

    # Sentiment classification
    current_status = "[😄|😡] Labelling comments with sentiment..."
    log += f"\n{current_status}\n"
    yield emit()
    comments_df = label_comments_with_sentiment(comments_df)
    current_status = "✅ Comments labelled with sentiment"
    log += f"{current_status}\n"
    yield emit()

    # Intent classification
    current_status = "🤔💭 Labelling comments with intent..."
    log += f"\n{current_status}\n"
    yield emit()
    comments_df = label_comments_with_intent(comments_df)
    current_status = "✅ Comments labelled with intent"
    log += f"{current_status}\n"

    # Cluster comments into topics
    current_status = "🔍 Clustering comments into topics..."
    log += f"\n{current_status}\n"
    yield emit()
    comments_df, topic_model = label_comments_with_topics(comments_df)
    current_status = "✅ Comments clustered into topics"
    log += f"{current_status}\n"
    yield emit()

    # Summarize topics
    current_status = "📄 Summarizing topics..."
    log += f"\n{current_status}\n"
    yield emit()
    for topic in summarize_topics(topic_model, comments_df, topic_summaries):
        print("topic in summarize_topics")
        yield emit()

    # Create actionability dataframe
    current_status = "🎯 Creating actionability dataframe..."
    log += f"\n{current_status}\n"
    yield emit()
    actionability_df = comments_df[comments_df["intent_label"].isin(["question", "suggestion"])].copy()
    current_status = f"✅ Actionability dataframe created with {len(actionability_df)} comments"
    log += f"{current_status}\n"
    yield emit()

    # Cluster actionable comments into topics
    current_status = "🔍 Clustering actionable comments into topics..."
    log += f"\n{current_status}\n"
    yield emit()
    actionability_df, actionability_topic_model = label_comments_with_topics(actionability_df)
    current_status = "✅ Actionable comments clustered into topics"
    log += f"{current_status}\n"
    yield emit()

    # Get actionability scoring for each topic
    current_status = "📈 Calculating actionability scores..."
    log += f"\n{current_status}\n"
    yield emit()
    actionability_scored_df = score_topics_by_actionability(actionability_df, actionability_topic_model)
    current_status = "✅ Actionability scores calculated"
    log += f"{current_status}\n"

    actionable_topic_ids = actionability_scored_df.head(3)["topic_id"].unique()
    actionable_insights = []
    for topic_id in actionable_topic_ids:
        title, summary = summarize_actionable_insights(actionability_topic_model, actionability_df, topic_id)
        actionable_insights.append((title, summary,))

    current_status = None
    yield emit()
    return


# --------------------------------------------------------------------
# GRADIO UI
# --------------------------------------------------------------------

with gr.Blocks() as demo:
    gr.Markdown("## 🎥 YouTube Comments Insight Tool")

    gr.Markdown(
        "Enter a YouTube URL — system will create Drive folder, fetch transcript, "
        "and extract metadata. Future insights coming soon."
    )

    # URL entry
    with gr.Row():
        with gr.Column(scale=1):
            pass
        with gr.Column(scale=2):
            url_input = gr.Textbox(
                label="Video URL",
                placeholder="https://www.youtube.com/watch?v=dQw4w9WgXcQ",
            )
            analyze_btn = gr.Button("Analyze", variant="primary")
        with gr.Column(scale=1):
            pass

    # logs panel
    with gr.Accordion("📜 Logs", open=False):
        status_box = gr.Textbox(
            label="Execution Log",
            lines=10,
            autoscroll=True,
            interactive=False
        )

    # --- metadata accordion (hidden initially) ---
    metadata_accordion = gr.Accordion("📼 Video Metadata", open=False, visible=False)

    with metadata_accordion:
        with gr.Row():
            thumbnail = gr.Image(label="Thumbnail", visible=False, height=180)

            with gr.Column():
                title_md = gr.Markdown(visible=False)
                published_md = gr.Markdown(visible=False)
                category_md = gr.Markdown(visible=False)

        description_md = gr.Markdown(visible=False)

    # --- full transcript accordion (hidden initially) ---
    transcript_accordion = gr.Accordion("📝 Transcript", open=False, visible=False)

    with transcript_accordion:
        transcript_md = gr.Textbox(
            label="Transcript",
            container=False,
            lines=20,
            interactive=False
        )

    # --- Comments Count panel (hidden initially) ---
    with gr.Row():
        with gr.Column(scale=1):
            comments_accordion = gr.Accordion("💬 Overall Stats", open=True, visible=False)

            with comments_accordion:
                comment_count_md = gr.Markdown(visible=False)

                with gr.Row():
                    sentiment_chart_panel = gr.Plot(label="Sentiment Distribution", visible=True, scale=2)
                    intent_chart_panel    = gr.Plot(label="Intent Distribution", visible=True, scale=3)

                intentwise_sentiment_distribution_panel = gr.Plot(label="Intent-wise Sentiment Distribution", visible=True)

    with gr.Row():
        with gr.Column(scale=1):
            topics_accordion = gr.Accordion("🗣️ Recurring Themes", open=True, visible=False)
            with topics_accordion:
                with gr.Row():
                    topic_visualization_panel = gr.Plot(label="Topic Distance Map", visible=True, scale=1)
                    topic_container = gr.Column(scale=1)
                    topic_titles = []
                    topic_accordions = []
                    topic_panels = []
                    topic_distribution_plots = []

                    for i in range(MAX_TOPICS):
                        with topic_container:
                            topic_title = gr.Markdown(f"**Topic {i+1}**", visible=True)
                            topic_titles.append(topic_title)
                            topic_accordion = gr.Accordion(f"More Details...", open=False, visible=True)
                            topic_accordions.append(topic_accordion)
                            with topic_accordion:
                                topic_panels.append(gr.Markdown("", visible=True))
                                topic_distribution_plots.append(gr.Plot(label="Intent and Sentiment Distribution Plot", visible=True))

        actionability_accordion = gr.Accordion("🎯 Actionable Insights", open=True, visible=False)
        with actionability_accordion:
            insight_titles = []
            insight_summaries = []

            for i in range(MAX_INSIGHTS):
                insight_accordion = gr.Accordion(f"Insight {i+1}", open=False, visible=False)
                insight_titles.append(insight_accordion)
                with insight_accordion:
                    insight_summary = gr.Markdown("", visible=False)
                    insight_summaries.append(insight_summary)





    # bind outputs
    analyze_btn.click(
        fn=process_video_url,
        inputs=url_input,
        outputs=[
            analyze_btn,
            status_box,          # log text
            metadata_accordion,  # show/hide metadata accordion
            thumbnail,
            title_md,
            published_md,
            description_md,
            category_md,
            transcript_accordion,  # show/hide transcript accordion
            transcript_md,
            comments_accordion,  # show/hide comment count stats
            comment_count_md,
            sentiment_chart_panel,
            intent_chart_panel,
            intentwise_sentiment_distribution_panel,
            topics_accordion,
            topic_visualization_panel,
            topic_container,
            *topic_titles,
            *topic_accordions,
            *topic_panels,
            *topic_distribution_plots,
            actionability_accordion,
            *insight_titles,
            *insight_summaries,
        ]
    )


css = """
.scroll-text-box {
    max-height: 300px;
    overflow-y: auto;
    white-space: pre-wrap;
}

.generating {
  border: none;
}
"""

demo.launch(css=css, debug=True)


6.0.2
It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://b31997d684f1aa4d22.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Using existing folder with ID: 1EqZhaO7ch7KbYro39ts9Y2_GN04t-Bq0
Fetched 100 comments (total: 100)
Fetched 100 comments (total: 200)
Fetched 100 comments (total: 300)
Fetched 100 comments (total: 400)
Fetched 100 comments (total: 500)
Fetched 100 comments (total: 600)
Fetched 100 comments (total: 700)
Fetched 70 comments (total: 770)
Uploading to Google Drive...
Uploaded 'comments.csv' to Drive with ID: 1OSOht_-CCOVfYZWlc9HWRDPSg6U-DsY1
Total comments saved: 770
Found 'comments.csv' with ID: 1OSOht_-CCOVfYZWlc9HWRDPSg6U-DsY1
Download 100%.
Comments loaded successfully into df.
Uploaded 'comments_preprocessed.csv' to Drive with ID: 1aak9ZUZ0Hd5fwtTNgxUUXIS4pKuzuGk5
Cleaned comments saved to: 1EqZhaO7ch7KbYro39ts9Y2_GN04t-Bq0/comments_preprocessed.csv

⏳ Running inference on 726 comments...

Completed inference in 4.47 seconds...
sentiment
Positive    480
Neutral     157
Negative     89
Name: count, dtype: int64
sentiment
Positive    480
Neutral     157
Negative     89
Name: count, dtype

2025-12-07 16:46:50,007 - BERTopic - Embedding - Transforming documents to embeddings.


sentiment
Positive    480
Neutral     157
Negative     89
Name: count, dtype: int64
intent_counts
intent_label
praise        411
question       98
suggestion     91
criticism      89
humour         26
other          11
Name: count, dtype: int64


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

2025-12-07 16:46:52,449 - BERTopic - Embedding - Completed ✓
2025-12-07 16:46:52,449 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-12-07 16:46:55,266 - BERTopic - Dimensionality - Completed ✓
2025-12-07 16:46:55,268 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-12-07 16:46:55,336 - BERTopic - Cluster - Completed ✓
2025-12-07 16:46:55,339 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-12-07 16:46:55,388 - BERTopic - Representation - Completed ✓


sentiment
Positive    480
Neutral     157
Negative     89
Name: count, dtype: int64
intent_counts
intent_label
praise        411
question       98
suggestion     91
criticism      89
humour         26
other          11
Name: count, dtype: int64
topic_summaries
{}
title_updates
[{'value': '⏳ **Topic**: 0 — 38 comments', '__type__': 'update', 'visible': True}, {'value': '⏳ **Topic**: 1 — 34 comments', '__type__': 'update', 'visible': True}, {'value': '⏳ **Topic**: 2 — 28 comments', '__type__': 'update', 'visible': True}, {'value': '⏳ **Topic**: 3 — 27 comments', '__type__': 'update', 'visible': True}, {'value': '⏳ **Topic**: 4 — 27 comments', '__type__': 'update', 'visible': True}]
sentiment
Positive    480
Neutral     157
Negative     89
Name: count, dtype: int64
intent_counts
intent_label
praise        411
question       98
suggestion     91
criticism      89
humour         26
other          11
Name: count, dtype: int64
topic_summaries
{}
title_updates
[{'value': '⏳ **Topic**: 0 — 38 c

2025-12-07 16:47:12,797 - BERTopic - Embedding - Transforming documents to embeddings.


title_updates
[{'value': '🧠 **Topic**: How to buy a mazda 3 gt sport tech in soul crystal red — 38 comments', '__type__': 'update', 'visible': True}, {'value': '🧠 **Topic**: Gran Turismo 7 — 34 comments', '__type__': 'update', 'visible': True}, {'value': "🧠 **Topic**: Japan's first car review video — 28 comments", '__type__': 'update', 'visible': True}, {'value': '🧠 **Topic**: Humming noise on Mazda 2021 turbo sedan — 27 comments', '__type__': 'update', 'visible': True}, {'value': '🧠 **Topic**: Video of a man making a video — 27 comments', '__type__': 'update', 'visible': True}]


Batches:   0%|          | 0/6 [00:00<?, ?it/s]

2025-12-07 16:47:14,010 - BERTopic - Embedding - Completed ✓
2025-12-07 16:47:14,011 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-12-07 16:47:14,581 - BERTopic - Dimensionality - Completed ✓
2025-12-07 16:47:14,582 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-12-07 16:47:14,598 - BERTopic - Cluster - Completed ✓
2025-12-07 16:47:14,600 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-12-07 16:47:14,621 - BERTopic - Representation - Completed ✓


sentiment
Positive    480
Neutral     157
Negative     89
Name: count, dtype: int64
intent_counts
intent_label
praise        411
question       98
suggestion     91
criticism      89
humour         26
other          11
Name: count, dtype: int64
topic_summaries
{0: ['How to buy a mazda 3 gt sport tech in soul crystal red', 'Commenter 1 bought a mazda 3 gt sport tech all black and red interior. Commenter 2 bought a 2024 na europe in soul crystal red. Commenters are impressed with the quality of the videos and reviews of the cars on youtube. ', 'Commenter 1 is considering buying a used Mazda as a first time driver and wants to know if he should get one of the newest versions in the base model with the latest features or get an older version but in a turbo trim. He is interested in the video.'], 1: ['Gran Turismo 7', 'Commenter 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 and 15 are discussing the cars they like.   They like the design of Gran Turismo 7. ', 'Commenter 1 wonders why pe

2025-12-07 16:49:27,155 - BERTopic - Embedding - Transforming documents to embeddings.


sentiment
Positive    480
Neutral     157
Negative     89
Name: count, dtype: int64
intent_counts
intent_label
praise        411
question       98
suggestion     91
criticism      89
humour         26
other          11
Name: count, dtype: int64


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

2025-12-07 16:49:29,632 - BERTopic - Embedding - Completed ✓
2025-12-07 16:49:29,633 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-12-07 16:49:32,453 - BERTopic - Dimensionality - Completed ✓
2025-12-07 16:49:32,454 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-12-07 16:49:32,520 - BERTopic - Cluster - Completed ✓
2025-12-07 16:49:32,523 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-12-07 16:49:32,571 - BERTopic - Representation - Completed ✓


sentiment
Positive    480
Neutral     157
Negative     89
Name: count, dtype: int64
intent_counts
intent_label
praise        411
question       98
suggestion     91
criticism      89
humour         26
other          11
Name: count, dtype: int64
topic_summaries
{}
title_updates
[{'value': '⏳ **Topic**: 0 — 38 comments', '__type__': 'update', 'visible': True}, {'value': '⏳ **Topic**: 1 — 34 comments', '__type__': 'update', 'visible': True}, {'value': '⏳ **Topic**: 2 — 28 comments', '__type__': 'update', 'visible': True}, {'value': '⏳ **Topic**: 3 — 27 comments', '__type__': 'update', 'visible': True}, {'value': '⏳ **Topic**: 4 — 27 comments', '__type__': 'update', 'visible': True}]
sentiment
Positive    480
Neutral     157
Negative     89
Name: count, dtype: int64
intent_counts
intent_label
praise        411
question       98
suggestion     91
criticism      89
humour         26
other          11
Name: count, dtype: int64
topic_summaries
{}
title_updates
[{'value': '⏳ **Topic**: 0 — 38 c

2025-12-07 16:49:51,591 - BERTopic - Embedding - Transforming documents to embeddings.


topic_summaries
{0: ['How to buy a mazda 3 gt sport tech in soul crystal red', 'Commenter 1 bought a mazda 3 gt sport tech all black and red interior. Commenter 2 bought a 2024 na europe in soul crystal red. Commenters are impressed with the quality of the videos and reviews of the cars on youtube. ', 'Commenter 1 is considering buying a used Mazda as a first time driver and wants to know if he should get one of the newest versions in the base model with the latest features or get an older version but in a turbo trim. He is interested in the video.'], 1: ['Gran Turismo 7', 'Commenter 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 and 15 are discussing the cars they like.   They like the design of Gran Turismo 7. ', 'Commenter 1 wonders why people talk about the yaris gr being sporty, while Honda Civic seems to be better balanced around the corners. Commenter 4 explains that the bose sound speakers are designed for this car.   Commenter 2 wonders why everyone seems to talk about how 

Batches:   0%|          | 0/6 [00:00<?, ?it/s]

2025-12-07 16:49:52,812 - BERTopic - Embedding - Completed ✓
2025-12-07 16:49:52,812 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-12-07 16:49:53,389 - BERTopic - Dimensionality - Completed ✓
2025-12-07 16:49:53,390 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-12-07 16:49:53,406 - BERTopic - Cluster - Completed ✓
2025-12-07 16:49:53,408 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-12-07 16:49:53,428 - BERTopic - Representation - Completed ✓


sentiment
Positive    480
Neutral     157
Negative     89
Name: count, dtype: int64
intent_counts
intent_label
praise        411
question       98
suggestion     91
criticism      89
humour         26
other          11
Name: count, dtype: int64
topic_summaries
{0: ['How to buy a mazda 3 gt sport tech in soul crystal red', 'Commenter 1 bought a mazda 3 gt sport tech all black and red interior. Commenter 2 bought a 2024 na europe in soul crystal red. Commenters are impressed with the quality of the videos and reviews of the cars on youtube. ', 'Commenter 1 is considering buying a used Mazda as a first time driver and wants to know if he should get one of the newest versions in the base model with the latest features or get an older version but in a turbo trim. He is interested in the video.'], 1: ['Gran Turismo 7', 'Commenter 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 and 15 are discussing the cars they like.   They like the design of Gran Turismo 7. ', 'Commenter 1 wonders why pe

2025-12-07 17:00:10,457 - BERTopic - Embedding - Transforming documents to embeddings.


sentiment
Positive    480
Neutral     157
Negative     89
Name: count, dtype: int64
intent_counts
intent_label
praise        411
question       98
suggestion     91
criticism      89
humour         26
other          11
Name: count, dtype: int64


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

2025-12-07 17:00:12,879 - BERTopic - Embedding - Completed ✓
2025-12-07 17:00:12,880 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-12-07 17:00:15,686 - BERTopic - Dimensionality - Completed ✓
2025-12-07 17:00:15,687 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-12-07 17:00:15,763 - BERTopic - Cluster - Completed ✓
2025-12-07 17:00:15,766 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-12-07 17:00:15,815 - BERTopic - Representation - Completed ✓


sentiment
Positive    480
Neutral     157
Negative     89
Name: count, dtype: int64
intent_counts
intent_label
praise        411
question       98
suggestion     91
criticism      89
humour         26
other          11
Name: count, dtype: int64
topic_summaries
{}
title_updates
[{'value': '⏳ **Topic**: 0 — 38 comments', '__type__': 'update', 'visible': True}, {'value': '⏳ **Topic**: 1 — 34 comments', '__type__': 'update', 'visible': True}, {'value': '⏳ **Topic**: 2 — 28 comments', '__type__': 'update', 'visible': True}, {'value': '⏳ **Topic**: 3 — 27 comments', '__type__': 'update', 'visible': True}, {'value': '⏳ **Topic**: 4 — 27 comments', '__type__': 'update', 'visible': True}]
sentiment
Positive    480
Neutral     157
Negative     89
Name: count, dtype: int64
intent_counts
intent_label
praise        411
question       98
suggestion     91
criticism      89
humour         26
other          11
Name: count, dtype: int64
topic_summaries
{}
title_updates
[{'value': '⏳ **Topic**: 0 — 38 c

2025-12-07 17:00:33,003 - BERTopic - Embedding - Transforming documents to embeddings.


title_updates
[{'value': '🧠 **Topic**: How to buy a mazda 3 gt sport tech in soul crystal red — 38 comments', '__type__': 'update', 'visible': True}, {'value': '🧠 **Topic**: Gran Turismo 7 — 34 comments', '__type__': 'update', 'visible': True}, {'value': "🧠 **Topic**: Japan's first car review video — 28 comments", '__type__': 'update', 'visible': True}, {'value': '🧠 **Topic**: Humming noise on Mazda 2021 turbo sedan — 27 comments', '__type__': 'update', 'visible': True}, {'value': '🧠 **Topic**: Video of a man making a video — 27 comments', '__type__': 'update', 'visible': True}]


Batches:   0%|          | 0/6 [00:00<?, ?it/s]

2025-12-07 17:00:34,177 - BERTopic - Embedding - Completed ✓
2025-12-07 17:00:34,178 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-12-07 17:00:34,759 - BERTopic - Dimensionality - Completed ✓
2025-12-07 17:00:34,760 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-12-07 17:00:34,776 - BERTopic - Cluster - Completed ✓
2025-12-07 17:00:34,779 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-12-07 17:00:34,800 - BERTopic - Representation - Completed ✓


sentiment
Positive    480
Neutral     157
Negative     89
Name: count, dtype: int64
intent_counts
intent_label
praise        411
question       98
suggestion     91
criticism      89
humour         26
other          11
Name: count, dtype: int64
topic_summaries
{0: ['How to buy a mazda 3 gt sport tech in soul crystal red', 'Commenter 1 bought a mazda 3 gt sport tech all black and red interior. Commenter 2 bought a 2024 na europe in soul crystal red. Commenters are impressed with the quality of the videos and reviews of the cars on youtube. ', 'Commenter 1 is considering buying a used Mazda as a first time driver and wants to know if he should get one of the newest versions in the base model with the latest features or get an older version but in a turbo trim. He is interested in the video.'], 1: ['Gran Turismo 7', 'Commenter 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 and 15 are discussing the cars they like.   They like the design of Gran Turismo 7. ', 'Commenter 1 wonders why pe



In [None]:
!export GRADIO_DISABLE_SSL_VERIFY=1


In [None]:
!python -c "import certifi; print(certifi.where())"


In [None]:
!!update-ca-certificates
