In [None]:
try:
  from google.colab import drive
  drive.mount('/content/drive')
except ModuleNotFoundError:
  print("Not running on colab...")

RUN_DASHBOARD = True

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Install dependencies

In [None]:
!pip install youtube-transcript-api google-api-python-client



In [None]:
import pandas as pd
from io import BytesIO
from googleapiclient.http import MediaIoBaseDownload, MediaIoBaseUpload # Import needed for downloading and uploading
import re # Keep this import as it's used by clean_comment
from typing import List, Optional

def drive_to_df(folder_id: str, filename: str, column_names: Optional[List[str]] = None) -> pd.DataFrame:
    # Find the file ID of 'comments.csv' in the folder_id
    query = f"name='{filename}' and '{folder_id}' in parents"
    response = service.files().list(q=query, spaces='drive', fields='files(id, name)').execute()
    files = response.get('files', [])

    if files:
        file_id = files[0].get('id')
        print(f"Found '{filename}' with ID: {file_id}")

        # Download the file content
        request = service.files().get_media(fileId=file_id)
        fh = BytesIO()
        downloader = MediaIoBaseDownload(fh, request)
        done = False
        while done is False:
            status, done = downloader.next_chunk()
            print(f"Download {int(status.progress() * 100)}%.")

        fh.seek(0) # Reset buffer position to the beginning

        # Read the downloaded content into a pandas DataFrame
        if column_names:
            df = pd.read_csv(fh, header=None, names=column_names)
        else:
            df = pd.read_csv(fh)
        print("Comments loaded successfully into df.")
    else:
        print(f"Error: '{filename}' not found in folder {folder_id}. Please ensure the file is uploaded.")
        df = pd.DataFrame() # Initialize empty DataFrame to avoid errors later
    return df


def df_to_drive(df, folder_id, filename):
    """
    Uploads a pandas DataFrame as a CSV file to Google Drive.

    Args:
        df (pd.DataFrame): The DataFrame to upload.
        folder_id (str): The ID of the Google Drive folder where the file will be saved.
        filename (str): The desired filename for the CSV file in Google Drive.
    """
    # Convert DataFrame to CSV in-memory
    csv_buffer = BytesIO()
    df.to_csv(csv_buffer, index=False, encoding='utf-8')
    csv_buffer.seek(0) # Rewind the buffer to the beginning

    # Prepare metadata for the new file
    file_metadata = {
        'name': filename,
        'parents': [folder_id]
    }

    # Create a MediaIoBaseUpload object from the BytesIO buffer
    media_body = MediaIoBaseUpload(csv_buffer, mimetype='text/csv', resumable=True)

    # Upload the file
    try:
        uploaded_file = service.files().create(
            body=file_metadata,
            media_body=media_body,
            fields='id'
        ).execute()
        print(f"Uploaded '{filename}' to Drive with ID: {uploaded_file.get('id')}")
    except Exception as e:
        print(f"Error uploading '{filename}' to Drive: {e}")


def clean_comment(text):
    text = str(text).lower()
    text = re.sub(r"http\S+", "", text)  # remove links
    text = re.sub(r"@\w+", "", text)     # remove mentions
    text = re.sub(r"[^a-zA-Z0-9\s.,!?']", " ", text)  # keep punctuation
    text = re.sub(r"\s+", " ", text).strip()  # remove extra spaces
    return text


In [None]:
DRIVE_FOLDER_ID = '1NEXXEvuBZLB1MFqRCkeGbJfYn4wbA2SX'
YOUTUBE_VIDEO_ID = 'zozEm4f_dlw'
YOUTUBE_API_KEY = "AIzaSyBpSCBmvcy9LAQz2Q3cVExHk_y_dbk6zss"

### Set up Drive

In [None]:
# Connect to Drive
from googleapiclient.discovery import build
from googleapiclient.http import MediaFileUpload
from google.colab import auth
auth.authenticate_user()

import google.auth
creds, _ = google.auth.default()
service = build('drive', 'v3', credentials=creds)


In [None]:
# Create directory with name: youtube_video_id if not already exists
def setup_drive(youtube_video_id, drive_folder_id):
    query = f"name='{youtube_video_id}' and mimeType='application/vnd.google-apps.folder' and '{drive_folder_id}' in parents"
    response = service.files().list(q=query, spaces='drive', fields='files(id, name)').execute()
    files = response.get('files', [])

    if files:
        # Folder already exists
        video_folder_id = files[0].get('id')
        print(f"Using existing folder with ID: {video_folder_id}")
    else:
        # Create new folder
        file_metadata = {
            'name': youtube_video_id,
            'mimeType': 'application/vnd.google-apps.folder',
            'parents': [drive_folder_id]
        }
        file = service.files().create(body=file_metadata, fields='id').execute()
        video_folder_id = file.get('id')
        print(f"Created new folder with ID: {video_folder_id}")
    return video_folder_id

if not RUN_DASHBOARD:
  VIDEO_FOLDER_ID = setup_drive(YOUTUBE_VIDEO_ID, DRIVE_FOLDER_ID)

### Extract video transcript and save to Drive


In [None]:
from typing import List, Dict, Optional
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api._errors import TranscriptsDisabled, NoTranscriptFound, VideoUnavailable
import pandas as pd # Import pandas for DataFrame creation


def fetch_english_transcript(video_id: str) -> Optional[List[Dict]]:
    """
    Fetches the English transcript for a YouTube video.

    Preference order:
        1. Manually uploaded English transcript
        2. Auto-generated English transcript
        3. None (if no transcript found or disabled)

    Args:
        video_id (str): The YouTube video ID (e.g., 'dQw4w9WgXcQ').

    Returns:
        list[dict] or None: List of transcript segments as dicts like:
            [
                {"text": "Hello world", "start": 0.0, "duration": 3.5},
                ...
            ]
        Returns None if no transcript is available.
    """
    try:
        # Initialize API
        ytt_api = YouTubeTranscriptApi()

        # Get all available transcripts for this video
        transcript_list = ytt_api.list(video_id)

        # Try manually created English transcript first
        try:
            transcript = transcript_list.find_manually_created_transcript(['en'])
        except NoTranscriptFound:
            # Fallback to auto-generated English transcript
            try:
                transcript = transcript_list.find_generated_transcript(['en'])
            except NoTranscriptFound:
                return None  # Neither manual nor auto-generated transcript found

        # Fetch the transcript data and return as list of dicts
        return transcript.fetch()

    except (TranscriptsDisabled, VideoUnavailable):
        # Transcripts disabled or video unavailable
        return None
    except Exception as e:
        # Handle unexpected errors safely
        print(f"Error fetching transcript for video {video_id}: {e}")
        return None

if not RUN_DASHBOARD:
    transcript_data = fetch_english_transcript(YOUTUBE_VIDEO_ID)

    if transcript_data:
        df_transcript = pd.DataFrame(transcript_data)
        print("Transcript loaded successfully into DataFrame.")

        # Upload to drive using the existing df_to_drive function
        df_to_drive(df_transcript, VIDEO_FOLDER_ID, 'transcript.csv')
    else:
        print(f"No English transcript found for video ID: {YOUTUBE_VIDEO_ID}")


### Extract video metadata and save to Drive


In [None]:
from googleapiclient.discovery import build
import pandas as pd

# Build the YouTube API service
youtube = build('youtube', 'v3', developerKey=YOUTUBE_API_KEY)

def get_video_metadata(video_id: str) -> pd.DataFrame:
    """
    Fetches video metadata (title, description, published date, thumbnail URL, category ID)
    for a given YouTube video ID.

    Args:
        video_id (str): The YouTube video ID.

    Returns:
        pd.DataFrame: A DataFrame containing the video metadata.
    """
    request = youtube.videos().list(
        part="snippet",
        id=video_id
    )
    response = request.execute()

    if response['items']:
        snippet = response['items'][0]['snippet']
        metadata = {
            'video_id': video_id,
            'title': snippet.get('title'),
            'description': snippet.get('description'),
            'publishedAt': snippet.get('publishedAt'),
            'thumbnail_url': snippet['thumbnails']['default']['url'] if 'thumbnails' in snippet else None,
            'categoryId': snippet.get('categoryId')
        }
        return pd.DataFrame([metadata])
    else:
        print(f"No video found with ID: {video_id}")
        return pd.DataFrame()


if not RUN_DASHBOARD:
    # Get video metadata
    df_metadata = get_video_metadata(YOUTUBE_VIDEO_ID)

    if not df_metadata.empty:
        print("Video metadata loaded successfully into DataFrame.")
        # Upload to drive using the existing df_to_drive function
        df_to_drive(df_metadata, VIDEO_FOLDER_ID, 'video_metadata.csv')
    else:
        print(f"Failed to retrieve metadata for video ID: {YOUTUBE_VIDEO_ID}")


### Extract comments from a youtube video and save to Drive

In [None]:
import csv
import time
import requests
import pandas as pd # Import pandas for DataFrame creation

def save_youtube_comments_to_drive(
    video_id: str,
    api_key: str,
    service,
    drive_folder_id: str,
    max_retries: int = 5,
    backoff_base: float = 2.0,
):
    """
    Fetches YouTube comments for a given video ID and stores them in a pandas DataFrame,
    then uploads the DataFrame as a CSV file to a Google Drive folder using df_to_drive.

    Args:
        video_id (str): YouTube video ID
        api_key (str): YouTube Data API v3 key
        service: Authenticated Google Drive API service object
        drive_folder_id (str): Destination Google Drive folder ID
        max_retries (int): Max retries per failed request
        backoff_base (float): Base for exponential backoff (seconds)
    """
    base_url = "https://www.googleapis.com/youtube/v3/commentThreads"
    params = {
        "part": "snippet",
        "videoId": video_id,
        "maxResults": 100,
        "textFormat": "plainText",
        "key": api_key,
    }

    session = requests.Session()
    next_page_token = None
    total_comments = 0
    all_comments_data = [] # List to store all comment rows

    while True:
        if next_page_token:
            params["pageToken"] = next_page_token
        else:
            params.pop("pageToken", None)

        for attempt in range(max_retries):
            try:
                resp = session.get(base_url, params=params, timeout=10)
                if resp.status_code == 200:
                    data = resp.json()
                    break
                else:
                    print(f"Warning: HTTP {resp.status_code}, retrying...")
            except requests.exceptions.RequestException as e:
                print(f"Network error: {e}, retrying...")

            sleep_time = backoff_base ** attempt
            print(f"Sleeping {sleep_time:.1f}s before retry...")
            time.sleep(sleep_time)
        else:
            raise RuntimeError("Max retries exceeded while fetching comments.")

        # Parse comments
        items = data.get("items", [])
        for item in items:
            snippet = item["snippet"]["topLevelComment"]["snippet"]
            row = [
                snippet.get("authorDisplayName", ""),
                snippet.get("textDisplay", "").replace("\n", " "),
                snippet.get("likeCount", 0),
                snippet.get("publishedAt", ""),
            ]
            all_comments_data.append(row) # Append row to list
            total_comments += 1

        print(f"Fetched {len(items)} comments (total: {total_comments})")

        next_page_token = data.get("nextPageToken")
        if not next_page_token:
            break

        # Be polite — small delay to avoid rate limits
        time.sleep(0.3)

    # Create DataFrame from collected data with explicit column names
    df_comments = pd.DataFrame(all_comments_data, columns=['authorDisplayName', 'commentText', 'likeCount', 'publishedAt'])

    # Upload the DataFrame to Google Drive using df_to_drive
    print("Uploading to Google Drive...")
    df_to_drive(df_comments, drive_folder_id, "comments.csv")
    print(f"Total comments saved: {total_comments}")
    return df_comments


if not RUN_DASHBOARD:
    save_youtube_comments_to_drive(
        YOUTUBE_VIDEO_ID,
        YOUTUBE_API_KEY,
        service,
        VIDEO_FOLDER_ID
    )



### Data Preprocessing

In [None]:
import pandas as pd
import os

# Load comments CSV

def preprocess_comments(video_folder_id):
    df = drive_to_df(video_folder_id, 'comments.csv', column_names=['authorDisplayName', 'commentText', 'likeCount', 'publishedAt'])

    df['cleanedCommentText'] = df['commentText'].apply(clean_comment)

    # Remove exact duplicates
    df = df.drop_duplicates(subset='cleanedCommentText')

    # Remove very short comments (e.g. emojis only or less than 3 words)
    df = df[df['cleanedCommentText'].str.split().str.len() > 2]

    df_to_drive(df, video_folder_id, 'comments_preprocessed.csv')
    print(f"Cleaned comments saved to: {os.path.join(video_folder_id, 'comments_preprocessed.csv')}")

    return df

if not RUN_DASHBOARD:
    preprocess_comments(VIDEO_FOLDER_ID)


In [None]:
VIDEO_ID_LIST = [
    'teMKnxbd_O0',
    'PssKpzB0Ah0',
    'UiIRlg4Xr5w',
    'YYQXk1t_JHM',
    '7qriveAV7BY',
    'fqyl5kbZ7Tw',
    '7ARBJQn6QkM',
    'hmtuvNfytjM'
]

for YOUTUBE_VIDEO_ID in VIDEO_ID_LIST:
    # Drive setup
    VIDEO_FOLDER_ID = setup_drive(YOUTUBE_VIDEO_ID, DRIVE_FOLDER_ID)
    # Transcript extraction
    transcript_data = fetch_english_transcript(YOUTUBE_VIDEO_ID)
    if transcript_data:
        df_transcript = pd.DataFrame(transcript_data)
        print("Transcript loaded successfully into DataFrame.")

        # Upload to drive using the existing df_to_drive function
        df_to_drive(df_transcript, VIDEO_FOLDER_ID, 'transcript.csv')
    else:
        print(f"No English transcript found for video ID: {YOUTUBE_VIDEO_ID}")
    # Metadata extraction
    df_metadata = get_video_metadata(YOUTUBE_VIDEO_ID)

    if not df_metadata.empty:
        print("Video metadata loaded successfully into DataFrame.")
        # Upload to drive using the existing df_to_drive function
        df_to_drive(df_metadata, VIDEO_FOLDER_ID, 'video_metadata.csv')
    else:
        print(f"Failed to retrieve metadata for video ID: {YOUTUBE_VIDEO_ID}")
    # Comments Extraction
    save_youtube_comments_to_drive(
        YOUTUBE_VIDEO_ID,
        YOUTUBE_API_KEY,
        service,
        VIDEO_FOLDER_ID
    )
    # Preprocess comments
    preprocess_comments(VIDEO_FOLDER_ID)

Created new folder with ID: 1Ow6TcABf2iDLiiNPgIAcU4Pey7Ub73RZ
Transcript loaded successfully into DataFrame.
Uploaded 'transcript.csv' to Drive with ID: 1vTANGtkIvf455XolZ9-ghO_FmOb3m1zO
Video metadata loaded successfully into DataFrame.
Uploaded 'video_metadata.csv' to Drive with ID: 1vxUc0_DcjKUdXzEfkLUuUgQEsI15-KTW
Fetched 100 comments (total: 100)
Fetched 100 comments (total: 200)
Fetched 100 comments (total: 300)
Fetched 100 comments (total: 400)
Fetched 100 comments (total: 500)
Fetched 100 comments (total: 600)
Fetched 100 comments (total: 700)
Fetched 100 comments (total: 800)
Fetched 100 comments (total: 900)
Fetched 100 comments (total: 1000)
Fetched 100 comments (total: 1100)
Fetched 100 comments (total: 1200)
Fetched 100 comments (total: 1300)
Fetched 100 comments (total: 1400)
Fetched 100 comments (total: 1500)
Fetched 100 comments (total: 1600)
Fetched 100 comments (total: 1700)
Fetched 100 comments (total: 1800)
Fetched 100 comments (total: 1900)
Fetched 100 comments (to

Multilingual Detection

In [None]:
#install dependencies

!pip install googletrans==4.0.0-rc1 langdetect
from langdetect import detect
from googletrans import Translator
from langdetect import detect, LangDetectException
from googletrans import Translator

translator = Translator()

# Drop rows with empty or invalid cleaned_comment
df = df[df['cleaned_comment'].str.strip().str.len() > 3].copy()

# Function to safely detect language
def safe_detect(text):
    try:
        return detect(text)
    except LangDetectException:
        return "unknown"

# Apply language detection
df['language'] = df['cleaned_comment'].apply(safe_detect)

# Translate if not English
def translate_to_english(row):
    if row['language'] == 'en':
        return row['cleaned_comment']
    try:
        return translator.translate(row['cleaned_comment'], dest='en').text
    except Exception:
        return row['cleaned_comment']  # fallback to original if translation fails

df['translated_comment'] = df.apply(translate_to_english, axis=1)

# Optional: save to Drive
df.to_csv("/content/drive/MyDrive/comments_translated.csv", index=False)

print("✅ Translation complete. File saved to Drive.")
print(df)

### Intent Classification (Praise, Question, Criticism, or Suggestion)

In [None]:
# TODO

### Sentiment Classfication (Positive, Negative, Neutral)

In [None]:
#install dependences
!pip install transformers torch
#load cleaned data
import pandas as pd
#for graph visulaization
import matplotlib.pyplot as plt





load sentiment piple line -pretrained model

In [None]:

# Update the path based on where your file is
df = pd.read_csv("/content/drive/MyDrive/youtube_comments.csv")

# Drop missing or empty comments just in case
df = df.dropna(subset=["cleaned_comment"])
df = df[df["cleaned_comment"].str.strip() != ""]
#load pretrained model
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification

model_name = "cardiffnlp/twitter-roberta-base-sentiment"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Create a pipeline for sentiment analysis
sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
#run anaylsis on my own data(test-data)
def get_sentiment(text):
    try:
        # Truncate input text safely to 512 tokens max (roberta's limit)
        encoded = tokenizer(text, truncation=True, max_length=512, return_tensors="pt")
        output = model(**encoded)
        prediction = output.logits.argmax(dim=1).item()

        label_map = {0: "Negative", 1: "Neutral", 2: "Positive"}
        return label_map[prediction]
    except Exception as e:
        print(f"Error processing: {text[:50]}... — {str(e)}")
        return "Unknown"
# You can test on a subset first
df = df.head(100)

df['sentiment'] = df['cleaned_comment'].apply(get_sentiment)
#df.to_csv("/content/drive/MyDrive/comments_with_sentiment.csv", index=False)
print(" Sentiment-labeled file saved to Drive.")




Device set to use cpu


 Sentiment-labeled file saved to Drive.


**visulaize output**

In [None]:
df['sentiment'].value_counts().plot.pie(autopct='%1.1f%%', colors=["red", "gray", "green"], title="Sentiment Distribution")
plt.ylabel("")
plt.show()

'so'

### Topic Classification (BERTopic)

1.   *List item*
2.   List item



In [None]:
# TODO

### Summarization

In [None]:
# TODO

### Dashboard

In [None]:
!pip install -U gradio

Collecting gradio
  Downloading gradio-6.0.2-py3-none-any.whl.metadata (16 kB)
Collecting gradio-client==2.0.1 (from gradio)
  Downloading gradio_client-2.0.1-py3-none-any.whl.metadata (7.1 kB)
Downloading gradio-6.0.2-py3-none-any.whl (21.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.6/21.6 MB[0m [31m24.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading gradio_client-2.0.1-py3-none-any.whl (55 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.4/55.4 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gradio-client, gradio
  Attempting uninstall: gradio-client
    Found existing installation: gradio_client 1.14.0
    Uninstalling gradio_client-1.14.0:
      Successfully uninstalled gradio_client-1.14.0
  Attempting uninstall: gradio
    Found existing installation: gradio 5.50.0
    Uninstalling gradio-5.50.0:
      Successfully uninstalled gradio-5.50.0
Successfully installed gradio-6.0.2 gradio-client-2.0.1


In [None]:
import gradio as gr

print(gr.__version__)
import re

# --- Your helpers ---

def extract_video_id(url):
    pattern = r"(?:v=|youtu\.be/)([a-zA-Z0-9_-]{11})"
    match = re.search(pattern, url)
    return match.group(1) if match else None


# --------------------------------------------------------------------
# UI update helpers
# --------------------------------------------------------------------

def metadata_update(meta):
    """
    Returns UI update tuple for metadata components
    """
    def truncate(description):
        short_desc = "\n".join(description.split("\n")[:4] + ['[TRUNCATED]'])  # first 4 lines only
        return short_desc

    return (
        gr.update(value=meta["thumb"], visible=meta["thumb"] is not None),
        gr.update(value=meta["title"], visible=bool(meta["title"])),
        gr.update(value=meta["published"], visible=bool(meta["published"])),
        gr.update(value=truncate(meta["description"]), visible=bool(meta["description"])),
        gr.update(value=meta["category"], visible=bool(meta["category"])),
    )


def transcript_update(transcript_df):
    """
    Build transcript text output with formatted timestamps
    and return UI update tuple.
    """

    if transcript_df is None or transcript_df.empty:
        return (gr.update(value="", visible=False),)

    # Format timestamp + text lines
    formatted_lines = []
    for _, row in transcript_df.iterrows():
        minutes = int(row["start"] // 60)
        seconds = int(row["start"] % 60)
        ts = f"{minutes:02d}:{seconds:02d}"
        formatted_lines.append(f"[{ts}] {row['text']}")

    formatted_transcript = "\n".join(formatted_lines)

    return (gr.update(value=formatted_transcript, visible=True),)

def comment_count_update(comments_df):
    if comments_df is None or comments_df.empty:
        return (gr.update(value="", visible=False),)
    return (gr.update(value=f"Total comments: **{len(comments_df)}**", visible=True),)


def metadata_panel_state(meta):
    """
    Makes metadata accordion visible only when metadata exists
    """
    has_data = any([
        meta["thumb"],
        meta["title"],
        meta["published"],
        meta["description"],
        meta["category"]
    ])
    return gr.update(visible=has_data)


def transcript_panel_state(transcript_df):
    return gr.update(visible=bool(transcript_df is not None and not transcript_df.empty))


def comment_count_panel_state(comments_df):
    return gr.update(visible=bool(comments_df is not None and not comments_df.empty))


# --------------------------------------------------------------------
# Main pipeline
# --------------------------------------------------------------------

def process_video_url(url):
    log = ""

    # metadata placeholder container
    meta = {
        "thumb": None,
        "title": "",
        "published": "",
        "description": "",
        "category": ""
    }

    df_transcript = None
    comments_df = None

    # wrapper to emit current UI state
    def emit():
        return (
            log,
            metadata_panel_state(meta),
            *metadata_update(meta),
            transcript_panel_state(df_transcript),
            *transcript_update(df_transcript),
            comment_count_panel_state(comments_df),
            *comment_count_update(comments_df)
        )

    # --- Pipeline execution ---

    log += "🔍 Validating YouTube URL...\n"
    yield emit()

    video_id = extract_video_id(url)
    if not video_id:
        log += "❌ Invalid YouTube URL.\n"
        yield emit()
        return

    log += f"✅ Video ID extracted: {video_id}\n"
    yield emit()

    log += "📂 Creating Google Drive folder...\n"
    yield emit()

    # Your drive setup call here
    folder_id = setup_drive(video_id, DRIVE_FOLDER_ID)

    log += f"📁 Folder created: {folder_id}\n"
    yield emit()

    # Fetch transcript
    log += "📝 Fetching transcript...\n"
    yield emit()
    transcript = fetch_english_transcript(video_id)

    if transcript:
        df_transcript = pd.DataFrame(transcript)
        log += "✔ Transcript fetched successfully.\n"
    else:
        df_transcript = None
        log += "❌ Transcript unavailable.\n"
    yield emit()

    # Fetch metadata
    log += "📊 Fetching metadata...\n"
    yield emit()
    df_metadata = get_video_metadata(video_id)

    if not df_metadata.empty:
        row = df_metadata.iloc[0]
        meta["thumb"]       = row["thumbnail_url"]
        meta["title"]       = f"**{row['title']}**"
        meta["published"]   = f"**Published:** {row['publishedAt']}"
        meta["description"] = f"**Description:**\n{row['description']}"
        meta["category"]    = f"**Category ID:** {row['categoryId']}"

        log += "✔ Metadata loaded successfully.\n"
    else:
        log += "❌ Failed to load metadata.\n"
    yield emit()

    # Fetch comments
    log += "\n📥 Fetching comments...\n"
    yield emit()
    comments_df = save_youtube_comments_to_drive(video_id, YOUTUBE_API_KEY, service, folder_id)
    log += f"✅ {len(comments_df)} comments fetched and saved to Drive.\n"
    yield emit()

    # Preprocess comments
    log += "🧹 Preprocessing comments...\n"
    yield emit()
    comments_df = preprocess_comments(folder_id)
    log += "✅ Comments preprocessed.\n"
    yield emit()
    return


# --------------------------------------------------------------------
# GRADIO UI
# --------------------------------------------------------------------

with gr.Blocks() as demo:
    gr.Markdown("## 🎥 YouTube Comments Insight Tool")

    gr.Markdown(
        "Enter a YouTube URL — system will create Drive folder, fetch transcript, "
        "and extract metadata. Future insights coming soon."
    )

    # URL entry
    with gr.Row():
        with gr.Column(scale=1):
            pass
        with gr.Column(scale=2):
            url_input = gr.Textbox(
                label="Video URL",
                placeholder="https://www.youtube.com/watch?v=dQw4w9WgXcQ",
            )
            analyze_btn = gr.Button("Analyze", variant="primary")
        with gr.Column(scale=1):
            pass

    # logs panel
    with gr.Accordion("📜 Logs", open=True):
        status_box = gr.Textbox(
            label="Execution Log",
            max_lines=10,
            autoscroll=True,
            interactive=False
        )

    # --- metadata accordion (hidden initially) ---
    metadata_accordion = gr.Accordion("📼 Video Metadata", open=False, visible=False)

    with metadata_accordion:
        with gr.Row():
            thumbnail = gr.Image(label="Thumbnail", visible=False, height=180)

            with gr.Column():
                title_md = gr.Markdown(visible=False)
                published_md = gr.Markdown(visible=False)
                category_md = gr.Markdown(visible=False)

        description_md = gr.Markdown(visible=False)

    # --- full transcript accordion (hidden initially) ---
    transcript_accordion = gr.Accordion("📝 Transcript", open=False, visible=False)

    with transcript_accordion:
        transcript_md = gr.Textbox(
            label="Transcript",
            container=False,
            lines=20,
            interactive=False
        )

    # --- Comments Count panel (hidden initially) ---
    with gr.Row():
        with gr.Column(scale=1):
            comment_count_accordion = gr.Accordion("💬 Comments", open=True, visible=False)
            with comment_count_accordion:
                comment_count_md = gr.Markdown(visible=False)
        with gr.Column(scale=2):
            pass

    # bind outputs
    analyze_btn.click(
        fn=process_video_url,
        inputs=url_input,
        outputs=[
            status_box,          # log text
            metadata_accordion,  # show/hide metadata accordion
            thumbnail,
            title_md,
            published_md,
            description_md,
            category_md,
            transcript_accordion,  # show/hide transcript accordion
            transcript_md,
            comment_count_accordion,  # show/hide comment count stats
            comment_count_md
        ]
    )


css = """
.scroll-text-box {
    max-height: 300px;
    overflow-y: auto;
    white-space: pre-wrap;
}
"""

demo.launch(css=css, debug=True)


6.0.2
It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://2a39428e16ea785c1b.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Created new folder with ID: 1lGT009GqIBU0TN-M0-XcVszk-RHFW_JM
Fetched 83 comments (total: 83)
Uploading to Google Drive...
Uploaded 'comments.csv' to Drive with ID: 1fXEZ374_wJjDy0Ub2hf3qvbhPETDmPtb
Total comments saved: 83
Found 'comments.csv' with ID: 1fXEZ374_wJjDy0Ub2hf3qvbhPETDmPtb
Download 100%.
Comments loaded successfully into df.
Uploaded 'comments_preprocessed.csv' to Drive with ID: 1SvoXshXDMOgjpr0dMjVOYe1gQo7ehILi
Cleaned comments saved to: 1lGT009GqIBU0TN-M0-XcVszk-RHFW_JM/comments_preprocessed.csv
