In [None]:
import requests
import pandas as pd
from google.colab import drive
from googleapiclient.discovery import build
import time
import json
from google.colab import files
from googleapiclient.errors import HttpError
import re
import matplotlib.pyplot as plt

# Importing collected transcripts

In [None]:
drive.mount('/gdrive', force_remount=True)

Mounted at /gdrive


In [None]:
%cd # Directory location removed

In [None]:
df_combined = pd.read_csv() # File location removed
df_combined.shape

(10366, 13)

In [None]:
df_comments = pd.read_csv() # File location removed
df_comments.shape

(1034050, 7)

# Collecting comments

In [None]:
# API Keys have been removed

In [None]:
youtube = build("youtube", "v3", developerKey=api_key)

In [None]:
# Initial function for collection of comments and replies
def get_comments_and_replies(video_id):
    """Fetch all comments and replies for a given video."""
    comments = []
    next_page_token = None

    while True:
        request = youtube.commentThreads().list(
            part="snippet",
            videoId=video_id,
            maxResults=100,
            pageToken=next_page_token
        )
        response = request.execute()

        for item in response.get("items", []):
            top_comment = item["snippet"]["topLevelComment"]["snippet"]

            # Store top-level comment
            comment_id = item["id"]
            comments.append({
                "video_id": video_id,
                "comment_id": comment_id,
                "author": top_comment["authorDisplayName"],
                "comment": top_comment["textDisplay"],
                "published_at": top_comment["publishedAt"],
                "like_count": top_comment["likeCount"],
                "reply_to": None  # Top-level comment
            })

            # Fetch replies if available
            if item["snippet"]["totalReplyCount"] > 0:
                replies_request = youtube.comments().list(
                    part="snippet",
                    parentId=comment_id,
                    maxResults=100
                )
                replies_response = replies_request.execute()

                for reply in replies_response.get("items", []):
                    reply_snippet = reply["snippet"]
                    comments.append({
                        "video_id": video_id,
                        "comment_id": reply["id"],
                        "author": reply_snippet["authorDisplayName"],
                        "comment": reply_snippet["textDisplay"],
                        "published_at": reply_snippet["publishedAt"],
                        "like_count": reply_snippet["likeCount"],
                        "reply_to": comment_id  # Reply to this comment
                    })

        # Handle pagination
        next_page_token = response.get("nextPageToken")
        if not next_page_token:
            break

        # Prevent hitting API limits
        time.sleep(1)

    return comments

In [None]:
# Load video IDs from CSV (Ensure there's a "video_id" column)

all_comments = []
for video_id in df_combined["video_id"]:
    print(f"Fetching comments for video: {video_id}")
    all_comments.extend(get_comments_and_replies(video_id))

df_comments = pd.DataFrame(all_comments)

In [None]:
n = len(pd.unique(df_comments['video_id']))
n

1095

In [None]:
df_comments = pd.DataFrame(all_comments)
df_comments.shape

(103785, 7)

In [None]:
uploaded = files.upload()

Saving youtube_comments_with_replies7.csv to youtube_comments_with_replies7 (1).csv


In [None]:
df_comments = pd.read_csv("youtube_comments_with_replies7.csv")

In [None]:
#add saving for comments diabled

In [None]:
#Function that collects the comments and replies that have not been collected yet

# Load video IDs from CSV (Ensure there's a "video_id" column)
df_videos = df_combined
processed_videos = set()

# Try to load checkpoint file (if exists)
try:
    with open("checkpoint.json", "r") as f:
        checkpoint = json.load(f)
        last_processed_video = checkpoint.get("video_id", None)
        last_page_token = checkpoint.get("nextPageToken", None)
except FileNotFoundError:
    last_processed_video = None
    last_page_token = None

# Load already processed comments to avoid duplicates
try:
    existing_df = df_comments
    processed_videos.update(existing_df["video_id"].unique())
except FileNotFoundError:
    existing_df = pd.DataFrame()

def get_comments_and_replies(video_id, last_page_token=None):
    """Fetch all comments and replies for a given video, resuming from last_page_token if needed."""
    comments = []
    next_page_token = last_page_token

    while True:
      try:
        request = youtube.commentThreads().list(
            part="snippet",
            videoId=video_id,
            maxResults=100,
            pageToken=next_page_token
        )
        response = request.execute()

        for item in response.get("items", []):
            top_comment = item["snippet"]["topLevelComment"]["snippet"]

            # Store top-level comment
            comment_id = item["id"]
            comments.append({
                "video_id": video_id,
                "comment_id": comment_id,
                "author": top_comment["authorDisplayName"],
                "comment": top_comment["textDisplay"],
                "published_at": top_comment["publishedAt"],
                "like_count": top_comment["likeCount"],
                "reply_to": None  # Top-level comment
            })

            # Fetch replies if available
            if item["snippet"]["totalReplyCount"] > 0:
                replies_request = youtube.comments().list(
                    part="snippet",
                    parentId=comment_id,
                    maxResults=100
                )
                replies_response = replies_request.execute()

                for reply in replies_response.get("items", []):
                    reply_snippet = reply["snippet"]
                    comments.append({
                        "video_id": video_id,
                        "comment_id": reply["id"],
                        "author": reply_snippet["authorDisplayName"],
                        "comment": reply_snippet["textDisplay"],
                        "published_at": reply_snippet["publishedAt"],
                        "like_count": reply_snippet["likeCount"],
                        "reply_to": comment_id  # Reply to this comment
                    })

        # Save progress
        with open("checkpoint.json", "w") as f:
            json.dump({"video_id": video_id, "nextPageToken": next_page_token}, f)

        # Handle pagination
        next_page_token = response.get("nextPageToken")
        if not next_page_token:
            break

        # Prevent hitting API limits
        #time.sleep(1)
      except HttpError as e:
            error_message = str(e)
            if "commentsDisabled" in error_message:
                print(f" Skipping video {video_id}: Comments are disabled.")
                break
            elif e.resp.status in [403, 429]:  # Quota exceeded or too many requests
                wait_time = (2 ** retries) * 30  # Exponential backoff
                print(f" Rate limit reached. Retrying in {wait_time} seconds...")
                time.sleep(wait_time)
                retries += 1
                if retries > 5:
                    print("Too many retries. Skipping this video.")
                    break
            else:
                raise e


    return comments

# Fetch comments for remaining videos
all_comments = df_comments.to_dict(orient="records")  # Load previous data

for video_id in df_videos["video_id"]:
    if video_id in processed_videos:
        print(f"Skipping already processed video: {video_id}")
        continue

    print(f"Fetching comments for video: {video_id}")

    # Resume from last_page_token if this is the last processed video
    page_token = last_page_token if video_id == last_processed_video else None

    comments = get_comments_and_replies(video_id, page_token)
    all_comments.extend(comments)

    # Save progress
    df_comments = pd.DataFrame(all_comments)

    pd.DataFrame(all_comments).to_csv("youtube_comments_with_replies.csv", index=False)
    print(f"Saved progress for video: {video_id}")

# Delete checkpoint when done
import os
if os.path.exists("checkpoint.json"):
    os.remove("checkpoint.json")

print("All comments and replies saved successfully.")