In [None]:
# Load credentials from environment (.env) to avoid committing secrets
from dotenv import load_dotenv
import os
import time
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
import json

# Read API key from environment (set YOUTUBE_API_KEY in .env or your environment)
load_dotenv()
YOUTUBE_API_KEY = os.environ.get("YOUTUBE_API_KEY")
if not YOUTUBE_API_KEY:
    raise RuntimeError(
        "YOUTUBE_API_KEY is not set. Copy .env.example to .env and add your key, or export it in your environment."
    )

youtube = build("youtube", "v3", developerKey=YOUTUBE_API_KEY)

# Function to search for videos based on a query
def search_videos(query, max_total_results=100):
    all_video_ids = []
    next_page_token = None
    fetched_count = 0

    print(f"Searching for videos with query: '{query}' (Region: US, Non-Live)")
    while fetched_count < max_total_results:
        try:
            search_request = youtube.search().list(
                q=query,
                part='id,snippet',
                type='video',
                regionCode='US',
                order= 'date',
                maxResults=min(50, max_total_results - fetched_count),
                pageToken=next_page_token
            )
            search_response = search_request.execute()

            for item in search_response['items']:
                if item['snippet'].get('liveBroadcastContent') == 'none':
                    all_video_ids.append(item['id']['videoId'])
                    fetched_count += 1
                    if fetched_count >= max_total_results:
                        break

            next_page_token = search_response.get('nextPageToken')
            if not next_page_token:
                break
            time.sleep(0.1)

        except HttpError as e:
            print(f"An HTTP error {e.resp.status} occurred while searching for '{query}': {e.content}")
            break
        except Exception as e:
            print(f"An unexpected error occurred while searching for '{query}': {e}")
            break
    print(f"Found {len(all_video_ids)} non-live videos for query '{query}'.")
    return all_video_ids

# Function to get video details including metadata and statistics
def get_video_details(video_ids):
    videos_data = []
    if not video_ids:
        return videos_data

    for i in range(0, len(video_ids), 50):
        try:
            response = youtube.videos().list(
                part="snippet,statistics,contentDetails",
                id=','.join(video_ids[i:i+50])
            ).execute()

            for video in response['items']:
                sn = video['snippet']
                st = video['statistics']
                comment_count = int(st.get("commentCount", 0))

                if comment_count >= 10:
                    video_info = {
                        "videoId": video['id'],
                        "title": sn.get("title"),
                        "description": sn.get("description"),
                        "channelTitle": sn.get("channelTitle"),
                        "videoPublishedAt": sn.get("publishedAt"),
                        "tags": ";".join(sn.get("tags", [])),
                        "viewCount": st.get("viewCount"),
                        "videoLikeCount": st.get("likeCount"),
                        "videoCommentCount": st.get("commentCount")
                    }
                    videos_data.append(video_info)
            time.sleep(0.1)
        except HttpError as e:
            print(f"An HTTP error {e.resp.status} occurred while fetching video details for IDs {video_ids[i:i+50]}: {e.content}")
            time.sleep(5)
            continue
        except Exception as e:
            print(f"An unexpected error occurred while fetching video details for IDs {video_ids[i:i+50]}: {e}")
            continue
    print(f"Fetched details for {len(videos_data)} videos (with >=10 comments).")
    return videos_data

# Function to get comments for a video, including top-level comments and their replies
def get_comments(video_id, max_comments_to_fetch=None):
    all_comments_for_video = []
    next_page_token_top_level = None
    total_comments_fetched_count = 0

    print(f"  Fetching comments for video {video_id} (limit: {max_comments_to_fetch or 'None'})...")
    while True:
        if max_comments_to_fetch is not None and total_comments_fetched_count >= max_comments_to_fetch:
            break

        try:
            api_max_results_top_level = min(100, max_comments_to_fetch - total_comments_fetched_count) if max_comments_to_fetch is not None else 100
            if api_max_results_top_level <= 0:
                break

            request = youtube.commentThreads().list(
                part="snippet,replies",
                videoId=video_id,
                maxResults=api_max_results_top_level,
                textFormat="plainText",
                pageToken=next_page_token_top_level
            )
            response = request.execute()

            for item in response['items']:
                top_level_comment_snippet = item['snippet']['topLevelComment']['snippet']
                top_level_comment_id = item['id']

                comment_data = {
                    'commentId': top_level_comment_id,
                    'author': top_level_comment_snippet['authorDisplayName'],
                    'text': top_level_comment_snippet['textDisplay'],
                    'likeCount': top_level_comment_snippet['likeCount'],
                    'publishedAt': top_level_comment_snippet['publishedAt'],
                    'parentId': None,
                    'authorChannelId': top_level_comment_snippet.get('authorChannelId', {}).get('value'),
                    'replies': []
                }
                all_comments_for_video.append(comment_data)
                total_comments_fetched_count += 1

                if max_comments_to_fetch is not None and total_comments_fetched_count >= max_comments_to_fetch:
                    break

                if item['snippet']['totalReplyCount'] > 0:
                    remaining_limit_for_replies = max_comments_to_fetch - total_comments_fetched_count if max_comments_to_fetch is not None else None
                    if remaining_limit_for_replies is not None and remaining_limit_for_replies <= 0:
                        continue

                    replies_for_this_comment = get_all_replies_for_comment(top_level_comment_id, limit=remaining_limit_for_replies)
                    comment_data['replies'].extend(replies_for_this_comment)
                    total_comments_fetched_count += len(replies_for_this_comment)

                if max_comments_to_fetch is not None and total_comments_fetched_count >= max_comments_to_fetch:
                    break

            next_page_token_top_level = response.get('nextPageToken')
            if not next_page_token_top_level:
                break
            time.sleep(0.1)

        except HttpError as e:
            print(f"  HTTP Error fetching top-level comments for video {video_id}: {e}")
            if e.resp.status == 403:
                print(f"  Comments likely disabled or quota exceeded for video {video_id}. Skipping.")
                break
            time.sleep(5)
            continue
        except Exception as e:
            print(f"  Unexpected Error fetching top-level comments for video {video_id}: {e}")
            break
    print(f"  Fetched {total_comments_fetched_count} comments (top-level + replies) for video {video_id}.")
    return all_comments_for_video

# Function to get all replies for a given parent comment ID
def get_all_replies_for_comment(parent_comment_id, limit=None):
    replies = []
    next_page_token = None
    while True:
        if limit is not None and len(replies) >= limit:
            break
        try:
            api_max_results = min(100, limit - len(replies)) if limit is not None else 100
            if api_max_results <= 0:
                break

            replies_request = youtube.comments().list(
                part="snippet",
                parentId=parent_comment_id,
                maxResults=api_max_results,
                textFormat="plainText",
                pageToken=next_page_token
            )
            replies_response = replies_request.execute()

            for reply_item in replies_response['items']:
                reply_snippet = reply_item['snippet']
                replies.append({
                    'commentId': reply_item['id'],
                    'text': reply_snippet['textDisplay'],
                    'author': reply_snippet.get('authorDisplayName'),
                    'authorChannelId': reply_snippet.get('authorChannelId', {}).get('value'),
                    'publishedAt': reply_snippet['publishedAt'],
                    'likeCount': reply_snippet['likeCount'],
                    'parentId': reply_snippet.get('parentId')
                })
                if limit is not None and len(replies) >= limit:
                    break

            next_page_token = replies_response.get('nextPageToken')
            if not next_page_token:
                break
            time.sleep(0.05)

        except HttpError as e:
            print(f"HTTP Error fetching replies for parent comment {parent_comment_id}: {e}")
            break
        except Exception as e:
            print(f"Unexpected Error fetching replies for parent comment {parent_comment_id}: {e}")
            break
    return replies

query = "Russia Ukraine war"
max_videos_to_fetch = 300
limited_comments_per_video = 1000

print(f"Starting data collection for query: '{query}'")
print(f"Fetching up to {max_videos_to_fetch} non-live videos from US region.")
print(f"Attempting to fetch a maximum of {limited_comments_per_video} comments (top-level + replies) per video.")
print("WARNING: Fetching many videos and comments can still consume API quota quickly. Monitor your quota usage.")

video_ids = search_videos(query, max_total_results=max_videos_to_fetch)
filtered_videos_metadata = get_video_details(video_ids)

all_comments_by_video_id = {}
for vid_meta in filtered_videos_metadata:
    video_id = vid_meta['videoId']
    vid_comments = get_comments(video_id, max_comments_to_fetch=limited_comments_per_video)
    all_comments_by_video_id[video_id] = vid_comments
    time.sleep(0.5)

print("\n--- Data Collection Complete ---")
print(f"Total videos with metadata: {len(filtered_videos_metadata)}")
total_comments_fetched_overall = sum(len(comments) + sum(len(c['replies']) for c in comments) for comments in all_comments_by_video_id.values())
print(f"Total comments (top-level + replies) fetched across all videos: {total_comments_fetched_overall}")

Starting data collection for query: 'Russia Ukraine war'
Fetching up to 300 non-live videos from US region.
Attempting to fetch a maximum of 1000 comments (top-level + replies) per video.
Searching for videos with query: 'Russia Ukraine war' (Region: US, Non-Live)
An HTTP error 403 occurred while searching for 'Russia Ukraine war': b'{\n  "error": {\n    "code": 403,\n    "message": "The request cannot be completed because you have exceeded your \\u003ca href=\\"/youtube/v3/getting-started#quota\\"\\u003equota\\u003c/a\\u003e.",\n    "errors": [\n      {\n        "message": "The request cannot be completed because you have exceeded your \\u003ca href=\\"/youtube/v3/getting-started#quota\\"\\u003equota\\u003c/a\\u003e.",\n        "domain": "youtube.quota",\n        "reason": "quotaExceeded"\n      }\n    ]\n  }\n}\n'
Found 0 non-live videos for query 'Russia Ukraine war'.
Nested comment data (limited to 1000 per video) saved to ../Data/youtube_war_comments_nested_250_comments.json

--- 

In [None]:
json_output_filename = "../Data/youtube_war_comments_nested_250_comments.json"
with open(json_output_filename, "w", encoding='utf-8') as f:
    json.dump(all_comments_by_video_id, f, indent=4, ensure_ascii=False)
print(f"Nested comment data (limited to {limited_comments_per_video} per video) saved to {json_output_filename}")

In [None]:
video_json_output_filename = "../Data/youtube_video_metadata.json"
with open(video_json_output_filename, "w", encoding='utf-8') as f:
    json.dump(filtered_videos_metadata, f, indent=4, ensure_ascii=False)
print(f"Video metadata saved to {video_json_output_filename}")

In [None]:
with open('../Data/youtube_video_metadata.json', 'r',encoding='utf-8') as f:
    video_metadata = json.load(f)

with open('../Data/youtube_war_comments_nested_250_comments.json', 'r',encoding='utf-8') as f:
    comments_data = json.load(f)
import json


for video in video_metadata:
    video_id = video['videoId']
    if video_id in comments_data:
        video['comments'] = comments_data[video_id]
    else:
        video['comments'] = []


output_file_name = '../Data/merged_youtube_data.json'
with open(output_file_name, 'w', encoding='utf-8') as f:
    json.dump(video_metadata, f, indent=4)

print(f"Merged data saved to {output_file_name}")

In [None]:
total_videos = len(video_metadata)
total_comments = sum(len(video['comments']) for video in video_metadata)
print(f"Total videos: {total_videos}")
print(f"Total comments across all videos: {total_comments}")