In [None]:
import time
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
import json
import csv
import re

# --- CONFIGURATION ---
# IMPORTANT: Replace with your actual YouTube Data API Key
YOUTUBE_API_KEY = "AIzaSyDJ-b5o_Umy-ZitkzvXvC5XFTWGAw1_st8" # <--- REPLACE THIS
youtube = build("youtube", "v3", developerKey=YOUTUBE_API_KEY)

# --- 1. Search Videos (with pagination and filters) ---
def search_videos(query, max_total_results=100):
    """
    Searches for videos based on a query, filters by region and live status,
    and handles pagination to fetch up to max_total_results.
    """
    all_video_ids = []
    next_page_token = None
    fetched_count = 0

    print(f"Searching for videos with query: '{query}' (Region: US, Non-Live)")
    while fetched_count < max_total_results:
        try:
            search_request = youtube.search().list(
                q=query,
                part='id,snippet',
                type='video',
                regionCode='US', # Filter for US region
                order= 'date',
                maxResults=min(50, max_total_results - fetched_count), # Max per page is 50 for search
                pageToken=next_page_token
            )
            search_response = search_request.execute()

            for item in search_response['items']:
                # Filter out live and upcoming videos using liveBroadcastContent
                if item['snippet'].get('liveBroadcastContent') == 'none':
                    all_video_ids.append(item['id']['videoId'])
                    fetched_count += 1
                    if fetched_count >= max_total_results:
                        break # Stop if we reached the desired total count

            next_page_token = search_response.get('nextPageToken')
            if not next_page_token:
                break # No more pages
            time.sleep(0.1) # Small delay between search pages to respect API limits

        except HttpError as e:
            print(f"An HTTP error {e.resp.status} occurred while searching for '{query}': {e.content}")
            break
        except Exception as e:
            print(f"An unexpected error occurred while searching for '{query}': {e}")
            break
    print(f"Found {len(all_video_ids)} non-live videos for query '{query}'.")
    return all_video_ids

# --- 2. Get Video Details ---
def get_video_details(video_ids):
    """Fetches detailed statistics for a list of video IDs."""
    videos_data = []
    if not video_ids:
        return videos_data

    # API allows up to 50 video IDs per request for videos().list
    for i in range(0, len(video_ids), 50):
        try:
            response = youtube.videos().list(
                part="snippet,statistics,contentDetails",
                id=','.join(video_ids[i:i+50])
            ).execute()

            for video in response['items']:
                sn = video['snippet']
                st = video['statistics']
                comment_count = int(st.get("commentCount", 0))

                # Filter videos with at least 10 comments, as per your original code
                if comment_count >= 10:
                    video_info = {
                        "videoId": video['id'],
                        "title": sn.get("title"),
                        "description": sn.get("description"),
                        "channelTitle": sn.get("channelTitle"),
                        "videoPublishedAt": sn.get("publishedAt"),
                        "tags": ";".join(sn.get("tags", [])), # Join tags into a single string
                        "viewCount": st.get("viewCount"),
                        "videoLikeCount": st.get("likeCount"),
                        "videoCommentCount": st.get("commentCount")
                    }
                    videos_data.append(video_info)
            time.sleep(0.1) # Small delay between video detail requests
        except HttpError as e:
            print(f"An HTTP error {e.resp.status} occurred while fetching video details for IDs {video_ids[i:i+50]}: {e.content}")
            time.sleep(5) # Longer delay on error
            continue
        except Exception as e:
            print(f"An unexpected error occurred while fetching video details for IDs {video_ids[i:i+50]}: {e}")
            continue
    print(f"Fetched details for {len(videos_data)} videos (with >=10 comments).")
    return videos_data

# --- 3. Get All Replies for a Specific Comment (with limit) ---
def get_all_replies_for_comment(parent_comment_id, limit=None):
    """
    Fetches replies for a given parent comment ID, up to a specified limit.
    This uses comments().list with parentId.
    """
    replies = []
    next_page_token = None
    while True:
        if limit is not None and len(replies) >= limit: # Check if we've reached the limit
            break
        try:
            # Calculate maxResults for the API call, constrained by the remaining limit
            api_max_results = min(100, limit - len(replies)) if limit is not None else 100
            if api_max_results <= 0: # If no room left, break
                break

            replies_request = youtube.comments().list(
                part="snippet",
                parentId=parent_comment_id,
                maxResults=api_max_results,
                textFormat="plainText",
                pageToken=next_page_token
            )
            replies_response = replies_request.execute()

            for reply_item in replies_response['items']:
                reply_snippet = reply_item['snippet']
                replies.append({
                    'commentId': reply_item['id'],
                    'text': reply_snippet['textDisplay'],
                    'author': reply_snippet.get('authorDisplayName'),
                    'authorChannelId': reply_snippet.get('authorChannelId', {}).get('value'),
                    'publishedAt': reply_snippet['publishedAt'],
                    'likeCount': reply_snippet['likeCount'],
                    'parentId': reply_snippet.get('parentId') # This will be the parent_comment_id
                })
                if limit is not None and len(replies) >= limit: # Check limit after appending
                    break

            next_page_token = replies_response.get('nextPageToken')
            if not next_page_token:
                break
            time.sleep(0.05) # Very small delay for replies to avoid hitting limits too fast

        except HttpError as e:
            # Often 403 if replies are disabled for a specific comment or quota exceeded
            print(f"HTTP Error fetching replies for parent comment {parent_comment_id}: {e}")
            break
        except Exception as e:
            print(f"Unexpected Error fetching replies for parent comment {parent_comment_id}: {e}")
            break
    return replies

# --- 4. Get All Comments (Top-Level and Limited Replies) for a Video ---
def get_comments(video_id, max_comments_to_fetch=None):
    """
    Fetches top-level comments and their replies for a given video ID,
    up to a total combined limit.
    """
    all_comments_for_video = []
    next_page_token_top_level = None
    total_comments_fetched_count = 0

    print(f"  Fetching comments for video {video_id} (limit: {max_comments_to_fetch or 'None'})...")
    # Fetch top-level comments
    while True:
        if max_comments_to_fetch is not None and total_comments_fetched_count >= max_comments_to_fetch:
            break

        try:
            # Calculate maxResults for the API call, constrained by the remaining limit
            api_max_results_top_level = min(100, max_comments_to_fetch - total_comments_fetched_count) if max_comments_to_fetch is not None else 100
            if api_max_results_top_level <= 0: # If no room left for top-level comments, break
                break

            request = youtube.commentThreads().list(
                part="snippet,replies", # Request 'replies' part to get initial sample and totalReplyCount
                videoId=video_id,
                maxResults=api_max_results_top_level,
                textFormat="plainText",
                pageToken=next_page_token_top_level
            )
            response = request.execute()

            for item in response['items']:
                top_level_comment_snippet = item['snippet']['topLevelComment']['snippet']
                top_level_comment_id = item['id']

                comment_data = {
                    'commentId': top_level_comment_id,
                    'author': top_level_comment_snippet['authorDisplayName'],
                    'text': top_level_comment_snippet['textDisplay'],
                    'likeCount': top_level_comment_snippet['likeCount'],
                    'publishedAt': top_level_comment_snippet['publishedAt'],
                    'parentId': None, # Top-level comments have no parent
                    'authorChannelId': top_level_comment_snippet.get('authorChannelId', {}).get('value'),
                    'replies': [] # This will store fetched replies for this top-level comment
                }
                all_comments_for_video.append(comment_data)
                total_comments_fetched_count += 1

                if max_comments_to_fetch is not None and total_comments_fetched_count >= max_comments_to_fetch:
                    break # Break if we've hit the total limit after adding top-level comment

                # Fetch replies for this top-level comment, constrained by remaining limit
                if item['snippet']['totalReplyCount'] > 0:
                    remaining_limit_for_replies = max_comments_to_fetch - total_comments_fetched_count if max_comments_to_fetch is not None else None
                    if remaining_limit_for_replies is not None and remaining_limit_for_replies <= 0: # If no room for replies
                        continue # Don't fetch replies for this comment, move to next top-level comment

                    replies_for_this_comment = get_all_replies_for_comment(top_level_comment_id, limit=remaining_limit_for_replies)
                    comment_data['replies'].extend(replies_for_this_comment)
                    total_comments_fetched_count += len(replies_for_this_comment)

                if max_comments_to_fetch is not None and total_comments_fetched_count >= max_comments_to_fetch:
                    break # Break if we've hit the total limit after adding replies

            next_page_token_top_level = response.get('nextPageToken')
            if not next_page_token_top_level:
                break # No more pages of top-level comments
            time.sleep(0.1) # Small delay between fetching top-level comment pages

        except HttpError as e:
            print(f"  HTTP Error fetching top-level comments for video {video_id}: {e}")
            if e.resp.status == 403: # Comments disabled or quota exceeded
                print(f"  Comments likely disabled or quota exceeded for video {video_id}. Skipping.")
                break
            time.sleep(5) # Longer delay on error
            continue
        except Exception as e:
            print(f"  Unexpected Error fetching top-level comments for video {video_id}: {e}")
            break
    print(f"  Fetched {total_comments_fetched_count} comments (top-level + replies) for video {video_id}.")
    return all_comments_for_video

# --- 5. Save Combined Data to CSV (Flattened Structure) ---
def save_combined_data_to_csv(videos_metadata, comments_data_dict, file_name="youtube_data_combined_limited.csv"):
    """
    Combines video metadata and comment data into a single flattened CSV file.
    Each row represents a comment (or a video if no comments).
    """
    combined_rows = []
    # Define all possible headers for the combined CSV
    fieldnames = [
        "videoId", "title", "description", "channelTitle", "videoPublishedAt", "tags",
        "viewCount", "videoLikeCount", "videoCommentCount",
        "commentId", "text", "author", "authorChannelId", "publishedAt", "likeCount", "parentId"
    ]

    for video_info in videos_metadata:
        video_id = video_info['videoId']
        comments_for_video = comments_data_dict.get(video_id, [])

        if not comments_for_video:
            # Add a row for the video even if it has no comments (or comments couldn't be fetched)
            row = video_info.copy()
            # Fill comment-specific fields with None
            for col in ["commentId", "text", "author", "authorChannelId", "publishedAt", "likeCount", "parentId"]:
                row[col] = None
            combined_rows.append(row)
        else:
            for comment_item in comments_for_video:
                # Top-level comment
                row = video_info.copy() # Start with video info
                row.update({
                    "commentId": comment_item['commentId'],
                    "text": comment_item['text'],
                    "author": comment_item['author'],
                    "authorChannelId": comment_item['authorChannelId'],
                    "publishedAt": comment_item['publishedAt'],
                    "likeCount": comment_item['likeCount'],
                    "parentId": comment_item['parentId'] # Should be None for top-level
                })
                combined_rows.append(row)

                # Add replies as separate rows, linked by parentId
                for reply_item in comment_item['replies']:
                    reply_row = video_info.copy() # Start with video info
                    reply_row.update({
                        "commentId": reply_item['commentId'],
                        "text": reply_item['text'],
                        "author": reply_item['author'],
                        "authorChannelId": reply_item['authorChannelId'],
                        "publishedAt": reply_item['publishedAt'],
                        "likeCount": reply_item['likeCount'],
                        "parentId": reply_item['parentId']
                    })
                    combined_rows.append(reply_row)

    with open(file_name, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(combined_rows)
    print(f"Combined video and comment data saved to {file_name}")

# --- MAIN EXECUTION ---
if __name__ == "__main__":
    query = "Russia Ukraine war"
    max_videos_to_fetch = 300 # Keep fetching up to 300 non-live videos for now
    limited_comments_per_video = 1000

    print(f"Starting data collection for query: '{query}'")
    print(f"Fetching up to {max_videos_to_fetch} non-live videos from US region.")
    print(f"Attempting to fetch a maximum of {limited_comments_per_video} comments (top-level + replies) per video.")
    print("WARNING: Fetching many videos and comments can still consume API quota quickly. Monitor your quota usage.")

    video_ids = search_videos(query, max_total_results=max_videos_to_fetch)
    filtered_videos_metadata = get_video_details(video_ids)

    all_comments_by_video_id = {}
    for vid_meta in filtered_videos_metadata:
        video_id = vid_meta['videoId']
        # Call get_comments with the new limit
        vid_comments = get_comments(video_id, max_comments_to_fetch=limited_comments_per_video)
        all_comments_by_video_id[video_id] = vid_comments
        time.sleep(0.5) # Small delay between processing different videos

    # Save comments as JSON (nested structure)
    json_output_filename = "../Data/youtube_war_comments_nested_250_comments.json" # Updated filename
    with open(json_output_filename, "w", encoding='utf-8') as f:
        json.dump(all_comments_by_video_id, f, indent=4, ensure_ascii=False)
    print(f"Nested comment data (limited to {limited_comments_per_video} per video) saved to {json_output_filename}")

    # # Save combined data to CSV (flattened structure)
    # csv_output_filename = "youtube_war_data_combined_250_comments.csv" # Updated filename
    # save_combined_data_to_csv(filtered_videos_metadata, all_comments_by_video_id, csv_output_filename)

    print("\n--- Data Collection Complete ---")
    print(f"Total videos with metadata: {len(filtered_videos_metadata)}")
    total_comments_fetched_overall = sum(len(comments) + sum(len(c['replies']) for c in comments) for comments in all_comments_by_video_id.values())
    print(f"Total comments (top-level + replies) fetched across all videos: {total_comments_fetched_overall}")

Starting data collection for query: 'Russia Ukraine war'
Fetching up to 300 non-live videos from US region.
Attempting to fetch a maximum of 1000 comments (top-level + replies) per video.
Searching for videos with query: 'Russia Ukraine war' (Region: US, Non-Live)
An HTTP error 403 occurred while searching for 'Russia Ukraine war': b'{\n  "error": {\n    "code": 403,\n    "message": "The request cannot be completed because you have exceeded your \\u003ca href=\\"/youtube/v3/getting-started#quota\\"\\u003equota\\u003c/a\\u003e.",\n    "errors": [\n      {\n        "message": "The request cannot be completed because you have exceeded your \\u003ca href=\\"/youtube/v3/getting-started#quota\\"\\u003equota\\u003c/a\\u003e.",\n        "domain": "youtube.quota",\n        "reason": "quotaExceeded"\n      }\n    ]\n  }\n}\n'
Found 0 non-live videos for query 'Russia Ukraine war'.
Nested comment data (limited to 1000 per video) saved to ../Data/youtube_war_comments_nested_250_comments.json

--- 

In [None]:
# --- Snippet to save video data to JSON ---
video_json_output_filename = "../Data/youtube_video_metadata.json"
with open(video_json_output_filename, "w", encoding='utf-8') as f:
    json.dump(filtered_videos_metadata, f, indent=4, ensure_ascii=False)
print(f"Video metadata saved to {video_json_output_filename}")
# --- End of snippet ---

In [None]:
# Load the video metadata
with open('../Data/youtube_video_metadata.json', 'r',encoding='utf-8') as f:
    video_metadata = json.load(f)

# Load the comments data
with open('../Data/youtube_war_comments_nested_250_comments.json', 'r',encoding='utf-8') as f:
    comments_data = json.load(f)
import json

# Merge the data
for video in video_metadata:
    video_id = video['videoId']
    if video_id in comments_data:
        video['comments'] = comments_data[video_id]
    else:
        video['comments'] = [] # Add an empty list if no comments are found


# Save the merged data to a new JSON file with UTF-8 encoding
output_file_name = '../Data/merged_youtube_data.json'
with open(output_file_name, 'w', encoding='utf-8') as f:
    json.dump(video_metadata, f, indent=4)

print(f"Merged data saved to {output_file_name}")

In [None]:
# Total count of videos and comments
total_videos = len(video_metadata)
total_comments = sum(len(video['comments']) for video in video_metadata)
print(f"Total videos: {total_videos}")
print(f"Total comments across all videos: {total_comments}")