In [81]:
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
from googleapiclient.errors import HttpError
from googleapiclient.discovery import build
from dotenv import load_dotenv
import os

In [82]:
load_dotenv()
class YouTubeAPIClient:
    def __init__(self):
        # Load and clean keys
        keys = os.getenv("YOUTUBE_API_KEYS")
        if not keys:
            raise ValueError("No API keys found in file.")
        self.api_keys = [key.strip() for key in keys.split(',')]
        self.index = 0
        self.youtube = self._build_client(self.api_keys[self.index])

    def _build_client(self, key):
        return build("youtube", "v3", developerKey=key)

    def _rotate_key(self):
        self.index += 1
        if self.index >= len(self.api_keys):
            raise Exception("All API keys exhausted.")
        print(f"Switching to next API key: {self.api_keys[self.index]}")
        self.youtube = self._build_client(self.api_keys[self.index])

    def execute(self, request):
        while True:
            try:
                return request.execute()
            except HttpError as e:
                error_res = e.resp.get('status')
                if error_res == 403 and 'quotaExceeded' in str(e):
                    print(f"Quota exhausted for API key {self.api_keys[self.index]}")
                    self._rotate_key()
                else:
                    raise e

    def get_client(self):
        return self.youtube

yt_client = YouTubeAPIClient()
youtube = yt_client.get_client()

In [113]:
# Step 1: Search for News Channels
def search_news_channels(max_results=100):
    try:
        request = youtube.search().list(
            part='snippet',
            type='channel',
            q='news',
            regionCode='US',
            maxResults=max_results
        )
        response = request.execute()
        return [(item['snippet']['channelId'], item['snippet']['channelTitle']) for item in response['items']]
    except Exception as e:
        print(f"Error searching news channels: {e}")
        return []

In [114]:
# Step 2: Get Channel Statistics
def get_channel_stats(channel_id):
    try:
        request = youtube.channels().list(
            part='snippet,statistics',
            id=channel_id
        )
        response = request.execute()
        data = response['items'][0]
        stats = data['statistics']
        return {
            'channel_id': channel_id,
            'channel_title': data['snippet']['title'],
            'subscriber_count': int(stats.get('subscriberCount', 0)),
            'total_views': int(stats.get('viewCount', 0)),
            'video_count': int(stats.get('videoCount', 0))
        }
    except Exception as e:
        print(f"Error getting channel stats for {channel_id}: {e}")
        return None

In [115]:
# Step 3: Get Uploads Playlist ID
def get_uploads_playlist_id(channel_id):
    try:
        request = youtube.channels().list(
            part='contentDetails',
            id=channel_id
        )
        response = request.execute()
        return response['items'][0]['contentDetails']['relatedPlaylists']['uploads']
    except Exception as e:
        print(f"Error getting uploads playlist for {channel_id}: {e}")
        return None

In [116]:
# Step 4: Get Latest Video IDs
def get_latest_video_ids(playlist_id, max_videos=5):
    video_ids = []
    try:
        request = youtube.playlistItems().list(
            part='contentDetails',
            playlistId=playlist_id,
            maxResults=max_videos
        )
        response = request.execute()
        for item in response['items']:
            video_ids.append(item['contentDetails']['videoId'])
    except Exception as e:
        print(f"Error getting video IDs from playlist {playlist_id}: {e}")
    return video_ids

In [117]:
# Step 5: Get Video Metadata
def get_video_stats(video_id):
    try:
        request = youtube.videos().list(
            part='snippet,statistics',
            id=video_id
        )
        response = request.execute()
        data = response['items'][0]
        stats = data['statistics']
        return {
            'video_id': video_id,
            'video_title': data['snippet']['title'],
            'video_published_at': data['snippet']['publishedAt'],
            'video_views': int(stats.get('viewCount', 0)),
            'video_likes': int(stats.get('likeCount', 0)),
            'video_comments': int(stats.get('commentCount', 0))
        }
    except Exception as e:
        print(f"Error getting stats for video {video_id}: {e}")
        return None

In [118]:
# === Run the full pipeline ===
def get_top_news_videos():
    final_data = []
    channels = search_news_channels(max_results=100)

    for channel_id, channel_title in channels:
        channel_info = get_channel_stats(channel_id)
        if not channel_info:
            continue

        uploads_playlist = get_uploads_playlist_id(channel_id)
        if not uploads_playlist:
            continue

        video_ids = get_latest_video_ids(uploads_playlist, max_videos=5)
        for vid in video_ids:
            video_info = get_video_stats(vid)
            if video_info:
                final_data.append({
                    **channel_info,
                    **video_info
                })

    return pd.DataFrame(final_data)

In [119]:
df = get_top_news_videos()
df.to_csv("top_news_videos.csv", index=False)
df

Error getting video IDs from playlist UUYfdidRxbB8Qhf0Nx7ioOYw: <HttpError 404 when requesting https://youtube.googleapis.com/youtube/v3/playlistItems?part=contentDetails&playlistId=UUYfdidRxbB8Qhf0Nx7ioOYw&maxResults=5&key=AIzaSyA2fHh7U-Q2qlvBBaBi2D59qvgaxBM6yyY&alt=json returned "The playlist identified with the request's <code>playlistId</code> parameter cannot be found.". Details: "[{'message': "The playlist identified with the request's <code>playlistId</code> parameter cannot be found.", 'domain': 'youtube.playlistItem', 'reason': 'playlistNotFound', 'location': 'playlistId', 'locationType': 'parameter'}]">
Error getting video IDs from playlist UUEl0qh9X3kuL1RdFHng497Q: <HttpError 404 when requesting https://youtube.googleapis.com/youtube/v3/playlistItems?part=contentDetails&playlistId=UUEl0qh9X3kuL1RdFHng497Q&maxResults=5&key=AIzaSyA2fHh7U-Q2qlvBBaBi2D59qvgaxBM6yyY&alt=json returned "The playlist identified with the request's <code>playlistId</code> parameter cannot be found.". 

Unnamed: 0,channel_id,channel_title,subscriber_count,total_views,video_count,video_id,video_title,video_published_at,video_views,video_likes,video_comments
0,UCuTAXTexrhetbOe3zgskJBQ,日テレNEWS,2780000,4050252948,72293,gBBdYhfWzKI,米中関税協議で共同声明「１１５％引き下げ」合意の背景▽協議出席者の顔ぶれから見える双方の思惑...,2025-05-13T02:00:06Z,283,7,0
1,UCuTAXTexrhetbOe3zgskJBQ,日テレNEWS,2780000,4050252948,72293,ZjwyCwbjdjU,【最新 ニュースライブ】最新ニュースと生活情報（5月13日） ──THE LATEST NE...,2025-05-13T00:35:07Z,1482,11,0
2,UCuTAXTexrhetbOe3zgskJBQ,日テレNEWS,2780000,4050252948,72293,vDoUL0qyV-g,【国会中継】『参議院・財政金融委員会』チャットで語ろう！ ──政治ニュースライブ［2025年...,2025-05-12T09:56:19Z,1381,22,0
3,UCuTAXTexrhetbOe3zgskJBQ,日テレNEWS,2780000,4050252948,72293,MDFwyS8eyg0,【朝 ニュースライブ】最新ニュースと生活情報（5月13日） ──THE LATEST NEW...,2025-05-13T01:46:39Z,62472,144,0
4,UCuTAXTexrhetbOe3zgskJBQ,日テレNEWS,2780000,4050252948,72293,ey8gEkH3D3U,【コメ価格】18週ぶり下落 「備蓄米」効果は？ 社員にコメ配布の企業も,2025-05-12T23:14:19Z,938,15,6
...,...,...,...,...,...,...,...,...,...,...,...
216,UC69SwnuvoumrKy6giA6crnA,Bad News Reunion - Topic,44,19938,109,GDHq0xd8xQo,Young Girl Blues (Live),2020-03-12T15:28:00Z,13,0,0
217,UC69SwnuvoumrKy6giA6crnA,Bad News Reunion - Topic,44,19938,109,GqqS-5TOnwE,Coming Into Los Angeles (Live),2020-03-12T15:28:00Z,15,0,0
218,UC69SwnuvoumrKy6giA6crnA,Bad News Reunion - Topic,44,19938,109,r_nRGqPKw48,Ill Go Crazy (Live),2020-03-12T15:28:00Z,11,0,0
219,UC69SwnuvoumrKy6giA6crnA,Bad News Reunion - Topic,44,19938,109,CY0f0cVnH7A,The Thrill Is Gone (Live),2020-03-12T15:27:59Z,9,0,0


In [122]:
df['channel_title'].unique()

array(['日テレNEWS', 'にゅうちゅうぶ | NEWS official', 'Sky News', 'TVBS NEWS',
       'ABS-CBN News', '9 News Australia', 'Geo News', 'Fox News',
       'NBC News', 'ABC News', '無綫新聞 TVB NEWS Official', 'BBC News',
       'NEWS AM', 'News - Topic', 'Local News Legend - Topic',
       'Broken News - Topic', 'Good News Circle - Topic',
       'Vida News - Topic', 'Good News Team - Topic', 'Good News - Topic',
       'NEWS - Topic', 'The News Hinano - Topic', 'Great News - Topic',
       'News from Neptune - Topic', 'News From Nowhere - Topic',
       'Tierra Propia News - Topic', 'Hobo News - Topic',
       'Public Spreads The News - Topic', 'Phokat News', 'aoen News',
       'Violent News - Topic', 'Broadway News - Topic',
       'Good News Everyone - Topic', 'John News - Topic',
       'The News Channel - Topic', 'Shipping News - Topic',
       'The Good News - Topic', 'News PrimeTime 24',
       'Good News Music - Topic', 'TRUE LINE NEWS ',
       'Animal News - Topic', 'THE NEWS - Topic', 'Fu

In [45]:
# Print the channel info (you can modify this to display only the channel IDs and names)
for channel in news_channels['items']:
    channel_id = channel['snippet']['channelId']
    channel_title = channel['snippet']['channelTitle']
    print(f"{channel_title} - {channel_id}")

日テレNEWS - UCuTAXTexrhetbOe3zgskJBQ
にゅうちゅうぶ | NEWS official - UCj9f2amb1D0RuTNrBQ9GJoQ
Sky News - UCoMdktPbSTixAyNGwb-UYkQ
ABS-CBN News - UCE2606prvXQc_noEqKxVJXA
TVBS NEWS - UC5nwNW4KdC0SzrhF9BXEYOQ
9 News Australia - UCIYLOcEUX6TbBo7HQVF2PKA
News - UCYfdidRxbB8Qhf0Nx7ioOYw
Geo News - UC_vt34wimdCzdkrzVejwX9g
Fox News - UCXIJgqnII2ZOINSWNOGFThA
NBC News - UCeY0bbntWzzVIaj2z3QigXg
ABC News - UCBi2mrWuNuyYy4gbM6fU18Q
無綫新聞 TVB NEWS Official - UC_ifDTtFAcsj-wJ5JfM27CQ
BBC News - UC16niRr50-MSBwiO3YDb3RA
NEWS AM - UCDv-XtfgNGHXcpwu4ab0WnA
News - Topic - UCte8ywmzIMWUT3K6rH4H1XQ
Vida News - Topic - UCk6j5PWGPvpyYpKyYz7z91Q
Good News Circle - Topic - UC79umPRGQIo23TpAyfpv48w
Good News - Topic - UCrLbXfsTlSlU4eZYiAXNcsQ
NEWS - Topic - UCbYM7oBcZNzqQp98PiF5HKw
Great News - Topic - UCDAqPSrlnLSJJrjiI0eoHYw
Health News - UCn371zWk5jljg-ycIXkEUSA
News From Nowhere - Topic - UCvOJ4neMIuwsAvoNV82sCPA
Tierra Propia News - Topic - UCkYvApoUkQR4oNh_vCN3JLA
Hobo News - Topic - UC9Ik6IQd7ro2nhr3sI7wCNg
P

In [34]:
def get_channel_stats(channel_id):
    request = youtube.channels().list(
        part='snippet,statistics',
        id=channel_id
    )
    response = request.execute()
    stats = response['items'][0]['statistics']
    return {
        'channel_id': channel_id,
        'title': response['items'][0]['snippet']['title'],
        'subscribers': stats.get('subscriberCount', 'hidden'),
        'total_views': stats.get('viewCount'),
        'video_count': stats.get('videoCount')
    }

In [35]:
def get_uploads_playlist_id(channel_id):
    request = youtube.channels().list(
        part='contentDetails',
        id=channel_id
    )
    response = request.execute()
    return response['items'][0]['contentDetails']['relatedPlaylists']['uploads']

def get_video_ids_from_playlist(playlist_id, max_videos=5):
    video_ids = []
    request = youtube.playlistItems().list(
        part='contentDetails',
        playlistId=playlist_id,
        maxResults=max_videos
    )
    response = request.execute()
    for item in response['items']:
        video_ids.append(item['contentDetails']['videoId'])
    return video_ids


In [36]:
def get_video_stats(video_id):
    request = youtube.videos().list(
        part='snippet,statistics',
        id=video_id
    )
    response = request.execute()
    data = response['items'][0]
    return {
        'title': data['snippet']['title'],
        'published_at': data['snippet']['publishedAt'],
        'views': data['statistics'].get('viewCount'),
        'likes': data['statistics'].get('likeCount'),
        'comments': data['statistics'].get('commentCount')
    }


In [57]:
def get_video_comments(video_id, max_comments=100000):
    comments = []
    request = youtube.commentThreads().list(
        part='snippet',
        videoId=video_id,
        maxResults=1000000,  # YouTube limits this per page
        textFormat='plainText'
    )
    response = request.execute()
    
    for item in response['items']:
        top_comment = item['snippet']['topLevelComment']['snippet']
        comments.append({
            'author': top_comment['authorDisplayName'],
            'profile_image': top_comment['authorProfileImageUrl'],
            'text': top_comment['textDisplay'],
            'like_count': top_comment['likeCount'],
            'published_at': top_comment['publishedAt']
        })

    return comments[:max_comments]


In [68]:
import pandas as pd

# Function to get replies for a specific comment
def get_replies(youtube, parent_id, video_id): 
    replies = []
    next_page_token = None

    while True:
        reply_request = youtube.comments().list(
            part="snippet",
            parentId=parent_id,
            textFormat="plainText",
            maxResults=100,
            pageToken=next_page_token
        )
        reply_response = reply_request.execute()

        for item in reply_response['items']:
            comment = item['snippet']
            replies.append({
                'Timestamp': comment['publishedAt'],
                'Username': comment['authorDisplayName'],
                'VideoID': video_id,
                'Comment': comment['textDisplay'],
                'Date': comment['updatedAt'] if 'updatedAt' in comment else comment['publishedAt'],
                'UserID': comment['authorChannelId']['value']
            })

        next_page_token = reply_response.get('nextPageToken')
        if not next_page_token:
            break

    return replies

# Function to get all comments (including replies) for a single video
def get_comments_for_video(youtube, video_id, max_comments=100):
    all_comments = []
    next_page_token = None
    comments_fetched = 0

    while True:
        comment_request = youtube.commentThreads().list(
            part="snippet",
            videoId=video_id,
            pageToken=next_page_token,
            textFormat="plainText",
            maxResults=100
        )
        comment_response = comment_request.execute()

        for item in comment_response['items']:
            top_comment = item['snippet']['topLevelComment']['snippet']
            comment_data = {
                'VideoID': video_id,
                'VideoDate': top_comment['publishedAt'],  # Date of the comment (same as video post time for top comment)
                'Comment': top_comment['textDisplay'],
                'CommentTimestamp': top_comment['publishedAt'],
                'Username': top_comment['authorDisplayName'],
                'UserID': top_comment.get('authorChannelId', {}).get('value'),  # User's channel ID
                'Likes': top_comment.get('likeCount', 0),  # Likes on the comment
                'Dislikes': 0,  # Dislike data is not provided by the API, so we assume 0.
                'Replies': item['snippet']['totalReplyCount'],  # Number of replies to the comment
            }

            all_comments.append(comment_data)

            # Fetch replies if there are any
            if item['snippet']['totalReplyCount'] > 0:
                all_comments.extend(get_replies(youtube, item['snippet']['topLevelComment']['id'], video_id))

            comments_fetched += 1
            if comments_fetched >= max_comments:
                break

        next_page_token = comment_response.get('nextPageToken')
        if not next_page_token or comments_fetched >= max_comments:
            break

    # Convert the list of comments to a pandas DataFrame
    comments_df = pd.DataFrame(all_comments)
    return comments_df

# Test the function on the provided video ID
video_id = "cMe6JGJtAb8"
comments_df = get_comments_for_video(youtube, video_id, max_comments=1000)

# Display the first few rows of the DataFrame
comments_df.head()


Unnamed: 0,VideoID,VideoDate,Comment,CommentTimestamp,Username,UserID,Likes,Dislikes,Replies,Timestamp,Date
0,cMe6JGJtAb8,2025-05-12T23:46:38Z,Really!? What trade deal? You mean he's going ...,2025-05-12T23:46:38Z,@Elcidro123,UC04TjoS58Xpzg9yAjuY-HGw,0.0,0.0,0.0,,
1,cMe6JGJtAb8,2025-05-12T23:43:41Z,Trade deal my arse! He’s back peddling fast as...,2025-05-12T23:43:41Z,@scamwatchdog,UCLRIrszWF_bAM28DW9_SCbQ,0.0,0.0,0.0,,
2,cMe6JGJtAb8,2025-05-12T23:38:56Z,YA GREAT NEWS WE WENT BACK TO BEFORE THE TARIF...,2025-05-12T23:38:56Z,@ChrisVukadin,UCSwkbqwDZhQtL1y1hJM8CrA,0.0,0.0,0.0,,
3,cMe6JGJtAb8,2025-05-12T23:37:16Z,How many MAGA Republicans does it take to chan...,2025-05-12T23:37:16Z,@autodidactic-i2i,UCCbjpEd5vS_zEq4o7AglOLg,0.0,0.0,0.0,,
4,cMe6JGJtAb8,2025-05-12T23:36:13Z,Trump lies and Fox News swears to it. Fox News...,2025-05-12T23:36:13Z,@ctbigdog88,UCKn6jEfF1d9Z8BwcdA9yEwQ,0.0,0.0,0.0,,


In [43]:
comments = get_comments_for_video(video_ids[0], max_comments=5)
for c in comments:
    print(c['author'], "->", c['text'][:100])

@FoxNews -> Read more: https://www.foxnews.com/politics/democrat-hank-johnson-draws-holocaust-comparison-while-b
@rosesargent3098 -> I bet Democrats are on payroll follow the money
@JacobMyers-g8y -> Hillary is on his hat
@tompastian3447 -> You have to be mentally sick to support the democrat party today.
@mikemigueis548 -> THANK GOD FOR THE GREAT TRUMP. NOBODY LIKE TRUMP.
