In [1]:
from googleapiclient.discovery import build
from IPython.display import JSON
from pprint import pprint
import pandas as pd
import json

import os
os.chdir(os.path.expanduser("../"))

from dees_package.youtube_functions import *

print("Current working directory:", os.getcwd())

Current working directory: /Users/hanbinfeng/Desktop/LSE_Data_Science/ds105a-project-dees-nuts


## Functions

In [2]:
def youtube_search(any_youtube, max_results: int, query: str, searchtype: str, region: str, category: int):
    search_data = []
    video_ids = []

    next_page_token = None

    while True :
        youtube_search_request = any_youtube.search().list(
            part="snippet",
            maxResults=min(50, max_results),  # Maximum allowed value is 50
            q=query,
            type=searchtype,
            regionCode = region,
            videoCategoryId=category,
            order="viewCount",
            fields="items(id/videoId,snippet(channelId,channelTitle,description,title)),nextPageToken,pageInfo,prevPageToken,regionCode",
            pageToken=next_page_token
        )

        # Execute the request and get the response
        youtube_search_response = youtube_search_request.execute()
        print(youtube_search_response['pageInfo']['totalResults'])

        # iterate through each element in the nested dictionary to get the relevant values of each video
        for item in youtube_search_response.get('items', []):
            video_id = item['id']['videoId']
            title = item['snippet']['title']
            channel_id = item['snippet']['channelId']
            channel_title = item['snippet']['channelTitle']

            # append the relevant values to the data dictionary to save as a dataframe
            search_data.append({
                'video_id': video_id,
                'title': title,
                'channel_id': channel_id,
                'channel_title': channel_title,
            })

            video_ids.append(video_id)

        # Check if there are more pages
        next_page_token = youtube_search_response.get('nextPageToken')
        if not next_page_token or len(video_ids) >= max_results:
            break  # No more pages or reached the desired number of results

    # Return the collected data and video IDs
    return search_data, video_ids


def get_stats(any_youtube, videoId:list):

    # video_ids_str = ','.join(videoId) # pre-2024 code, video ID as comma separated strings; but apparently it's ok to just use a list now
    video_data= []


    # create the request object
    # from the above response, we already have the channelId, channelTitle, videoID, categoryID, 
    chunk_size = 50
    for i in range(0, 600, chunk_size):
        current_chunk = videoId[i:i+chunk_size-1]
        video_request = any_youtube.videos().list(
        part="statistics, id, topicDetails, contentDetails",
        id=",".join(current_chunk))
        print(video_request)
        print(i,i+chunk_size-1)
        video_response = video_request.execute()

        print(video_response['items'])

    # iterate through each element in the nested dictionary to get the relevant values
    for item in video_response['items']:
        view_count = item['statistics']['viewCount']
        comment_count = item['statistics']['commentCount']
        wikipedia_category = item['topicDetails']['topicCategories']
        duration = item['contentDetails']['duration']


        # append the relevant values to the data dictionary to save as a dataframe
        video_data.append ({
        'video_id': item['id'],
        'view_count': view_count,
        'comment_count': comment_count,
        'wikipedia_categories': wikipedia_category,
        'duration': duration
        })
        
    return video_data

def get_comments_in_videos(youtube, video_ids):
    """
    Get top level comments as text from all videos with given IDs (only the first 10 comments due to quote limit of Youtube API)
    Params:
    
    youtube: the build object from googleapiclient.discovery
    video_ids: list of video IDs
    
    Returns:
    Dataframe with video IDs and associated top level comment in text.
    
    """
    all_comments = []
    
    for i in range (0,len(video_ids),1):
        try:   
            request = youtube.commentThreads().list(
                part="snippet,replies",
                videoId=video_ids[i]
            )
            response = request.execute()
        
            comments_in_video = [comment['snippet']['topLevelComment']['snippet']['textOriginal'] for comment in response['items'][0:10]]
            comments_in_video_info = {'video_id':video_ids[i], 'comments': comments_in_video}

            all_comments.append(comments_in_video_info)

        except Exception as e:
            print(f'Could not get comments for video {i}. Error: {e}')
        continue # Skip to the next iteration in case of an error
        
    return pd.DataFrame(all_comments)  

In [3]:
credentials_file_path = "./credentials.json"

# open the file and load the data into a variable
with open(credentials_file_path, "r") as f:
    credentials = json.load(f)

In [4]:
# creating service object of the youtube version 3 API
service_youtube = build('youtube', 'v3', developerKey=credentials['youtube_api_key'])

### Getting list of music videos, carried out using .search().list() methods

In [5]:
youtube_search_data, video_id = youtube_search(service_youtube, 2000, "official music video", "video", "US", 10)

with open('./data/yt_search_data.json', 'w') as json_file:
    json.dump(youtube_search_data, json_file, indent=4)

yt_search_df = pd.DataFrame(youtube_search_data)
yt_search_df.to_csv('./data/search.csv')


1000000
1000000
1000000
1000000
1000000
1000000
1000000
1000000
1000000
1000000
1000000
1000000
1000000


### Getting statistics on each video, using video IDs from previous function as an input, carried out using .videos().list() methods

In [6]:
video_stats = get_stats(service_youtube, video_id)  # there is a limit on the number of video ids, can only run 50 at a time. Solution: create different lists with 50 IDs each.
video_stats_df = pd.DataFrame(video_stats)

<googleapiclient.http.HttpRequest object at 0x141a1aed0>
0 49
[{'kind': 'youtube#video', 'etag': 'bTKqWH5G1rmXXucm3ToW5p_4OE4', 'id': 'JGwWNGJdvx8', 'contentDetails': {'duration': 'PT4M24S', 'dimension': '2d', 'definition': 'hd', 'caption': 'false', 'licensedContent': True, 'contentRating': {}, 'projection': 'rectangular'}, 'statistics': {'viewCount': '6186385961', 'likeCount': '32532424', 'favoriteCount': '0', 'commentCount': '1150111'}, 'topicDetails': {'topicCategories': ['https://en.wikipedia.org/wiki/Music', 'https://en.wikipedia.org/wiki/Pop_music']}}, {'kind': 'youtube#video', 'etag': '4haTN2-cIsQeXOS1kI2zKAUrAUY', 'id': '09R8_2nJtjg', 'contentDetails': {'duration': 'PT5M2S', 'dimension': '2d', 'definition': 'hd', 'caption': 'true', 'licensedContent': True, 'contentRating': {}, 'projection': 'rectangular'}, 'statistics': {'viewCount': '3996173625', 'likeCount': '15981343', 'favoriteCount': '0', 'commentCount': '420729'}, 'topicDetails': {'topicCategories': ['https://en.wikipedia

### Merging dataframes


In [7]:
# merge the mv stats and search dataframes
merged_df = pd.merge(yt_search_df, video_stats_df, left_on='video_id', right_on='video_id')
merged_df.to_json('./data/merged.json')

merged_df.to_csv('./data/merged.csv')

### Getting comments, carried out using .commentThreads().list() methods

In [8]:
comments_df = get_comments_in_videos(service_youtube, video_id) # note that comments are disabled for some videos
comments_df.head()

Could not get comments for video 186. Error: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/commentThreads?part=snippet%2Creplies&videoId=ri5_fzndMBg&key=AIzaSyBMc9VT62N37Ek274blRYgSHUl2E7p8SSo&alt=json returned "The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.". Details: "[{'message': 'The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.', 'domain': 'youtube.commentThread', 'reason': 'commentsDisabled', 'location': 'videoId', 'locationType': 'parameter'}]">
Could not get comments for video 193. Error: <HttpError 400 when requesting https://youtube.googleapis.com/youtube/v3/commentThreads?part=snippet%2Creplies&videoId=uEJuoEs1UxY&key=AIzaSyBMc9VT62N37Ek274blRYgSHUl2E7p8SSo&alt=json returned "The API server failed to successfully process the request. While this can be a transient err

Unnamed: 0,video_id,comments
0,JGwWNGJdvx8,[New song https://youtu.be/rvfZfCtPPqk?si=6R_V...
1,09R8_2nJtjg,"[Me\n😊, Me I see in 2024, Who's watching this ..."
2,pRpeEdMmmQ0,"[❤😂🎉😢😮😮😅😊❤😂🎉😢😢😮😮😅😊😊❤😂🎉😢😮😮😅❤🎉😂😢😮😮😅😅😊, ❤😂🎉😢😮😮😅😊❤..."
3,lp-EO5I60KA,[Y did this get so many views? It's not even c...
4,2Vv-BfVoq4g,"[Follow me for more 😍 i am here 2024 👍, Ed She..."


### Final merge of dataframes


In [9]:
final_youtube_df = pd.merge(merged_df, comments_df, left_on='video_id', right_on='video_id', sort = False)
final_youtube_df.to_csv('./data/final_youtube.csv')
final_youtube_df.head(100)

Unnamed: 0,video_id,title,channel_id,channel_title,view_count,comment_count,wikipedia_categories,duration,comments
0,vVLl0xI_qD0,Saweetie - IMMORTAL FREESTYLE (Official Music ...,UC2FCUwThC0RAzaQoI0TtpLA,Official Saweetie,242394,1808,"[https://en.wikipedia.org/wiki/Hip_hop_music, ...",PT2M11S,[Appreciate all the love…but what’s your favor...
1,vVLl0xI_qD0,Saweetie - IMMORTAL FREESTYLE (Official Music ...,UC2FCUwThC0RAzaQoI0TtpLA,Official Saweetie,242394,1808,"[https://en.wikipedia.org/wiki/Hip_hop_music, ...",PT2M11S,[Appreciate all the love…but what’s your favor...
2,vVLl0xI_qD0,Saweetie - IMMORTAL FREESTYLE (Official Music ...,UC2FCUwThC0RAzaQoI0TtpLA,Official Saweetie,242394,1808,"[https://en.wikipedia.org/wiki/Hip_hop_music, ...",PT2M11S,[Appreciate all the love…but what’s your favor...
3,vVLl0xI_qD0,Saweetie - IMMORTAL FREESTYLE (Official Music ...,UC2FCUwThC0RAzaQoI0TtpLA,Official Saweetie,242394,1808,"[https://en.wikipedia.org/wiki/Hip_hop_music, ...",PT2M11S,[Appreciate all the love…but what’s your favor...
4,SSLPIT5Hi5w,Aidonia - Bottles | Official Music Video,UCG2QFlHjY3yAPLXbzH3_vSQ,AidoniaVEVO,239280,1646,[https://en.wikipedia.org/wiki/Electronic_musi...,PT2M29S,"[Aidonia on street vybz in 2024 is crazy lol, ..."
5,1g_fHPyC974,Punsang Appun // Teaser Brishtisikha // Banmal...,UC_I_VcN-MM9TtMXepCYUqYg,STUDIO OBONORI,232401,64,"[https://en.wikipedia.org/wiki/Music, https://...",PT17S,"[❤❤❤❤❤❤❤❤❤, 😂😂😂😂, Nice, Mojaaaaa lagise, Hii, ..."
6,l0_OFC10hTE,Guns N&#39; Roses - The General (Official Musi...,UCIak6JLVOjqhStxrL1Lcytw,Guns N' Roses,227050,1860,"[https://en.wikipedia.org/wiki/Music, https://...",PT4M24S,"[No se ustedes pero a mi me gustó, What si thi..."
7,AnnRwkxi4mg,Joseph Attieh - Hadder Halak [Official Music V...,UCqEQp8OWk2p2fWBZjyscZUA,Joseph Attieh,226096,366,[https://en.wikipedia.org/wiki/Music],PT4M17S,"[😍😍😍😍😍, حلوه الاغنيه كتير بس ماعرفنا باي لهجة ..."
8,ysftXz2INGs,Soulja - Ronaldo (Official Music Video) | سولج...,UCU-3aVH2u__JEZFCXShiRTQ,SOULJA - سولجا,237983,900,"[https://en.wikipedia.org/wiki/Hip_hop_music, ...",PT2M7S,"[وحش ‼️⚔️🦅📍❤, من متابعينك ❤❤❤❤ومتمنى أكون الأو..."
9,Upi2cBoGwnM,শিকারী - Itz Kabbo (Official Music Video) 🔥 F...,UCE-8FgcXvnAsb2EG1FizSyg,Itz Kabbo,210916,4728,"[https://en.wikipedia.org/wiki/Hip_hop_music, ...",PT3M33S,[কি এইবার পছন্দ হওয়ার মতো গান হয়েছে তোহ ?? ভাঙ...
