In [15]:
from googleapiclient.discovery import build
from datetime import datetime, timedelta
import pandas as pd
import random

In [9]:
# global variables
MAX_RESULTS = 30
MAX_COMMENTS = 1000

**API Key: Lorenzo**

In [3]:
# open the text file containing the API key
with open("../authentication/YouTube_Data_API_Key.txt", "r") as file:
    API_KEY = file.read().strip() 

**API Key: Ishwarya**

In [None]:
# open the text file containing the API key
with open("YOUR PATH TO THE API KEY HERE", "r") as file:
    API_KEY = file.read().strip() 

In [4]:
# function to search for videos given a specific query. Requires max_results, published_after, and published_before both in datetime format
def search_videos(query, max_results=MAX_RESULTS, published_after=None, published_before=None):
    youtube = build('youtube', 'v3', developerKey=API_KEY)

    # convert datetime objects to ISO 8601 string format
    published_after_string = published_after.strftime('%Y-%m-%dT%H:%M:%SZ') if published_after else None
    published_before_string = published_before.strftime('%Y-%m-%dT%H:%M:%SZ') if published_before else None

    ## Two requests are created, to separately search for medium and long lenght videos.
    ## This ensures that we don't collect and YouTube Shorts videos, which we are not interested in.

    # construct request for medium lenght videos
    search_request_medium = youtube.search().list(
        q=query, 
        part="snippet", 
        type="video", 
        maxResults=max_results, 
        order="viewCount",
        videoDuration="medium",
        relevanceLanguage="en", 
        publishedAfter=published_after_string, 
        publishedBefore=published_before_string)
    
    # construct request for long videos
    search_request_long = youtube.search().list(
        q=query, 
        part="snippet", 
        type="video", 
        maxResults=max_results, 
        order="viewCount",
        videoDuration="long", 
        relevanceLanguage="en",
        publishedAfter=published_after_string, 
        publishedBefore=published_before_string)
    
    # execute both search requests and store the response
    search_response_medium = search_request_medium.execute()
    search_response_long = search_request_long.execute()

    # save video IDs into lists
    video_ids_medium = [item['id']['videoId'] for item in search_response_medium['items']]
    video_ids_long = [item['id']['videoId'] for item in search_response_long['items']]

    # construct request to retrieve data about found videos
    video_request_medium = youtube.videos().list(part="snippet,statistics", id=",".join(video_ids_medium))
    video_request_long = youtube.videos().list(part="snippet,statistics", id=",".join(video_ids_long))

    # execute both video requests and store the response
    video_response_medium = video_request_medium.execute()
    video_response_long = video_request_long.execute()

    # create empty list videos_data, in which each list item is a dictionary of metadata about one video
    videos_data = []
    # loop through the medium length videos and append to videos_data
    for item in video_response_medium['items']:
        videos_data.append({
            'title': item['snippet']['title'],
            'publish_date': item['snippet']['publishedAt'],
            'description': item['snippet']['description'],
            'video_id': item['id'],
            'view_count': int(item['statistics']['viewCount']) if 'viewCount' in item['statistics'] else 0,
            'comment_count': int(item['statistics']['commentCount']) if 'commentCount' in item['statistics'] else 0
        })
    # loop through the long videos and append to videos_data
    for item in video_response_long['items']:
        videos_data.append({
            'title': item['snippet']['title'],
            'publish_date': item['snippet']['publishedAt'],
            'description': item['snippet']['description'],
            'video_id': item['id'],
            'view_count': int(item['statistics']['viewCount']) if 'viewCount' in item['statistics'] else 0,
            'comment_count': int(item['statistics']['commentCount']) if 'commentCount' in item['statistics'] else 0
        })
    # convert the list of dictionaries to a dataframe and return
    return pd.DataFrame(videos_data)

In [5]:
# specify start and end dates
start_date = datetime(2017, 1, 1)
end_date = datetime(2017, 12, 31)
# call the search videos() function
videos_df = search_videos("artificial intelligence", MAX_RESULTS, start_date, end_date)

In [7]:
# inspect the data
videos_df.head(20)

Unnamed: 0,title,publish_date,description,video_id,view_count,comment_count
0,Tonight Showbotics: Jimmy Meets Sophia the Hum...,2017-04-26T03:57:12Z,Jimmy Fallon demos amazing new robots from all...,Bg_tJvCA8zw,34704805,23103
1,"But what is a neural network? | Chapter 1, Dee...",2017-10-05T15:11:25Z,"What are the neurons, why are there layers, an...",aircAruvnKk,16266953,7257
2,The Rise of the Machines – Why Automation is D...,2017-06-08T16:28:31Z,Automation in the Information Age is different...,WSKi8HfcxEk,14468899,30410
3,"How AIs, like ChatGPT, Learn",2017-12-18T14:39:33Z,"How do all the algorithms, like ChatGPT, aroun...",R9OHn5ZF4Uo,9914313,22915
4,Interview With The Lifelike Hot Robot Named So...,2017-10-25T16:32:19Z,"CNBC's Andrew Ross Sorkin interviews Sophia, a...",S5t6K9iwcdw,9891047,7668
5,"David, A.I - Prometheus & Covenant",2017-10-23T15:00:20Z,"David, Artificial Intelligence from Prometheus...",4Z-QCDyL2q4,6408321,4359
6,Artificial Intelligence: it will kill us | Jay...,2017-01-31T18:36:05Z,"For more information on Jay Tuck, please visit...",BrNs0M77Pd4,5271588,12443
7,The Real Reason to be Afraid of Artificial Int...,2017-12-15T15:55:54Z,"A robotics researcher afraid of robots, Peter ...",TRzBk_KuIaM,2428681,3951
8,MICRO DRONES KILLER ARMS ROBOTS - AUTONOMOUS A...,2017-11-17T05:44:03Z,"Killer drone arms, articial intelligence an in...",TlO2gcs1YvM,2112142,2376
9,With artificial Intelligence we're summoning t...,2017-08-31T19:39:08Z,"""Artificial intelligence is the future, not on...",xs_HhZrCBdg,1880121,4256


In [6]:
def get_video_comments(video_id, max_results=20):
    youtube = build('youtube', 'v3', developerKey=API_KEY)

    # Call the API to retrieve comments
    response = youtube.commentThreads().list(
        part='snippet',
        videoId=video_id,
        maxResults=max_results
    ).execute()

    comments = []
    for item in response['items']:
        comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
        comments.append(comment)

    return comments

In [7]:
comments = get_video_comments("Bg_tJvCA8zw") # TEST: get 20 comments for one specific video

# loop through the comments and print them
for i, comment in enumerate(comments, start=1):
    print(f"comment {i}: {comment}")

comment 1: I knew Jennifer Lawrence wasn&#39;t a real person
comment 2: Robot + feeling = love
comment 3: Beautiful
comment 4: Tu chizz badi hain mast 770 k dollers right
comment 5: Bluetooth
comment 6: DJs ❤
comment 7: she is the one that had remarks about the human race and if it would make it, she said ,no.
comment 8: <a href="https://www.youtube.com/watch?v=Bg_tJvCA8zw&amp;t=304">5:04</a> yep we’re doomed 💀
comment 9: 😮
comment 10: The interaction with Sophia is creepy and unsettling but I don’t think the robot noticed.
comment 11: Jimmy Fallon always looks, talks and gesters like a perv.
comment 12: Jimmy is so lovely 🤩
comment 13: Pourquoi n&#39;a-t-elle pas de cheveux ?
comment 14: Avengers: Age of Sophia
comment 15: “It’s my great start to dominate human race” jit ain’t jokin’
comment 16: Without a natural female human - society will never continue except via robot technology! That is truthful not fabricated deceit. God the true heavenly Good God our Heavenly Alnighty Lord God 

In [27]:
def get_video_comments(video_id, max_comments=MAX_COMMENTS, total_comment_count=0):
    youtube = build('youtube', 'v3', developerKey=API_KEY)

    # calculate the number of pages required to fetch all comments
    total_pages = (total_comment_count + 99) // 100
    print(f"Total Pages: {total_pages}")
    
    comments = []
    while len(comments) < max_comments:
        response = youtube.commentThreads().list(
            part='snippet',
            videoId=video_id,
            maxResults=min(100, max_comments - len(comments))
        ).execute()

        for item in response['items']:
            comments.append({
                'comment': item['snippet']['topLevelComment']['snippet']['textDisplay'],
                'reply_count': item['snippet']['totalReplyCount']
            })

        if 'nextPageToken' in response and len(comments) < max_comments:
            response = youtube.commentThreads().list(
                    part = 'snippet,replies',
                    videoId = video_id,
                    pageToken = response['nextPageToken']
                ).execute()
        else:
            break
    return pd.DataFrame(comments)

In [30]:
video_comments_df = get_video_comments(video_id="R9OHn5ZF4Uo", total_comment_count=22915)

Total Pages: 230


In [32]:
video_comments_df.head(20)

Unnamed: 0,comment,reply_count
0,Bot food,0
1,"Last time I watched it, I saw another tech-loo...",0
2,Best Youtuber,0
3,Watching this clip is like listening to neuron...,0
4,"comment, cause the algorithm is *watching*.",0
5,This is definitely my favorate Grey video. It&...,0
6,This video came out when I was 13 and I rememb...,0
7,Thanks!,0
8,😂,0
9,Thanks,0
