In [19]:
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from datetime import datetime, timedelta
import pandas as pd
import time

In [20]:
# global variables
MAX_RESULTS = 30
MAX_COMMENTS = 1000

**API Key: Lorenzo**

In [21]:
# open the text file containing the API key
with open("../authentication/YouTube_Data_API_Key_alt1.txt", "r") as file:
    API_KEY = file.read().strip() 

**API Key: Ishwarya**

In [None]:
# open the text file containing the API key
with open("YOUR PATH TO THE API KEY HERE", "r") as file:
    API_KEY = file.read().strip() 

## Collect YouTube Videos

In [4]:
# function to search for videos given a specific query. Requires max_results, published_after, and published_before both in datetime format
def search_videos(query, max_results=MAX_RESULTS, published_after=None, published_before=None):
    youtube = build('youtube', 'v3', developerKey=API_KEY)

    # convert datetime objects to ISO 8601 string format
    published_after_string = published_after.strftime('%Y-%m-%dT%H:%M:%SZ') if published_after else None
    published_before_string = published_before.strftime('%Y-%m-%dT%H:%M:%SZ') if published_before else None

    ## Two requests are created, to separately search for medium and long lenght videos.
    ## This ensures that we don't collect and YouTube Shorts videos, which we are not interested in.

    # construct request for medium lenght videos
    search_request_medium = youtube.search().list(
        q=query, 
        part="snippet", 
        type="video", 
        maxResults=max_results, 
        order="viewCount",
        videoDuration="medium",
        relevanceLanguage="en", 
        publishedAfter=published_after_string, 
        publishedBefore=published_before_string)
    
    # construct request for long videos
    search_request_long = youtube.search().list(
        q=query, 
        part="snippet", 
        type="video", 
        maxResults=max_results, 
        order="viewCount",
        videoDuration="long", 
        relevanceLanguage="en",
        publishedAfter=published_after_string, 
        publishedBefore=published_before_string)
    
    # execute both search requests and store the response
    search_response_medium = search_request_medium.execute()
    search_response_long = search_request_long.execute()

    # save video IDs into lists
    video_ids_medium = [item['id']['videoId'] for item in search_response_medium['items']]
    video_ids_long = [item['id']['videoId'] for item in search_response_long['items']]

    # construct request to retrieve data about found videos
    video_request_medium = youtube.videos().list(part="snippet,statistics", id=",".join(video_ids_medium))
    video_request_long = youtube.videos().list(part="snippet,statistics", id=",".join(video_ids_long))

    # execute both video requests and store the response
    video_response_medium = video_request_medium.execute()
    video_response_long = video_request_long.execute()

    # create empty list videos_data, in which each list item is a dictionary of metadata about one video
    videos_data = []
    # loop through the medium length videos and append to videos_data
    for item in video_response_medium['items']:
        tags_list = item["snippet"].get("tags", []) 
        videos_data.append({
            'video_id': item['id'],
            'video_title': item['snippet']['title'],
            'channel_title': item['snippet']['channelTitle'],
            'channel_id': item['snippet']['channelId'],
            'video_publish_date': item['snippet']['publishedAt'],
            'view_count': int(item['statistics']['viewCount']) if 'viewCount' in item['statistics'] else 0,
            'comment_count': int(item['statistics']['commentCount']) if 'commentCount' in item['statistics'] else 0,
            'video_description': item['snippet']['description'],
            'tags': tags_list,
            'category_id': item['snippet']['categoryId']
        })
    # loop through the long videos and append to videos_data
    for item in video_response_long['items']:
        tags_list = item["snippet"].get("tags", []) 
        videos_data.append({
            'video_id': item['id'],
            'video_title': item['snippet']['title'],
            'channel_title': item['snippet']['channelTitle'],
            'channel_id': item['snippet']['channelId'],
            'video_publish_date': item['snippet']['publishedAt'],
            'view_count': int(item['statistics']['viewCount']) if 'viewCount' in item['statistics'] else 0,
            'comment_count': int(item['statistics']['commentCount']) if 'commentCount' in item['statistics'] else 0,
            'video_description': item['snippet']['description'],
            'tags': tags_list,
            'category_id': item['snippet']['categoryId']
        })
    # convert the list of dictionaries to a dataframe and return
    return pd.DataFrame(videos_data)

In [5]:
# # NOTE: Single shot test
# # # specify start and end dates
# year = 2021

# start_date = datetime(year, 1, 1)
# end_date = datetime(year, 12, 31)
# # call the search videos() function
# df_videos = search_videos("artificial intelligence", MAX_RESULTS, start_date, end_date)

# # preliminary filtering steps
# df_videos = (
#     df_videos[df_videos["comment_count"] > 500] # select only videos with at least 500 comments
#     .nlargest(20, "view_count")  # select the top 10 videos with the highest view_count
# )

# # save the DataFrame with the appropriate name
# df_videos.to_csv(f'../data/videos/df_videos_{year}.csv', index=False)

In [6]:
for year in range(2017, 2024):
    start_date = datetime(year, 1, 1)
    end_date = datetime(year, 12, 31)

    # call the search_videos() function
    df_videos = search_videos("artificial intelligence", MAX_RESULTS, start_date, end_date)

    # preliminary filtering steps
    df_videos = (
        df_videos[df_videos["comment_count"] > 500] # select only videos with at least 500 comments
        .nlargest(20, "view_count")  # select the top 10 videos with the highest view_count
    )

    # save the DataFrame with the appropriate name
    df_videos.to_csv(f'../data/videos/df_videos_{year}.csv', index=False)

In [112]:
# inspect the data
df_videos.head(20)

Unnamed: 0,video_id,video_title,channel_title,channel_id,video_publish_date,view_count,comment_count,video_description,tags,category_id
0,ICUVyDJPzwY,"LEGO, but AI controls what I BUILD...",TD BRICKS,UCUU3GdGuQshZFRGnxAPBf_w,2023-04-06T16:48:47Z,21686763,4757,I use ARTIFICIAL INTELLIGENCE to design severa...,"[lego, ai, artifical intelligence, lego meme, ...",24
30,Sqa8Zo2XWc4,Artificial Intelligence: Last Week Tonight wit...,LastWeekTonight,UC3XTzVzaHQEd30rQbuvCtTQ,2023-02-27T07:30:08Z,10262955,11186,Artificial intelligence is increasingly becomi...,[],24
31,bk-nQ7HF6k4,EMERGENCY EPISODE: Ex-Google Officer Finally S...,The Diary Of A CEO,UCGq-a57w-aPwyi3pW7XLiHw,2023-06-01T07:00:21Z,9633767,34732,If You Enjoyed This Episode You Must Watch Thi...,"[The Diary Of A CEO, steven bartlett steve bar...",22
32,vJefOB8kec8,The Truth about Artificial Intelligence and Ch...,Dhruv Rathee,UC-CSyyi47VX1lD9zyeABW3w,2023-07-16T15:05:29Z,9600532,28000,🤖 Join my AI Course: https://academy.dhruvrath...,"[Dhruv Rathee, Dhruv, Rathee, indian youtuber,...",27
1,uiUPD-z9DTg,The Next Global Superpower Isn't Who You Think...,TED,UCAuUUnT6oDeKwE6v1NGQxug,2023-06-14T16:20:35Z,9209035,13448,Who runs the world? Political scientist Ian Br...,"[TEDTalk, TEDTalks, TED Talk, TED Talks, TED, ...",25
2,jPhJbKBuNnA,I tried using AI. It scared me.,Tom Scott,UCBa659QWEk1AI4Tg--mrJ2A,2023-02-13T16:00:23Z,7379887,14322,I just wanted to fix my email. ■ AD: 👨‍💻 NordV...,[],27
33,zaB_20bkoA4,Elon Musk's BRUTALLY Honest Interview With Tuc...,Visionary,UC_8gQrYCIf9aG_9N4_T5fzg,2023-07-06T05:51:15Z,6180406,11401,Elon Musk's BRUTALLY Honest Interview With Tuc...,"[Smart Sense, Elon Musk, elon musk carlson tuc...",24
34,aZ5EsdnpLMI,Artificial Intelligence | 60 Minutes Full Epis...,60 Minutes,UCsN32BtMd0IoByjJRNF12cw,2023-12-30T13:00:02Z,5792902,6652,"From January 2019, Scott Pelley's interview wi...","[60 minutes, cbs news, artificial intelligence...",25
3,DCu9xawHJaw,Meet The 6'10 Ai Robot NBA Players Fear..,REBOUND,UCVepIJedg6BMQeSgjPY4e6A,2023-06-20T19:58:24Z,5232601,1345,"If you’re ever injured in an accident, you can...","[nba, rebound central, rebound]",17
4,2yd18z6iSyk,"Joe Rogan: ""I Wasn't Afraid of AI Until I Lear...",JRE Daily Clips,UCavUx5z-IziOIxbsK_Ah7ng,2023-12-19T22:46:30Z,5190605,8147,FREE Alpha Brain Trial ► https://onnit.sjv.io/...,"[JRE, UFC, Joe Rogan, Joe Rogan Experience, JR...",23


### Cleaning the Videos DataFrames

In [60]:
# ## NOTE: Probably not yet necessary, we can also do this at a later step.
# # Extracting year, month, and day from the datetime column
# # df_videos_cleaned['year'] = pd.to_datetime(df_videos_cleaned['publish_date']).dt.year
# # df_videos_cleaned['month'] = pd.to_datetime(df_videos_cleaned['publish_date']).dt.month
# # df_videos_cleaned['day'] = pd.to_datetime(df_videos_cleaned['publish_date']).dt.day
# df_videos_cleaned

## Collect YouTube Comments

In [22]:
# Run this to reset the comments list again
comments = []

In [35]:
# This function gets all the top level comments and saves them to a dataframe
def get_all_top_level_comments(video_id, video_title, video_publish_date, video_category_id, comments=[], next_page_token=None):
    youtube = build('youtube', 'v3', developerKey=API_KEY)
    
    if comments is None:
        comments = []

    try:
        response = youtube.commentThreads().list(
            part="id,snippet,replies",
            videoId=video_id,
            maxResults=100,
            order="relevance",
            pageToken=next_page_token).execute()

        for item in response["items"]:
            comment = item['snippet']['topLevelComment']
            comment_text = comment['snippet']['textDisplay']

            comments.append({
                "video_id": video_id,
                "video_title": video_title,
                "video_publish_date": video_publish_date,
                "video_category_id": video_category_id,
                "comment_text": comment_text,
                "comment_id": item['id'],
                "comment_publish_date": comment["snippet"]["publishedAt"]
                })

        if "nextPageToken" in response:
            return get_all_top_level_comments(video_id, video_title, video_publish_date, video_category_id, comments, response["nextPageToken"])
        else:
            return pd.DataFrame(comments)
        
    except HttpError as error:
        if error.resp.status == 400:
            print(f"HTTP 400 error for video ID {video_id}, with name '{video_title}'. Skipping this page.")
            return pd.DataFrame(comments)  # Return what has been collected so far
        elif error.resp.status == 404:
            print(f"HTTP 404 error for video ID {video_id}, with name '{video_title}', date {video_publish_date}, and category id {video_category_id}. Skipping this page.")
            return pd.DataFrame(comments)  # Return what has been collected so far
        else:
            raise # Re-raise the exception if it's not a 400 error

In [48]:
# First we have to read all df_videos.csv files and save them to a list 

# List to store all the cleaned video dataframes
video_dataframes = []

# Years for which video CSV files exist
years = range(2017, 2023 + 1)

# Loop through the years, load the corresponding CSV file, and append it to the list
for year in years:
    file_path_and_name = f"../data/videos/cleaned/df_videos_{year}_cleaned.csv"
    df = pd.read_csv(file_path_and_name)
    video_dataframes.append(df)

# Optional: To verify, print the first few rows of each dataframe
for i, df in enumerate(video_dataframes):
    print(f"DataFrame for year {years[i]}:")
    print("Video IDs")
    print(df['video_id'].tolist())  # Print all values in the 'video_id' column


DataFrame for year 2017:
Video IDs
['Bg_tJvCA8zw', 'aircAruvnKk', 'WSKi8HfcxEk', 'R9OHn5ZF4Uo', 'S5t6K9iwcdw', 'BrNs0M77Pd4', 'TRzBk_KuIaM', '9kiEK4LrCgQ', 'TlO2gcs1YvM', 'xs_HhZrCBdg']
DataFrame for year 2018:
Video IDs
['Ml9v3wHLuWI', '-cN8sJz50Ng', '1y3XdwTa1cA', 'Ra3fv8gl6NE', 'Pls_q2aQzHg', '-JlxuQ7tPgQ', 'Ktg8E7i4nzw', 'YNLC0wJSHxI', 'gb4SshJ5WOY', '6tBZA2rygcM']
DataFrame for year 2019:
Video IDs
['UwsrzCVZAb8', 'NR32ULxbjYc', '5dZ_lvDgevk', 's0dMTAQM4cw', 'f3lUEnMaiAU', 'JMLsHI8aV0g', 'smK9dgdTl40', 'O5xeyoRL95U', 'Izd2qOgOGQI', '4svUKPeDa5A']
DataFrame for year 2020:
Video IDs
['WXuK6gekU1Y', 'Jky9I1ihAkg', 'XSgfE2vg-Kk', 'WATLfjRHySU', 'tPYj3fFJGjk', '-ePZ7OdY-Dw', 'R69JYEfCSeI', 'R3YFxF0n8n8', '60KJz1BVTyU', '-g0xOJYPjkQ']
DataFrame for year 2021:
Video IDs
['9jkRcrM6XKA', '5q87K1WaoFI', '63yr9dlI0cU', 'fmJ74774RO8', 'aManoLQAHQU', '0yCJMt9Mx9c', 'b8IYaY4aOV0', '5B3Wn6Wo5CU', 'z498dvAYyu0', 'rA5k2S8xPK8']
DataFrame for year 2022:
Video IDs
['GVsUOuSjvcg', 'J6Mdq3n6kgk', 'b2b

In [51]:
# initialise the dataframe where all comments will be stored
df_all_comments = pd.DataFrame(columns=['video_id', 'video_title', 'video_publish_date', 'video_category_id', 'comment_text', 'comment_id', 'comment_publish_date'])

In [52]:
# Split the DataFrame into two halves (for api reasons)
half_point = len(video_dataframes) // 2
first_half = video_dataframes[:half_point]
second_half = video_dataframes[half_point:]

for dataframe in first_half:
    for index, row in dataframe.iterrows():
        # for each row get the 4 relevant columns
        video_id = row['video_id']
        video_title = row['video_title']
        video_publish_date = row['video_publish_date']
        video_category_id = row['category_id']

        # Call the function and pass the values as parameters
        comments = [] # reset the list comments to an empty list, for get_all_top_level_comments() function
        df_all_comments_of_one_video = get_all_top_level_comments(video_id, video_title, video_publish_date, video_category_id, comments)
        df_all_comments =  pd.concat([df_all_comments, df_all_comments_of_one_video], ignore_index=True)

In [54]:
for dataframe in second_half:
    for index, row in dataframe.iterrows():
        # for each row get the 4 relevant columns
        video_id = row['video_id']
        video_title = row['video_title']
        video_publish_date = row['video_publish_date']
        video_category_id = row['category_id']

        # Call the function and pass the values as parameters
        comments = []
        df_all_comments_of_one_video = get_all_top_level_comments(video_id, video_title, video_publish_date, video_category_id, comments)
        df_all_comments =  pd.concat([df_all_comments, df_all_comments_of_one_video], ignore_index=True)

In [59]:
df_all_comments.head(50)
df_all_comments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67904 entries, 0 to 67903
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   video_id              67904 non-null  object
 1   video_title           67904 non-null  object
 2   video_publish_date    67904 non-null  object
 3   video_category_id     67904 non-null  object
 4   comment_text          67904 non-null  object
 5   comment_id            67904 non-null  object
 6   comment_publish_date  67904 non-null  object
dtypes: object(7)
memory usage: 3.6+ MB


In [61]:
df_all_comments.to_pickle("../data/comments/all_comments.pkl")

In [62]:
test = pd.read_pickle("../data/comments/all_comments.pkl")
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67904 entries, 0 to 67903
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   video_id              67904 non-null  object
 1   video_title           67904 non-null  object
 2   video_publish_date    67904 non-null  object
 3   video_category_id     67904 non-null  object
 4   comment_text          67904 non-null  object
 5   comment_id            67904 non-null  object
 6   comment_publish_date  67904 non-null  object
dtypes: object(7)
memory usage: 3.6+ MB


### Alternative: Collect ALL YouTube comments (including replies)

In [31]:
# def get_all_top_level_comment_replies(video_id, video_publish_date, video_title, top_level_comment_id, replies, page_token):
#     youtube = build('youtube', 'v3', developerKey=API_KEY)

#     response = youtube.comments().list(
#         part="snippet",
#         parentId=top_level_comment_id,
#         maxResults=100,
#         pageToken=page_token).execute()
    
#     for item in response["items"]:
#         replies.append({
#             "video_id": video_id,
#             "video_title": video_title,
#             "video_publish_date": video_publish_date,
#             "text": item["snippet"]["textDisplay"],
#             "comment_published_at": item["snippet"]["publishedAt"],
#             "parent_comment_id": top_level_comment_id
#             })
    
#     if "nextPageToken" in replies:
#         return get_all_top_level_comment_replies(top_level_comment_id, replies, response["nextPageToken"])
#     else:
#         return replies

In [16]:
# def get_all_top_level_comments(video_id, video_title, video_publish_date, video_category_id, comments=[], next_page_token=None):
#     youtube = build('youtube', 'v3', developerKey=API_KEY)
    
#     response = youtube.commentThreads().list(
#         part="id,snippet,replies",
#         videoId=video_id,
#         maxResults=100,
#         order="relevance",
#         pageToken=next_page_token).execute()

#     # Stores the total reply count a top level commnet has.
#     #total_reply_count = 0
    
#     #replies = []

#     for item in response["items"]:
#         comment = item['snippet']['topLevelComment']
#         comment_text = comment['snippet']['textDisplay']

#         comments.append({
#             "video_id": video_id,
#             "video_title": video_title,
#             "video_publish_date": video_publish_date,
#             "video_category_id": video_category_id,
#             "comment_text": comment_text,
#             "comment_id": item['id'],
#             "comment_publish_date": comment["snippet"]["publishedAt"]
#             })

#         # get the total reply count
#         # total_reply_count = item['snippet']['totalReplyCount']

#         # if (total_reply_count > 0): 
#         #     replies = []
#         #     replies.extend(get_all_top_level_comment_replies(video_id, video_publish_date, video_title, comment['id'], [], None))
#         #     comments.extend(replies)
    
#     if "nextPageToken" in response:
#         return get_all_comments(video_id, video_title, video_publish_date, video_category_id, comments, response["nextPageToken"])
#     else:
#         return pd.DataFrame(comments)

In [17]:
# id = "mJeNghZXtMo"
# publish_date_str = "2017-01-30T20:19:44Z"
# title = "What is Artificial Intelligence"

# publish_date = datetime.strptime(publish_date_str, "%Y-%m-%dT%H:%M:%SZ")

# all_comments_df = get_all_top_level_comments(id, title, publish_date, 28)
# all_comments_df.head(50)