In [88]:
from googleapiclient.discovery import build
from datetime import datetime, timedelta
import pandas as pd
import time

In [7]:
# global variables
MAX_RESULTS = 30
MAX_COMMENTS = 1000

**API Key: Lorenzo**

In [8]:
# open the text file containing the API key
with open("../authentication/YouTube_Data_API_Key_alt1.txt", "r") as file:
    API_KEY = file.read().strip() 

**API Key: Ishwarya**

In [None]:
# open the text file containing the API key
with open("YOUR PATH TO THE API KEY HERE", "r") as file:
    API_KEY = file.read().strip() 

In [63]:
# function to search for videos given a specific query. Requires max_results, published_after, and published_before both in datetime format
def search_videos(query, max_results=MAX_RESULTS, published_after=None, published_before=None):
    youtube = build('youtube', 'v3', developerKey=API_KEY)

    # convert datetime objects to ISO 8601 string format
    published_after_string = published_after.strftime('%Y-%m-%dT%H:%M:%SZ') if published_after else None
    published_before_string = published_before.strftime('%Y-%m-%dT%H:%M:%SZ') if published_before else None

    ## Two requests are created, to separately search for medium and long lenght videos.
    ## This ensures that we don't collect and YouTube Shorts videos, which we are not interested in.

    # construct request for medium lenght videos
    search_request_medium = youtube.search().list(
        q=query, 
        part="snippet", 
        type="video", 
        maxResults=max_results, 
        order="viewCount",
        videoDuration="medium",
        relevanceLanguage="en", 
        publishedAfter=published_after_string, 
        publishedBefore=published_before_string)
    
    # construct request for long videos
    search_request_long = youtube.search().list(
        q=query, 
        part="snippet", 
        type="video", 
        maxResults=max_results, 
        order="viewCount",
        videoDuration="long", 
        relevanceLanguage="en",
        publishedAfter=published_after_string, 
        publishedBefore=published_before_string)
    
    # execute both search requests and store the response
    search_response_medium = search_request_medium.execute()
    search_response_long = search_request_long.execute()

    # save video IDs into lists
    video_ids_medium = [item['id']['videoId'] for item in search_response_medium['items']]
    video_ids_long = [item['id']['videoId'] for item in search_response_long['items']]

    # construct request to retrieve data about found videos
    video_request_medium = youtube.videos().list(part="snippet,statistics,topicDetails", id=",".join(video_ids_medium))
    video_request_long = youtube.videos().list(part="snippet,statistics,topicDetails", id=",".join(video_ids_long))

    # execute both video requests and store the response
    video_response_medium = video_request_medium.execute()
    video_response_long = video_request_long.execute()

    # create empty list videos_data, in which each list item is a dictionary of metadata about one video
    videos_data = []
    # loop through the medium length videos and append to videos_data
    for item in video_response_medium['items']:
        tags_list = item["snippet"].get("tags", []) 
        topics_categories_list = item["topicDetails"].get("topicCategories", [])
        videos_data.append({
            'title': item['snippet']['title'],
            'publish_date': item['snippet']['publishedAt'],
            'description': item['snippet']['description'],
            'tags': tags_list,
            'topic_categories': topics_categories_list,
            'video_id': item['id'],
            'view_count': int(item['statistics']['viewCount']) if 'viewCount' in item['statistics'] else 0,
            'comment_count': int(item['statistics']['commentCount']) if 'commentCount' in item['statistics'] else 0
        })
    # loop through the long videos and append to videos_data
    for item in video_response_long['items']:
        tags_list = item["snippet"].get("tags", []) 
        topics_categories_list = item["topicDetails"].get("topicCategories", [])
        videos_data.append({
            'title': item['snippet']['title'],
            'publish_date': item['snippet']['publishedAt'],
            'description': item['snippet']['description'],
            'tags': tags_list,
            'topic_categories': topics_categories_list,
            'video_id': item['id'],
            'view_count': int(item['statistics']['viewCount']) if 'viewCount' in item['statistics'] else 0,
            'comment_count': int(item['statistics']['commentCount']) if 'commentCount' in item['statistics'] else 0
        })
    # convert the list of dictionaries to a dataframe and return
    return pd.DataFrame(videos_data)

In [68]:
# specify start and end dates
start_date = datetime(2017, 1, 1)
end_date = datetime(2017, 12, 31)
# call the search videos() function
df_videos = search_videos("artificial intelligence", MAX_RESULTS, start_date, end_date)

In [69]:
# inspect the data
df_videos.to_csv("videos_2017.csv")
df_videos.head(20)

Unnamed: 0,title,publish_date,description,tags,video_id,view_count,comment_count,topic_categories
0,Tonight Showbotics: Jimmy Meets Sophia the Hum...,2017-04-26T03:57:12Z,Jimmy Fallon demos amazing new robots from all...,"[The Tonight Show, Jimmy Fallon, Tonight Showb...",Bg_tJvCA8zw,34706536,23102,"[https://en.wikipedia.org/wiki/Entertainment, ..."
1,"But what is a neural network? | Chapter 1, Dee...",2017-10-05T15:11:25Z,"What are the neurons, why are there layers, an...","[three brown one blue, 3 brown 1 blue, neural ...",aircAruvnKk,16283682,7257,[https://en.wikipedia.org/wiki/Knowledge]
2,The Rise of the Machines – Why Automation is D...,2017-06-08T16:28:31Z,Automation in the Information Age is different...,"[automation, universal basic income, new autom...",WSKi8HfcxEk,14470662,30411,[https://en.wikipedia.org/wiki/Knowledge]
3,"How AIs, like ChatGPT, Learn",2017-12-18T14:39:33Z,"How do all the algorithms, like ChatGPT, aroun...","[cgpgrey, education, hello internet, ai, chatg...",R9OHn5ZF4Uo,9926791,22915,[https://en.wikipedia.org/wiki/Knowledge]
4,Interview With The Lifelike Hot Robot Named So...,2017-10-25T16:32:19Z,"CNBC's Andrew Ross Sorkin interviews Sophia, a...","[CNBC, business news, finance stock, stock mar...",S5t6K9iwcdw,9892083,7660,"[https://en.wikipedia.org/wiki/Society, https:..."
5,Artificial Intelligence: it will kill us | Jay...,2017-01-31T18:36:05Z,"For more information on Jay Tuck, please visit...","[TEDxTalks, English, Germany, Technology, AI, ...",BrNs0M77Pd4,5271840,12446,[https://en.wikipedia.org/wiki/Knowledge]
6,The Real Reason to be Afraid of Artificial Int...,2017-12-15T15:55:54Z,"A robotics researcher afraid of robots, Peter ...","[TEDxTalks, English, Technology, Big Data, Cod...",TRzBk_KuIaM,2428765,3951,[https://en.wikipedia.org/wiki/Knowledge]
7,Sofia| First Robot as a Citizen of Saudi Arabi...,2017-10-27T09:25:22Z,#Sofia #Artificial Intelligence #FirstRobotCit...,[],9kiEK4LrCgQ,2382950,2473,[https://en.wikipedia.org/wiki/Society]
8,MICRO DRONES KILLER ARMS ROBOTS - AUTONOMOUS A...,2017-11-17T05:44:03Z,"Killer drone arms, articial intelligence an in...","[KILLER DRONE, INTELLIGENCE, DEFENCE, PRIVACY,...",TlO2gcs1YvM,2112313,2376,[https://en.wikipedia.org/wiki/Society]
9,"Meet Samantha, Beautiful AI Robot, People Are ...",2017-12-28T23:24:21Z,"Meet Samantha, Beautiful Female Robot, It's Cr...","[samantha, samantha robot, female robot, hyper...",-7EMxSxVs-Q,1966616,1027,[https://en.wikipedia.org/wiki/Lifestyle_(soci...


### Cleaning the Videos DataFrames

In [79]:
# def check_keywords(tags):
#     keyword_list = [""]
#     for tag in tags:
#         for keyword in keyword_list:
#             if keyword in tag:
#                 return True
#     return False

df_videos_cleaned = (
    df_videos[df_videos["comment_count"] > 500] # select only videos with at least 500 comments
    .nlargest(10, "view_count")  # select the top 10 videos with the highest view_count
    .reset_index(drop=True)  # reset the index without bringing the old indices to the new DataFrame
)

## NOTE: Probably not yet necessary, we can also do this at a later step.
# Extracting year, month, and day from the datetime column
df_videos_cleaned['year'] = pd.to_datetime(df_videos_cleaned['publish_date']).dt.year
df_videos_cleaned['month'] = pd.to_datetime(df_videos_cleaned['publish_date']).dt.month
df_videos_cleaned['day'] = pd.to_datetime(df_videos_cleaned['publish_date']).dt.day
df_videos_cleaned

Unnamed: 0,title,publish_date,description,tags,video_id,view_count,comment_count,topic_categories,year,month,day
0,Tonight Showbotics: Jimmy Meets Sophia the Hum...,2017-04-26T03:57:12Z,Jimmy Fallon demos amazing new robots from all...,"[The Tonight Show, Jimmy Fallon, Tonight Showb...",Bg_tJvCA8zw,34706536,23102,"[https://en.wikipedia.org/wiki/Entertainment, ...",2017,4,26
1,"But what is a neural network? | Chapter 1, Dee...",2017-10-05T15:11:25Z,"What are the neurons, why are there layers, an...","[three brown one blue, 3 brown 1 blue, neural ...",aircAruvnKk,16283682,7257,[https://en.wikipedia.org/wiki/Knowledge],2017,10,5
2,The Rise of the Machines – Why Automation is D...,2017-06-08T16:28:31Z,Automation in the Information Age is different...,"[automation, universal basic income, new autom...",WSKi8HfcxEk,14470662,30411,[https://en.wikipedia.org/wiki/Knowledge],2017,6,8
3,"How AIs, like ChatGPT, Learn",2017-12-18T14:39:33Z,"How do all the algorithms, like ChatGPT, aroun...","[cgpgrey, education, hello internet, ai, chatg...",R9OHn5ZF4Uo,9926791,22915,[https://en.wikipedia.org/wiki/Knowledge],2017,12,18
4,Interview With The Lifelike Hot Robot Named So...,2017-10-25T16:32:19Z,"CNBC's Andrew Ross Sorkin interviews Sophia, a...","[CNBC, business news, finance stock, stock mar...",S5t6K9iwcdw,9892083,7660,"[https://en.wikipedia.org/wiki/Society, https:...",2017,10,25
5,Artificial Intelligence: it will kill us | Jay...,2017-01-31T18:36:05Z,"For more information on Jay Tuck, please visit...","[TEDxTalks, English, Germany, Technology, AI, ...",BrNs0M77Pd4,5271840,12446,[https://en.wikipedia.org/wiki/Knowledge],2017,1,31
6,Why AI Is The Most Dangerous Thing You Can Ima...,2017-12-23T18:04:43Z,First 500 people get a free 2 month trial of S...,"[artificial intelligence, ai, singularity, ai ...",R_5sAburz5Q,2484866,15179,,2017,12,23
7,The Real Reason to be Afraid of Artificial Int...,2017-12-15T15:55:54Z,"A robotics researcher afraid of robots, Peter ...","[TEDxTalks, English, Technology, Big Data, Cod...",TRzBk_KuIaM,2428765,3951,[https://en.wikipedia.org/wiki/Knowledge],2017,12,15
8,Sofia| First Robot as a Citizen of Saudi Arabi...,2017-10-27T09:25:22Z,#Sofia #Artificial Intelligence #FirstRobotCit...,[],9kiEK4LrCgQ,2382950,2473,[https://en.wikipedia.org/wiki/Society],2017,10,27
9,MICRO DRONES KILLER ARMS ROBOTS - AUTONOMOUS A...,2017-11-17T05:44:03Z,"Killer drone arms, articial intelligence an in...","[KILLER DRONE, INTELLIGENCE, DEFENCE, PRIVACY,...",TlO2gcs1YvM,2112313,2376,[https://en.wikipedia.org/wiki/Society],2017,11,17


In [89]:
# NOTE: This is just a test to see if I can nicely loop over the DF
# Loop through the DataFrame and pass values as parameters to the function
# for index, row in df_videos_cleaned.iterrows():
#     video_id = row['video_id']
#     publish_date = row['publish_date']
#     # Call the function and pass the values as parameters

#     print(f"--- {index + 1} --- \nID: {video_id}\nPublish Date: {publish_date}")



# Split the DataFrame into two halves
half_point = len(df_videos_cleaned) // 2
first_half = df_videos_cleaned.iloc[:half_point]
second_half = df_videos_cleaned.iloc[half_point:]

# Loop through the first half of the DataFrame and pass values as parameters to the function
for index, row in first_half.iterrows():
    video_id = row['video_id']
    publish_date = row['publish_date']
    # Call the function and pass the values as parameters
    print(f"--- {index + 1} --- \nID: {video_id}\nPublish Date: {publish_date}")

print(f"\n\n\nWaiting...\n\n\n")
time.sleep(10)

# At a later time, loop through the second half of the DataFrame
for index, row in second_half.iterrows():
    video_id = row['video_id']
    publish_date = row['publish_date']
    print(f"--- {index + 1} --- \nID: {video_id}\nPublish Date: {publish_date}")



--- 1 --- 
ID: Bg_tJvCA8zw
Publish Date: 2017-04-26T03:57:12Z
--- 2 --- 
ID: aircAruvnKk
Publish Date: 2017-10-05T15:11:25Z
--- 3 --- 
ID: WSKi8HfcxEk
Publish Date: 2017-06-08T16:28:31Z
--- 4 --- 
ID: R9OHn5ZF4Uo
Publish Date: 2017-12-18T14:39:33Z
--- 5 --- 
ID: S5t6K9iwcdw
Publish Date: 2017-10-25T16:32:19Z



Waiting...



--- 6 --- 
ID: BrNs0M77Pd4
Publish Date: 2017-01-31T18:36:05Z
--- 7 --- 
ID: R_5sAburz5Q
Publish Date: 2017-12-23T18:04:43Z
--- 8 --- 
ID: TRzBk_KuIaM
Publish Date: 2017-12-15T15:55:54Z
--- 9 --- 
ID: 9kiEK4LrCgQ
Publish Date: 2017-10-27T09:25:22Z
--- 10 --- 
ID: TlO2gcs1YvM
Publish Date: 2017-11-17T05:44:03Z


## Section 2: Retrieving Comments

In [6]:
def get_video_comments(video_id, max_results=20):
    youtube = build('youtube', 'v3', developerKey=API_KEY)

    # Call the API to retrieve comments
    response = youtube.commentThreads().list(
        part='snippet',
        videoId=video_id,
        maxResults=max_results
    ).execute()

    comments = []
    for item in response['items']:
        comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
        comments.append(comment)

    return comments

In [7]:
comments = get_video_comments("Bg_tJvCA8zw") # TEST: get 20 comments for one specific video

# loop through the comments and print them
for i, comment in enumerate(comments, start=1):
    print(f"comment {i}: {comment}")

comment 1: I knew Jennifer Lawrence wasn&#39;t a real person
comment 2: Robot + feeling = love
comment 3: Beautiful
comment 4: Tu chizz badi hain mast 770 k dollers right
comment 5: Bluetooth
comment 6: DJs ❤
comment 7: she is the one that had remarks about the human race and if it would make it, she said ,no.
comment 8: <a href="https://www.youtube.com/watch?v=Bg_tJvCA8zw&amp;t=304">5:04</a> yep we’re doomed 💀
comment 9: 😮
comment 10: The interaction with Sophia is creepy and unsettling but I don’t think the robot noticed.
comment 11: Jimmy Fallon always looks, talks and gesters like a perv.
comment 12: Jimmy is so lovely 🤩
comment 13: Pourquoi n&#39;a-t-elle pas de cheveux ?
comment 14: Avengers: Age of Sophia
comment 15: “It’s my great start to dominate human race” jit ain’t jokin’
comment 16: Without a natural female human - society will never continue except via robot technology! That is truthful not fabricated deceit. God the true heavenly Good God our Heavenly Alnighty Lord God 

In [27]:
def get_video_comments(video_id, max_comments=MAX_COMMENTS, total_comment_count=0):
    youtube = build('youtube', 'v3', developerKey=API_KEY)

    # calculate the number of pages required to fetch all comments
    total_pages = (total_comment_count + 99) // 100
    print(f"Total Pages: {total_pages}")
    
    comments = []
    while len(comments) < max_comments:
        response = youtube.commentThreads().list(
            part='snippet',
            videoId=video_id,
            maxResults=min(100, max_comments - len(comments))
        ).execute()

        for item in response['items']:
            comments.append({
                'comment': item['snippet']['topLevelComment']['snippet']['textDisplay'],
                'reply_count': item['snippet']['totalReplyCount']
            })

        if 'nextPageToken' in response and len(comments) < max_comments:
            response = youtube.commentThreads().list(
                    part = 'snippet,replies',
                    videoId = video_id,
                    pageToken = response['nextPageToken']
                ).execute()
        else:
            break
    return pd.DataFrame(comments)

In [30]:
video_comments_df = get_video_comments(video_id="R9OHn5ZF4Uo", total_comment_count=22915)

Total Pages: 230


In [32]:
video_comments_df.head(20)

Unnamed: 0,comment,reply_count
0,Bot food,0
1,"Last time I watched it, I saw another tech-loo...",0
2,Best Youtuber,0
3,Watching this clip is like listening to neuron...,0
4,"comment, cause the algorithm is *watching*.",0
5,This is definitely my favorate Grey video. It&...,0
6,This video came out when I was 13 and I rememb...,0
7,Thanks!,0
8,😂,0
9,Thanks,0


### Retreiving ALL comments of a video

In [31]:
def get_all_top_level_comment_replies(video_id, video_publish_date, video_title, top_level_comment_id, replies, page_token):
    youtube = build('youtube', 'v3', developerKey=API_KEY)

    response = youtube.comments().list(
        part="snippet",
        parentId=top_level_comment_id,
        maxResults=100,
        pageToken=page_token).execute()
    
    for item in response["items"]:
        replies.append({
            "video_id": video_id,
            "video_title": video_title,
            "video_publish_date": video_publish_date,
            "text": item["snippet"]["textDisplay"],
            "comment_published_at": item["snippet"]["publishedAt"],
            "parent_comment_id": top_level_comment_id
            })
    
    if "nextPageToken" in replies:
        return get_all_top_level_comment_replies(top_level_comment_id, replies, response["nextPageToken"])
    else:
        return replies

In [32]:
def get_all_comments(video_id, video_publish_date, video_title, comments=[], next_page_token=None):
    youtube = build('youtube', 'v3', developerKey=API_KEY)
    
    response = youtube.commentThreads().list(
        part="id,snippet,replies",
        videoId=video_id,
        maxResults=100,
        order="relevance",
        pageToken=next_page_token).execute()

    # Stores the total reply count a top level commnet has.
    #total_reply_count = 0
    
    #replies = []

    for item in response["items"]:
        comment = item['snippet']['topLevelComment']
        comment_text = comment['snippet']['textDisplay']

        comments.append({
            "video_id": video_id,
            "video_title": video_title,
            "video_publish_date": video_publish_date,
            "text": comment_text,
            "comment_published_at": comment["snippet"]["publishedAt"],
            "parent_comment_id": None
            })

        # get the total reply count
        # total_reply_count = item['snippet']['totalReplyCount']

        # if (total_reply_count > 0): 
        #     replies = []
        #     replies.extend(get_all_top_level_comment_replies(video_id, video_publish_date, video_title, comment['id'], [], None))
        #     comments.extend(replies)
    
    if "nextPageToken" in response:
        return get_all_comments(video_id, video_publish_date, video_title, comments, response["nextPageToken"])
    else:
        return pd.DataFrame(comments)

In [37]:
id = "mJeNghZXtMo"
publish_date_str = "2017-01-30T20:19:44Z"
title = "What is Artificial Intelligence"

publish_date = datetime.strptime(publish_date_str, "%Y-%m-%dT%H:%M:%SZ")

all_comments_df = get_all_comments(id, publish_date, title)

In [None]:
#all_comments_df.to_pickle("all_comments.pkl")
all_comments_df.to_csv("all_comments_2.csv")

In [40]:
datatypes = all_comments_df.dtypes
datatypes

all_comments_df.info()
all_comments_df.head(50)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 498 entries, 0 to 497
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   video_id              498 non-null    object
 1   video_title           498 non-null    object
 2   video_publish_date    498 non-null    object
 3   text                  498 non-null    object
 4   comment_published_at  498 non-null    object
 5   parent_comment_id     170 non-null    object
dtypes: object(6)
memory usage: 23.5+ KB


Unnamed: 0,video_id,video_title,video_publish_date,text,comment_published_at,parent_comment_id
0,mJeNghZXtMo,What is Artificial Intelligence,2017-01-30T20:19:44Z,Download HubSpot&#39;s State of AI in Marketin...,2023-02-02T21:03:19Z,
1,mJeNghZXtMo,What is Artificial Intelligence,2017-01-30T20:19:44Z,pov: your IT teacher told you to watch this,2020-11-08T12:58:12Z,
2,mJeNghZXtMo,What is Artificial Intelligence,2017-01-30T20:19:44Z,Wel it was my &#39;Data mining and machine lea...,2020-11-10T08:23:07Z,UgybrlkjPEbM1g8GGwt4AaABAg
3,mJeNghZXtMo,What is Artificial Intelligence,2017-01-30T20:19:44Z,hol my name is pol and u r a pro gamer clearly...,2020-11-16T21:28:24Z,UgybrlkjPEbM1g8GGwt4AaABAg
4,mJeNghZXtMo,What is Artificial Intelligence,2017-01-30T20:19:44Z,Well turns out ur correct :(,2021-01-29T09:08:16Z,UgybrlkjPEbM1g8GGwt4AaABAg
5,mJeNghZXtMo,What is Artificial Intelligence,2017-01-30T20:19:44Z,hola andy,2021-02-02T12:01:12Z,UgybrlkjPEbM1g8GGwt4AaABAg
6,mJeNghZXtMo,What is Artificial Intelligence,2017-01-30T20:19:44Z,My mechatronics teacher told me too so your so...,2021-02-08T21:37:30Z,UgybrlkjPEbM1g8GGwt4AaABAg
7,mJeNghZXtMo,What is Artificial Intelligence,2017-01-30T20:19:44Z,"No, I’m only curious",2021-02-17T02:13:58Z,UgybrlkjPEbM1g8GGwt4AaABAg
8,mJeNghZXtMo,What is Artificial Intelligence,2017-01-30T20:19:44Z,My english teacher xD,2021-03-02T10:31:28Z,UgybrlkjPEbM1g8GGwt4AaABAg
9,mJeNghZXtMo,What is Artificial Intelligence,2017-01-30T20:19:44Z,@@tobix8687 relatable,2021-04-06T16:08:56Z,UgybrlkjPEbM1g8GGwt4AaABAg
