In [20]:
from googleapiclient.discovery import build
from datetime import datetime, timedelta
import pandas as pd
import time

In [2]:
# global variables
MAX_RESULTS = 30
MAX_COMMENTS = 1000

**API Key: Lorenzo**

In [3]:
# open the text file containing the API key
with open("../authentication/YouTube_Data_API_Key_alt1.txt", "r") as file:
    API_KEY = file.read().strip() 

**API Key: Ishwarya**

In [None]:
# open the text file containing the API key
with open("YOUR PATH TO THE API KEY HERE", "r") as file:
    API_KEY = file.read().strip() 

In [4]:
# function to search for videos given a specific query. Requires max_results, published_after, and published_before both in datetime format
def search_videos(query, max_results=MAX_RESULTS, published_after=None, published_before=None):
    youtube = build('youtube', 'v3', developerKey=API_KEY)

    # convert datetime objects to ISO 8601 string format
    published_after_string = published_after.strftime('%Y-%m-%dT%H:%M:%SZ') if published_after else None
    published_before_string = published_before.strftime('%Y-%m-%dT%H:%M:%SZ') if published_before else None

    ## Two requests are created, to separately search for medium and long lenght videos.
    ## This ensures that we don't collect and YouTube Shorts videos, which we are not interested in.

    # construct request for medium lenght videos
    search_request_medium = youtube.search().list(
        q=query, 
        part="snippet", 
        type="video", 
        maxResults=max_results, 
        order="viewCount",
        videoDuration="medium",
        relevanceLanguage="en", 
        publishedAfter=published_after_string, 
        publishedBefore=published_before_string)
    
    # construct request for long videos
    search_request_long = youtube.search().list(
        q=query, 
        part="snippet", 
        type="video", 
        maxResults=max_results, 
        order="viewCount",
        videoDuration="long", 
        relevanceLanguage="en",
        publishedAfter=published_after_string, 
        publishedBefore=published_before_string)
    
    # execute both search requests and store the response
    search_response_medium = search_request_medium.execute()
    search_response_long = search_request_long.execute()

    # save video IDs into lists
    video_ids_medium = [item['id']['videoId'] for item in search_response_medium['items']]
    video_ids_long = [item['id']['videoId'] for item in search_response_long['items']]

    # construct request to retrieve data about found videos
    video_request_medium = youtube.videos().list(part="snippet,statistics", id=",".join(video_ids_medium))
    video_request_long = youtube.videos().list(part="snippet,statistics", id=",".join(video_ids_long))

    # execute both video requests and store the response
    video_response_medium = video_request_medium.execute()
    video_response_long = video_request_long.execute()

    # create empty list videos_data, in which each list item is a dictionary of metadata about one video
    videos_data = []
    # loop through the medium length videos and append to videos_data
    for item in video_response_medium['items']:
        tags_list = item["snippet"].get("tags", []) 
        videos_data.append({
            'video_id': item['id'],
            'video_title': item['snippet']['title'],
            'channel_title': item['snippet']['channelTitle'],
            'channel_id': item['snippet']['channelId'],
            'video_publish_date': item['snippet']['publishedAt'],
            'view_count': int(item['statistics']['viewCount']) if 'viewCount' in item['statistics'] else 0,
            'comment_count': int(item['statistics']['commentCount']) if 'commentCount' in item['statistics'] else 0,
            'video_description': item['snippet']['description'],
            'tags': tags_list,
            'category_id': item['snippet']['categoryId']
        })
    # loop through the long videos and append to videos_data
    for item in video_response_long['items']:
        tags_list = item["snippet"].get("tags", []) 
        videos_data.append({
            'video_id': item['id'],
            'video_title': item['snippet']['title'],
            'channel_title': item['snippet']['channelTitle'],
            'channel_id': item['snippet']['channelId'],
            'video_publish_date': item['snippet']['publishedAt'],
            'view_count': int(item['statistics']['viewCount']) if 'viewCount' in item['statistics'] else 0,
            'comment_count': int(item['statistics']['commentCount']) if 'commentCount' in item['statistics'] else 0,
            'video_description': item['snippet']['description'],
            'tags': tags_list,
            'category_id': item['snippet']['categoryId']
        })
    # convert the list of dictionaries to a dataframe and return
    return pd.DataFrame(videos_data)

In [5]:
# # NOTE: Single shot test
# # # specify start and end dates
# year = 2021

# start_date = datetime(year, 1, 1)
# end_date = datetime(year, 12, 31)
# # call the search videos() function
# df_videos = search_videos("artificial intelligence", MAX_RESULTS, start_date, end_date)

# # preliminary filtering steps
# df_videos = (
#     df_videos[df_videos["comment_count"] > 500] # select only videos with at least 500 comments
#     .nlargest(20, "view_count")  # select the top 10 videos with the highest view_count
# )

# # save the DataFrame with the appropriate name
# df_videos.to_csv(f'../data/df_videos_{year}.csv', index=False)

In [6]:
for year in range(2017, 2024):
    start_date = datetime(year, 1, 1)
    end_date = datetime(year, 12, 31)

    # call the search_videos() function
    df_videos = search_videos("artificial intelligence", MAX_RESULTS, start_date, end_date)

    # preliminary filtering steps
    df_videos = (
        df_videos[df_videos["comment_count"] > 500] # select only videos with at least 500 comments
        .nlargest(20, "view_count")  # select the top 10 videos with the highest view_count
    )

    # save the DataFrame with the appropriate name
    df_videos.to_csv(f'../data/df_videos_{year}.csv', index=False)

In [112]:
# inspect the data
df_videos.head(20)

Unnamed: 0,video_id,video_title,channel_title,channel_id,video_publish_date,view_count,comment_count,video_description,tags,category_id
0,ICUVyDJPzwY,"LEGO, but AI controls what I BUILD...",TD BRICKS,UCUU3GdGuQshZFRGnxAPBf_w,2023-04-06T16:48:47Z,21686763,4757,I use ARTIFICIAL INTELLIGENCE to design severa...,"[lego, ai, artifical intelligence, lego meme, ...",24
30,Sqa8Zo2XWc4,Artificial Intelligence: Last Week Tonight wit...,LastWeekTonight,UC3XTzVzaHQEd30rQbuvCtTQ,2023-02-27T07:30:08Z,10262955,11186,Artificial intelligence is increasingly becomi...,[],24
31,bk-nQ7HF6k4,EMERGENCY EPISODE: Ex-Google Officer Finally S...,The Diary Of A CEO,UCGq-a57w-aPwyi3pW7XLiHw,2023-06-01T07:00:21Z,9633767,34732,If You Enjoyed This Episode You Must Watch Thi...,"[The Diary Of A CEO, steven bartlett steve bar...",22
32,vJefOB8kec8,The Truth about Artificial Intelligence and Ch...,Dhruv Rathee,UC-CSyyi47VX1lD9zyeABW3w,2023-07-16T15:05:29Z,9600532,28000,🤖 Join my AI Course: https://academy.dhruvrath...,"[Dhruv Rathee, Dhruv, Rathee, indian youtuber,...",27
1,uiUPD-z9DTg,The Next Global Superpower Isn't Who You Think...,TED,UCAuUUnT6oDeKwE6v1NGQxug,2023-06-14T16:20:35Z,9209035,13448,Who runs the world? Political scientist Ian Br...,"[TEDTalk, TEDTalks, TED Talk, TED Talks, TED, ...",25
2,jPhJbKBuNnA,I tried using AI. It scared me.,Tom Scott,UCBa659QWEk1AI4Tg--mrJ2A,2023-02-13T16:00:23Z,7379887,14322,I just wanted to fix my email. ■ AD: 👨‍💻 NordV...,[],27
33,zaB_20bkoA4,Elon Musk's BRUTALLY Honest Interview With Tuc...,Visionary,UC_8gQrYCIf9aG_9N4_T5fzg,2023-07-06T05:51:15Z,6180406,11401,Elon Musk's BRUTALLY Honest Interview With Tuc...,"[Smart Sense, Elon Musk, elon musk carlson tuc...",24
34,aZ5EsdnpLMI,Artificial Intelligence | 60 Minutes Full Epis...,60 Minutes,UCsN32BtMd0IoByjJRNF12cw,2023-12-30T13:00:02Z,5792902,6652,"From January 2019, Scott Pelley's interview wi...","[60 minutes, cbs news, artificial intelligence...",25
3,DCu9xawHJaw,Meet The 6'10 Ai Robot NBA Players Fear..,REBOUND,UCVepIJedg6BMQeSgjPY4e6A,2023-06-20T19:58:24Z,5232601,1345,"If you’re ever injured in an accident, you can...","[nba, rebound central, rebound]",17
4,2yd18z6iSyk,"Joe Rogan: ""I Wasn't Afraid of AI Until I Lear...",JRE Daily Clips,UCavUx5z-IziOIxbsK_Ah7ng,2023-12-19T22:46:30Z,5190605,8147,FREE Alpha Brain Trial ► https://onnit.sjv.io/...,"[JRE, UFC, Joe Rogan, Joe Rogan Experience, JR...",23


### Cleaning the Videos DataFrames

In [104]:
# df_videos_cleaned = (
#     df_videos[df_videos["comment_count"] > 500] # select only videos with at least 500 comments
#     .nlargest(20, "view_count")  # select the top 10 videos with the highest view_count
#     .reset_index(drop=True)  # reset the index without bringing the old indices to the new DataFrame
# )

# ## NOTE: Probably not yet necessary, we can also do this at a later step.
# # Extracting year, month, and day from the datetime column
# # df_videos_cleaned['year'] = pd.to_datetime(df_videos_cleaned['publish_date']).dt.year
# # df_videos_cleaned['month'] = pd.to_datetime(df_videos_cleaned['publish_date']).dt.month
# # df_videos_cleaned['day'] = pd.to_datetime(df_videos_cleaned['publish_date']).dt.day
# df_videos_cleaned

Unnamed: 0,video_id,video_title,video_publish_date,view_count,comment_count,video_description,tags,category_id
0,Bg_tJvCA8zw,Tonight Showbotics: Jimmy Meets Sophia the Hum...,2017-04-26T03:57:12Z,34706883,23100,Jimmy Fallon demos amazing new robots from all...,"[The Tonight Show, Jimmy Fallon, Tonight Showb...",23
1,aircAruvnKk,"But what is a neural network? | Chapter 1, Dee...",2017-10-05T15:11:25Z,16286746,7257,"What are the neurons, why are there layers, an...","[three brown one blue, 3 brown 1 blue, neural ...",27
2,WSKi8HfcxEk,The Rise of the Machines – Why Automation is D...,2017-06-08T16:28:31Z,14470932,30411,Automation in the Information Age is different...,"[automation, universal basic income, new autom...",27
3,R9OHn5ZF4Uo,"How AIs, like ChatGPT, Learn",2017-12-18T14:39:33Z,9929083,22915,"How do all the algorithms, like ChatGPT, aroun...","[cgpgrey, education, hello internet, ai, chatg...",27
4,S5t6K9iwcdw,Interview With The Lifelike Hot Robot Named So...,2017-10-25T16:32:19Z,9892253,7660,"CNBC's Andrew Ross Sorkin interviews Sophia, a...","[CNBC, business news, finance stock, stock mar...",25
5,BrNs0M77Pd4,Artificial Intelligence: it will kill us | Jay...,2017-01-31T18:36:05Z,5271878,12446,"For more information on Jay Tuck, please visit...","[TEDxTalks, English, Germany, Technology, AI, ...",29
6,TRzBk_KuIaM,The Real Reason to be Afraid of Artificial Int...,2017-12-15T15:55:54Z,2428778,3951,"A robotics researcher afraid of robots, Peter ...","[TEDxTalks, English, Technology, Big Data, Cod...",29
7,9kiEK4LrCgQ,Sofia| First Robot as a Citizen of Saudi Arabi...,2017-10-27T09:25:22Z,2382951,2473,#Sofia #Artificial Intelligence #FirstRobotCit...,[],19
8,TlO2gcs1YvM,MICRO DRONES KILLER ARMS ROBOTS - AUTONOMOUS A...,2017-11-17T05:44:03Z,2112357,2376,"Killer drone arms, articial intelligence an in...","[KILLER DRONE, INTELLIGENCE, DEFENCE, PRIVACY,...",22
9,xs_HhZrCBdg,With artificial Intelligence we're summoning t...,2017-08-31T19:39:08Z,1880123,4256,"""Artificial intelligence is the future, not on...","[robots, artificial intelligence, Elon Musk, S...",22


In [89]:
# NOTE: This is just a test to see if I can nicely loop over the DF
# Loop through the DataFrame and pass values as parameters to the function
for index, row in df_videos_cleaned.iterrows():
    # for each row get the 4 relevant columns
    video_id = row['video_id']
    video_title = row['video_title']
    video_publish_date = row['video_publish_date']
    video_category_id = row['category_id']

    
    # Call the function and pass the values as parameters

    print(f"--- {index + 1} --- \nID: {video_id}\nPublish Date: {publish_date}")



# Split the DataFrame into two halves
half_point = len(df_videos_cleaned) // 2
first_half = df_videos_cleaned.iloc[:half_point]
second_half = df_videos_cleaned.iloc[half_point:]

# Loop through the first half of the DataFrame and pass values as parameters to the function
for index, row in first_half.iterrows():
    video_id = row['video_id']
    publish_date = row['publish_date']
    # Call the function and pass the values as parameters
    print(f"--- {index + 1} --- \nID: {video_id}\nPublish Date: {publish_date}")

print(f"\n\n\nWaiting...\n\n\n")
time.sleep(10)

# At a later time, loop through the second half of the DataFrame
for index, row in second_half.iterrows():
    video_id = row['video_id']
    publish_date = row['publish_date']
    print(f"--- {index + 1} --- \nID: {video_id}\nPublish Date: {publish_date}")



--- 1 --- 
ID: Bg_tJvCA8zw
Publish Date: 2017-04-26T03:57:12Z
--- 2 --- 
ID: aircAruvnKk
Publish Date: 2017-10-05T15:11:25Z
--- 3 --- 
ID: WSKi8HfcxEk
Publish Date: 2017-06-08T16:28:31Z
--- 4 --- 
ID: R9OHn5ZF4Uo
Publish Date: 2017-12-18T14:39:33Z
--- 5 --- 
ID: S5t6K9iwcdw
Publish Date: 2017-10-25T16:32:19Z



Waiting...



--- 6 --- 
ID: BrNs0M77Pd4
Publish Date: 2017-01-31T18:36:05Z
--- 7 --- 
ID: R_5sAburz5Q
Publish Date: 2017-12-23T18:04:43Z
--- 8 --- 
ID: TRzBk_KuIaM
Publish Date: 2017-12-15T15:55:54Z
--- 9 --- 
ID: 9kiEK4LrCgQ
Publish Date: 2017-10-27T09:25:22Z
--- 10 --- 
ID: TlO2gcs1YvM
Publish Date: 2017-11-17T05:44:03Z


## Section 2: Retrieving Comments

In [None]:
# This function gets all the top level comments and saves them to a dataframe
def get_all_top_level_comments(video_id, video_title, video_publish_date, video_category_id, comments=[], next_page_token=None):
    youtube = build('youtube', 'v3', developerKey=API_KEY)
    
    response = youtube.commentThreads().list(
        part="id,snippet,replies",
        videoId=video_id,
        maxResults=100,
        order="relevance",
        pageToken=next_page_token).execute()

    for item in response["items"]:
        comment = item['snippet']['topLevelComment']
        comment_text = comment['snippet']['textDisplay']

        comments.append({
            "video_id": video_id,
            "video_title": video_title,
            "video_publish_date": video_publish_date,
            "video_category_id": video_category_id,
            "comment_text": comment_text,
            "comment_id": item['id'],
            "comment_publish_date": comment["snippet"]["publishedAt"]
            })

    if "nextPageToken" in response:
        return get_all_comments(video_id, video_title, video_publish_date, video_category_id, comments, response["nextPageToken"])
    else:
        return pd.DataFrame(comments)

In [25]:
# First we have to read all df_videos.csv files and save them to a list 

# List to store all the cleaned video dataframes
video_dataframes = []

# Years for which video CSV files exist
years = range(2017, 2023 + 1)

# Loop through the years, load the corresponding CSV file, and append it to the list
for year in years:
    file_path_and_name = f"../data/cleaned/df_videos_{year}_cleaned.csv"
    df = pd.read_csv(file_path_and_name)
    video_dataframes.append(df)

# Optional: To verify, print the first few rows of each dataframe
# for i, df in enumerate(video_dataframes):
#     print(f"DataFrame for year {years[i]}:")
#     print(df.head())


In [31]:
df_videos_2017 = video_dataframes[0]
df_videos_2018 = video_dataframes[1]
df_videos_2019 = video_dataframes[2]
df_videos_2020 = video_dataframes[3]
df_videos_2021 = video_dataframes[4]
df_videos_2022 = video_dataframes[5]
df_videos_2023 = video_dataframes[6]

df_videos_2020.head()
# for index, row in df_videos_cleaned.iterrows():
#     # for each row get the 4 relevant columns
#     video_id = row['video_id']
#     video_title = row['video_title']
#     video_publish_date = row['video_publish_date']
#     video_category_id = row['category_id']

    
#     # Call the function and pass the values as parameters

#     print(f"--- {index + 1} --- \nID: {video_id}\nPublish Date: {publish_date}")

Unnamed: 0,video_id,video_title,channel_title,channel_id,video_publish_date,view_count,comment_count,video_description,tags,category_id
0,WXuK6gekU1Y,AlphaGo - The Movie | Full award-winning docum...,Google DeepMind,UCP7jMXSY2xbc3KCAE0MHQ-A,2020-03-13T14:04:51Z,35241545,21378,With more board configurations than there are ...,"['DeepMind', 'Deep Mind', 'Google DeepMind', '...",28
1,Jky9I1ihAkg,9 Most Advanced AI Robots - Humanoid & Industr...,TerkRecoms - Tech TV,UCtbo7Mcf52Lbd-XZDUzTBNw,2020-03-02T16:01:23Z,6093406,4987,"A list of most advanced Humanoid, Industrial a...","['humanoid robots', 'surena 4 robot', 'hrp-5p'...",28
2,XSgfE2vg-Kk,Top 5 Female Humanoid Robots 2023 - Artificial...,TFlex Tech,UCX3IL9F5nnSltv3x-qezITw,2020-04-24T16:30:13Z,4648148,3007,Top 5 Female Humanoid Robots 2023 - Artificial...,"['tflex tech', 't flex tech', 'Tflextech02', '...",28
3,WATLfjRHySU,India Welcomes Robot Sophia For The First-Ever...,Republic World,UCwqusr8YDwM-3mEYTDeJHzw,2020-02-19T09:51:32Z,3060083,1727,Republic TV is India's no.1 English news chann...,"['republic', 'republic tv', 'republic tv live'...",25
4,tPYj3fFJGjk,TensorFlow 2.0 Complete Course - Python Neural...,freeCodeCamp.org,UC8butISFwT-Wl7EV0hUK0BQ,2020-03-03T15:44:00Z,2990743,1870,Learn how to use TensorFlow 2.0 in this full t...,[],27


In [6]:
def get_video_comments(video_id, max_results=20):
    youtube = build('youtube', 'v3', developerKey=API_KEY)

    # Call the API to retrieve comments
    response = youtube.commentThreads().list(
        part='snippet',
        videoId=video_id,
        maxResults=max_results
    ).execute()

    comments = []
    for item in response['items']:
        comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
        comments.append(comment)

    return comments

In [7]:
comments = get_video_comments("Bg_tJvCA8zw") # TEST: get 20 comments for one specific video

# loop through the comments and print them
for i, comment in enumerate(comments, start=1):
    print(f"comment {i}: {comment}")

comment 1: I knew Jennifer Lawrence wasn&#39;t a real person
comment 2: Robot + feeling = love
comment 3: Beautiful
comment 4: Tu chizz badi hain mast 770 k dollers right
comment 5: Bluetooth
comment 6: DJs ❤
comment 7: she is the one that had remarks about the human race and if it would make it, she said ,no.
comment 8: <a href="https://www.youtube.com/watch?v=Bg_tJvCA8zw&amp;t=304">5:04</a> yep we’re doomed 💀
comment 9: 😮
comment 10: The interaction with Sophia is creepy and unsettling but I don’t think the robot noticed.
comment 11: Jimmy Fallon always looks, talks and gesters like a perv.
comment 12: Jimmy is so lovely 🤩
comment 13: Pourquoi n&#39;a-t-elle pas de cheveux ?
comment 14: Avengers: Age of Sophia
comment 15: “It’s my great start to dominate human race” jit ain’t jokin’
comment 16: Without a natural female human - society will never continue except via robot technology! That is truthful not fabricated deceit. God the true heavenly Good God our Heavenly Alnighty Lord God 

In [27]:
def get_video_comments(video_id, max_comments=MAX_COMMENTS, total_comment_count=0):
    youtube = build('youtube', 'v3', developerKey=API_KEY)

    # calculate the number of pages required to fetch all comments
    total_pages = (total_comment_count + 99) // 100
    print(f"Total Pages: {total_pages}")
    
    comments = []
    while len(comments) < max_comments:
        response = youtube.commentThreads().list(
            part='snippet',
            videoId=video_id,
            maxResults=min(100, max_comments - len(comments))
        ).execute()

        for item in response['items']:
            comments.append({
                'comment': item['snippet']['topLevelComment']['snippet']['textDisplay'],
                'reply_count': item['snippet']['totalReplyCount']
            })

        if 'nextPageToken' in response and len(comments) < max_comments:
            response = youtube.commentThreads().list(
                    part = 'snippet,replies',
                    videoId = video_id,
                    pageToken = response['nextPageToken']
                ).execute()
        else:
            break
    return pd.DataFrame(comments)

In [30]:
video_comments_df = get_video_comments(video_id="R9OHn5ZF4Uo", total_comment_count=22915)

Total Pages: 230


In [32]:
video_comments_df.head(20)

Unnamed: 0,comment,reply_count
0,Bot food,0
1,"Last time I watched it, I saw another tech-loo...",0
2,Best Youtuber,0
3,Watching this clip is like listening to neuron...,0
4,"comment, cause the algorithm is *watching*.",0
5,This is definitely my favorate Grey video. It&...,0
6,This video came out when I was 13 and I rememb...,0
7,Thanks!,0
8,😂,0
9,Thanks,0


### Retreiving ALL comments of a video

In [31]:
def get_all_top_level_comment_replies(video_id, video_publish_date, video_title, top_level_comment_id, replies, page_token):
    youtube = build('youtube', 'v3', developerKey=API_KEY)

    response = youtube.comments().list(
        part="snippet",
        parentId=top_level_comment_id,
        maxResults=100,
        pageToken=page_token).execute()
    
    for item in response["items"]:
        replies.append({
            "video_id": video_id,
            "video_title": video_title,
            "video_publish_date": video_publish_date,
            "text": item["snippet"]["textDisplay"],
            "comment_published_at": item["snippet"]["publishedAt"],
            "parent_comment_id": top_level_comment_id
            })
    
    if "nextPageToken" in replies:
        return get_all_top_level_comment_replies(top_level_comment_id, replies, response["nextPageToken"])
    else:
        return replies

In [16]:
def get_all_top_level_comments(video_id, video_title, video_publish_date, video_category_id, comments=[], next_page_token=None):
    youtube = build('youtube', 'v3', developerKey=API_KEY)
    
    response = youtube.commentThreads().list(
        part="id,snippet,replies",
        videoId=video_id,
        maxResults=100,
        order="relevance",
        pageToken=next_page_token).execute()

    # Stores the total reply count a top level commnet has.
    #total_reply_count = 0
    
    #replies = []

    for item in response["items"]:
        comment = item['snippet']['topLevelComment']
        comment_text = comment['snippet']['textDisplay']

        comments.append({
            "video_id": video_id,
            "video_title": video_title,
            "video_publish_date": video_publish_date,
            "video_category_id": video_category_id,
            "comment_text": comment_text,
            "comment_id": item['id'],
            "comment_publish_date": comment["snippet"]["publishedAt"]
            })

        # get the total reply count
        # total_reply_count = item['snippet']['totalReplyCount']

        # if (total_reply_count > 0): 
        #     replies = []
        #     replies.extend(get_all_top_level_comment_replies(video_id, video_publish_date, video_title, comment['id'], [], None))
        #     comments.extend(replies)
    
    if "nextPageToken" in response:
        return get_all_comments(video_id, video_title, video_publish_date, video_category_id, comments, response["nextPageToken"])
    else:
        return pd.DataFrame(comments)

In [17]:
id = "mJeNghZXtMo"
publish_date_str = "2017-01-30T20:19:44Z"
title = "What is Artificial Intelligence"

publish_date = datetime.strptime(publish_date_str, "%Y-%m-%dT%H:%M:%SZ")

all_comments_df = get_all_top_level_comments(id, title, publish_date, 28)

In [18]:
all_comments_df.head(50)

Unnamed: 0,video_id,video_title,video_publish_date,video_category_id,comment_text,comment_id,comment_publish_date,comment_published_at
0,mJeNghZXtMo,What is Artificial Intelligence,2017-01-30 20:19:44,28,Download HubSpot&#39;s State of AI in Marketin...,Ugzx0oSDt_Pfbz8E5sp4AaABAg,2023-02-02T21:03:19Z,
1,mJeNghZXtMo,What is Artificial Intelligence,2017-01-30 20:19:44,28,pov: your IT teacher told you to watch this,UgybrlkjPEbM1g8GGwt4AaABAg,2020-11-08T12:58:12Z,
2,mJeNghZXtMo,What is Artificial Intelligence,2017-01-30 20:19:44,28,"&#39;&#39;its not machine vs us, it&#39;s Mach...",UgiVAjDUI4j6BngCoAEC,2017-07-11T06:10:18Z,
3,mJeNghZXtMo,What is Artificial Intelligence,2017-01-30 20:19:44,28,"In basic terms, AI is a broad area of computer...",UgxZptK7SryKySrdZFF4AaABAg,2020-11-01T17:54:42Z,
4,mJeNghZXtMo,What is Artificial Intelligence,2017-01-30 20:19:44,28,Thanks HubSpot! You helped me out a lot. Pleas...,UgznIyDtDMkdObHYwKl4AaABAg,2018-08-20T09:01:13Z,
5,mJeNghZXtMo,What is Artificial Intelligence,2017-01-30 20:19:44,28,This video sounds like its made by an intellig...,UgwEPtQ40RfTGG-joGR4AaABAg,2018-01-03T02:58:26Z,
6,mJeNghZXtMo,What is Artificial Intelligence,2017-01-30 20:19:44,28,how does a computer analyze the sound waves of...,UgzSyOf_jvG-SsxwBsB4AaABAg,2018-09-21T02:39:21Z,
7,mJeNghZXtMo,What is Artificial Intelligence,2017-01-30 20:19:44,28,Totally unrelated question - what software are...,Ugzpqymd8CKSeiiDn1F4AaABAg,2019-10-03T15:54:14Z,
8,mJeNghZXtMo,What is Artificial Intelligence,2017-01-30 20:19:44,28,Best Explanation... after searching for long i...,Ugz0jWbM-ALew_jzE0Z4AaABAg,2017-11-10T11:03:15Z,
9,mJeNghZXtMo,What is Artificial Intelligence,2017-01-30 20:19:44,28,"Great video, what do you guys use for the anim...",Ugjc9ijhGnT62XgCoAEC,2017-02-22T05:35:54Z,


In [None]:
#all_comments_df.to_pickle("all_comments.pkl")
all_comments_df.to_csv("all_comments_2.csv")

In [40]:
datatypes = all_comments_df.dtypes
datatypes

all_comments_df.info()
all_comments_df.head(50)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 498 entries, 0 to 497
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   video_id              498 non-null    object
 1   video_title           498 non-null    object
 2   video_publish_date    498 non-null    object
 3   text                  498 non-null    object
 4   comment_published_at  498 non-null    object
 5   parent_comment_id     170 non-null    object
dtypes: object(6)
memory usage: 23.5+ KB


Unnamed: 0,video_id,video_title,video_publish_date,text,comment_published_at,parent_comment_id
0,mJeNghZXtMo,What is Artificial Intelligence,2017-01-30T20:19:44Z,Download HubSpot&#39;s State of AI in Marketin...,2023-02-02T21:03:19Z,
1,mJeNghZXtMo,What is Artificial Intelligence,2017-01-30T20:19:44Z,pov: your IT teacher told you to watch this,2020-11-08T12:58:12Z,
2,mJeNghZXtMo,What is Artificial Intelligence,2017-01-30T20:19:44Z,Wel it was my &#39;Data mining and machine lea...,2020-11-10T08:23:07Z,UgybrlkjPEbM1g8GGwt4AaABAg
3,mJeNghZXtMo,What is Artificial Intelligence,2017-01-30T20:19:44Z,hol my name is pol and u r a pro gamer clearly...,2020-11-16T21:28:24Z,UgybrlkjPEbM1g8GGwt4AaABAg
4,mJeNghZXtMo,What is Artificial Intelligence,2017-01-30T20:19:44Z,Well turns out ur correct :(,2021-01-29T09:08:16Z,UgybrlkjPEbM1g8GGwt4AaABAg
5,mJeNghZXtMo,What is Artificial Intelligence,2017-01-30T20:19:44Z,hola andy,2021-02-02T12:01:12Z,UgybrlkjPEbM1g8GGwt4AaABAg
6,mJeNghZXtMo,What is Artificial Intelligence,2017-01-30T20:19:44Z,My mechatronics teacher told me too so your so...,2021-02-08T21:37:30Z,UgybrlkjPEbM1g8GGwt4AaABAg
7,mJeNghZXtMo,What is Artificial Intelligence,2017-01-30T20:19:44Z,"No, I’m only curious",2021-02-17T02:13:58Z,UgybrlkjPEbM1g8GGwt4AaABAg
8,mJeNghZXtMo,What is Artificial Intelligence,2017-01-30T20:19:44Z,My english teacher xD,2021-03-02T10:31:28Z,UgybrlkjPEbM1g8GGwt4AaABAg
9,mJeNghZXtMo,What is Artificial Intelligence,2017-01-30T20:19:44Z,@@tobix8687 relatable,2021-04-06T16:08:56Z,UgybrlkjPEbM1g8GGwt4AaABAg
