# YouTube Comment Collection

For this project, I want to collect all the comments from all a user's videos for analysis.  The comments will be collected using the YouTube v3 API and then the comments will be stored in a Google BigQuery datawarehouse.  Note though that the API has limitations and can only get 10,000 requests per day which limits the amount of data that can be collected daily.  Further improvement suggestions for this project would be to get time/date markers for things like likes on comments to see how they change over time.

In [19]:
# The first function gets a list of all the videos uploaded by a channel given the channelId

def get_all_videos_from_channel(channelId, API_KEY):
    #1 import required libraries
    import googleapiclient.discovery
    import pandas as pd

    #2 define api criteria 
    api_service_name = "youtube"
    api_version = "v3"
    DEVELOPER_KEY = API_KEY

    #3 build api pull
    youtube = googleapiclient.discovery.build(
        api_service_name, api_version, developerKey = DEVELOPER_KEY)

    #4 write a list for request
    request = youtube.channels().list(
            part="contentDetails",
            id = channelId
        )

    #5 execture the request
    response = request.execute()

    #6 get a list of all videos from the uploads portion of the response
    uploads_playlist = response['items'][0]['contentDetails']['relatedPlaylists']['uploads']

    #7 use the uploads_playlist to get a list of all videos
    request = youtube.playlistItems().list(
        part="snippet",
        playlistId=uploads_playlist
    )

    #8 execture the request
    response = request.execute()

    # create blank lists for each variable we want
    videoId = []
    title = []
    channelId = []
    channelTitle = []
    uploadTime = []

    # initial for loop
    for item in response['items']:
                #Taking each comment from the JSON response
                video_id = item['snippet']['resourceId']['videoId']
                title_snip = item['snippet']['title']
                channel_Id = item['snippet']['channelId']
                channel_title = item['snippet']['channelTitle']
                upload_time = item['snippet']['publishedAt']
                            
                #Adding the video info to the list
                videoId.append(video_id)
                title.append(title_snip)
                channelId.append(channel_Id)
                channelTitle.append(channel_title)
                uploadTime.append(upload_time)

    #6 write a for loop to take all the information as it comes in and make a pd dataframe

    #Check to see if nextPageTokenExists
    try:
        response['nextPageToken']
    except KeyError:
        var_exists = False
    else:
        var_exists = True

    while var_exists == True:
        if 'nextPageToken' in response:
                    request = youtube.playlistItems().list(
                    part="snippet",
                    playlistId=uploads_playlist,
                    pageToken=response['nextPageToken']
                    )

                    response = request.execute()

                    for item in response['items']:
                        #Taking each comment from the JSON response
                        video_id = item['snippet']['resourceId']['videoId']
                        title_snip = item['snippet']['title']
                        channel_Id = item['snippet']['channelId']
                        channel_title = item['snippet']['channelTitle']
                        upload_time = item['snippet']['publishedAt']
                                    
                        #Adding the video info to the list
                        videoId.append(video_id)
                        title.append(title_snip)
                        channelId.append(channel_Id)
                        channelTitle.append(channel_title)
                        uploadTime.append(upload_time)

                    try:
                        response['nextPageToken']
                    except KeyError:
                        var_exists = False
                    else:
                        var_exists = True

    data = pd.DataFrame(list(zip(videoId,title,channelId,channelTitle,uploadTime)),
                                columns = ['videoId','title','channelId','channelTitle','uploadTime'])

    return data

# The second fucntion here is to collect all the comments given a dataframe df where one of 
# the columns is explicitly named *videoId* (which should line up from the previous function) 
# and the function will loop through every video in the list and pull the comments from the video

def get_parent_comments(df, API_KEY):
    # to improve, add something that prevents http errors from happening

    # import required libraries
    import os
    import googleapiclient.discovery
    import pandas as pd

    # define api criteria 
    api_service_name = "youtube"
    api_version = "v3"
    DEVELOPER_KEY = API_KEY
    
    # build api pull
    youtube = googleapiclient.discovery.build(
        api_service_name, api_version, developerKey = DEVELOPER_KEY)
    
    # get data on dataframe to have print statements about function status
    video_num = len(df)
    i = 0

    # create blank lists for each variable we want  
    videoId = []
    comments = []
    commentsId = []
    repliesCount = []
    likesCount = []

    for video in df['videoId']:
        i = i+1
        print('Starting VideoID:', video, '- Number', i, 'of', video_num)

        #4 write a list for request
        request = youtube.commentThreads().list(
                part="snippet",
                maxResults=100,
                order="time",
                textFormat="plainText",
                videoId=video,
                prettyPrint=True
            )

        #5 execture the request
        response = request.execute()

        #6 write a for loop to take all the information as it comes in and make a pd dataframe

        # initial for loop
        for item in response['items']:
                    #Taking each comment from the JSON response
                    comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
                    comment_id = item['snippet']['topLevelComment']['id']
                    reply_count = item['snippet']['totalReplyCount']
                    like_count = item['snippet']['topLevelComment']['snippet']['likeCount']
                    
                    #Adding the comments to the list
                    videoId.append(video)
                    comments.append(comment)
                    commentsId.append(comment_id)
                    repliesCount.append(reply_count)
                    likesCount.append(like_count)

        #Check to see if nextPageTokenExists
        try:
            response['nextPageToken']
        except KeyError:
            var_exists = False
        else:
            var_exists = True

        while var_exists == True:
            if 'nextPageToken' in response:
                        request = youtube.commentThreads().list(
                        part="snippet",
                        maxResults=100,
                        order="time",
                        textFormat="plainText",
                        videoId=video,
                        prettyPrint=True,
                        pageToken=response['nextPageToken']
                        )

                        response = request.execute()

                        for item in response['items']:
                        #Taking each comment from the JSON response
                            comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
                            comment_id = item['snippet']['topLevelComment']['id']
                            reply_count = item['snippet']['totalReplyCount']
                            like_count = item['snippet']['topLevelComment']['snippet']['likeCount']
                            
                            #Adding the comments to the list
                            videoId.append(video)
                            comments.append(comment)
                            commentsId.append(comment_id)
                            repliesCount.append(reply_count)
                            likesCount.append(like_count)

                        try:
                            response['nextPageToken']
                        except KeyError:
                            var_exists = False
                        else:
                            var_exists = True

    data = pd.DataFrame(list(zip(comments,commentsId,repliesCount,likesCount, videoId)),
                                columns = ['comments','commentsId','repliesCount','likesCount', 'videoId'])

    return data

In [20]:
# For Ludwigs channel as an example, pull all the videos
channelId = 'UCrPseYLGpNygVi34QpGNqpA' # Example for Ludwig's Channel

# Insert your YouTube API key
API_KEY = 'input_your_API_key_here'

video_list = get_all_videos_from_channel(channelId, API_KEY)

In [56]:
# save to csv so I don't have to run again
video_list.to_csv('videoInfo.csv', index = False, encoding = 'utf-8-sig')

In [42]:
# now that we have the list of all videos, lets get a list of all the comments on each video
parent_comments_list = get_parent_comments(video_list, API_KEY)

Starting VideoID: IgrxQRlDVi0 - Number 1 of 716
Starting VideoID: lHgaT0i-TA0 - Number 2 of 716
Starting VideoID: DqmSovzS8HY - Number 3 of 716
Starting VideoID: hzgDK90GA1Q - Number 4 of 716
Starting VideoID: FiSRIuuIgyE - Number 5 of 716
Starting VideoID: Y-G8pTrOWus - Number 6 of 716
Starting VideoID: QblSVioteW8 - Number 7 of 716
Starting VideoID: ixO-_v2fB4Y - Number 8 of 716
Starting VideoID: cUJ3B7QcZ9U - Number 9 of 716
Starting VideoID: JSH8VLIqEDs - Number 10 of 716
Starting VideoID: d_x_khfUVTA - Number 11 of 716
Starting VideoID: uLZz8vH7t-U - Number 12 of 716
Starting VideoID: 8LnJNywSVew - Number 13 of 716
Starting VideoID: O8H95tpKzuc - Number 14 of 716
Starting VideoID: 0KGvcyy5z18 - Number 15 of 716
Starting VideoID: 3rGBisAobPE - Number 16 of 716
Starting VideoID: 4nD5OFuUM7I - Number 17 of 716
Starting VideoID: _mrKvqkd2DE - Number 18 of 716
Starting VideoID: qmRQspiECu4 - Number 19 of 716
Starting VideoID: -TA8UsrDl0A - Number 20 of 716
Starting VideoID: wL0c1Zp3iD0

In [43]:
parent_comments_list.to_csv('parentComments.csv', index = False, encoding = 'utf-8-sig')