# Get YouTube Data

Utilizes Google's YouTube Data API to retrieve information about the top 50 YouTube videos, plus 25 related videos for each of the top videos.

### Import Python libraries

In [1]:
import re
import datetime as dt
import pandas as pd
import numpy as np
import math

from google.oauth2 import service_account
from googleapiclient.discovery import build

### Define helper functions

In [2]:
def my_recursive_print_json(j, level = -1):
    "From W205 sample code: Given a json object print it"
    
    level += 1
    
    spaces = "    "
    
    if type(j) is dict:
        dict_2_list = list(j.keys())
        for k in dict_2_list:
            print(spaces * level + k)
            my_recursive_print_json(j[k], level)
            
    elif type(j) is list:
        for (i, l) in enumerate(j):
            print(spaces * level + "[" + str(i) + "]")
            my_recursive_print_json(l, level)
                  
    else:
        print(spaces * level + "value:", str(j))
        
        
def get_top_yt_videos(apiClient):
    """Make repeated calls to the YouTube Data API to retrieve 
    the top 50 videos in the given region"""
    
    video_list = []
    pageToken = ""
    
    while True:
        response = apiClient.videos().list(
            part='contentDetails,snippet,statistics',
            chart='mostPopular',
            pageToken=pageToken
        ).execute()
        
        video_list = np.append(video_list, response['items'])
        
        if len(video_list) >= 50:
            video_list = video_list[:50]
            break
            
        if 'nextPageToken' in response:
            pageToken = response['nextPageToken']
        else:
            break
            
    return video_list


def get_yt_videos_by_id(apiClient, video_id_list):
    """Given a list of video IDs call the YouTube Data API to
    get details about the videos"""

    video_list = []
    
    num_videos = len(video_id_list)
    chunks = math.ceil(num_videos / 50)
    
    start_idx = 0
    for chunk_num in range(chunks):
        end_idx = min(num_videos, (chunk_num + 1) * 50)
        video_id_str = ','.join(video_id_list[start_idx:end_idx])

        response = apiClient.videos().list(
            part='contentDetails,snippet,statistics',
            id=video_id_str
        ).execute()
        
        video_list = np.append(video_list, response['items'])
        
        start_idx = end_idx
    
    return video_list

    
def get_related_yt_video_ids(apiClient, videoId):
    """Given the ID of a video call the YouTube Data API to 
    get the IDs of 25 other videos related to it"""

    response = apiClient.search().list(
                 part='snippet',
                 relatedToVideoId=videoId,
                 type='video',
                 maxResults=25
               ).execute()

    video_id_list = []
    for result in response['items']:
        video_id_list.append(result['id']['videoId'])

    return video_id_list


def get_yt_channels(apiClient, channel_id_list):
    """Give a list of channel IDs, call the YouTube Data API
    to get statistics about the channels"""
    
    channel_list = []
    
    num_channels = len(channel_id_list)
    chunks = math.ceil(num_channels / 50)
    
    start_idx = 0
    for chunk_num in range(chunks):
        end_idx = min(num_channels, (chunk_num + 1) * 50)
        channel_id_str = ','.join(channel_id_list[start_idx:end_idx])

        response = apiClient.channels().list(
            id=channel_id_str,
            part='snippet,statistics',
            maxResults=50
        ).execute()
        
        channel_list = np.append(channel_list, response['items'])
        
        start_idx = end_idx
        
    return channel_list


def parse_yt_video_data(videos):
    "Convert a list of YouTube video JSON elements into a dataframe"
    
    # regular expressions to parse the duration string
    hms_regex = re.compile('PT(\d+)H(\d+)M(\d+)S')
    hm_regex = re.compile('PT(\d+)H(\d+)M')
    hs_regex = re.compile('PT(\d+)H(\d+)S')
    ms_regex = re.compile('PT(\d+)M(\d+)S')
    m_regex = re.compile('PT(\d+)M')
    s_regex = re.compile('PT(\d+)S')

    now = dt.datetime.now(dt.timezone.utc)

    channel_id_list = []
    video_list = []

    for video in videos:

        attributes = {}

        attributes['video_id'] = video['id']
        
        snippet = video['snippet']
        attributes['title'] = snippet['title']
        attributes['description'] = snippet['description']

        attributes['video_published_at'] = dt.datetime.fromisoformat(snippet['publishedAt'][:-1] + '+00:00')
        attributes['hours_published_video'] = (now - attributes['video_published_at']).total_seconds() / 60 / 60

        if 'tags' in snippet:
            attributes['num_tags'] = len(snippet['tags'])
        else:
            attributes['num_tags'] = 0

        attributes['channel'] = snippet['channelTitle']
        attributes['channel_id'] = snippet['channelId']
        channel_id_list.append(attributes['channel_id'])

        thumbnail = snippet['thumbnails']['default']
        attributes['thumbnail_url'] = thumbnail['url']
        attributes['thumbnail_width'] = thumbnail['width']
        attributes['thumbnail_height'] = thumbnail['height']

        contentDetails = video['contentDetails']

        hours = 0
        minutes = 0
        seconds = 0
        duration = contentDetails['duration']
        regex_match = hms_regex.match(duration)
        if regex_match is not None:
            (hours, minutes, seconds) = regex_match.group(1, 2, 3)
        else:
            regex_match = hm_regex.match(duration)
            if regex_match is not None:
                (hours, minutes) = regex_match.group(1, 2)
            else:
                regex_match = hs_regex.match(duration)
                if regex_match is not None:
                    (hours, seconds) = regex_match.group(1, 2)
                else:
                    regex_match = ms_regex.match(duration)
                    if regex_match is not None:
                        (minutes, seconds) = regex_match.group(1, 2)
                    else:
                        regex_match = m_regex.match(duration)
                        if regex_match is not None:
                            minutes = regex_match.group(1)
                        else:
                            regex_match = s_regex.match(duration)
                            if regex_match is not None:
                                seconds = regex_match.group(1)

        attributes['duration'] = (int(hours)*60 + int(minutes))* 60 + int(seconds)
        attributes['definition'] = contentDetails['definition']
        attributes['caption'] = contentDetails['caption']

        stats = video['statistics']
        attributes['views'] = None
        attributes['likes'] = None
        attributes['favorites'] = None
        attributes['comments'] = None
        
        if 'viewCount' in stats:
            attributes['views'] = stats['viewCount']
            
        if 'likeCount' in stats:
            attributes['likes'] = stats['likeCount']
            
        if 'favoriteCount' in stats:
            attributes['favorites'] = stats['favoriteCount']

        if 'commentCount' in stats:
            attributes['comments'] = stats['commentCount']

        video_list.append(attributes)

    df = pd.DataFrame.from_records(video_list)
    unique_channel_id_list = list(set(channel_id_list))
    
    return (df, unique_channel_id_list)


def parse_yt_channel_data(channels):
    "Convert a list of YouTube channel JSON elements into a dataframe"
    
    date_regex = re.compile("\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z")
    date_with_microsec_regex = re.compile("\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+Z")
    
    now = dt.datetime.now(dt.timezone.utc)

    channel_list = []
    
    for channel in channels:
       
        attributes = {}
        
        attributes['channel_id'] = channel['id']
        
        snippet = channel['snippet']
        attributes['channel_description'] = snippet['description']
        
        published_date_str = snippet['publishedAt']
        if date_with_microsec_regex.match(published_date_str) is not None:
            microsec_idx = published_date_str.find(".") + 1
            published_date_str = published_date_str[:microsec_idx]
            
        attributes['channel_published_at'] = dt.datetime.fromisoformat(published_date_str[:-1] + '+00:00')
        attributes['days_published_channel'] = (now - attributes['channel_published_at']).total_seconds() / 60 / 60 / 24
        
        stats = channel['statistics']
        attributes['channel_views'] = stats['viewCount']
        attributes['channel_videos'] = stats['videoCount']

        if stats['hiddenSubscriberCount'] == False:
            attributes['channel_subscribers'] = stats['subscriberCount']
        else:
            attributes['channel_subscribers'] = None

        channel_list.append(attributes)
    
    return pd.DataFrame.from_records(channel_list)


### Initialize the API client

In [3]:
# set API credentials
credentials = service_account.Credentials.from_service_account_file(
    'credentials.json',
    scopes=['https://www.googleapis.com/auth/youtube.readonly'])

# create a YouTube API client
youtube = build('youtube', 'v3', credentials=credentials)


### Get data about the top 50 videos from YouTube's "Most Popular" chart

In [4]:
# call the API to get top videos
video_list = get_top_yt_videos(apiClient=youtube)

# Output 1 video JSON element
my_recursive_print_json(video_list[0])


kind
    value: youtube#video
etag
    value: Z9Igjrnk1Hi7tLtpDrbiAnsq0rE
id
    value: vS3_72Gb-bI
snippet
    publishedAt
        value: 2023-04-03T15:39:24Z
    channelId
        value: UCiifkYAs_bq1pt_zbNAzYGg
    title
        value: Blue Beetle – Official Trailer
    description
        value: He’s A Superhero, Whether He Likes It Or Not #BlueBeetle - Only in Theaters August 18

From Warner Bros. Pictures comes the feature film “Blue Beetle,” marking the DC Super Hero’s first time on the big screen. The film, directed by Angel Manuel Soto, stars Xolo Maridueña in the title role as well as his alter ego, Jaime Reyes.
Recent college grad Jaime Reyes returns home full of aspirations for his future, only to find that home is not quite as he left it. As he searches to find his purpose in the world, fate intervenes when Jaime unexpectedly finds himself in possession of an ancient relic of alien biotechnology: the Scarab. When the Scarab suddenly chooses Jaime to be its symbiotic host, 

### Get data about 25 videos related to each of the top videos

In [5]:
# call the API to get videos related to each of the top videos
related_video_id_list = []
top_video_id_list = [video['id'] for video in video_list]

for top_video_id in top_video_id_list:
    related_videos = get_related_yt_video_ids(apiClient=youtube, videoId=top_video_id)
    related_video_id_list = np.append(related_video_id_list, related_videos)

related_video_list = get_yt_videos_by_id(apiClient=youtube, video_id_list=related_video_id_list)

video_list = np.append(video_list, related_video_list)

# Output 1 video JSON element from related_video_list
my_recursive_print_json(related_video_list[0])


kind
    value: youtube#video
etag
    value: C9COT9jCR05gm4wouCf7EkLrpdk
id
    value: mO0OuR26IZM
snippet
    publishedAt
        value: 2023-04-03T13:30:00Z
    channelId
        value: UCWOA1ZGywLbqmigxE4Qlvuw
    title
        value: EXTRACTION 2 | Official Teaser Trailer | Netflix
    description
        value: Chris Hemsworth returns as Tyler Rake in EXTRACTION 2 - only on Netflix, June 16.

After barely surviving the events of the first movie, Rake is back as the Australian black ops mercenary, tasked with another deadly mission: rescuing the battered family of a ruthless Georgian gangster from the prison where they are being held. Hemsworth reunites with director Sam Hargrave, with Joe and Anthony Russo's AGBO producing and Joe Russo writing. Golshifteh Farahani reprises her role from the first film, with Daniel Bernhardt and Tinatin Dalakishvili also co-starring.

This is a sequel to the first film that was based on the graphic novel 'Ciudad' by Ande Parks, from a story by An

### Parse the JSON video data into a Dataframe

In [6]:
# Parse the video JSON elements into a dataframe & list of unique channel IDs
(video_df, channel_id_list) = parse_yt_video_data(video_list)
video_df


Unnamed: 0,video_id,title,description,video_published_at,hours_published_video,num_tags,channel,channel_id,thumbnail_url,thumbnail_width,thumbnail_height,duration,definition,caption,views,likes,favorites,comments
0,vS3_72Gb-bI,Blue Beetle – Official Trailer,"He’s A Superhero, Whether He Likes It Or Not #...",2023-04-03 15:39:24+00:00,13.105019,23,DC,UCiifkYAs_bq1pt_zbNAzYGg,https://i.ytimg.com/vi/vS3_72Gb-bI/default.jpg,120,90,171,hd,true,3773283,172899,0,12763
1,Tp_YZNqNBhw,Marvel Studios’ Secret Invasion | Official Tra...,Who do you trust?\n\nMarvel Studios’ Secret In...,2023-04-03 01:02:26+00:00,27.721130,2,Marvel Entertainment,UCvC4D8onUfXzvjTOM-dBfEA,https://i.ytimg.com/vi/Tp_YZNqNBhw/default.jpg,120,90,121,hd,true,10700320,289198,0,12007
2,1WEAJ-DFkHE,"$1 vs $500,000 Plane Ticket!",Check out ALL of MrBeast’s awesome jobs or dis...,2023-04-01 20:00:04+00:00,56.760574,0,MrBeast,UCX6OQ3DkcsbYNE6H8uQQuVA,https://i.ytimg.com/vi/1WEAJ-DFkHE/default.jpg,120,90,740,hd,true,56645681,2986960,0,92407
3,l4w6808wJcU,DOES YOUR FLAG FAIL? Grey Grades The State Fl...,"- Thank you, Bonnie Bees, for making this vide...",2023-04-02 19:30:08+00:00,33.259463,2,CGP Grey,UC2C_jShtL725hvbm1arSV9w,https://i.ytimg.com/vi/l4w6808wJcU/default.jpg,120,90,1133,hd,true,1873899,149811,0,227
4,4ZX9T0kWb4Y,I rode the world's fastest train.,I thought maglev trains were a dead-end techno...,2023-04-03 15:00:05+00:00,13.760297,2,Tom Scott,UCBa659QWEk1AI4Tg--mrJ2A,https://i.ytimg.com/vi/4ZX9T0kWb4Y/default.jpg,120,90,593,hd,true,1327950,99436,0,3717
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1267,VngGJyDHe6A,Matt Riddle makes a surprise return to “Bro” a...,Already frustrated by all the painful surprise...,2023-04-04 03:25:00+00:00,1.345019,29,WWE,UCJ5v_MCY6GNUBTO8-D3XoAg,https://i.ytimg.com/vi/VngGJyDHe6A/default.jpg,120,90,190,hd,false,70152,3342,0,333
1268,EiCo2_n64fY,Cody Rhodes Full Entrance Live WrestleMania 39...,,2023-04-03 03:17:56+00:00,25.462797,0,MrBeatlematt,UCFpF2hfFXEXjJo3ABiZYWKg,https://i.ytimg.com/vi/EiCo2_n64fY/default.jpg,120,90,281,hd,false,11400,297,0,31
1269,NSh6dHxLEME,Omos vs Brock Lesnar Full Match - Wrestlemania 39,Brock Lesnar vs Omos\n\nSocial Media Profiles:...,2023-04-03 00:33:10+00:00,28.208908,0,Matt Kempke,UCfV8O1xhlB8O9YGz9ywmn8A,https://i.ytimg.com/vi/NSh6dHxLEME/default.jpg,120,90,274,hd,false,1682759,32306,0,1451
1270,_x9_oomdrF4,Kevin Owens & Sami Zayn vs. Street Profits: Ra...,Kevin Owens and Sami Zayn face The Street Prof...,2023-04-04 02:42:33+00:00,2.052519,31,WWE,UCJ5v_MCY6GNUBTO8-D3XoAg,https://i.ytimg.com/vi/_x9_oomdrF4/default.jpg,120,90,191,hd,false,40750,2252,0,109


### Get data about the channels that produced the videos

In [7]:
# Call the API to get data on the channels
channel_list = get_yt_channels(apiClient=youtube, channel_id_list=channel_id_list)

# Output 1 channel JSON element
my_recursive_print_json(channel_list[0])


kind
    value: youtube#channel
etag
    value: eZ60RbrgTY2fxilhJw7FDzTi1Vg
id
    value: UCvRhblKpR8SH7VHho0KnT0g
snippet
    title
        value: More Mia Maples
    description
        value: Come here for extra Mia Maples content that didn't make it to the main channel! 

    customUrl
        value: @moremiamaples
    publishedAt
        value: 2019-09-06T21:34:13Z
    thumbnails
        default
            url
                value: https://yt3.ggpht.com/TLMTVs-T5JeH0EBVotSctOq6feP7OKD98zxV6HVm50eEB3fMtgvdyYgrIKv1J5aTF3tmRbFIMQ=s88-c-k-c0x00ffffff-no-rj
            width
                value: 88
            height
                value: 88
        medium
            url
                value: https://yt3.ggpht.com/TLMTVs-T5JeH0EBVotSctOq6feP7OKD98zxV6HVm50eEB3fMtgvdyYgrIKv1J5aTF3tmRbFIMQ=s240-c-k-c0x00ffffff-no-rj
            width
                value: 240
            height
                value: 240
        high
            url
                value: https://yt3.ggpht.com/TL

### Parse the JSON channel data into a Dataframe

In [8]:
# Parse the channel JSON elements into a dataframe
channel_df = parse_yt_channel_data(channel_list)
channel_df


Unnamed: 0,channel_id,channel_description,channel_published_at,days_published_channel,channel_views,channel_videos,channel_subscribers
0,UCvRhblKpR8SH7VHho0KnT0g,Come here for extra Mia Maples content that di...,2019-09-06 21:34:13+00:00,1305.299659,10118753,45,336000
1,UCnmGIkw-KdI0W5siakKPKog,just a guy\nInstagram: @ryantrahan\nBusiness i...,2013-10-27 02:06:20+00:00,3446.110689,1846695627,273,12200000
2,UCcE0jj8MTpl5WFE2qMhjZEQ,"Welcome To Story Recap, This Channel Explains ...",2022-08-30 23:09:21+00:00,216.233594,12708227,93,34700
3,UCeeSIt_BnLMuJeGDryZGDWg,I'm a degreed meteorologist and climatologist ...,2021-05-14 14:31:19+00:00,689.593340,1538864,10,106000
4,UCTkXRDQl0luXxVQrRQvWS6w,"Minecraft stuff, yes, my IGN is Dream",2014-02-08 18:20:05+00:00,3341.434474,2905886930,116,31500000
...,...,...,...,...,...,...,...
586,UCa0aTySoJBHXpHigv5tsxbQ,끼룩이와 함께 떠나는 생생한 음식 여행 :)\nYummy Journey with a...,2020-08-13 17:18:34+00:00,963.477194,339107330,597,468000
587,UCo_IB5145EVNcf8hw1Kku7w,Hello Internet! Welcome to GAME THEORY! If you...,2009-08-22 18:01:46+00:00,4972.447194,3615951960,612,17300000
588,UC4U_Y_6rTnHUhMuYTGK2wdg,,2022-08-19 13:28:15+00:00,227.637136,19640,33,140
589,UCPEBKNGRCqljva73ITncJxw,Just an accountant who loves Pokemon! Pokemon ...,2021-07-25 03:14:02+00:00,618.063675,7506756,39,51600


### Join the video data with the channel data for the final Dataframe

In [13]:
# Join the two dataframes into 1
final_df = pd.merge(
                     video_df,
                     channel_df,
                     how="inner",
                     on="channel_id",
                     copy=True,
                  )
final_df

today = dt.date.today()
filename = 'youtube_video_data_' + today.strftime('%Y%m%d') + '.csv'

print("Writing data to file: " + filename)
final_df.to_csv(filename)

Writing data to file: youtube_video_data_20230403.csv


### Describe the final Dataframe

In [10]:
final_df.describe(include='all')


  final_df.describe(include='all')
  final_df.describe(include='all')


Unnamed: 0,video_id,title,description,video_published_at,hours_published_video,num_tags,channel,channel_id,thumbnail_url,thumbnail_width,...,views,likes,favorites,comments,channel_description,channel_published_at,days_published_channel,channel_views,channel_videos,channel_subscribers
count,1272,1272,1272.0,1272,1272.0,1272.0,1272,1272,1272,1272.0,...,1272.0,1266.0,1272.0,1270.0,1272.0,1272,1272.0,1272.0,1272.0,1272.0
unique,1147,1146,1118.0,1147,,,591,591,1147,,...,1148.0,1161.0,1.0,982.0,545.0,591,,591.0,451.0,539.0
top,4ZX9T0kWb4Y,"$1 vs $500,000 Plane Ticket!",,2023-04-03 15:00:05+00:00,,,MrBeast,UCX6OQ3DkcsbYNE6H8uQQuVA,https://i.ytimg.com/vi/4ZX9T0kWb4Y/default.jpg,,...,56645681.0,974.0,0.0,0.0,,2012-02-20 00:43:50+00:00,,24193410562.0,738.0,140000000.0
freq,7,7,24.0,7,,,43,43,7,,...,7.0,5.0,1272.0,28.0,83.0,43,,43.0,43.0,43.0
first,,,,2009-10-03 04:52:20+00:00,,,,,,,...,,,,,,2005-06-16 12:09:27+00:00,,,,
last,,,,2023-04-04 04:38:22+00:00,,,,,,,...,,,,,,2023-03-18 14:41:23+00:00,,,,
mean,,,,,6849.166799,16.398585,,,,120.0,...,,,,,,,3431.62433,,,
std,,,,,13471.833189,14.024226,,,,0.0,...,,,,,,,1805.505956,,,
min,,,,,0.122241,0.0,,,,120.0,...,,,,,,,16.586349,,,
25%,,,,,55.452588,2.0,,,,120.0,...,,,,,,,1997.906028,,,
