# Get YouTube Data

Utilizes Google's YouTube Data API to retrieve information about the top 50 YouTube videos, plus 25 related videos for each of the top videos.

### Import Python libraries

In [1]:
import re
import datetime as dt
import pandas as pd
import numpy as np
import math

from google.oauth2 import service_account
from googleapiclient.discovery import build

### Define helper functions

In [7]:
def my_recursive_print_json(j, level = -1):
    "From W205 sample code: Given a json object print it"
    
    level += 1
    
    spaces = "    "
    
    if type(j) is dict:
        dict_2_list = list(j.keys())
        for k in dict_2_list:
            print(spaces * level + k)
            my_recursive_print_json(j[k], level)
            
    elif type(j) is list:
        for (i, l) in enumerate(j):
            print(spaces * level + "[" + str(i) + "]")
            my_recursive_print_json(l, level)
                  
    else:
        print(spaces * level + "value:", str(j))
        
        
def get_top_yt_videos(apiClient):
    """Make repeated calls to the YouTube Data API to retrieve 
    the top 50 videos in the given region"""
    
    video_list = []
    pageToken = ""
    
    while True:
        response = apiClient.videos().list(
            part='contentDetails,snippet,statistics',
            chart='mostPopular',
            pageToken=pageToken
        ).execute()
        
        video_list = np.append(video_list, response['items'])
        
        if len(video_list) >= 50:
            video_list = video_list[:50]
            break
            
        if 'nextPageToken' in response:
            pageToken = response['nextPageToken']
        else:
            break
            
    return video_list


def get_yt_videos_by_id(apiClient, video_id_list):
    """Given a list of video IDs call the YouTube Data API to
    get details about the videos"""

    video_list = []
    
    num_videos = len(video_id_list)
    chunks = math.ceil(num_videos / 50)
    
    start_idx = 0
    for chunk_num in range(chunks):
        end_idx = min(num_videos, (chunk_num + 1) * 50)
        video_id_str = ','.join(video_id_list[start_idx:end_idx])

        response = apiClient.videos().list(
            part='contentDetails,snippet,statistics',
            id=video_id_str
        ).execute()
        
        video_list = np.append(video_list, response['items'])
        
        start_idx = end_idx
    
    return video_list

    
def get_related_yt_video_ids(apiClient, videoId):
    """Given the ID of a video call the YouTube Data API to 
    get the IDs of 25 other videos related to it"""

    response = apiClient.search().list(
                 part='snippet',
                 relatedToVideoId=videoId,
                 type='video',
                 maxResults=25
               ).execute()

    video_id_list = []
    for result in response['items']:
        video_id_list.append(result['id']['videoId'])

    return video_id_list


def get_yt_channels(apiClient, channel_id_list):
    """Give a list of channel IDs, call the YouTube Data API
    to get statistics about the channels"""
    
    channel_list = []
    
    num_channels = len(channel_id_list)
    chunks = math.ceil(num_channels / 50)
    
    start_idx = 0
    for chunk_num in range(chunks):
        end_idx = min(num_channels, (chunk_num + 1) * 50)
        channel_id_str = ','.join(channel_id_list[start_idx:end_idx])

        response = apiClient.channels().list(
            id=channel_id_str,
            part='snippet,statistics',
            maxResults=50
        ).execute()
        
        channel_list = np.append(channel_list, response['items'])
        
        start_idx = end_idx
        
    return channel_list


def parse_yt_video_data(videos):
    "Convert a list of YouTube video JSON elements into a dataframe"
    
    # regular expressions to parse the duration string
    hms_regex = re.compile('PT(\d+)H(\d+)M(\d+)S')
    hm_regex = re.compile('PT(\d+)H(\d+)M')
    hs_regex = re.compile('PT(\d+)H(\d+)S')
    ms_regex = re.compile('PT(\d+)M(\d+)S')
    m_regex = re.compile('PT(\d+)M')
    s_regex = re.compile('PT(\d+)S')

    now = dt.datetime.now(dt.timezone.utc)

    channel_id_list = []
    video_list = []

    for video in videos:

        attributes = {}

        attributes['video_id'] = video['id']
        
        snippet = video['snippet']
        attributes['title'] = snippet['title']
        attributes['description'] = snippet['description']

        attributes['video_published_at'] = dt.datetime.fromisoformat(snippet['publishedAt'][:-1] + '+00:00')
        attributes['hours_published_video'] = (now - attributes['video_published_at']).total_seconds() / 60 / 60

        if 'tags' in snippet:
            attributes['num_tags'] = len(snippet['tags'])
        else:
            attributes['num_tags'] = 0

        attributes['channel'] = snippet['channelTitle']
        attributes['channel_id'] = snippet['channelId']
        channel_id_list.append(attributes['channel_id'])

        thumbnail = snippet['thumbnails']['default']
        attributes['thumbnail_url'] = thumbnail['url']
        attributes['thumbnail_width'] = thumbnail['width']
        attributes['thumbnail_height'] = thumbnail['height']

        contentDetails = video['contentDetails']

        hours = 0
        minutes = 0
        seconds = 0
        duration = contentDetails['duration']
        regex_match = hms_regex.match(duration)
        if regex_match is not None:
            (hours, minutes, seconds) = regex_match.group(1, 2, 3)
        else:
            regex_match = hm_regex.match(duration)
            if regex_match is not None:
                (hours, minutes) = regex_match.group(1, 2)
            else:
                regex_match = hs_regex.match(duration)
                if regex_match is not None:
                    (hours, seconds) = regex_match.group(1, 2)
                else:
                    regex_match = ms_regex.match(duration)
                    if regex_match is not None:
                        (minutes, seconds) = regex_match.group(1, 2)
                    else:
                        regex_match = m_regex.match(duration)
                        if regex_match is not None:
                            minutes = regex_match.group(1)
                        else:
                            regex_match = s_regex.match(duration)
                            if regex_match is not None:
                                seconds = regex_match.group(1)

        attributes['duration'] = (int(hours)*60 + int(minutes))* 60 + int(seconds)
        attributes['definition'] = contentDetails['definition']
        attributes['caption'] = contentDetails['caption']

        stats = video['statistics']
        attributes['views'] = None
        attributes['likes'] = None
        attributes['favorites'] = None
        attributes['comments'] = None
        
        if 'viewCount' in stats:
            attributes['views'] = stats['viewCount']
            
        if 'likeCount' in stats:
            attributes['likes'] = stats['likeCount']
            
        if 'favoriteCount' in stats:
            attributes['favorites'] = stats['favoriteCount']

        if 'commentCount' in stats:
            attributes['comments'] = stats['commentCount']

        video_list.append(attributes)

    df = pd.DataFrame.from_records(video_list)
    unique_channel_id_list = list(set(channel_id_list))
    
    return (df, unique_channel_id_list)


def parse_yt_channel_data(channels):
    "Convert a list of YouTube channel JSON elements into a dataframe"
    
    date_regex = re.compile("\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z")
    date_with_microsec_regex = re.compile("\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+Z")
    
    now = dt.datetime.now(dt.timezone.utc)

    channel_list = []
    
    for channel in channels:
       
        attributes = {}
        
        attributes['channel_id'] = channel['id']
        
        snippet = channel['snippet']
        attributes['channel_description'] = snippet['description']
        
        published_date_str = snippet['publishedAt']
        if date_with_microsec_regex.match(published_date_str) is not None:
            microsec_idx = published_date_str.find(".") + 1
            published_date_str = published_date_str[:microsec_idx]
            
        attributes['channel_published_at'] = dt.datetime.fromisoformat(published_date_str[:-1] + '+00:00')
        attributes['days_published_channel'] = (now - attributes['channel_published_at']).total_seconds() / 60 / 60 / 24
        
        stats = channel['statistics']
        attributes['channel_views'] = stats['viewCount']
        attributes['channel_videos'] = stats['videoCount']

        if stats['hiddenSubscriberCount'] == False:
            attributes['channel_subscribers'] = stats['subscriberCount']
        else:
            attributes['channel_subscribers'] = None

        channel_list.append(attributes)
    
    return pd.DataFrame.from_records(channel_list)


### Initialize the API client

In [3]:
# set API credentials
credentials = service_account.Credentials.from_service_account_file(
    'credentials.json',
    scopes=['https://www.googleapis.com/auth/youtube.readonly'])

# create a YouTube API client
youtube = build('youtube', 'v3', credentials=credentials)


### Get data about the top 50 videos from YouTube's "Most Popular" chart

In [4]:
# call the API to get top videos
video_list = get_top_yt_videos(apiClient=youtube)

# Output 1 video JSON element
my_recursive_print_json(video_list[0])


kind
    value: youtube#video
etag
    value: X-zycuDth6N1KofpkU69yP2AMmo
id
    value: x_t53a5Ons0
snippet
    publishedAt
        value: 2023-03-01T15:00:43Z
    channelId
        value: UCbulh9WdLtEXiooRcYK7SWw
    title
        value: Metallica: If Darkness Had a Son (Official Music Video)
    description
        value: Metallica's official music video for “If Darkness Had a Son,” from the album “72 Seasons” available April 14th, 2023.

Listen to “If Darkness Had a Son”: https://metallica.lnk.to/IfDarknessHadASon
Pre-order. Pre-save. Pre-add “72 Seasons”: https://metallica.lnk.to/72Seasons 
Subscribe for more videos: https://tallica.lnk.to/subscribe

Directed by Tim Saccenti
Filmed in Los Angeles, CA, on January 9, 2023

Video Premiere Date: March 1, 2023

LYRICS

Temptation
Temptation
Temptation
Temptation

The beast still shouts for what it’s yearning
He stokes the fire, desire burning
The never-ending quenchless craving
The unforgiving misbehaving

If darkness had a son, here I 

### Get data about 25 videos related to each of the top videos

In [5]:
# call the API to get videos related to each of the top videos
related_video_id_list = []
top_video_id_list = [video['id'] for video in video_list]

for top_video_id in top_video_id_list:
    related_videos = get_related_yt_video_ids(apiClient=youtube, videoId=top_video_id)
    related_video_id_list = np.append(related_video_id_list, related_videos)

related_video_list = get_yt_videos_by_id(apiClient=youtube, video_id_list=related_video_id_list)

video_list = np.append(video_list, related_video_list)

# Output 1 video JSON element from related_video_list
my_recursive_print_json(related_video_list[0])


kind
    value: youtube#video
etag
    value: Jol4ZMDDQl0uS4kG8m00sofESZU
id
    value: WM8bTdBs-cw
snippet
    publishedAt
        value: 2009-10-27T01:53:30Z
    channelId
        value: UCbulh9WdLtEXiooRcYK7SWw
    title
        value: Metallica: One (Official Music Video)
    description
        value: Metallica's official music video for “One,” from the album “...And Justice for All.” Subscribe for more videos: https://metallica.lnk.to/subscribe

Listen to Metallica: https://metallica.lnk.to/listen

Directed by Bill Pope and Michael Salomon
Filmed in December 1988 in Long Beach, CA

Video Premiere Date: January 22, 1989

Follow Metallica:
Website & Store: http://www.metallica.com
Official Live Recordings: http://www.livemetallica.com
Instagram: http://www.instagram.com/metallica
Facebook: http://www.facebook.com/metallica
Twitter: http://www.twitter.com/metallica

© 1989 Blackened Recordings

#Metallica #AndJusticeForAll
    thumbnails
        default
            url
             

### Parse the JSON video data into a Dataframe

In [8]:
# Parse the video JSON elements into a dataframe & list of unique channel IDs
(video_df, channel_id_list) = parse_yt_video_data(video_list)
video_df


Unnamed: 0,video_id,title,description,video_published_at,hours_published_video,num_tags,channel,channel_id,thumbnail_url,thumbnail_width,thumbnail_height,duration,definition,caption,views,likes,favorites,comments
0,x_t53a5Ons0,Metallica: If Darkness Had a Son (Official Mus...,Metallica's official music video for “If Darkn...,2023-03-01 15:00:43+00:00,14.864821,8,Metallica,UCbulh9WdLtEXiooRcYK7SWw,https://i.ytimg.com/vi/x_t53a5Ons0/default.jpg,120,90,421,hd,false,1190584,104266,0,11027
1,6GFwqT5bNNA,"I Spent 10,000,000 Robux",We love Roblox!\nSpecial thanks to all the dev...,2023-02-28 20:27:23+00:00,33.420376,2,LazarBeam,UCw1SQ6QRRtfAhrN_cjkrOgA,https://i.ytimg.com/vi/6GFwqT5bNNA/default.jpg,120,90,660,hd,false,1379976,86574,0,5468
2,zDzz8Tv0pN8,Color Fest Season is Here! Clash of Clans Offi...,Prepare for an overload of color - Color Fest ...,2023-03-01 08:00:24+00:00,21.870099,26,Clash of Clans,UCD1Em4q90ZUK2R5HKesszJg,https://i.ytimg.com/vi/zDzz8Tv0pN8/default.jpg,120,90,58,hd,false,2565491,68205,0,2905
3,0nLoRgi1f78,I Got Exclusive Access to NBA All-Star Weekend!,Check out exclusive content on the NBA app! ht...,2023-02-28 22:05:00+00:00,31.793432,16,Jesser,UCQIUhhcmXsu6cN6n3y9-Pww,https://i.ytimg.com/vi/0nLoRgi1f78/default.jpg,120,90,1296,hd,false,939121,33435,0,1424
4,412--23qbb4,Jake Paul Holds Back Tears On Show After Loss ...,Tune into the post-fight special as Jake Paul ...,2023-03-01 16:00:02+00:00,13.876210,10,BS w/ Jake Paul,UCbEn1kwHr0TPmC9VPWIA_Mw,https://i.ytimg.com/vi/412--23qbb4/default.jpg,120,90,1172,hd,false,399804,11697,0,2975
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1269,QD570yDn98s,ELDEN RING | Top 10 Secrets and Discoveries!,#eldenring #darksouls #gaming \nOne of my favo...,2022-10-08 15:00:26+00:00,3470.869543,31,Ziostorm,UCMjd7Sh1Dwf7mywMEDH4OqA,https://i.ytimg.com/vi/QD570yDn98s/default.jpg,120,90,799,hd,false,120744,4352,0,280
1270,iDZzGooj3DA,The WEIRDEST Location in Elden Ring | LORE,#eldenring #darksouls #gaming \nElden Ring has...,2023-02-21 16:15:04+00:00,205.625654,33,Ziostorm,UCMjd7Sh1Dwf7mywMEDH4OqA,https://i.ytimg.com/vi/iDZzGooj3DA/default.jpg,120,90,795,hd,false,76941,3529,0,447
1271,hX7xcxpxHhU,Can I Beat Elden Ring by 1 Shotting the Bosses?,Enjoy!\n\n------------------------------------...,2022-05-16 14:00:18+00:00,6951.871765,1,Bushy,UCF5RrlbsxJjAVLWgOCoNHMg,https://i.ytimg.com/vi/hX7xcxpxHhU/default.jpg,120,90,2929,hd,false,5840506,124835,0,3366
1272,3MtHTKpH9IM,The ULTIMATE Elden Ring Location Tier List,"In this Elden Ring video, all of the locations...",2022-07-15 13:24:32+00:00,5512.467876,0,Dark Tark,UCRqTUdxEg-5cYK7MbCxU4Dw,https://i.ytimg.com/vi/3MtHTKpH9IM/default.jpg,120,90,1789,hd,false,403969,16203,0,646


### Get data about the channels that produced the videos

In [9]:
# Call the API to get data on the channels
channel_list = get_yt_channels(apiClient=youtube, channel_id_list=channel_id_list)

# Output 1 channel JSON element
my_recursive_print_json(channel_list[0])


kind
    value: youtube#channel
etag
    value: dzXDRMnHcZCK7d3l5-so9xzgDSc
id
    value: UCGl0sx3LNOUffvonae7HohA
snippet
    title
        value: Bandas Mix 2022
    description
        value: Bandas Mix 2021 es un canal especializado en la producción de productos de música latina, siempre trabajamos duro para crear buena música, queremos recibir contribuciones de la audiencia a los productos que creamos.
    customUrl
        value: @user-ro5ki9kh6h
    publishedAt
        value: 2012-10-29T00:37:13Z
    thumbnails
        default
            url
                value: https://yt3.ggpht.com/Lqwh7SQoSQygItGXMD_YtCpytqEgaxDMUz7afKpetNjUoLqQ5_k5EAicPhW28y687oFFlhZDyg=s88-c-k-c0x00ffffff-no-rj
            width
                value: 88
            height
                value: 88
        medium
            url
                value: https://yt3.ggpht.com/Lqwh7SQoSQygItGXMD_YtCpytqEgaxDMUz7afKpetNjUoLqQ5_k5EAicPhW28y687oFFlhZDyg=s240-c-k-c0x00ffffff-no-rj
            width
             

### Parse the JSON channel data into a Dataframe

In [10]:
# Parse the channel JSON elements into a dataframe
channel_df = parse_yt_channel_data(channel_list)
channel_df


Unnamed: 0,channel_id,channel_description,channel_published_at,days_published_channel,channel_views,channel_videos,channel_subscribers
0,UCGl0sx3LNOUffvonae7HohA,Bandas Mix 2021 es un canal especializado en l...,2012-10-29 00:37:13+00:00,3776.219239,51789249,225,250000
1,UC0bCUnP5RrkJZUtd3bBz6Kw,"WPTV NewsChannel 5, the NBC affiliate in West ...",2010-05-07 20:23:30+00:00,4681.395431,301690991,112462,320000
2,UCVpankR4HtoAVtYnFDUieYA,Hi. Please subscribe to this channel and I wil...,2006-07-22 18:14:13+00:00,6066.485211,643321445,211,3890000
3,UCJquYOG5EL82sKTfH9aMA9Q,Everything Music,2006-05-23 21:13:57+00:00,6126.360396,550721325,1147,3370000
4,UCXZDcPu7YbWbxVTtY80xrww,lightly salted\n,2015-10-06 00:39:13+00:00,2704.217850,174423,8,217
...,...,...,...,...,...,...,...
629,UC47GDLUPXADaa69O2AqJQ9A,"Welcome to Hawkshaw, a channel for going in de...",2015-10-28 14:32:24+00:00,2681.639250,8515004,28,78700
630,UC176GAQozKKjhz62H8u9vQQ,An exploration of the world's life science sub...,2019-06-19 22:01:55+00:00,1351.327086,87107800,66,1160000
631,UC94lW_-Hr_uA7RcJ3D-WPOg,,2014-03-06 18:29:16+00:00,3282.474759,1643558613,439,7000000
632,UCg40OxZ1GYh3u3jBntB6DLg,,2017-06-28 19:03:36+00:00,2072.450917,1617372771,49111,1540000


### Join the video data with the channel data for the final Dataframe

In [11]:
# Join the two dataframes into 1
final_df = pd.merge(
                     video_df,
                     channel_df,
                     how="inner",
                     on="channel_id",
                     copy=True,
                   )
final_df

final_df.to_csv('youtube_video_data.csv')

### Describe the final Dataframe

In [12]:
final_df.describe(include='all')


  final_df.describe(include='all')
  final_df.describe(include='all')


Unnamed: 0,video_id,title,description,video_published_at,hours_published_video,num_tags,channel,channel_id,thumbnail_url,thumbnail_width,...,views,likes,favorites,comments,channel_description,channel_published_at,days_published_channel,channel_views,channel_videos,channel_subscribers
count,1274,1274,1274.0,1274,1274.0,1274.0,1274,1274,1274,1274.0,...,1274.0,1271.0,1274.0,1266.0,1274.0,1274,1274.0,1274.0,1274.0,1274.0
unique,1189,1188,1145.0,1184,,,634,634,1189,,...,1192.0,1184.0,1.0,1009.0,556.0,634,,634.0,470.0,551.0
top,GPmtG9TPNpg,FBI Won’t Share Proof of Covid Lab Leak | MTG ...,,2023-03-02 01:31:35+00:00,,,UFC - Ultimate Fighting Championship,UCvgfXK4nTYKudb0rFR6noLA,https://i.ytimg.com/vi/GPmtG9TPNpg/default.jpg,,...,444855.0,4785.0,0.0,0.0,,2006-03-04 14:15:07+00:00,,6145964625.0,13495.0,19000000.0
freq,5,5,26.0,5,,,29,29,5,,...,5.0,4.0,1274.0,21.0,121.0,29,,29.0,29.0,30.0
first,,,,2009-10-27 01:53:30+00:00,,,,,,,...,,,,,,2005-09-16 19:34:05+00:00,,,,
last,,,,2023-03-02 05:00:06+00:00,,,,,,,...,,,,,,2023-02-21 03:00:34+00:00,,,,
mean,,,,,6068.00925,16.653846,,,,120.0,...,,,,,,,3132.719073,,,
std,,,,,12981.101093,13.54398,,,,0.0,...,,,,,,,1806.968106,,,
min,,,,,0.875099,0.0,,,,120.0,...,,,,,,,9.11969,,,
25%,,,,,73.789265,3.0,,,,120.0,...,,,,,,,1644.44903,,,
