In [1]:
import re
import datetime as dt
import pandas as pd
import math

from google.oauth2 import service_account
from googleapiclient.discovery import build

In [2]:
def my_recursive_print_json(j, level = -1):
    "From W205 sample code: Given a json object print it"
    
    level += 1
    
    spaces = "    "
    
    if type(j) is dict:
        dict_2_list = list(j.keys())
        for k in dict_2_list:
            print(spaces * level + k)
            my_recursive_print_json(j[k], level)
            
    elif type(j) is list:
        for (i, l) in enumerate(j):
            print(spaces * level + "[" + str(i) + "]")
            my_recursive_print_json(l, level)
                  
    else:
        print(spaces * level + "value:", str(j))
        
        
def get_top_youtube_videos(apiClient, region = 'US'):
    """Make repeated calls to the YouTube Data API to retrieve the top count
    videos in the given region"""
    
    video_list = []
    pageToken = ""
    
    while True:
        response = apiClient.videos().list(
            part='contentDetails,recordingDetails,snippet,statistics',
            chart='mostPopular',
            regionCode=region,
            pageToken=pageToken
        ).execute()
        
        video_list.extend(response['items'])
        
        if 'nextPageToken' in response:
            pageToken = response['nextPageToken']
        else:
            break
            
    return video_list


def get_youtube_channels(apiClient, channel_id_list):
    
    channel_list = []
    
    num_channels = len(channel_id_list)
    chunks = math.ceil(num_channels / 50)
    
    start_idx = 0
    for chunk_num in range(chunks):
        end_idx = min(num_channels, (chunk_num + 1) * 50)
        channel_id_str = ','.join(channel_id_list[start_idx:end_idx])

        response = youtube.channels().list(
            id=channel_id_str,
            part='snippet,statistics',
            maxResults=50
        ).execute()
        
        channel_list.extend(response['items'])
        
        start_idx = end_idx
        
    return channel_list


def parse_youtube_video_data(videos):
    "Convert a list of YouTube video JSON elements into a dataframe"
    
    # regular expressions to parse the duration string
    hms_regex = re.compile('PT(\d+)H(\d+)M(\d+)S')
    hm_regex = re.compile('PT(\d+)H(\d+)M')
    hs_regex = re.compile('PT(\d+)H(\d+)S')
    ms_regex = re.compile('PT(\d+)M(\d+)S')
    m_regex = re.compile('PT(\d+)M')
    s_regex = re.compile('PT(\d+)S')

    now = dt.datetime.now(dt.timezone.utc)

    channel_id_list = []
    video_list = []

    for video in videos:

        attributes = {}

        snippet = video['snippet']
        attributes['title'] = snippet['title']
        attributes['description'] = snippet['description']

        attributes['video_published_at'] = dt.datetime.fromisoformat(snippet['publishedAt'][:-1] + '+00:00')
        attributes['hours_published_video'] = (now - attributes['video_published_at']).total_seconds() / 60 / 60

        if 'tags' in snippet:
            attributes['num_tags'] = len(snippet['tags'])
        else:
            attributes['num_tags'] = 0

        attributes['channel'] = snippet['channelTitle']
        attributes['channel_id'] = snippet['channelId']
        channel_id_list.append(attributes['channel_id'])

        thumbnail = snippet['thumbnails']['default']
        attributes['thumbnail_url'] = thumbnail['url']
        attributes['thumbnail_width'] = thumbnail['width']
        attributes['thumbnail_height'] = thumbnail['height']

        contentDetails = video['contentDetails']

        hours = 0
        minutes = 0
        seconds = 0
        duration = contentDetails['duration']
        regex_match = hms_regex.match(duration)
        if regex_match is not None:
            (hours, minutes, seconds) = regex_match.group(1, 2, 3)
        else:
            regex_match = hm_regex.match(duration)
            if regex_match is not None:
                (hours, minutes) = regex_match.group(1, 2)
            else:
                regex_match = hs_regex.match(duration)
                if regex_match is not None:
                    (hours, seconds) = regex_match.group(1, 2)
                else:
                    regex_match = ms_regex.match(duration)
                    if regex_match is not None:
                        (minutes, seconds) = regex_match.group(1, 2)
                    else:
                        regex_match = m_regex.match(duration)
                        if regex_match is not None:
                            minutes = regex_match.group(1)
                        else:
                            regex_match = s_regex.match(duration)
                            if regex_match is not None:
                                seconds = regex_match.group(1)

        attributes['duration'] = (int(hours)*60 + int(minutes))* 60 + int(seconds)
        attributes['definition'] = contentDetails['definition']
        attributes['caption'] = contentDetails['caption']

        stats = video['statistics']
        attributes['views'] = stats['viewCount']
        attributes['likes'] = stats['likeCount']
        attributes['favorites'] = stats['favoriteCount']
        if 'commentCount' in stats:
            attributes['comments'] = stats['commentCount']
        else:
            attributes['comments'] = None

        video_list.append(attributes)

    df = pd.DataFrame.from_records(video_list)
    unique_channel_id_list = list(set(channel_id_list))
    
    return (df, unique_channel_id_list)


def parse_youtube_channel_data(channels):
    "Convert a list of YouTube channel JSON elements into a dataframe"
    
    date_regex = re.compile("\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z")
    date_with_microsec_regex = re.compile("\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+Z")
    
    now = dt.datetime.now(dt.timezone.utc)

    channel_list = []
    
    for channel in channels:
       
        attributes = {}
        
        attributes['channel_id'] = channel['id']
        
        snippet = channel['snippet']
        attributes['channel_description'] = snippet['description']
        
        published_date_str = snippet['publishedAt']
        if date_with_microsec_regex.match(published_date_str) is not None:
            microsec_idx = published_date_str.find(".") + 1
            published_date_str = published_date_str[:microsec_idx]
            
        attributes['channel_published_at'] = dt.datetime.fromisoformat(published_date_str[:-1] + '+00:00')
        attributes['days_published_channel'] = (now - attributes['channel_published_at']).total_seconds() / 60 / 60 / 24
        
        stats = channel['statistics']
        attributes['channel_views'] = stats['viewCount']
        attributes['channel_videos'] = stats['videoCount']

        if stats['hiddenSubscriberCount'] == False:
            attributes['channel_subscribers'] = stats['subscriberCount']
        else:
            attributes['channel_subscribers'] = None

        channel_list.append(attributes)
    
    return pd.DataFrame.from_records(channel_list)


In [3]:
# Set your API credentials
credentials = service_account.Credentials.from_service_account_file(
    'credentials.json',
    scopes=['https://www.googleapis.com/auth/youtube.readonly'])

# Create a YouTube API client
youtube = build('youtube', 'v3', credentials=credentials)

# Call the API to get top videos
video_list = get_top_youtube_videos(apiClient=youtube)

# Output 1 video JSON element
my_recursive_print_json(video_list[0])

kind
    value: youtube#video
etag
    value: PVGudiwP7bhqbhsfDMSEjcj-BZM
id
    value: GVT3WUa-48Y
snippet
    publishedAt
        value: 2023-02-26T17:00:11Z
    channelId
        value: UCsn6cjffsvyOZCZxvGoJxGg
    title
        value: ANIME ROCK, PAPER, SCISSORS
    description
        value: ANYONE can make a cartoon with this groundbreaking technique. Want to learn how? We made a ONE-HOUR, CLICK-BY-CLICK TUTORIAL on http://www.corridordigital.com/ 

With Your Support, We Can Make More! ►
This project exists because of the amazing members of CorridorDigital, our INDEPENDENT STREAMING PLATFORM. All memberships begin with a 14-Day Free Trial and you can Cancel Anytime. Consider becoming a member yourself! http://www.corridordigital.com/learn-more

Anime Rock, Paper, Scissors Shirt?! ►
Available only until March 6th, we have a limited-edition Anime Rock Paper Scissors t-shirt and longsleeve design to celebrate this release. Check it out here! http://corridordigital.store/

WE OWE IT 

In [4]:
# Parse the video JSON elements into a dataframe & list of unique channel IDs
(video_df, channel_id_list) = parse_youtube_video_data(video_list)
video_df


Unnamed: 0,title,description,video_published_at,hours_published_video,num_tags,channel,channel_id,thumbnail_url,thumbnail_width,thumbnail_height,duration,definition,caption,views,likes,favorites,comments
0,"ANIME ROCK, PAPER, SCISSORS",ANYONE can make a cartoon with this groundbrea...,2023-02-26 17:00:11+00:00,12.903881,9,Corridor,UCsn6cjffsvyOZCZxvGoJxGg,https://i.ytimg.com/vi/GVT3WUa-48Y/default.jpg,120,90,442,hd,false,540525,85643,0,4364
1,Hotel Stereotypes,"Hotel Stereotypes. Love em' or hate em', we al...",2023-02-25 15:00:02+00:00,38.906381,37,Dude Perfect,UCRijo3ddMTht_IHyNSNXpNQ,https://i.ytimg.com/vi/zPlsnWq5X50/default.jpg,120,90,733,hd,false,4248791,173435,0,6135
2,He Made Me a Gorilla Tag Pro.,Could you do it?\n\nTTTPig: @TTTPig \nElliot: ...,2023-02-26 16:00:24+00:00,13.900270,13,jmancurly,UCRaehQPWXnJ72WmE8nvvKKw,https://i.ytimg.com/vi/pGCT0jhPd0g/default.jpg,120,90,483,hd,false,466680,15481,0,3893
3,"KAROL G, Shakira - TQG (Official Video)","KAROL G, Shakira - TQG (Official Video)\nEscuc...",2023-02-24 05:00:09+00:00,72.904436,29,KarolGVEVO,UCz9yS18zJGQObwUL_K-ICnw,https://i.ytimg.com/vi/jZGpkLElSu8/default.jpg,120,90,217,hd,true,84804771,4695375,0,178972
4,BUSTING 100 MYTHS IN 24 HOURS!,Salish would have never let me me out of the s...,2023-02-25 15:00:06+00:00,38.905270,21,Jordan Matter,UCKaCalz5N5ienIbfPzEbYuA,https://i.ytimg.com/vi/0NXvJ2rLmbo/default.jpg,120,90,767,hd,true,3848866,93026,0,14199
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,BOAT TOUR: 40ft Rapido Trimaran (& our Future ...,To get a 1 year supply of vitamin D3+K2 + 10 i...,2023-02-20 21:59:36+00:00,151.913603,0,Sailing La Vagabonde,UCZdQjaSoLjIzFnWsDQOv4ww,https://i.ytimg.com/vi/BW8BeBjTQ_g/default.jpg,120,90,1500,hd,false,577077,29441,0,1356
196,I Survived 100 Days as HADES in Minecraft.. He...,I Survived 100 Days as HADES in Minecraft.. He...,2023-02-20 01:00:16+00:00,172.902492,16,Forrestbono,UCT4oXolSaPqL5T5TXqmPQ2A,https://i.ytimg.com/vi/XXIswRPnlkI/default.jpg,120,90,2189,hd,false,1768341,31851,0,2043
197,I Worked at a Maid Café for a Day,**Try out some free sample packs with FREE shi...,2023-02-20 18:48:39+00:00,155.096103,0,Emirichu,UCFeqAfEuKm7lIg2ddQzh61A,https://i.ytimg.com/vi/yqHLW60BSlA/default.jpg,120,90,916,hd,false,577073,51820,0,2505
198,PUTTING Brooklyn QUEEN on The HOT SEAT 🔥🥵,PUTTING Brooklyn QUEEN on The HOT SEAT 🔥🥵,2023-02-21 22:15:10+00:00,127.654159,11,Chris and Debo Mafia,UCbJM7J0RVZ6AsQmKdCrnZtw,https://i.ytimg.com/vi/8Z-R-cNqGQU/default.jpg,120,90,613,hd,false,140593,7409,0,378


In [5]:
# Call the API to get data on the channels
channel_list = get_youtube_channels(apiClient=youtube, channel_id_list=channel_id_list)

# Output 1 channel JSON element
my_recursive_print_json(channel_list[0])


kind
    value: youtube#channel
etag
    value: 3pg5UZ50-Asi76JFbDB34i5iKNU
id
    value: UCJEER74X9kBenMT_x9iK9Mw
snippet
    title
        value: FIFTY FIFTY Official
    description
        value: Stream Fifty Fifty   https://linktr.ee/we_fiftyfifty

FIFTY FIFTY OFFICIAL 
 🟣 www.instagram.com/we_fiftyfifty
 🔵 www.twitter.com/we_fiftyfifty
 ⚫ www.tiktok.com/@we_fiftyfifty
 🔴 www.youtube.com/@we_fiftyfifty
 🟧 https://cafe.daum.net/wefiftyfifty
 🟦 https://www.facebook.com/we.fiftyfifty



    customUrl
        value: @we_fiftyfifty
    publishedAt
        value: 2022-10-31T07:00:22.530659Z
    thumbnails
        default
            url
                value: https://yt3.ggpht.com/wGBHKpnA5txxcCDiFpN6pgebeYvkNFqhoxnqoI0Nuk4Zpo5XmWeyZ9EvhEkAvnN4MDff-HpmYQ=s88-c-k-c0x00ffffff-no-nd-rj
            width
                value: 88
            height
                value: 88
        medium
            url
                value: https://yt3.ggpht.com/wGBHKpnA5txxcCDiFpN6pgebeYvkNFqhoxnqoI0Nuk

In [6]:
# Parse the channel JSON elements into a dataframe
channel_df = parse_youtube_channel_data(channel_list)
channel_df


Unnamed: 0,channel_id,channel_description,channel_published_at,days_published_channel,channel_views,channel_videos,channel_subscribers
0,UCJEER74X9kBenMT_x9iK9Mw,Stream Fifty Fifty https://linktr.ee/we_fift...,2022-10-31 07:00:22+00:00,118.954207,8403555,52,117000
1,UCHCph-_jLba_9atyCZJPLQQ,Welcome to How it Should Have Ended! A place ...,2007-03-05 21:41:41+00:00,5837.342181,3193132215,442,10400000
2,UCpi8TJfiA4lKGkaXs__YdBA,a new era. \n\nepisodes every wednesday and sa...,2018-05-23 02:05:02+00:00,1741.159299,2334446057,522,8080000
3,UCY6Ij8zOds0WJEeqCLOnqOQ,I make cartoons and sometimes video essays abo...,2015-07-09 08:02:42+00:00,2789.910920,635487905,313,3330000
4,UCQznUf1SjfDqx65hX3zRDiA,"Drake on Vevo - Official Music Videos, Live Pe...",2009-08-17 17:13:33+00:00,4941.528385,9273945840,116,9230000
...,...,...,...,...,...,...,...
180,UCSpFnDQr88xCZ80N-X7t0nQ,"We run a production studio based on ingenuity,...",2011-02-18 20:27:23+00:00,4391.393778,1423181528,1084,6040000
181,UCHRTfR2r0Ss3UjFyw7gSA-A,SUBSCRIBE: https://www.youtube.com/subscriptio...,2009-04-20 00:34:14+00:00,5061.222355,1071220547,1778,2480000
182,UCr47E3UYaFoKU7Mroevt4aQ,,2017-01-18 04:10:31+00:00,2231.072158,181964975,560,1050000
183,UCo_IB5145EVNcf8hw1Kku7w,Hello Internet! Welcome to GAME THEORY! If you...,2009-08-22 18:01:46+00:00,4936.494901,3544122128,601,16700000


In [7]:
# Join the two dataframes into 1
final_df = pd.merge(
                     video_df,
                     channel_df,
                     how="inner",
                     on="channel_id",
                     copy=True,
                   )
final_df

Unnamed: 0,title,description,video_published_at,hours_published_video,num_tags,channel,channel_id,thumbnail_url,thumbnail_width,thumbnail_height,...,views,likes,favorites,comments,channel_description,channel_published_at,days_published_channel,channel_views,channel_videos,channel_subscribers
0,"ANIME ROCK, PAPER, SCISSORS",ANYONE can make a cartoon with this groundbrea...,2023-02-26 17:00:11+00:00,12.903881,9,Corridor,UCsn6cjffsvyOZCZxvGoJxGg,https://i.ytimg.com/vi/GVT3WUa-48Y/default.jpg,120,90,...,540525,85643,0,4364,"Videos you didn't think were possible, created...",2010-05-17 21:36:10+00:00,4668.346012,1941737020,206,9760000
1,Hotel Stereotypes,"Hotel Stereotypes. Love em' or hate em', we al...",2023-02-25 15:00:02+00:00,38.906381,37,Dude Perfect,UCRijo3ddMTht_IHyNSNXpNQ,https://i.ytimg.com/vi/zPlsnWq5X50/default.jpg,120,90,...,4248791,173435,0,6135,5 best buds just kickin' it.\nIf you like Spor...,2009-03-17 05:44:36+00:00,5095.006822,15561308942,369,58900000
2,He Made Me a Gorilla Tag Pro.,Could you do it?\n\nTTTPig: @TTTPig \nElliot: ...,2023-02-26 16:00:24+00:00,13.900270,13,jmancurly,UCRaehQPWXnJ72WmE8nvvKKw,https://i.ytimg.com/vi/pGCT0jhPd0g/default.jpg,120,90,...,466680,15481,0,3893,"Hi, I'm Julian and making videos is the only t...",2013-08-30 11:13:50+00:00,3467.778188,231556209,348,1190000
3,"KAROL G, Shakira - TQG (Official Video)","KAROL G, Shakira - TQG (Official Video)\nEscuc...",2023-02-24 05:00:09+00:00,72.904436,29,KarolGVEVO,UCz9yS18zJGQObwUL_K-ICnw,https://i.ytimg.com/vi/jZGpkLElSu8/default.jpg,120,90,...,84804771,4695375,0,178972,,2016-01-11 18:06:10+00:00,2603.491846,13664353317,79,3740000
4,"KAROL G, Carla Morrison - Mañana Será Bonito (...",Escucha / Stream “MSB” on your favorite platfo...,2023-02-24 05:00:32+00:00,72.898048,25,KarolGVEVO,UCz9yS18zJGQObwUL_K-ICnw,https://i.ytimg.com/vi/orNpRKOfjzY/default.jpg,120,90,...,5769925,290472,0,4925,,2016-01-11 18:06:10+00:00,2603.491846,13664353317,79,3740000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,BOAT TOUR: 40ft Rapido Trimaran (& our Future ...,To get a 1 year supply of vitamin D3+K2 + 10 i...,2023-02-20 21:59:36+00:00,151.913603,0,Sailing La Vagabonde,UCZdQjaSoLjIzFnWsDQOv4ww,https://i.ytimg.com/vi/BW8BeBjTQ_g/default.jpg,120,90,...,577077,29441,0,1356,We are an Australian couple with the dream of ...,2014-10-25 14:53:01+00:00,3046.625978,387478585,523,1820000
196,I Survived 100 Days as HADES in Minecraft.. He...,I Survived 100 Days as HADES in Minecraft.. He...,2023-02-20 01:00:16+00:00,172.902492,16,Forrestbono,UCT4oXolSaPqL5T5TXqmPQ2A,https://i.ytimg.com/vi/XXIswRPnlkI/default.jpg,120,90,...,1768341,31851,0,2043,Welcome to the Forrestbono YouTube channel! I ...,2013-07-03 03:38:24+00:00,3526.094461,197394150,33,1940000
197,I Worked at a Maid Café for a Day,**Try out some free sample packs with FREE shi...,2023-02-20 18:48:39+00:00,155.096103,0,Emirichu,UCFeqAfEuKm7lIg2ddQzh61A,https://i.ytimg.com/vi/yqHLW60BSlA/default.jpg,120,90,...,577073,51820,0,2505,"Hi, I like drawing and I'm using this channel ...",2011-06-12 06:36:19+00:00,4277.970908,338126806,78,3330000
198,PUTTING Brooklyn QUEEN on The HOT SEAT 🔥🥵,PUTTING Brooklyn QUEEN on The HOT SEAT 🔥🥵,2023-02-21 22:15:10+00:00,127.654159,11,Chris and Debo Mafia,UCbJM7J0RVZ6AsQmKdCrnZtw,https://i.ytimg.com/vi/8Z-R-cNqGQU/default.jpg,120,90,...,140593,7409,0,378,\n,2017-05-08 03:21:48+00:00,2121.105989,55499259,121,623000


In [8]:
final_df.describe()

Unnamed: 0,hours_published_video,num_tags,thumbnail_width,thumbnail_height,duration,days_published_channel
count,200.0,200.0,200.0,200.0,200.0,200.0
mean,84.134681,15.59,120.0,90.0,881.985,3232.456987
std,44.58916,11.810986,0.0,0.0,727.97242,1764.703902
min,12.898325,0.0,120.0,90.0,29.0,5.520584
25%,54.774298,5.0,120.0,90.0,227.75,1956.346061
50%,79.619714,14.0,120.0,90.0,701.0,3182.852818
75%,111.09395,24.0,120.0,90.0,1248.0,4576.874169
max,337.93527,55.0,120.0,90.0,3067.0,6366.114577
