# Data Sampling

## Setup

In [1]:
import sys

!{sys.executable} -m pip install --quiet --user --upgrade pandas==1.*
!{sys.executable} -m pip install --quiet --user --upgrade -r requirements.txt

In [2]:
from goodies import *
import pandas as pd
import os

## Data Collection

In [3]:
from dcollect import plugins

modules = {'http': plugins.fasthttp()}
headers = None

### YouTube (United States)

Initial setup. Be sure to have your API key ready. For details on how to obtain an API key, read [YouTube Data API Overview, Introduction: Before you start](https://developers.google.com/youtube/v3/getting-started#before-you-start).

In [4]:
from dcollect import api_youtube as youtube
from dcollect import api_youtubei as youtubei

# This key is for testing ONLY. DO NOT release to the public!
api_experiment = False
api_key_testing = None
api_key = os.environ.get('YOUTUBE_API_KEY') or api_key_testing

if not api_key:
    api_key = os.environ.get('YOUTUBE_EXPLORER_API_KEY')
    if api_key: 
        api_experiment = True
    else: 
        api_key = input('YouTube Data API Key: ')
        api_experiment = (input('Is this an explorer key? [Y/N]: ') == 'Y')

YouTube Data API Key: AIzaSyAa8yy0GdcGPHdtD083HiGGx_S0vMPScDM
Is this an explorer key? [Y/N]: Y


#### Search

In [5]:
count = int(os.environ.get('SAMPLE_SIZE', 200))

##### STEP 1  API Object Creation

In [6]:
# create a YouTube API object
youtube_o = youtube.api(
    modules = modules,
    headers = headers,
    key = api_key,
    experiment = api_experiment
)

# create a YouTube Internals API object
youtubei_o = youtubei.api(
    modules = modules,
    headers = headers
)

##### STEP 2  Data Collection

In [7]:
def df_trending_gen(*args, **kwargs):
    from dcollect.utils.log import log
    log.enable(level = log.levels.WARNING)
    
    import concurrent.futures
    
    df_trending = None
    df_channels = None
    df_ads = None
    
    def worker_df_trending(*args, **kwargs):
        nonlocal df_trending
        df_trending = df_from_json(
            youtube_o.video.trending(
                *args, **kwargs
            )
        )

    def worker_df_ads():
        nonlocal df_ads
        df_ads = df_from_json(
            youtubei_o.ad.placements(
                id = df_trending['id'],
                throttle_size = 50
            )
        )
            
    def worker_df_channels():
        nonlocal df_channels
        df_channels = df_from_json(
            youtube_o.channel.info(
                id = df_trending['creator.id']
            )
        )
        
    worker_df_trending(*args, **kwargs)    
    
    with concurrent.futures.ThreadPoolExecutor() as executor:
        for worker in worker_df_ads, worker_df_channels:
            executor.submit(worker)
        executor.shutdown(wait = True)

    return df_trending, df_channels, df_ads

In [8]:
df_trending, df_channels, df_ads = df_trending_gen(count = count)

##### STEP 3  Data Cleaning


In [9]:
# - trending
df_trending.set_index(['id'], inplace = True)
# - channels
df_channels = df_channels.add_prefix('creator.')
df_channels.set_index(['creator.id'], inplace = True)
# - ads
df_ads.set_index(['id'], inplace = True)

In [10]:
# - search (with details)
df_trending_details = df_trending.copy()
# - ads
df_trending_details = df_trending_details.merge(
    df_ads, 
    right_index = True, 
    left_on = 'id', 
    copy = False
)

##### STEP 4  Data Inspection


In [11]:
# take a brief look at our data
df_report(df_trending_details, name = 'Trending')
df_report(df_channels, name = 'Trending (Channels)')

---

# Results - Trending

## Data Preview

Unnamed: 0_level_0,title,description,time,length,tags,category,creator.id,stats.like,stats.dislike,stats.comment,stats.view,video.quality,ads
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
cT7wOSOZVoc,Minecraft Survivor VS 3 Hitmen...,Minecraft Survivor VS 3 Hitmen... This was inc...,2021-03-11 08:00:15+00:00,0 days 00:42:25,"[Dream Minecraft, dream Minecraft youtube, min...",Gaming,UCTkXRDQl0luXxVQrRQvWS6w,764910,6773,62106.0,5637503,HD,[{'kind': 'AD_PLACEMENT_KIND_COMMAND_TRIGGERED...
AeaR5QbXgpM,Rod Wave - Street Runner (Official Video),#rodwave #streetrunner #soulfly\n\nPre-save th...,2021-03-10 05:00:13+00:00,0 days 00:04:10,"[rod wave, hunger games, hunger games 3, ptsd,...",Music,UCenjunBhBhvKjfDAESnoppw,171619,2978,13974.0,3147242,HD,"[{'kind': 'AD_PLACEMENT_KIND_START', 'offset':..."
8P8sAhYF3hc,Dream - Minecraft Hitmen Extra Scenes (Hitmen 2),This is extra scenes from Dream's channel of t...,2021-03-11 08:44:57+00:00,0 days 00:09:40,,People & Blogs,UChU3JRloULzdFX3aCu7BiSA,85585,288,7798.0,525041,HD,[{'kind': 'AD_PLACEMENT_KIND_COMMAND_TRIGGERED...
OyREzWcCPDo,Russell Brand Reacts To Meghan & Harry Interview,Reaction to Meghan Markle and Prince Harry's P...,2021-03-10 20:39:36+00:00,0 days 00:09:40,"[Russell Brand, Brand Russell, BrandThe, Russe...",Education,UCswH8ovgUp5Bdg-0_JTYFNw,26422,1838,7625.0,810820,HD,[{'kind': 'AD_PLACEMENT_KIND_COMMAND_TRIGGERED...
sG9rX6Ifzhw,Piers and Alex Clash Over Prince Harry and Meg...,Following the Duke and Duchess of Sussex’s lan...,2021-03-09 10:58:19+00:00,0 days 00:14:05,"[good morning britain, breakfast show, news, m...",Entertainment,UCq18eeL7D9Vd8DhjMcLh9QQ,72657,13055,,7293762,HD,[{'kind': 'AD_PLACEMENT_KIND_COMMAND_TRIGGERED...


## Stats

Unnamed: 0,length,stats.like,stats.dislike,stats.comment,stats.view
count,200,200.0,200.0,190.0,200.0
mean,0 days 00:10:36.775000,135450.8,2339.035,11511.336842,2619507.0
std,0 days 00:10:46.507769368,256173.5,3658.268512,23137.173021,4021882.0
min,0 days 00:00:12,529.0,79.0,106.0,120510.0
25%,0 days 00:03:18.250000,18969.0,418.5,2195.5,592733.5
50%,0 days 00:08:39,49934.5,968.5,4120.0,1208350.0
75%,0 days 00:14:05.250000,121112.5,2406.25,10488.0,2980317.0
max,0 days 01:25:01,1788405.0,23309.0,191504.0,30037510.0


---

---

# Results - Trending (Channels)

## Data Preview

Unnamed: 0_level_0,creator.title,creator.description,creator.time,creator.stats.follower,creator.stats.view,creator.stats.post
creator.id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
UCaRH3rDr3K3CEfhVqu5mgUQ,Team Edge,This is a channel where we do nothing but comp...,2015-09-08 17:08:52+00:00,7100000.0,2911796569,662
UCvK4bOhULCpmLabd2pDMtnA,Yes Theory,We believe that life's greatest moments and de...,2014-01-13 16:59:30+00:00,6300000.0,719980544,324
UCzJIliq68IHSn-Kwgjeg2AQ,Jackie Aina,"Changing the standard of beauty, one video at ...",2006-03-12 15:10:29+00:00,3590000.0,363716908,927
UC3xZYc4SZUGfRERIvDRGqDQ,Skip the Tutorial,Skip the Tutorial - Subscribe or GAME OVER,2017-07-17 16:54:32+00:00,1810000.0,164695651,141
UCVhibwHk4WKw4leUt6JfRLg,SPORTSNET,Canada's #1 Sports Network\n\n----------------...,2010-03-24 19:51:17+00:00,531000.0,911465219,26554


## Stats

Unnamed: 0,creator.stats.follower,creator.stats.view,creator.stats.post
count,116.0,120.0,120.0
mean,5194152.0,1403179000.0,4257.766667
std,10568580.0,2963746000.0,10397.86294
min,16400.0,10249150.0,26.0
25%,1060000.0,203529900.0,142.0
50%,2670000.0,414885500.0,577.5
75%,4760000.0,1293130000.0,1833.0
max,58200000.0,16245740000.0,49586.0


---

##### STEP 5  Data Archiving (Cumulative)


In [12]:
pickle_proto = 3

dataset = eda_utils.dataset('dsamples/youtube_trending.dataset')
dataset.update('youtube_trending.pkl', df_trending, proto = pickle_proto)
dataset.update('youtube_trending_ads.pkl', df_ads, proto = pickle_proto)
dataset.update('youtube_trending_channels.pkl', df_channels, proto = pickle_proto)
dataset.update('youtube_trending_details.pkl', df_trending_details, proto = pickle_proto)

# verify that we saved the correct data
df_report(dataset.load('youtube_trending_details.pkl'), name = 'Trending (Verification)')
df_report(dataset.load('youtube_trending_channels.pkl'), name = 'Trending (Channels) (Verification)')

---

# Results - Trending (Verification)

## Data Preview

Unnamed: 0_level_0,title,description,time,length,tags,category,creator.id,stats.like,stats.dislike,stats.comment,stats.view,video.quality,ads
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
-aToDFwlM1Y,CARMEN RUSHED TO THE HOSPITAL BECAUSE OF MISCA...,Carmen rushed to the hospital signs of miscarr...,2021-03-08 22:31:08+00:00,0 days 00:13:20,"[Carmen and Corey, Carmen’s pregnant, Pregnanc...",People & Blogs,UC2rWFYjCHFcnjNnWhnGvv7Q,33575.0,451.0,4741.0,424684.0,HD,[{'kind': 'AD_PLACEMENT_KIND_COMMAND_TRIGGERED...
-hAztPfxZEk,FIRST DAY IN OUR NEW HOME! (HECTIC),LAST VIDEO: \r\n\r\n*MAKE SURE OUR POST NOTIFI...,2021-03-09 03:38:11+00:00,0 days 00:14:23,"[queen Naija, Medicine, Queen, Royalty Squad, ...",Entertainment,UCtj45MepAoKxZoyR_Mnt86Q,29000.0,590.0,1699.0,486158.0,HD,[{'kind': 'AD_PLACEMENT_KIND_COMMAND_TRIGGERED...
0L8kRN_bPXM,Big Sean - Deep Reverence ft. Nipsey Hussle,Stream/Download “Detroit 2 https://bigsean.lnk...,2021-03-05 20:00:12+00:00,0 days 00:04:00,"[Big, Sean, Deep, Reverence, Getting, Out, Our...",Music,UC7zOpx9wgvGBCDEjujnAPQA,127902.0,954.0,5635.0,1952919.0,HD,"[{'kind': 'AD_PLACEMENT_KIND_START', 'offset':..."
0OPKk1Hwk9s,Armor Plates Rule in Rainbow Six Siege (Animat...,"You almost did it, Rook... Almost...\n\n• Use ...",2021-03-10 15:15:00+00:00,0 days 00:00:34,"[rainbow six, siege, r6, rainbow six cartoon, ...",Gaming,UCT4ITc4BhHL4CXHYt4Bs9jg,31937.0,98.0,1194.0,270030.0,HD,"[{'kind': 'AD_PLACEMENT_KIND_START', 'offset':..."
0PzWnljCpL0,What Is International Women's Day?,International Women’s Day is celebrated every ...,2021-02-09 05:22:52+00:00,0 days 00:06:00,"[international women's day, what is internatio...",Education,UCvmofFg-oZc4jvBUIfZbjzg,2147.0,207.0,664.0,168160.0,HD,[{'kind': 'AD_PLACEMENT_KIND_COMMAND_TRIGGERED...


## Stats

Unnamed: 0,length,stats.like,stats.dislike,stats.comment,stats.view
count,219,219.0,219.0,209.0,219.0
mean,0 days 00:10:22.388127853,136133.5,2339.191781,11710.655502,2661245.0
std,0 days 00:10:36.855607074,253660.8,3615.780104,23349.540607,4127891.0
min,0 days 00:00:12,529.0,66.0,106.0,120510.0
25%,0 days 00:03:13,18944.0,378.0,2102.0,585778.5
50%,0 days 00:08:08,47905.0,958.0,4101.0,1205509.0
75%,0 days 00:14:02.500000,121197.0,2410.5,10479.0,2982000.0
max,0 days 01:25:01,1788405.0,23309.0,191504.0,30037510.0


---

---

# Results - Trending (Channels) (Verification)

## Data Preview

Unnamed: 0_level_0,creator.title,creator.description,creator.time,creator.stats.follower,creator.stats.view,creator.stats.post
creator.id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
UC-SJ6nODDmufqBzPBwCvYvQ,CBS This Morning,"Each weekday morning, Gayle King, Anthony Maso...",2013-05-23 10:59:52+00:00,1900000.0,1474178000.0,29403.0
UC-SJ6nODDmufqBzPBwCvYvQ,CBS This Morning,"Each weekday morning, Gayle King, Anthony Maso...",2013-05-23 10:59:52+00:00,1900000.0,1474178000.0,29403.0
UC-SJ6nODDmufqBzPBwCvYvQ,CBS This Morning,"Each weekday morning, Gayle King, Anthony Maso...",2013-05-23 10:59:52+00:00,1900000.0,1474178000.0,29403.0
UC-SJ6nODDmufqBzPBwCvYvQ,CBS This Morning,"Each weekday morning, Gayle King, Anthony Maso...",2013-05-23 10:59:52+00:00,1900000.0,1474178000.0,29403.0
UC-yW8iz7ICKv_bhuxLtcJaw,TateMcRaeVEVO,,2019-02-05 17:08:22+00:00,8170.0,139472600.0,28.0


## Stats

Unnamed: 0,creator.stats.follower,creator.stats.view,creator.stats.post
count,276.0,280.0,280.0
mean,4831331.0,1424369000.0,5084.042857
std,8089439.0,2956389000.0,13661.100995
min,8170.0,7119183.0,14.0
25%,1060000.0,203529900.0,99.0
50%,2170000.0,524263800.0,327.0
75%,5170000.0,1356193000.0,2031.0
max,58200000.0,17809000000.0,90424.0


---