# Data Sampling

## Setup

In [1]:
import sys

!{sys.executable} -m pip install --quiet --user --upgrade pandas==1.*
!{sys.executable} -m pip install --quiet --user --upgrade -r requirements.txt

In [2]:
from goodies import *
import pandas as pd
import os

## Data Collection

In [3]:
from dcollect import plugins

modules = {'http': plugins.fasthttp()}
headers = None

### YouTube (United States)

Initial setup. Be sure to have your API key ready. For details on how to obtain an API key, read [YouTube Data API Overview, Introduction: Before you start](https://developers.google.com/youtube/v3/getting-started#before-you-start).

In [4]:
from dcollect import api_youtube as youtube
from dcollect import api_youtubei as youtubei

# This key is for testing ONLY. DO NOT release to the public!
api_experiment = False
api_key_testing = None
api_key = os.environ.get('YOUTUBE_API_KEY') or api_key_testing

if not api_key:
    api_key = os.environ.get('YOUTUBE_EXPLORER_API_KEY')
    if api_key: 
        api_experiment = True
    else: 
        api_key = input('YouTube Data API Key: ')
        api_experiment = (input('Is this an explorer key? [Y/N]: ') == 'Y')

YouTube Data API Key: AIzaSyAa8yy0GdcGPHdtD083HiGGx_S0vMPScDM
Is this an explorer key? [Y/N]: Y


#### Search

In [5]:
count = int(os.environ.get('SAMPLE_SIZE', 200))

##### STEP 1  API Object Creation

In [6]:
# create a YouTube API object
youtube_o = youtube.api(
    modules = modules,
    headers = headers,
    key = api_key,
    experiment = api_experiment
)

# create a YouTube Internals API object
youtubei_o = youtubei.api(
    modules = modules,
    headers = headers
)

##### STEP 2  Data Collection

In [7]:
def df_trending_gen(*args, **kwargs):
    from dcollect.utils.log import log
    log.enable(level = log.levels.WARNING)
    
    import concurrent.futures
    
    df_trending = None
    df_channels = None
    df_ads = None
    
    def worker_df_trending(*args, **kwargs):
        nonlocal df_trending
        df_trending = df_from_json(
            youtube_o.video.trending(
                *args, **kwargs
            )
        )

    def worker_df_ads():
        nonlocal df_ads
        df_ads = df_from_json(
            youtubei_o.ad.placements(
                id = df_trending['id'],
                throttle_size = 100
            )
        )
            
    def worker_df_channels():
        nonlocal df_channels
        df_channels = df_from_json(
            youtube_o.channel.info(
                id = df_trending['creator.id']
            )
        )
        
    worker_df_trending(*args, **kwargs)    
    
    with concurrent.futures.ThreadPoolExecutor() as executor:
        for worker in worker_df_ads, worker_df_channels:
            executor.submit(worker)
        executor.shutdown(wait = True)

    return df_trending, df_channels, df_ads

In [8]:
df_trending, df_channels, df_ads = df_trending_gen(count = count)

##### STEP 3  Data Cleaning


In [9]:
# - trending
df_trending.set_index(['id'], inplace = True)
# - channels
df_channels = df_channels.add_prefix('creator.')
df_channels.set_index(['creator.id'], inplace = True)
# - ads
df_ads.set_index(['id'], inplace = True)

In [10]:
# - search (with details)
df_trending_details = df_trending.copy()
# - ads
df_trending_details = df_trending_details.merge(
    df_ads, 
    right_index = True, 
    left_on = 'id', 
    copy = False
)

##### STEP 4  Data Inspection


In [23]:
# take a brief look at our data
df_report(df_trending_details, name = 'Trending')
df_report(df_channels, name = 'Trending (Channels)')

---

# Results - Trending

## Data Preview

Unnamed: 0_level_0,title,description,time,length,tags,category,creator.id,stats.like,stats.dislike,stats.comment,stats.view,video.quality,ads
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
AeaR5QbXgpM,Rod Wave - Street Runner (Official Video),#rodwave #streetrunner #soulfly\n\nPre-save th...,2021-03-10 05:00:13+00:00,0 days 00:04:10,"[rod wave, hunger games, hunger games 3, ptsd,...",Music,UCenjunBhBhvKjfDAESnoppw,153393,2042,12179.0,2213924,HD,"[{'kind': 'AD_PLACEMENT_KIND_START', 'offset':..."
sG9rX6Ifzhw,Piers and Alex Clash Over Prince Harry and Meg...,Following the Duke and Duchess of Sussex’s lan...,2021-03-09 10:58:19+00:00,0 days 00:14:05,"[good morning britain, breakfast show, news, m...",Entertainment,UCq18eeL7D9Vd8DhjMcLh9QQ,66340,11658,,6528574,HD,[{'kind': 'AD_PLACEMENT_KIND_COMMAND_TRIGGERED...
ir1KIlOioSo,I Troll BadBoyHalo With The Morph Mod,Skeppy trolls BadBoyHalo with a Minecraft Morp...,2021-03-09 23:56:29+00:00,0 days 00:17:34,"[minecraft, mine craft, minecraft youtuber, pg...",Gaming,UCzMjRlKVO9XIqH_crIFpi6w,98271,1801,6061.0,1721208,HD,[{'kind': 'AD_PLACEMENT_KIND_COMMAND_TRIGGERED...
jJdlgKzVsnI,Doja Cat - Streets (Official Video),Doja Cat // Streets (Official Video)\nHot Pink...,2021-03-09 05:00:12+00:00,0 days 00:04:34,"[doja cat, streets, doja, doja cat streets, st...",Music,UCpTaAz_BxtkUB1qc8JTU_7g,1078021,13043,65259.0,10842551,HD,"[{'kind': 'AD_PLACEMENT_KIND_START', 'offset':..."
pIQIKIDZJjc,David Dobrik & I Bought Markell A Car | Charli...,hi everyone. i wanted to organize this with @D...,2021-03-09 18:00:12+00:00,0 days 00:11:26,"[charli, charlie, charli d'amelio, charli dame...",People & Blogs,UCi3OE-aN09WOcN9d2stCvPg,168144,9894,13237.0,2330292,HD,"[{'kind': 'AD_PLACEMENT_KIND_START', 'offset':..."


## Stats

Unnamed: 0,length,stats.like,stats.dislike,stats.comment,stats.view
count,200,200.0,200.0,194.0,200.0
mean,0 days 00:09:59.770000,139230.5,2300.57,11772.082474,2685819.0
std,0 days 00:09:18.136937772,254535.3,3570.47973,23675.89249,4111286.0
min,0 days 00:00:12,527.0,66.0,105.0,113365.0
25%,0 days 00:03:13,22682.0,356.0,2051.0,567808.5
50%,0 days 00:07:40,53950.0,901.5,4050.0,1177566.0
75%,0 days 00:14:05.250000,127322.8,2323.75,10539.5,3023414.0
max,0 days 00:54:22,1764723.0,22739.0,190843.0,28028940.0


---

---

# Results - Trending (Channels)

## Data Preview

Unnamed: 0_level_0,creator.title,creator.description,creator.time,creator.stats.follower,creator.stats.view,creator.stats.post
creator.id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
UCFbZ2e9IrPejOdp8wsKUxvA,QuackiTwo,,2014-03-16 23:04:36+00:00,1750000,102012742,73
UCqISR0F9-nCth-V2r4Qy75Q,REVOLT TV,Unapologetically Hip Hop. \n\nLaunched by Sea...,2006-05-29 13:23:51+00:00,1050000,252468863,4318
UCenjunBhBhvKjfDAESnoppw,RodWave,,2016-05-30 02:45:34+00:00,2810000,2046114411,110
UCVpankR4HtoAVtYnFDUieYA,zefrank1,Hi. Please subscribe to this channel and I wil...,2006-07-22 18:14:13+00:00,3320000,534101181,191
UCIPPMRA040LQr5QPyJEbmXA,MrBeast Gaming,MrBeast Gaming - SUBSCRIBE OR ELSE,2020-04-07 18:46:13.800720+00:00,15200000,1738968009,96


## Stats

Unnamed: 0,creator.stats.follower,creator.stats.view,creator.stats.post
count,128.0,128.0,128.0
mean,4501608.0,1659328000.0,6979.25
std,5723910.0,3268005000.0,17296.949385
min,8170.0,7119183.0,14.0
25%,1042500.0,233154600.0,90.75
50%,2035000.0,704233100.0,224.0
75%,6132500.0,1641790000.0,3685.0
max,24800000.0,17809000000.0,90424.0


---

##### STEP 5  Data Archiving (Cumulative)


In [39]:
pickle_proto = 3

dataset = eda_utils.dataset('dsamples/youtube_trending.dataset')
dataset.update('youtube_trending.pkl', df_trending, proto = pickle_proto)
dataset.update('youtube_trending_ads.pkl', df_ads, proto = pickle_proto)
dataset.update('youtube_trending_channels.pkl', df_channels, proto = pickle_proto)
dataset.update('youtube_trending_details.pkl', df_trending_details, proto = pickle_proto)

# verify that we saved the correct data
df_report(dataset.load('youtube_trending_details.pkl'), name = 'Trending (Verification)')
df_report(dataset.load('youtube_trending_channels.pkl'), name = 'Trending (Channels) (Verification)')

---

# Results - Trending (Verification)

## Data Preview

Unnamed: 0_level_0,title,description,time,length,tags,category,creator.id,stats.like,stats.dislike,stats.comment,stats.view,video.quality,ads
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
AeaR5QbXgpM,Rod Wave - Street Runner (Official Video),#rodwave #streetrunner #soulfly\n\nPre-save th...,2021-03-10 05:00:13+00:00,0 days 00:04:10,"[rod wave, hunger games, hunger games 3, ptsd,...",Music,UCenjunBhBhvKjfDAESnoppw,153393,2042,12179.0,2213924,HD,"[{'kind': 'AD_PLACEMENT_KIND_START', 'offset':..."
sG9rX6Ifzhw,Piers and Alex Clash Over Prince Harry and Meg...,Following the Duke and Duchess of Sussex’s lan...,2021-03-09 10:58:19+00:00,0 days 00:14:05,"[good morning britain, breakfast show, news, m...",Entertainment,UCq18eeL7D9Vd8DhjMcLh9QQ,66340,11658,,6528574,HD,[{'kind': 'AD_PLACEMENT_KIND_COMMAND_TRIGGERED...
ir1KIlOioSo,I Troll BadBoyHalo With The Morph Mod,Skeppy trolls BadBoyHalo with a Minecraft Morp...,2021-03-09 23:56:29+00:00,0 days 00:17:34,"[minecraft, mine craft, minecraft youtuber, pg...",Gaming,UCzMjRlKVO9XIqH_crIFpi6w,98271,1801,6061.0,1721208,HD,[{'kind': 'AD_PLACEMENT_KIND_COMMAND_TRIGGERED...
jJdlgKzVsnI,Doja Cat - Streets (Official Video),Doja Cat // Streets (Official Video)\nHot Pink...,2021-03-09 05:00:12+00:00,0 days 00:04:34,"[doja cat, streets, doja, doja cat streets, st...",Music,UCpTaAz_BxtkUB1qc8JTU_7g,1078021,13043,65259.0,10842551,HD,"[{'kind': 'AD_PLACEMENT_KIND_START', 'offset':..."
pIQIKIDZJjc,David Dobrik & I Bought Markell A Car | Charli...,hi everyone. i wanted to organize this with @D...,2021-03-09 18:00:12+00:00,0 days 00:11:26,"[charli, charlie, charli d'amelio, charli dame...",People & Blogs,UCi3OE-aN09WOcN9d2stCvPg,168144,9894,13237.0,2330292,HD,"[{'kind': 'AD_PLACEMENT_KIND_START', 'offset':..."


## Stats

Unnamed: 0,length,stats.like,stats.dislike,stats.comment,stats.view
count,200,200.0,200.0,194.0,200.0
mean,0 days 00:09:59.770000,139230.5,2300.57,11772.082474,2685819.0
std,0 days 00:09:18.136937772,254535.3,3570.47973,23675.89249,4111286.0
min,0 days 00:00:12,527.0,66.0,105.0,113365.0
25%,0 days 00:03:13,22682.0,356.0,2051.0,567808.5
50%,0 days 00:07:40,53950.0,901.5,4050.0,1177566.0
75%,0 days 00:14:05.250000,127322.8,2323.75,10539.5,3023414.0
max,0 days 00:54:22,1764723.0,22739.0,190843.0,28028940.0


---

---

# Results - Trending (Channels) (Verification)

## Data Preview

Unnamed: 0_level_0,creator.title,creator.description,creator.time,creator.stats.follower,creator.stats.view,creator.stats.post
creator.id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
UCFbZ2e9IrPejOdp8wsKUxvA,QuackiTwo,,2014-03-16 23:04:36+00:00,1750000,102012742,73
UCqISR0F9-nCth-V2r4Qy75Q,REVOLT TV,Unapologetically Hip Hop. \n\nLaunched by Sea...,2006-05-29 13:23:51+00:00,1050000,252468863,4318
UCenjunBhBhvKjfDAESnoppw,RodWave,,2016-05-30 02:45:34+00:00,2810000,2046114411,110
UCVpankR4HtoAVtYnFDUieYA,zefrank1,Hi. Please subscribe to this channel and I wil...,2006-07-22 18:14:13+00:00,3320000,534101181,191
UCIPPMRA040LQr5QPyJEbmXA,MrBeast Gaming,MrBeast Gaming - SUBSCRIBE OR ELSE,2020-04-07 18:46:13.800720+00:00,15200000,1738968009,96


## Stats

Unnamed: 0,creator.stats.follower,creator.stats.view,creator.stats.post
count,128.0,128.0,128.0
mean,4501608.0,1659328000.0,6979.25
std,5723910.0,3268005000.0,17296.949385
min,8170.0,7119183.0,14.0
25%,1042500.0,233154600.0,90.75
50%,2035000.0,704233100.0,224.0
75%,6132500.0,1641790000.0,3685.0
max,24800000.0,17809000000.0,90424.0


---