# Data Sampling

## Setup

In [1]:
import sys

!{sys.executable} -m pip install --quiet --user --upgrade ipykernel
!{sys.executable} -m pip install --quiet --user --upgrade pandas==1.*
!{sys.executable} -m pip install --quiet --user --upgrade -r requirements.txt

In [2]:
from goodies import *
import pandas as pd
import os
import logging

## Data Collection

In [3]:
from dcollect import plugins

modules = {'http': plugins.fasthttp()}
headers = None

### YouTube (United States)

Initial setup. Be sure to have your API key ready. For details on how to obtain an API key, read [YouTube Data API Overview, Introduction: Before you start](https://developers.google.com/youtube/v3/getting-started#before-you-start).

In [4]:
from dcollect import api_youtube as youtube
from dcollect import api_youtubei as youtubei

# This key is for testing ONLY. DO NOT release to the public!
api_experiment = False
api_key_testing = None
api_key = os.environ.get('YOUTUBE_API_KEY') or api_key_testing

if not api_key:
    api_key = os.environ.get('YOUTUBE_EXPLORER_API_KEY')
    if api_key: 
        api_experiment = True
    else: 
        api_key = input('YouTube Data API Key: ')
        api_experiment = (input('Is this an explorer key? [Y/N]: ') == 'Y')

dataset_id = os.environ.get('DATASET_NAME')
if dataset_id == None:
    dataset_id = input('Dataset Name: ')
    
sample_size_per_query_default = 1000000
sample_size_per_query = os.environ.get('SAMPLE_SIZE_PER_QUERY')    
if sample_size_per_query == None:
    sample_size_per_query = input('Sample size per query: ') or sample_size_per_query_default
    
sample_size_per_query = int(sample_size_per_query)

YouTube Data API Key: AIzaSyAa8yy0GdcGPHdtD083HiGGx_S0vMPScDM
Is this an explorer key? [Y/N]: Y
Dataset Name: random_extended_ascii
Sample size per query: 50


#### Search

##### STEP 1  Data Collection

In [5]:
# create a YouTube API object
youtube_o = youtube.api(
    modules = modules,
    headers = headers,
    key = api_key,
    experiment = api_experiment
)

# create a YouTube Internals API object
youtubei_o = youtubei.api(
    modules = modules,
    headers = headers
)

pickle_proto = 3
dataset = eda_utils.dataset(f'dsamples/youtube_search_{dataset_id}.dataset')

In [6]:
def df_search_gen(*args, **kwargs):
    from dcollect.utils.log import log
    log.enable(level = log.levels.WARNING)
    import concurrent.futures

    df_search = None
    df_info = None
    df_channels = None
    df_ads = None
    
    def worker_df_search(*args, **kwargs):
        nonlocal df_search
        df_search = df_from_json(
            youtube_o.video.search(
                *args, **kwargs
            )
        )
        
    def worker_df_info():
        nonlocal df_info
        df_info = df_from_json(
            youtube_o.video.info(
                id = df_search['id']
            )
        )
            
    def worker_df_ads():
        nonlocal df_ads
        df_ads = df_from_json(
            youtubei_o.ad.placements(
                id = df_search['id'],
                throttle_size = 10
            )
        )
            
    def worker_df_channels():
        nonlocal df_channels
        df_channels = df_from_json(
            youtube_o.channel.info(
                id = df_search['creator.id']
            )
        )
            
    # - search
    worker_df_search(*args, **kwargs)
    
    workers = [worker_df_info, worker_df_ads, worker_df_channels]
    with concurrent.futures.ThreadPoolExecutor(max_workers = len(workers)) as executor:
        for worker in workers:
            executor.submit(worker)
                
    return df_search, df_info, df_channels, df_ads

In [7]:
def df_search_gen_bulk(paramlist: list):
    import concurrent.futures
    
    futures = []
    with concurrent.futures.ThreadPoolExecutor(max_workers = len(paramlist)) as executor:
        futures = [executor.submit(df_search_gen, **param) for param in paramlist]
        
    return [f.result() for f in futures]

In [8]:
import string

param_default = {
    'count': sample_size_per_query
}

paramlist = []
for c in string.ascii_lowercase:
    param = dict(param_default)
    param.update({
        'keyword': c
    })
    paramlist.append(param)
    
df_search = pd.DataFrame()
df_info = pd.DataFrame()
df_channels = pd.DataFrame()
df_ads = pd.DataFrame()

results = df_search_gen_bulk(paramlist)

In [9]:
def transpose(l):
    return list(map(list, zip(*l)))

df_search_res, df_info_res, df_channels_res, df_ads_res = transpose(results)

df_search = pd.concat(df_search_res, copy = False)
df_info = pd.concat(df_info_res, copy = False)
df_channels = pd.concat(df_channels_res, copy = False)
df_ads = pd.concat(df_ads_res, copy = False)

dataset.update('youtube_search.pkl', df_search, overwrite = True, proto = pickle_proto)
dataset.update('youtube_search_info.pkl', df_info, overwrite = True, proto = pickle_proto)
dataset.update('youtube_search_ads.pkl', df_ads, overwrite = True, proto = pickle_proto)
dataset.update('youtube_search_channels.pkl', df_channels, overwrite = True, proto = pickle_proto)

df_report(df_search, name = 'Search Result (Original)')
df_report(df_info, name = 'Info (Original)')
df_report(df_channels, name = 'Channels (Original)')
df_report(df_ads, name = 'Ad Placements (Original)')

  df.describe()


---

# Results - Search Result (Original)

## Data Preview

Unnamed: 0,id,title,description,time,tags,creator.id
0,mwuL0UmFPQk,NiKO got a HAiRCUT ✂️ ADLEY has a new HAiRST...,Best Park Day Ever 1186 Today's Best Day Ever ...,2021-03-09 16:00:30+00:00,,UCoK5NOxkZBLfI_5eqf8Es4Q
1,YNeKQhPIpt8,KiDS Leprechaun TRAP!! Adley &amp; Niko make ...,The ultimate plan to catch a Leprechaun LET'...,2021-03-08 16:00:44+00:00,,UCBJuxfqZuiibvcwSc8ViGsQ
2,IqYVcxs4Qr0,🔴 Live: Baby&#39;s First Steps - Wolfoo Preten...,Live: Baby's First Steps - Wolfoo Pretends to ...,2021-03-11 07:16:45+00:00,,UCWGVQIspqW2j9M3-qLQ0HDg
3,Be-WVy8P4M0,【CUPHEAD #02】A GRAND SLAYING AND THEN SOME!!! ...,"What is up, humans?! ♡ Calliope Mori（森 カリオペ）he...",2021-03-11 03:33:29+00:00,,UCL_qhgtOy0dy1Agp8vkySQg
4,pMTRTl2GcSk,Surprising My Twin Best Friend With A Bad Hair...,Do you think new our twin Karen haircuts are t...,2021-03-10 17:09:40+00:00,,UCfw8x3VR-ElcaWW2Tg_jgSA


## Stats

Unnamed: 0,id,title,description,time,tags,creator.id
count,1300,1300,1281,1300,0.0,1300
unique,1277,1271,1223,1272,0.0,871
top,-F7_IcGvxfo,🔴Live สด! 🏆ศึก 𝐏𝐆𝐈.𝐒 รอบ 𝗪𝗲𝗲𝗸𝗹𝘆 𝗦𝘂𝗿𝘃𝗶𝘃𝗮𝗹 l สัป...,ABOUT FGTEEV: FGTeeV is a Gaming Channel that ...,2021-03-11 13:03:28+00:00,,UC0-swBG9Ne0Vh4OuoJ2bjbA
freq,2,4,6,2,,35
first,,,,2007-10-09 20:54:01+00:00,,
last,,,,2021-03-11 13:13:21+00:00,,


---

---

# Results - Info (Original)

## Data Preview

Unnamed: 0,id,title,description,time,length,tags,category,creator.id,stats.like,stats.dislike,stats.comment,stats.view,video.quality
0,6o8gGKJ8Oeg,TROLLANDO A LULUCA COM MUITOS SUSTOS NA REALID...,TROLLANDO A LULUCA COM MUITOS SUSTOS NA REALID...,2021-03-10 21:01:05+00:00,0 days 00:17:23,"[luluca, crescendo com luluca, TROLLANDO A LUL...",People & Blogs,UCnrGkE2AmBAv6Vxj6kjf1jg,23333.0,283.0,,125719,HD
1,h4u6o515nXI,Adley is the BOSS 🍦 SWEET SHAKE SHOP!! New Sto...,Adley and Dad's Sweet Shake Shop is now open f...,2021-03-01 16:00:17+00:00,0 days 00:21:57,"[adley, shonduras, school, neighbor, best day ...",Entertainment,UCBJuxfqZuiibvcwSc8ViGsQ,11090.0,2027.0,0.0,1696410,HD
2,mNqBxWS5e_s,RONALDO 'IS A JUVENTUS FLOP'.,The Irish Guy examines the Italian press decid...,2021-03-11 07:00:05+00:00,0 days 00:10:22,"[sport, football, hitc sport, HITCsport, goals...",Sports,UC3UFSVP6ormiRWUg_rmH2zA,5763.0,135.0,1082.0,53614,HD
3,S1X0jVwVjDw,ME PAREZCO A LA HERMANA DE MESSI? | Memes Redd...,Like por las gafas gamer de Willyrex\n\nDiscor...,2021-03-10 18:32:46+00:00,0 days 00:09:40,"[elrubius, rubius, reddit, meme, memes, memes ...",Entertainment,UCcjIvuxmWlS5IEQ0JdPV4Ng,287005.0,1512.0,6393.0,2098640,HD
4,AlnM6FBUch4,Playing as a BABY in Roblox!,We're the most stoopy babies in Roblox...\n► S...,2021-03-09 20:00:04+00:00,0 days 00:21:51,"[itsfunneh, funneh, funny moments, funny, krew...",Gaming,UCUk7VggtJdo9XYTy3Z5QVAw,39349.0,606.0,4916.0,1086742,HD


## Stats

Unnamed: 0,length,stats.like,stats.dislike,stats.comment,stats.view
count,1300,1278.0,1278.0,1172.0,1300.0
mean,0 days 00:14:47.382307692,375170.5,46277.6,28080.89,62242230.0
std,0 days 00:41:36.761616353,1477479.0,235555.0,226994.2,312848600.0
min,0 days 00:00:00,3.0,0.0,0.0,0.0
25%,0 days 00:03:32,4037.5,128.25,113.0,126445.8
50%,0 days 00:08:02,23654.5,1284.5,1062.0,1565260.0
75%,0 days 00:14:44.250000,144978.2,10111.75,6681.75,16726100.0
max,0 days 15:13:24,21113580.0,4142776.0,5198500.0,4418250000.0


---

---

# Results - Channels (Original)

## Data Preview

Unnamed: 0,id,title,description,time,stats.follower,stats.view,stats.post
0,UCNyv46EnCgc9hT7EWUXBuPg,Pop Teen TV,,2017-02-26 00:49:56+00:00,1310000.0,123941034,96
1,UC2C9XiZX-kFqFybRn6RrBJw,ARTDINK公式チャンネル,株式会社 アートディンクの公式動画チャンネルです。\nさまざまなムービーをご紹介してまいります。,2013-03-29 06:07:45+00:00,6540.0,2725162,80
2,UC0LlPl_mWhIFqVxcH1l3xpQ,WebTVBrasileira,"***INSCREVA-SE! Venha INTERAGIR, FOFOCAR e se ...",2011-12-17 21:52:18+00:00,1810000.0,590500113,3861
3,UCnrGkE2AmBAv6Vxj6kjf1jg,Crescendo com Luluca,"Olá... Eu sou a Luíza, mas todos me chamam d...",2015-08-30 15:47:06+00:00,10100000.0,2775545540,886
4,UCzYfz8uibvnB7Yc1LjePi4g,Aphmau,Welcome to my Gaming Youtube Channel! \n\nMy ...,2012-08-13 20:32:51+00:00,8600000.0,6411410392,3456


## Stats

Unnamed: 0,stats.follower,stats.view,stats.post
count,941.0,987.0,987.0
mean,5581791.0,3096332000.0,3670.189463
std,13495110.0,10282850000.0,14438.112141
min,57.0,15630.0,1.0
25%,269000.0,75357660.0,128.0
50%,1300000.0,389141000.0,447.0
75%,4910000.0,1935600000.0,1292.0
max,175000000.0,146273200000.0,155526.0


---

---

# Results - Ad Placements (Original)

## Data Preview

Unnamed: 0,id,ads
0,S1X0jVwVjDw,[{'kind': 'AD_PLACEMENT_KIND_COMMAND_TRIGGERED...
1,6o8gGKJ8Oeg,[{'kind': 'AD_PLACEMENT_KIND_COMMAND_TRIGGERED...
2,h4u6o515nXI,[{'kind': 'AD_PLACEMENT_KIND_COMMAND_TRIGGERED...
3,UxtWNSnhe9c,[{'kind': 'AD_PLACEMENT_KIND_COMMAND_TRIGGERED...
4,mNqBxWS5e_s,[{'kind': 'AD_PLACEMENT_KIND_COMMAND_TRIGGERED...


## Stats

Unnamed: 0,id,ads
count,1300,1176
unique,1277,560
top,astISOttCQ0,[{'kind': 'AD_PLACEMENT_KIND_COMMAND_TRIGGERED...
freq,2,229


---

##### STEP 2  Data Cleaning

In [10]:
# - * (filter)
def drop_common(df, df_other, *args, **kwargs):
    return df.drop(columns = df.columns & df_other.columns, *args, **kwargs)

# - search
df_search.set_index(['id'], inplace = True)
# - info
df_info.set_index(['id'], inplace = True)
# - channels
df_channels = df_channels.add_prefix('creator.')
df_channels.set_index(['creator.id'], inplace = True)
# - ads
df_ads.set_index(['id'], inplace = True)

# drop common columns to avoid clashing
# in this case, only `df_search` and `df_info` have merging conflicts
drop_common(df_search, df_info, inplace = True)

  This is separate from the ipykernel package so we can avoid doing imports until


In [11]:
# - search (with details)
df_search_details = df_search.copy()
# - info
df_search_details = df_search_details.merge(
    df_info, 
    right_index = True, 
    left_on = 'id', 
    copy = False
)
# - ads
df_search_details = df_search_details.merge(
    df_ads, 
    right_index = True, 
    left_on = 'id', 
    copy = False
)

##### STEP 3  Data Inspection

In [12]:
# take a brief look at our data
df_report(df_search_details, name = 'Search Result')

---

# Results - Search Result

## Data Preview

Unnamed: 0_level_0,title,description,time,length,tags,category,creator.id,stats.like,stats.dislike,stats.comment,stats.view,video.quality,ads
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
mwuL0UmFPQk,NiKO got a HAiRCUT ✂️ ADLEY has a new HAiRST...,The Kids Haircut Surprise for Dad!!\nand check...,2021-03-09 16:00:30+00:00,0 days 00:14:21,"[Shonduras, Best Day Ever, NiKO got a HAiRCUT ...",Entertainment,UCoK5NOxkZBLfI_5eqf8Es4Q,5276.0,507.0,488.0,702645,HD,[{'kind': 'AD_PLACEMENT_KIND_COMMAND_TRIGGERED...
YNeKQhPIpt8,KiDS Leprechaun TRAP!! Adley & Niko make a St...,The ultimate plan to catch a Leprechaun🍀\n\nLE...,2021-03-08 16:00:44+00:00,0 days 00:25:47,"[adley, shonduras, hidden, presents, hide n se...",Entertainment,UCBJuxfqZuiibvcwSc8ViGsQ,10883.0,2028.0,0.0,1546035,HD,[{'kind': 'AD_PLACEMENT_KIND_COMMAND_TRIGGERED...
IqYVcxs4Qr0,🔴 Live: Baby's First Steps - Wolfoo Pretends t...,🔴 Live: Baby's First Steps - Wolfoo Pretends t...,2021-03-11 07:16:45+00:00,0 days 00:00:00,,Film & Animation,UCWGVQIspqW2j9M3-qLQ0HDg,2020.0,1014.0,0.0,202916,SD,[{'kind': 'AD_PLACEMENT_KIND_COMMAND_TRIGGERED...
Be-WVy8P4M0,【CUPHEAD #02】A GRAND SLAYING AND THEN SOME!!! ...,"What is up, humans?! ♡ Calliope Mori（森 カリオペ）he...",2021-03-11 03:33:29+00:00,0 days 00:00:00,,Entertainment,UCL_qhgtOy0dy1Agp8vkySQg,1310.0,3.0,0.0,1,SD,
pMTRTl2GcSk,Surprising My Twin Best Friend With A Bad Hair...,Do you think new our twin Karen haircuts are t...,2021-03-10 17:09:40+00:00,0 days 00:22:06,"[suprising, best, friend, worst, hair cut, wor...",Entertainment,UCfw8x3VR-ElcaWW2Tg_jgSA,45642.0,1568.0,12224.0,1273320,HD,[{'kind': 'AD_PLACEMENT_KIND_COMMAND_TRIGGERED...


## Stats

Unnamed: 0,length,stats.like,stats.dislike,stats.comment,stats.view
count,1438,1416.0,1416.0,1298.0,1438.0
mean,0 days 00:14:45.856050069,468441.1,64699.14,51942.75,84750220.0
std,0 days 00:39:52.402657753,1978060.0,311658.0,412385.9,418615600.0
min,0 days 00:00:00,3.0,0.0,0.0,0.0
25%,0 days 00:03:35,3849.5,127.5,101.25,120192.8
50%,0 days 00:08:20.500000,26562.5,1327.5,1162.5,1589346.0
75%,0 days 00:15:16,168239.8,10712.0,7307.75,17629130.0
max,0 days 15:13:24,21113580.0,4142776.0,5198500.0,4418250000.0


---

##### STEP 4  Data Archiving

In [13]:
dataset.update('youtube_search_details.pkl', df_search_details, proto = pickle_proto)
# verify that we saved the correct data
df_report(dataset.load('youtube_search_details.pkl'), name = 'Search Result (Verification)')

---

# Results - Search Result (Verification)

## Data Preview

Unnamed: 0_level_0,title,description,time,length,tags,category,creator.id,stats.like,stats.dislike,stats.comment,stats.view,video.quality,ads
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
mwuL0UmFPQk,NiKO got a HAiRCUT ✂️ ADLEY has a new HAiRST...,The Kids Haircut Surprise for Dad!!\nand check...,2021-03-09 16:00:30+00:00,0 days 00:14:21,"[Shonduras, Best Day Ever, NiKO got a HAiRCUT ...",Entertainment,UCoK5NOxkZBLfI_5eqf8Es4Q,5276.0,507.0,488.0,702645,HD,[{'kind': 'AD_PLACEMENT_KIND_COMMAND_TRIGGERED...
YNeKQhPIpt8,KiDS Leprechaun TRAP!! Adley & Niko make a St...,The ultimate plan to catch a Leprechaun🍀\n\nLE...,2021-03-08 16:00:44+00:00,0 days 00:25:47,"[adley, shonduras, hidden, presents, hide n se...",Entertainment,UCBJuxfqZuiibvcwSc8ViGsQ,10883.0,2028.0,0.0,1546035,HD,[{'kind': 'AD_PLACEMENT_KIND_COMMAND_TRIGGERED...
IqYVcxs4Qr0,🔴 Live: Baby's First Steps - Wolfoo Pretends t...,🔴 Live: Baby's First Steps - Wolfoo Pretends t...,2021-03-11 07:16:45+00:00,0 days 00:00:00,,Film & Animation,UCWGVQIspqW2j9M3-qLQ0HDg,2020.0,1014.0,0.0,202916,SD,[{'kind': 'AD_PLACEMENT_KIND_COMMAND_TRIGGERED...
Be-WVy8P4M0,【CUPHEAD #02】A GRAND SLAYING AND THEN SOME!!! ...,"What is up, humans?! ♡ Calliope Mori（森 カリオペ）he...",2021-03-11 03:33:29+00:00,0 days 00:00:00,,Entertainment,UCL_qhgtOy0dy1Agp8vkySQg,1310.0,3.0,0.0,1,SD,
pMTRTl2GcSk,Surprising My Twin Best Friend With A Bad Hair...,Do you think new our twin Karen haircuts are t...,2021-03-10 17:09:40+00:00,0 days 00:22:06,"[suprising, best, friend, worst, hair cut, wor...",Entertainment,UCfw8x3VR-ElcaWW2Tg_jgSA,45642.0,1568.0,12224.0,1273320,HD,[{'kind': 'AD_PLACEMENT_KIND_COMMAND_TRIGGERED...


## Stats

Unnamed: 0,length,stats.like,stats.dislike,stats.comment,stats.view
count,1438,1416.0,1416.0,1298.0,1438.0
mean,0 days 00:14:45.856050069,468441.1,64699.14,51942.75,84750220.0
std,0 days 00:39:52.402657753,1978060.0,311658.0,412385.9,418615600.0
min,0 days 00:00:00,3.0,0.0,0.0,0.0
25%,0 days 00:03:35,3849.5,127.5,101.25,120192.8
50%,0 days 00:08:20.500000,26562.5,1327.5,1162.5,1589346.0
75%,0 days 00:15:16,168239.8,10712.0,7307.75,17629130.0
max,0 days 15:13:24,21113580.0,4142776.0,5198500.0,4418250000.0


---