# Data Sampling

## Setup

In [1]:
import sys

!{sys.executable} -m pip install --quiet --user --upgrade pandas==1.*
!{sys.executable} -m pip install --quiet --user --upgrade -r requirements.txt

In [2]:
from goodies import *
import pandas as pd
import os
import logging

## Data Collection

In [3]:
from dcollect import plugins

modules = {'http': plugins.fasthttp()}
headers = None

### YouTube (United States)

Initial setup. Be sure to have your API key ready. For details on how to obtain an API key, read [YouTube Data API Overview, Introduction: Before you start](https://developers.google.com/youtube/v3/getting-started#before-you-start).

In [4]:
from dcollect import api_youtube as youtube
from dcollect import api_youtubei as youtubei

# This key is for testing ONLY. DO NOT release to the public!
api_experiment = False
api_key_testing = None
api_key = os.environ.get('YOUTUBE_API_KEY') or api_key_testing

if not api_key:
    api_key = os.environ.get('YOUTUBE_EXPLORER_API_KEY')
    if api_key: 
        api_experiment = True
    else: 
        api_key = input('YouTube Data API Key: ')
        api_experiment = (input('Is this an explorer key? [Y/N]: ') == 'Y')

dataset_id = os.environ.get('DATASET_NAME')
if dataset_id == None:
    dataset_id = input('Dataset Name: ')
    
sample_size_per_query_default = 1000000
sample_size_per_query = os.environ.get('SAMPLE_SIZE_PER_QUERY')    
if sample_size_per_query == None:
    sample_size_per_query = input('Sample size per query: ') or sample_size_per_query_default
    
sample_size_per_query = int(sample_size_per_query)

YouTube Data API Key: AIzaSyAa8yy0GdcGPHdtD083HiGGx_S0vMPScDM
Is this an explorer key? [Y/N]: Y
Dataset Name: random_extended_ascii
Sample size per query: 200


#### Search

##### STEP 1  Data Collection

In [5]:
# create a YouTube API object
youtube_o = youtube.api(
    modules = modules,
    headers = headers,
    key = api_key,
    experiment = api_experiment
)

# create a YouTube Internals API object
youtubei_o = youtubei.api(
    modules = modules,
    headers = headers
)

pickle_proto = 3
dataset = eda_utils.dataset(f'dsamples/youtube_search_{dataset_id}.dataset')

In [6]:
def df_search_gen(*args, **kwargs):
    from dcollect.utils.log import log
    log.enable(level = log.levels.WARNING)
    import concurrent.futures

    df_search = None
    df_info = None
    df_channels = None
    df_ads = None
    
    def worker_df_search(*args, **kwargs):
        nonlocal df_search
        df_search = df_from_json(
            youtube_o.video.search(
                *args, **kwargs
            )
        )
        
    def worker_df_info():
        nonlocal df_info
        df_info = df_from_json(
            youtube_o.video.info(
                id = df_search['id']
            )
        )
            
    def worker_df_ads():
        nonlocal df_ads
        df_ads = df_from_json(
            youtubei_o.ad.placements(
                id = df_search['id'],
                throttle_size = 10
            )
        )
            
    def worker_df_channels():
        nonlocal df_channels
        df_channels = df_from_json(
            youtube_o.channel.info(
                id = df_search['creator.id']
            )
        )
            
    # - search
    worker_df_search(*args, **kwargs)
    
    max_workers = 1 # len(workers)
    workers = [worker_df_info, worker_df_ads, worker_df_channels]
    with concurrent.futures.ThreadPoolExecutor(max_workers = max_workers) as executor:
        for worker in workers:
            executor.submit(worker)
                
    return df_search, df_info, df_channels, df_ads

In [7]:
def df_search_gen_bulk(paramlist: list):
    import concurrent.futures
    
    futures = []
    max_workers = 1 # len(paramlist)
    with concurrent.futures.ThreadPoolExecutor(max_workers = max_workers) as executor:
        futures = [executor.submit(df_search_gen, **param) for param in paramlist]
        
    return [f.result() for f in futures]

In [8]:
import string

param_default = {
    'count': sample_size_per_query
}

paramlist = []
for c in string.ascii_lowercase:
    param = dict(param_default)
    param.update({
        'keyword': c
    })
    paramlist.append(param)
    
df_search = pd.DataFrame()
df_info = pd.DataFrame()
df_channels = pd.DataFrame()
df_ads = pd.DataFrame()

results = df_search_gen_bulk(paramlist)

In [9]:
def transpose(l):
    return list(map(list, zip(*l)))

df_search_res, df_info_res, df_channels_res, df_ads_res = transpose(results)

df_search = pd.concat(df_search_res, copy = False)
df_info = pd.concat(df_info_res, copy = False)
df_channels = pd.concat(df_channels_res, copy = False)
df_ads = pd.concat(df_ads_res, copy = False)

dataset.update('youtube_search.pkl', df_search, overwrite = True, proto = pickle_proto)
dataset.update('youtube_search_info.pkl', df_info, overwrite = True, proto = pickle_proto)
dataset.update('youtube_search_ads.pkl', df_ads, overwrite = True, proto = pickle_proto)
dataset.update('youtube_search_channels.pkl', df_channels, overwrite = True, proto = pickle_proto)

df_report(df_search, name = 'Search Result (Original)')
df_report(df_info, name = 'Info (Original)')
df_report(df_channels, name = 'Channels (Original)')
df_report(df_ads, name = 'Ad Placements (Original)')

  df.describe()


---

# Results - Search Result (Original)

## Data Preview

Unnamed: 0,id,title,description,time,tags,creator.id
0,mwuL0UmFPQk,NiKO got a HAiRCUT ✂️ ADLEY has a new HAiRST...,Best Park Day Ever 1186 Today's Best Day Ever ...,2021-03-09 16:00:30+00:00,,UCoK5NOxkZBLfI_5eqf8Es4Q
1,LY1-3fHx84c,IS THE STALKER A GHOST? Trapped in a Haunted H...,ESCAPE THE HAUNTED HOUSE After Chad Wild Clay ...,2021-03-11 16:03:05+00:00,,UCmRY4NSGK52lP_Lz11CjdYw
2,IqYVcxs4Qr0,🔴 Live: Baby&#39;s First Steps - Wolfoo Preten...,Live: Baby's First Steps - Wolfoo Pretends to ...,2021-03-11 07:16:45+00:00,,UCWGVQIspqW2j9M3-qLQ0HDg
3,YNeKQhPIpt8,KiDS Leprechaun TRAP!! Adley &amp; Niko make ...,The ultimate plan to catch a Leprechaun LET'...,2021-03-08 16:00:44+00:00,,UCBJuxfqZuiibvcwSc8ViGsQ
4,8ni8kdhGTc0,A LOST MERMAiD!! 5 year old Mermaid learns ho...,ummmm.... are Adley & Mom Super Heroes?! LET'S...,2021-02-26 16:01:14+00:00,,UCBJuxfqZuiibvcwSc8ViGsQ


## Stats

Unnamed: 0,id,title,description,time,tags,creator.id
count,5200,5200,5138,5200,0.0,5200
unique,5022,5014,4649,4941,0.0,3013
top,qT0TdxXB1K0,The Gummy Bear Song - Long English Version,#MeControTe #TeamTrote #Adv #LuieSofi © ME CON...,2021-03-10 15:00:15+00:00,,UCLqCmbd6bgcLaBVz3aA-68A
freq,3,3,16,4,,63
first,,,,2007-04-22 18:07:26+00:00,,
last,,,,2021-03-11 19:03:22+00:00,,


---

---

# Results - Info (Original)

## Data Preview

Unnamed: 0,id,title,description,time,length,tags,category,creator.id,stats.like,stats.dislike,stats.comment,stats.view,video.quality
0,lNwE2DIRaL8,BIANKINHA EM: A BATALHA DO TIK TOK !! AS RIVAI...,Me Sigam no Instagram: @BIANKINHAHA\n\nDiretor...,2021-01-01 22:15:04+00:00,0 days 00:08:06,"[Biankinha, Rivais, TIK Tok, Batalha, Amigos, ...",Entertainment,UCF2OF9f3PxoFNhZbbmcamHg,95482.0,6260.0,,3943357.0,HD
1,tnCjvg1IaBo,CREMOSINHO ESCOLHEU A MÃE,SITE DE VENDAS DA GROWTH\nhttp://www.gsuplemen...,2021-03-06 14:00:16+00:00,0 days 00:09:56,"[TOGURO, CREMOSINHO, MANSAO MAROMBA]",Entertainment,UCEI44xNfQmAukxMf1kW8d5g,169969.0,1122.0,3759.0,1386953.0,HD
2,9O-H3JvWyg0,O FACE GUARDIÃO ATACOU A GENTE PELA PRIMEIRA V...,E aí galera ! A produção preparou algo assusta...,2021-03-08 21:00:18+00:00,0 days 00:14:22,"[rafaella, baltar, luiz, phellipe, K.FUN]",Entertainment,UC6paX0kLA8D81aYogceMufw,85282.0,1607.0,18771.0,1004492.0,HD
3,kH_xVuhgMVw,IL MIO SERPENTE HA MANGIATO TROPPO!! (3 Giochi...,NUOVO NEGOZIO: https://gabby16bitshop.com\nLib...,2021-03-01 20:55:55+00:00,0 days 00:14:48,"[Gabby, Gabby16bit, Gabby16, 16, bit, 16bit, G...",Gaming,UC3VEGA6pJKPsPAHyB3n5BNQ,21431.0,442.0,855.0,585095.0,HD
4,ugRc5jx80yg,Testing if Sharks Can Smell a Drop of Blood,Scientific proof Pixar sits on a throne of lie...,2019-07-28 16:17:19+00:00,0 days 00:15:35,"[sharkweek, shark, discovery, bose, mark rober...",Science & Technology,UCY1kMZp36IQSyNx_9h4mpCg,1520686.0,69709.0,66719.0,83331814.0,HD


## Stats

Unnamed: 0,length,stats.like,stats.dislike,stats.comment,stats.view
count,5000,4932.0,4932.0,4688.0,4996.0
mean,0 days 00:17:09.134400,288307.2,25567.46,15047.283276,38692350.0
std,0 days 00:46:44.947979369,1106660.0,131536.0,65397.505885,185715700.0
min,0 days 00:00:00,1.0,0.0,0.0,0.0
25%,0 days 00:03:34,4019.0,86.0,146.0,118651.0
50%,0 days 00:06:32.500000,24595.0,685.0,1130.0,1004492.0
75%,0 days 00:14:41,114696.0,4762.0,4658.5,7800434.0
max,0 days 10:57:53,18401540.0,3205456.0,799226.0,2996023000.0


---

---

# Results - Channels (Original)

## Data Preview

Unnamed: 0,id,title,description,time,stats.follower,stats.view,stats.post
0,UCvH6u_Qzn5RQdz9W198umDw,한국고전영화 Korean Classic Film,한국영상자료원 유튜브 채널에 오신 것을 환영합니다. 이 곳에서 무료로 190여편의 ...,2011-09-09 00:34:03+00:00,618000.0,250794475,200
1,UCFBzTm13T9xls5nSc8ik5Ag,No Matinho,"Eai galerinha, somos Allana e Israel, moramos ...",2016-10-23 21:30:19+00:00,2750000.0,785655381,263
2,UCJElRTCNEmLemgirqvsW63Q,A Spor,"Galatasaray ,Fenerbahçe, Beşiktaş,Trabzonspor ...",2014-08-22 13:47:54+00:00,289000.0,150993596,13038
3,UCqoZcZ5SDuxexeKhCvmqcqg,Ducky Extra,Just Gaming\n\nBe sure to subscribe if you are...,2017-06-29 10:34:36+00:00,1360000.0,122554159,132
4,UCH2958aySlHIuMzpUE8Xe9A,Tex HS,"Olá pessoal, eu sou o Tex HS. \n\nEu posto víd...",2016-06-15 12:19:53+00:00,3270000.0,1092270410,1358


## Stats

Unnamed: 0,stats.follower,stats.view,stats.post
count,2206.0,2278.0,2278.0
mean,4460099.0,2577245000.0,3077.564091
std,11165490.0,8881584000.0,15005.200171
min,125.0,10192.0,1.0
25%,179000.0,50456960.0,104.0
50%,974000.0,274794100.0,379.0
75%,3130000.0,1342517000.0,1264.0
max,106000000.0,95872600000.0,249671.0


---

---

# Results - Ad Placements (Original)

## Data Preview

Unnamed: 0,id,ads
0,tL8xI__LOTY,[{'kind': 'AD_PLACEMENT_KIND_COMMAND_TRIGGERED...
1,t_wm0hcCG5k,[{'kind': 'AD_PLACEMENT_KIND_COMMAND_TRIGGERED...
2,zMEyPQFjlUg,[{'kind': 'AD_PLACEMENT_KIND_COMMAND_TRIGGERED...
3,YNeKQhPIpt8,[{'kind': 'AD_PLACEMENT_KIND_COMMAND_TRIGGERED...
4,IDCki2E-8TI,[{'kind': 'AD_PLACEMENT_KIND_COMMAND_TRIGGERED...


## Stats

Unnamed: 0,id,ads
count,5150,4606
unique,5022,12
top,sCbbMZ-q4-I,[{'kind': 'AD_PLACEMENT_KIND_COMMAND_TRIGGERED...
freq,3,2495


---

##### STEP 2  Data Cleaning

In [10]:
# - * (filter)
def drop_common(df, df_other, *args, **kwargs):
    return df.drop(columns = df.columns & df_other.columns, *args, **kwargs)

# - search
df_search.set_index(['id'], inplace = True)
# - info
df_info.set_index(['id'], inplace = True)
# - channels
df_channels = df_channels.add_prefix('creator.')
df_channels.set_index(['creator.id'], inplace = True)
# - ads
df_ads.set_index(['id'], inplace = True)

# drop common columns to avoid clashing
# in this case, only `df_search` and `df_info` have merging conflicts
drop_common(df_search, df_info, inplace = True)

  This is separate from the ipykernel package so we can avoid doing imports until


In [11]:
# - search (with details)
df_search_details = df_search.copy()
# - info
df_search_details = df_search_details.merge(
    df_info, 
    right_index = True, 
    left_on = 'id', 
    copy = False
)
# - ads
df_search_details = df_search_details.merge(
    df_ads, 
    right_index = True, 
    left_on = 'id', 
    copy = False
)

##### STEP 3  Data Inspection

In [12]:
# take a brief look at our data
df_report(df_search_details, name = 'Search Result')

---

# Results - Search Result

## Data Preview

Unnamed: 0_level_0,title,description,time,length,tags,category,creator.id,stats.like,stats.dislike,stats.comment,stats.view,video.quality,ads
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
LY1-3fHx84c,IS THE STALKER A GHOST? Trapped in a Haunted H...,ESCAPE THE HAUNTED HOUSE\n\nAfter Chad Wild Cl...,2021-03-11 16:03:05+00:00,0 days 00:19:12,"[spy ninjas, spy ninja, chad wild clay, cwc, v...",Howto & Style,UCmRY4NSGK52lP_Lz11CjdYw,41687.0,921.0,24990.0,383337.0,HD,[{'kind': 'AD_PLACEMENT_KIND_COMMAND_TRIGGERED...
LY1-3fHx84c,IS THE STALKER A GHOST? Trapped in a Haunted H...,ESCAPE THE HAUNTED HOUSE\n\nAfter Chad Wild Cl...,2021-03-11 16:03:05+00:00,0 days 00:19:12,"[spy ninjas, spy ninja, chad wild clay, cwc, v...",Howto & Style,UCmRY4NSGK52lP_Lz11CjdYw,41687.0,921.0,24990.0,383337.0,HD,[{'kind': 'AD_PLACEMENT_KIND_COMMAND_TRIGGERED...
LY1-3fHx84c,IS THE STALKER A GHOST? Trapped in a Haunted H...,ESCAPE THE HAUNTED HOUSE\n\nAfter Chad Wild Cl...,2021-03-11 16:03:05+00:00,0 days 00:19:12,"[spy ninjas, spy ninja, chad wild clay, cwc, v...",Howto & Style,UCmRY4NSGK52lP_Lz11CjdYw,41687.0,921.0,24990.0,383337.0,HD,[{'kind': 'AD_PLACEMENT_KIND_COMMAND_TRIGGERED...
LY1-3fHx84c,IS THE STALKER A GHOST? Trapped in a Haunted H...,ESCAPE THE HAUNTED HOUSE\n\nAfter Chad Wild Cl...,2021-03-11 16:03:05+00:00,0 days 00:19:12,"[spy ninjas, spy ninja, chad wild clay, cwc, v...",Howto & Style,UCmRY4NSGK52lP_Lz11CjdYw,41687.0,921.0,24990.0,383337.0,HD,[{'kind': 'AD_PLACEMENT_KIND_COMMAND_TRIGGERED...
LY1-3fHx84c,IS THE STALKER A GHOST? Trapped in a Haunted H...,ESCAPE THE HAUNTED HOUSE\n\nAfter Chad Wild Cl...,2021-03-11 16:03:05+00:00,0 days 00:19:12,"[spy ninjas, spy ninja, chad wild clay, cwc, v...",Howto & Style,UCmRY4NSGK52lP_Lz11CjdYw,41687.0,921.0,24990.0,383337.0,HD,[{'kind': 'AD_PLACEMENT_KIND_COMMAND_TRIGGERED...


## Stats

Unnamed: 0,length,stats.like,stats.dislike,stats.comment,stats.view
count,6024,5956.0,5956.0,5640.0,6020.0
mean,0 days 00:18:30.394422310,329611.0,31994.54,17861.699291,41797490.0
std,0 days 00:58:59.762825460,1176976.0,135221.0,76214.666372,176887200.0
min,0 days 00:00:00,1.0,0.0,0.0,0.0
25%,0 days 00:03:33,4484.0,94.0,128.0,152175.0
50%,0 days 00:06:17,29006.0,921.0,1159.5,1038170.0
75%,0 days 00:14:22,138238.0,5323.0,5068.0,9635872.0
max,0 days 10:57:53,18401540.0,3205456.0,799226.0,2996023000.0


---

##### STEP 4  Data Archiving

In [13]:
dataset.update('youtube_search_details.pkl', df_search_details, proto = pickle_proto)
# verify that we saved the correct data
df_report(dataset.load('youtube_search_details.pkl'), name = 'Search Result (Verification)')

---

# Results - Search Result (Verification)

## Data Preview

Unnamed: 0_level_0,title,description,time,length,tags,category,creator.id,stats.like,stats.dislike,stats.comment,stats.view,video.quality,ads
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
-1K_yQO8P_0,J / Feel Your Blaze,J LIVE and LET RIDE\n@SHIBUYA O-EAST 20081231\...,2017-02-09 15:12:20+00:00,0 days 00:05:46,"[LUNA SEA, J LUNA SEA, Jun Onose, wumf, j feel...",Music,UCSJUUn_KQw1T3gff5mRjblg,291.0,3.0,24.0,48009.0,HD,[{'kind': 'AD_PLACEMENT_KIND_COMMAND_TRIGGERED...
-1K_yQO8P_0,J / Feel Your Blaze,J LIVE and LET RIDE\n@SHIBUYA O-EAST 20081231\...,2017-02-09 15:12:20+00:00,0 days 00:05:46,"[LUNA SEA, J LUNA SEA, Jun Onose, wumf, j feel...",Music,UCSJUUn_KQw1T3gff5mRjblg,291.0,3.0,24.0,48009.0,HD,[{'kind': 'AD_PLACEMENT_KIND_COMMAND_TRIGGERED...
-1K_yQO8P_0,J / Feel Your Blaze,J LIVE and LET RIDE\n@SHIBUYA O-EAST 20081231\...,2017-02-09 15:12:20+00:00,0 days 00:05:46,"[LUNA SEA, J LUNA SEA, Jun Onose, wumf, j feel...",Music,UCSJUUn_KQw1T3gff5mRjblg,291.0,3.0,24.0,48009.0,HD,[{'kind': 'AD_PLACEMENT_KIND_COMMAND_TRIGGERED...
-1K_yQO8P_0,J / Feel Your Blaze,J LIVE and LET RIDE\n@SHIBUYA O-EAST 20081231\...,2017-02-09 15:12:20+00:00,0 days 00:05:46,"[LUNA SEA, J LUNA SEA, Jun Onose, wumf, j feel...",Music,UCSJUUn_KQw1T3gff5mRjblg,291.0,3.0,24.0,48009.0,HD,[{'kind': 'AD_PLACEMENT_KIND_COMMAND_TRIGGERED...
-49fGh_Hbhw,My Pet Fish are DYING In My Backyard Pond! Res...,🔹Buy 𝙂𝙊𝙊𝙂𝘼𝙉 𝘽𝘼𝙄𝙏𝙎 -- https://googansquad.com/...,2021-03-09 00:19:41+00:00,0 days 00:24:57,"[fishing, survial, camping, outdoors, cooking,...",Howto & Style,UCI6gB6eZS0c3ZrjcJq7xAtA,5187.0,93.0,648.0,132494.0,HD,[{'kind': 'AD_PLACEMENT_KIND_COMMAND_TRIGGERED...


## Stats

Unnamed: 0,length,stats.like,stats.dislike,stats.comment,stats.view
count,8065,7981.0,7981.0,7359.0,8061.0
mean,0 days 00:18:13.702665840,352231.6,50718.59,23141.19,53799650.0
std,0 days 00:55:43.814600751,1289249.0,195740.7,185280.1,232993200.0
min,0 days 00:00:00,1.0,0.0,0.0,0.0
25%,0 days 00:03:43,4355.0,92.0,78.0,143465.0
50%,0 days 00:09:13,33519.0,1174.0,1006.0,1038568.0
75%,0 days 00:14:22,178171.0,9120.0,5854.0,14442570.0
max,0 days 15:13:24,21113580.0,4142776.0,5198500.0,4418250000.0


---