# Data Sampling

## Setup

In [1]:
import sys

!{sys.executable} -m pip install --quiet --user --upgrade ipykernel
!{sys.executable} -m pip install --quiet --user --upgrade pandas==1.*
!{sys.executable} -m pip install --quiet --user --upgrade -r requirements.txt

In [2]:
from goodies import *
import pandas as pd
import os

## Data Collection

In [3]:
from dcollect import plugins

modules = {'http': plugins.fasthttp()}
headers = None

### YouTube (United States)

Initial setup. Be sure to have your API key ready. For details on how to obtain an API key, read [YouTube Data API Overview, Introduction: Before you start](https://developers.google.com/youtube/v3/getting-started#before-you-start).

In [4]:
from dcollect import api_youtube as youtube
from dcollect import api_youtubei as youtubei

# This key is for testing ONLY. DO NOT release to the public!
api_experiment = False
api_key_testing = None
api_key = os.environ.get('YOUTUBE_API_KEY') or api_key_testing

if not api_key:
    api_key = os.environ.get('YOUTUBE_EXPLORER_API_KEY')
    if api_key: 
        api_experiment = True
    else: 
        api_key = input('YouTube Data API Key: ')
        api_experiment = (input('Is this an explorer key? [Y/N]: ') == 'Y')

dataset_id = os.environ.get('DATASET_NAME')
if dataset_id == None:
    dataset_id = input('Dataset Name: ')
    
sample_size_per_query_default = 1000000
sample_size_per_query = os.environ.get('SAMPLE_SIZE_PER_QUERY')    
if sample_size_per_query == None:
    sample_size_per_query = input('Sample size per query: ') or sample_size_per_query_default
    
sample_size_per_query = int(sample_size_per_query)

YouTube Data API Key: AIzaSyAa8yy0GdcGPHdtD083HiGGx_S0vMPScDM
Is this an explorer key? [Y/N]: Y
Dataset Name: random_ascii
Sample size per query: 5


#### Search

##### STEP 1  Data Collection

In [5]:
# create a YouTube API object
youtube_o = youtube.api(
    modules = modules,
    headers = headers,
    key = api_key,
    experiment = api_experiment
)

# create a YouTube Internals API object
youtubei_o = youtubei.api(
    modules = modules,
    headers = headers
)

pickle_proto = 3
dataset = eda_utils.dataset(f'dsamples/youtube_search_{dataset_id}.dataset')

In [6]:
def df_search_gen(*args, **kwargs):
    from dcollect.utils.log import log
    log.enable(level = log.levels.WARNING)
    import concurrent.futures

    df_search = None
    df_info = None
    df_channels = None
    df_ads = None
    
    def worker_df_search(*args, **kwargs):
        nonlocal df_search
        df_search = df_from_json(
            youtube_o.video.search(
                *args, **kwargs
            )
        )
        
    def worker_df_info():
        nonlocal df_info
        df_info = df_from_json(
            youtube_o.video.info(
                id = df_search['id']
            )
        )
            
    def worker_df_ads():
        nonlocal df_ads
        df_ads = df_from_json(
            youtubei_o.ad.placements(
                id = df_search['id'],
                throttle_size = None
            )
        )
            
    def worker_df_channels():
        nonlocal df_channels
        df_channels = df_from_json(
            youtube_o.channel.info(
                id = df_search['creator.id']
            )
        )
            
    # - search
    worker_df_search(*args, **kwargs)
    
    with concurrent.futures.ThreadPoolExecutor() as executor:
        for worker in worker_df_info, worker_df_ads, worker_df_channels:
            executor.submit(worker)
        executor.shutdown(wait = True)
                
    return df_search, df_info, df_channels, df_ads

In [7]:
def df_search_gen_bulk(paramlist: list):
    import concurrent.futures
    
    results = []
    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = [executor.submit(df_search_gen, **param) for param in paramlist]
        results = [f.result() for f in concurrent.futures.as_completed(futures)]
        
    return results

In [8]:
import string

param_default = {
    'count': sample_size_per_query
}

paramlist = []
for c in string.ascii_lowercase:
    param = dict(param_default)
    param.update({
        'keyword': c
    })
    paramlist.append(param)
    
df_search = pd.DataFrame()
df_info = pd.DataFrame()
df_channels = pd.DataFrame()
df_ads = pd.DataFrame()

results = df_search_gen_bulk(paramlist)

In [9]:
def transpose(l):
    return list(map(list, zip(*l)))

df_search_res, df_info_res, df_channels_res, df_ads_res = transpose(results)

df_search = pd.concat(df_search_res, copy = False)
df_info = pd.concat(df_info_res, copy = False)
df_channels = pd.concat(df_channels_res, copy = False)
df_ads = pd.concat(df_ads_res, copy = False)

dataset.update('youtube_search.pkl', df_search, overwrite = True, proto = pickle_proto)
dataset.update('youtube_search_info.pkl', df_info, overwrite = True, proto = pickle_proto)
dataset.update('youtube_search_ads.pkl', df_ads, overwrite = True, proto = pickle_proto)
dataset.update('youtube_search_channels.pkl', df_channels, overwrite = True, proto = pickle_proto)

df_report(df_search, name = 'Search Result (Original)')
df_report(df_info, name = 'Info (Original)')
df_report(df_channels, name = 'Channels (Original)')
df_report(df_ads, name = 'Ad Placements (Original)')

  df.describe()


---

# Results - Search Result (Original)

## Data Preview

Unnamed: 0,id,title,description,time,tags,creator.id
0,W_BIrUdLRmw,I like ya cut G - How painful is it!?,ilikeyacutG #pain #Slapped The Pain Rankers ha...,2020-12-08 21:00:05+00:00,,UC108p_oMnVNmMlxpcNr43sQ
1,1N1QML9-tGI,I like ya cut g part 1,I like ya cut G shoutout to Bloxy for being an...,2020-09-29 14:22:27+00:00,,UCeTGkH3uDHsS0458NyYGFGQ
2,4vKYldeJUZk,Tik Toks that like Ya Cut G 🤣✂️,Thanks for watching the tik tok compilation. I...,2021-03-10 18:00:03+00:00,,UCOGljV-FSzt4rcHzalkPdvQ
3,3VoJ6AIJ8nk,y g vy g xv g,,2021-03-03 08:43:36+00:00,,UCu5M55UfQB8aLGGrOePJlVg
4,-fdYVVzo-bI,Benny G - Pazza d&#39;amore (Ufficiale 2020),#bennyg #pazzadamore #seamusica.,2020-06-06 09:30:12+00:00,,UCSvnev6rpHJvGc5Mf2em4GQ


## Stats

Unnamed: 0,id,title,description,time,tags,creator.id
count,130,130,128,130,0.0,130
unique,129,128,126,129,0.0,109
top,yff2MBEF9b0,Aplastando Cosas Crujientes y Suaves! Coca Col...,"Hola Amigos, En este EXPERIMENTO veremos Aplas...",2021-03-10 22:00:13+00:00,,UCVcQH8A634mauPrGbWs7QlQ
freq,2,2,2,2,,8
first,,,,2009-06-17 04:30:53+00:00,,
last,,,,2021-03-11 08:56:42+00:00,,


---

---

# Results - Info (Original)

## Data Preview

Unnamed: 0,id,title,description,time,length,tags,category,creator.id,stats.like,stats.dislike,stats.comment,stats.view,video.quality
0,-fdYVVzo-bI,Benny G - Pazza d'amore (Ufficiale 2020),Benny G - Pazza d'amore (Ufficiale 2020)\nPuoi...,2020-06-06 09:30:12+00:00,0 days 00:03:52,"[Benny G, Pop, Italo, Seamusica, pazza d'amore...",Music,UCSvnev6rpHJvGc5Mf2em4GQ,40471.0,9215.0,,3490157,HD
1,1N1QML9-tGI,I like ya cut g part 1,I like ya cut G shoutout to Bloxy for being an...,2020-09-29 14:22:27+00:00,0 days 00:08:37,,Comedy,UCeTGkH3uDHsS0458NyYGFGQ,328113.0,6840.0,14238.0,15606962,HD
2,W_BIrUdLRmw,I like ya cut G - How painful is it!?,#ilikeyacutG #pain #Slapped\n\nThe Pain Ranker...,2020-12-08 21:00:05+00:00,0 days 00:09:07,"[i like ya cut g, i like ya cut g original, I ...",Entertainment,UC108p_oMnVNmMlxpcNr43sQ,53214.0,1631.0,6592.0,2052444,HD
3,4vKYldeJUZk,Tik Toks that like Ya Cut G 🤣✂️,Thanks for watching the tik tok compilation. I...,2021-03-10 18:00:03+00:00,0 days 00:23:20,"[tik tok, tiktok, tik tok compilation, tiktok ...",Entertainment,UCOGljV-FSzt4rcHzalkPdvQ,1819.0,28.0,92.0,76024,HD
4,3VoJ6AIJ8nk,y g vy g xv g,,2021-03-03 08:43:36+00:00,0 days 00:02:52,,People & Blogs,UCu5M55UfQB8aLGGrOePJlVg,93.0,37.0,11.0,25845,HD


## Stats

Unnamed: 0,length,stats.like,stats.dislike,stats.comment,stats.view
count,130,129.0,129.0,117.0,130.0
mean,0 days 00:09:45.784615384,522763.3,74463.09,31760.51,73196490.0
std,0 days 00:10:08.071994215,1675836.0,343767.9,132076.2,316135600.0
min,0 days 00:00:00,82.0,0.0,0.0,0.0
25%,0 days 00:03:11.250000,4593.0,232.0,23.0,330900.2
50%,0 days 00:05:10,53214.0,2611.0,974.0,3340429.0
75%,0 days 00:14:19.250000,215961.0,21376.0,10244.0,23410650.0
max,0 days 01:00:24,10930770.0,3205025.0,1221216.0,2696372000.0


---

---

# Results - Channels (Original)

## Data Preview

Unnamed: 0,id,title,description,time,stats.follower,stats.view,stats.post
0,UCOGljV-FSzt4rcHzalkPdvQ,TikTok Dispenser,Hoping to brighten your day with a daily tikto...,2020-10-12 00:39:34.448645+00:00,28300.0,11417731,136
1,UCeTGkH3uDHsS0458NyYGFGQ,Zedex ZA,"If you wanna watch my content, sub. If not, do...",2017-11-12 07:21:30+00:00,72100.0,20891123,4
2,UC108p_oMnVNmMlxpcNr43sQ,Pain Rankers,"We rank Pain for Science, Entertainment and Cu...",2019-07-26 15:22:33+00:00,382000.0,25409567,92
3,UCSvnev6rpHJvGc5Mf2em4GQ,OfficialSeamusica,PRODUZIONE E DISTRIBUZIONI DISCOGRAFICHE.\nDIG...,2010-10-04 12:05:24+00:00,1870000.0,1673077109,14458
4,UCu5M55UfQB8aLGGrOePJlVg,Bean Hmong,,2020-01-05 11:06:33.046649+00:00,20600.0,7710354,84


## Stats

Unnamed: 0,stats.follower,stats.view,stats.post
count,116.0,119.0,119.0
mean,7543658.0,4088536000.0,1639.890756
std,14375750.0,10737920000.0,5792.336802
min,394.0,194179.0,2.0
25%,744750.0,220552200.0,125.0
50%,2070000.0,892454900.0,334.0
75%,8947500.0,3398575000.0,1144.0
max,106000000.0,95872600000.0,54848.0


---

---

# Results - Ad Placements (Original)

## Data Preview

Unnamed: 0,id,ads
0,3VoJ6AIJ8nk,"[{'kind': 'AD_PLACEMENT_KIND_START', 'offset':..."
1,1N1QML9-tGI,[{'kind': 'AD_PLACEMENT_KIND_COMMAND_TRIGGERED...
2,-fdYVVzo-bI,"[{'kind': 'AD_PLACEMENT_KIND_START', 'offset':..."
3,4vKYldeJUZk,[{'kind': 'AD_PLACEMENT_KIND_COMMAND_TRIGGERED...
4,W_BIrUdLRmw,"[{'kind': 'AD_PLACEMENT_KIND_START', 'offset':..."


## Stats

Unnamed: 0,id,ads
count,130,116
unique,129,61
top,yff2MBEF9b0,"[{'kind': 'AD_PLACEMENT_KIND_START', 'offset':..."
freq,2,15


---

##### STEP 2  Data Cleaning

In [10]:
# - * (filter)
def drop_common(df, df_other, *args, **kwargs):
    return df.drop(columns = df.columns & df_other.columns, *args, **kwargs)

# - search
df_search.set_index(['id'], inplace = True)
# - info
df_info.set_index(['id'], inplace = True)
# - channels
df_channels = df_channels.add_prefix('creator.')
df_channels.set_index(['creator.id'], inplace = True)
# - ads
df_ads.set_index(['id'], inplace = True)

# drop common columns to avoid clashing
# in this case, only `df_search` and `df_info` have merging conflicts
drop_common(df_search, df_info, inplace = True)

  This is separate from the ipykernel package so we can avoid doing imports until


In [11]:
# - search (with details)
df_search_details = df_search.copy()
# - info
df_search_details = df_search_details.merge(
    df_info, 
    right_index = True, 
    left_on = 'id', 
    copy = False
)
# - ads
df_search_details = df_search_details.merge(
    df_ads, 
    right_index = True, 
    left_on = 'id', 
    copy = False
)

##### STEP 3  Data Inspection

In [12]:
# take a brief look at our data
df_report(df_search_details, name = 'Search Result')

---

# Results - Search Result

## Data Preview

Unnamed: 0_level_0,title,description,time,length,tags,category,creator.id,stats.like,stats.dislike,stats.comment,stats.view,video.quality,ads
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
W_BIrUdLRmw,I like ya cut G - How painful is it!?,#ilikeyacutG #pain #Slapped\n\nThe Pain Ranker...,2020-12-08 21:00:05+00:00,0 days 00:09:07,"[i like ya cut g, i like ya cut g original, I ...",Entertainment,UC108p_oMnVNmMlxpcNr43sQ,53214.0,1631.0,6592.0,2052444,HD,"[{'kind': 'AD_PLACEMENT_KIND_START', 'offset':..."
1N1QML9-tGI,I like ya cut g part 1,I like ya cut G shoutout to Bloxy for being an...,2020-09-29 14:22:27+00:00,0 days 00:08:37,,Comedy,UCeTGkH3uDHsS0458NyYGFGQ,328113.0,6840.0,14238.0,15606962,HD,[{'kind': 'AD_PLACEMENT_KIND_COMMAND_TRIGGERED...
4vKYldeJUZk,Tik Toks that like Ya Cut G 🤣✂️,Thanks for watching the tik tok compilation. I...,2021-03-10 18:00:03+00:00,0 days 00:23:20,"[tik tok, tiktok, tik tok compilation, tiktok ...",Entertainment,UCOGljV-FSzt4rcHzalkPdvQ,1819.0,28.0,92.0,76024,HD,[{'kind': 'AD_PLACEMENT_KIND_COMMAND_TRIGGERED...
3VoJ6AIJ8nk,y g vy g xv g,,2021-03-03 08:43:36+00:00,0 days 00:02:52,,People & Blogs,UCu5M55UfQB8aLGGrOePJlVg,93.0,37.0,11.0,25845,HD,"[{'kind': 'AD_PLACEMENT_KIND_START', 'offset':..."
-fdYVVzo-bI,Benny G - Pazza d'amore (Ufficiale 2020),Benny G - Pazza d'amore (Ufficiale 2020)\nPuoi...,2020-06-06 09:30:12+00:00,0 days 00:03:52,"[Benny G, Pop, Italo, Seamusica, pazza d'amore...",Music,UCSvnev6rpHJvGc5Mf2em4GQ,40471.0,9215.0,,3490157,HD,"[{'kind': 'AD_PLACEMENT_KIND_START', 'offset':..."


## Stats

Unnamed: 0,length,stats.like,stats.dislike,stats.comment,stats.view
count,136,135.0,135.0,123.0,136.0
mean,0 days 00:10:02.029411764,500518.1,71198.29,30211.22,69978970.0
std,0 days 00:09:59.231334061,1641157.0,336326.9,128970.5,309395900.0
min,0 days 00:00:00,82.0,0.0,0.0,0.0
25%,0 days 00:03:12,5138.0,332.0,12.0,272216.5
50%,0 days 00:05:18.500000,36214.0,2378.0,688.0,3024439.0
75%,0 days 00:14:52.500000,190605.0,15662.0,8495.0,22793000.0
max,0 days 01:00:24,10930770.0,3205025.0,1221216.0,2696372000.0


---

##### STEP 4  Data Archiving

In [13]:
dataset.update('youtube_search_details.pkl', df_search_details, proto = pickle_proto)
# verify that we saved the correct data
df_report(dataset.load('youtube_search_details.pkl'), name = 'Search Result (Verification)')

---

# Results - Search Result (Verification)

## Data Preview

Unnamed: 0_level_0,title,description,time,length,tags,category,creator.id,stats.like,stats.dislike,stats.comment,stats.view,video.quality,ads
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
W_BIrUdLRmw,I like ya cut G - How painful is it!?,#ilikeyacutG #pain #Slapped\n\nThe Pain Ranker...,2020-12-08 21:00:05+00:00,0 days 00:09:07,"[i like ya cut g, i like ya cut g original, I ...",Entertainment,UC108p_oMnVNmMlxpcNr43sQ,53214.0,1631.0,6592.0,2052444,HD,"[{'kind': 'AD_PLACEMENT_KIND_START', 'offset':..."
1N1QML9-tGI,I like ya cut g part 1,I like ya cut G shoutout to Bloxy for being an...,2020-09-29 14:22:27+00:00,0 days 00:08:37,,Comedy,UCeTGkH3uDHsS0458NyYGFGQ,328113.0,6840.0,14238.0,15606962,HD,[{'kind': 'AD_PLACEMENT_KIND_COMMAND_TRIGGERED...
4vKYldeJUZk,Tik Toks that like Ya Cut G 🤣✂️,Thanks for watching the tik tok compilation. I...,2021-03-10 18:00:03+00:00,0 days 00:23:20,"[tik tok, tiktok, tik tok compilation, tiktok ...",Entertainment,UCOGljV-FSzt4rcHzalkPdvQ,1819.0,28.0,92.0,76024,HD,[{'kind': 'AD_PLACEMENT_KIND_COMMAND_TRIGGERED...
3VoJ6AIJ8nk,y g vy g xv g,,2021-03-03 08:43:36+00:00,0 days 00:02:52,,People & Blogs,UCu5M55UfQB8aLGGrOePJlVg,93.0,37.0,11.0,25845,HD,"[{'kind': 'AD_PLACEMENT_KIND_START', 'offset':..."
-fdYVVzo-bI,Benny G - Pazza d'amore (Ufficiale 2020),Benny G - Pazza d'amore (Ufficiale 2020)\nPuoi...,2020-06-06 09:30:12+00:00,0 days 00:03:52,"[Benny G, Pop, Italo, Seamusica, pazza d'amore...",Music,UCSvnev6rpHJvGc5Mf2em4GQ,40471.0,9215.0,,3490157,HD,"[{'kind': 'AD_PLACEMENT_KIND_START', 'offset':..."


## Stats

Unnamed: 0,length,stats.like,stats.dislike,stats.comment,stats.view
count,136,135.0,135.0,123.0,136.0
mean,0 days 00:10:02.029411764,500518.1,71198.29,30211.22,69978970.0
std,0 days 00:09:59.231334061,1641157.0,336326.9,128970.5,309395900.0
min,0 days 00:00:00,82.0,0.0,0.0,0.0
25%,0 days 00:03:12,5138.0,332.0,12.0,272216.5
50%,0 days 00:05:18.500000,36214.0,2378.0,688.0,3024439.0
75%,0 days 00:14:52.500000,190605.0,15662.0,8495.0,22793000.0
max,0 days 01:00:24,10930770.0,3205025.0,1221216.0,2696372000.0


---