# Data Sampling

## Setup

In [1]:
import sys

!{sys.executable} -m pip install --quiet --user --upgrade pandas==1.*
!{sys.executable} -m pip install --quiet --user --upgrade -r requirements.txt

In [2]:
from goodies import *
import pandas as pd

## Data Collection

In [3]:
from dcollect import plugins

modules = {'http': plugins.fasthttp()}
headers = None

### YouTube (United States)

Initial setup. Be sure to have your API key ready. For details on how to obtain an API key, read [YouTube Data API Overview, Introduction: Before you start](https://developers.google.com/youtube/v3/getting-started#before-you-start).

In [4]:
from dcollect import api_youtube as youtube
from dcollect import api_youtubei as youtubei

# This key is for testing ONLY. DO NOT release to the public!
api_key_testing = None
api_key = api_key_testing or input('YouTube Data API Key: ')
api_experiment = True

dataset_id_testing = ''
dataset_id = input('Dataset Name: ') or dataset_id_testing

YouTube Data API Key: AIzaSyAa8yy0GdcGPHdtD083HiGGx_S0vMPScDM
Dataset Name: random_ascii


#### Search

##### STEP 1  Data Collection

In [5]:
youtube_o = youtube.api(
    modules = modules,
    headers = headers,
    key = api_key,
    experiment = api_experiment
)
youtubei_o = youtubei.api(
    modules = modules,
    headers = headers
)

def df_search_gen(*args, **kwargs):
    from dcollect.utils.log import log
    log.enable(level = log.levels.WARNING)
    import concurrent.futures

    df_search = None
    df_info = None
    df_channels = None
    df_ads = None
    
    def worker_df_search(*args, **kwargs):
        nonlocal df_search
        df_search = df_from_json(
            youtube_o.video.search(
                *args, **kwargs
            )
        )
        
    def worker_df_info():
        nonlocal df_info
        df_info = df_from_json(
            youtube_o.video.info(
                id = df_search['id']
            )
        )
            
    def worker_df_ads():
        nonlocal df_ads
        df_ads = df_from_json(
            youtubei_o.ad.placements(
                id = df_search['id'],
                throttle_size = 100
            )
        )
            
    def worker_df_channels():
        nonlocal df_channels
        df_channels = df_from_json(
            youtube_o.channel.info(
                id = df_search['creator.id']
            )
        )
            
    # - search
    worker_df_search(*args, **kwargs)
    
    with concurrent.futures.ThreadPoolExecutor() as executor:
        for worker in worker_df_info, worker_df_ads, worker_df_channels:
            executor.submit(worker)
        executor.shutdown(wait = True)
                
    return df_search, df_info, df_channels, df_ads

In [6]:
def df_search_gen_bulk(paramlist: list):
    import concurrent.futures
    
    results = []
    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = [executor.submit(df_search_gen, **param) for param in paramlist]
        results = [f.result() for f in concurrent.futures.as_completed(futures)]
        
    return results

In [7]:
import string

param_default = {
    'count': 50
}

paramlist = []
for c in string.ascii_lowercase:
    param = dict(param_default)
    param.update({
        'keyword': c
    })
    paramlist.append(param)
    
df_search = pd.DataFrame()
df_info = pd.DataFrame()
df_channels = pd.DataFrame()
df_ads = pd.DataFrame()

results = df_search_gen_bulk(paramlist)

In [8]:
def transpose(l):
    return list(map(list, zip(*l)))

df_search_res, df_info_res, df_channels_res, df_ads_res = transpose(results)

df_search = pd.concat(df_search_res, copy = False)
df_info = pd.concat(df_info_res, copy = False)
df_channels = pd.concat(df_channels_res, copy = False)
df_ads = pd.concat(df_ads_res, copy = False)

df_report([df_search, df_info, df_channels, df_ads])

  df.describe()


---

# Results - 

## Data Preview

Unnamed: 0,id,title,description,time,tags,creator.id
0,bsN96zE8FuE,QAnon 101: The Search for Q,To understand what Q is and why it has been so...,2021-01-25 20:00:06+00:00,,UCn8zNIfYAQNdrFRrr8oibKw
1,u8Gd9MJsnnE,QAnon: The conspiracy theory spreading fake ne...,QAnon believers have speculated that this figh...,2020-07-23 23:19:01+00:00,,UC6o-wWU-v2ClFMwougmK7dA
2,Rhum_0F4weU,[NU’EST] Ha99y 9th anniversary ‘Q is.spoiler -...,뉴이스트 9주년 기념 아홉 가지 Q&A! 9에 진심인 질문과 답변들이 궁금하다면?!...,2021-03-10 06:15:00+00:00,,UCUuyrV8JDv5UAMW2StsL-NA
3,VGrfN3v5JL8,Sitting Down with QAnon Conspiracy Theorists -...,Jim gathers a group of QAnon conspiracy theori...,2018-08-22 01:18:14+00:00,,UCUsN5ZwHx2kILm84-jPDeXw
4,w1kebU_p0CA,Julian Assange on Q (AUDIO ONLY),"In a Canadian broadcast exclusive, controversi...",2014-09-29 14:12:57+00:00,,UC1nw_szfrEsDWcwD32wHE_w


## Stats

Unnamed: 0,id,title,description,time,tags,creator.id
count,1300,1300,1286,1300,0.0,1300
unique,1282,1277,1226,1279,0.0,867
top,N5xrKaFhjHc,Ovi x Natanael Cano x Junior H x Herencia De P...,http://www.pink.rs/ - Posetite najbrži portal ...,2012-07-15 07:46:32+00:00,,UC0-swBG9Ne0Vh4OuoJ2bjbA
freq,2,2,16,2,,37
first,,,,2007-10-09 20:54:01+00:00,,
last,,,,2021-03-10 16:30:12+00:00,,


---

---

# Results - 

## Data Preview

Unnamed: 0,id,title,description,time,length,tags,category,creator.id,stats.like,stats.dislike,stats.comment,stats.view,video.quality
0,IRQMZADWoPI,【公式】ダイジェスト これまでの『ヱヴァンゲリヲン新劇場版：Ｑ』,ダイジェスト\n『ヱヴァンゲリヲン新劇場版：Ｑ』\nEVANGELION:3.0 YOU C...,2019-11-01 02:00:02+00:00,0 days 00:05:03,"[ヱヴァンゲリヲン新劇場版, エヴァンゲリオン, エヴァ, eva]",Film & Animation,UCM4FwhxLYLHlOSPnfxmxSQw,51814.0,1053.0,11183.0,5588327,HD
1,LcgG_E9gQJM,SKYFALL | Bond meets Q,Bond (Daniel Craig) meets his new quartermaste...,2020-07-26 09:00:08+00:00,0 days 00:02:49,,Film & Animation,UCwTkM6CvIsYFaFiMKIKCqHw,26180.0,523.0,1326.0,2111109,HD
2,cTFYejM96Co,Morning News With Mallanna 09-03-2021 || #Teen...,#LIVE భువనగిరి లో పట్టభద్రుల గర్జన...https://y...,2021-03-09 03:33:11+00:00,0 days 01:03:36,"[q news mallanna, q news live, q news mallanna...",News & Politics,UCI-7hequY2IuQjpuj6g9BlA,5781.0,214.0,387.0,153771,HD
3,u8Gd9MJsnnE,QAnon: The conspiracy theory spreading fake ne...,Twitter has become the first social media outl...,2020-07-23 23:19:01+00:00,0 days 00:08:59,"[QAnon, conspiracy theory, Donald trump, what ...",News & Politics,UC6o-wWU-v2ClFMwougmK7dA,9614.0,6911.0,11210.0,927526,HD
4,I8NVVgg9v-U,QAnon: Wer steckt hinter den bizarren Verschwö...,Auf Corona-Demos taucht immer häufiger der Buc...,2020-09-17 10:30:01+00:00,0 days 00:07:02,"[querdenken, galileo selbstexperiment, jumbo s...",Entertainment,UC1XrG1M_hw8103zO2x-oivg,2282.0,1955.0,1636.0,121913,HD


## Stats

Unnamed: 0,length,stats.like,stats.dislike,stats.comment,stats.view
count,1300,1280.0,1280.0,1173.0,1300.0
mean,0 days 00:16:17.021538461,386106.3,46323.91,27243.1,64172030.0
std,0 days 00:46:33.146425745,1548443.0,236212.1,226127.2,321369300.0
min,0 days 00:00:00,1.0,0.0,0.0,0.0
25%,0 days 00:03:35,3833.5,112.25,118.0,113233.0
50%,0 days 00:08:02.500000,19743.0,1041.5,974.0,1274824.0
75%,0 days 00:15:08.250000,129284.0,8849.75,5725.0,14784800.0
max,0 days 15:13:24,21105080.0,4142134.0,5197883.0,4417995000.0


---

---

# Results - 

## Data Preview

Unnamed: 0,id,title,description,time,stats.follower,stats.view,stats.post
0,UC-3jIAlnQmbbVMV6gR7K8aQ,The Majority Report w/ Sam Seder,"The Majority Report is a daily, political talk...",2010-05-25 02:04:46+00:00,1020000.0,445288867,16495
1,UCZdGJgHbmqQcVZaJCkqDRwg,The Q,Science videos and more :)\n\nour facebook: ht...,2017-01-15 12:53:11+00:00,11300000.0,3294595758,217
2,UCTrQ7HXWRRxr7OsOtodr2_w,Channel 4 News,News that's committed to challenging expectati...,2006-07-05 23:18:56+00:00,1600000.0,830361269,11657
3,UCeqKIgPQfNInOswGRWt48kQ,ZDFheute Nachrichten,Willkommen auf dem offiziellen ZDFheute Nachri...,2019-08-05 12:42:10+00:00,322000.0,147531376,2147
4,UCX2M7xn-jMmq4KfX25TCTCA,HBO Brasil,Acesse um mundo inusitado de entretenimento. D...,2012-02-28 11:53:36+00:00,892000.0,155978465,2051


## Stats

Unnamed: 0,stats.follower,stats.view,stats.post
count,945.0,985.0,985.0
mean,6136526.0,3542005000.0,3878.405076
std,16506780.0,12960390000.0,16160.901048
min,57.0,41924.0,1.0
25%,269000.0,81891780.0,133.0
50%,1270000.0,402397300.0,448.0
75%,5120000.0,2065849000.0,1460.0
max,175000000.0,146140400000.0,249442.0


---

---

# Results - 

## Data Preview

Unnamed: 0,id,ads
0,IRQMZADWoPI,[{'kind': 'AD_PLACEMENT_KIND_COMMAND_TRIGGERED...
1,LcgG_E9gQJM,
2,cTFYejM96Co,
3,u8Gd9MJsnnE,[{'kind': 'AD_PLACEMENT_KIND_COMMAND_TRIGGERED...
4,I8NVVgg9v-U,"[{'kind': 'AD_PLACEMENT_KIND_START', 'offset':..."


## Stats

Unnamed: 0,id,ads
count,1300,1170
unique,1282,568
top,Xj82lgw_wtU,[{'kind': 'AD_PLACEMENT_KIND_COMMAND_TRIGGERED...
freq,2,117


---

##### STEP 2  Data Cleaning

In [9]:
# - ads (filter)
def filter_has_ad(ads):
    return not ads == None 
def filter_has_ad_beginning(ads):
    if ads == None:
        return False
    for ad in ads:
        if ad['kind'] == youtubei.resource.ad.kinds.START:
            return True
    return False
# - * (filter)
def drop_common(df, df_other, *args, **kwargs):
    return df.drop(columns = df.columns & df_other.columns, *args, **kwargs)

# - search
df_search.set_index(['id'], inplace = True)
# - info
df_info.set_index(['id'], inplace = True)
# - channels
df_channels = df_channels.add_prefix('creator.')
df_channels.set_index(['creator.id'], inplace = True)
# - ads
df_ads.set_index(['id'], inplace = True)
df_ads['has_ad'] = df_ads['ads'].apply(filter_has_ad)
df_ads['has_ad_at_beginning'] = \
        df_ads['ads'].apply(filter_has_ad_beginning)
df_ads.drop('ads', axis = 'columns', inplace = True)

# drop common columns to avoid clashing
# in this case, only `df_search` and `df_info` have merging conflicts
drop_common(df_search, df_info, inplace = True)

  del sys.path[0]


In [10]:
# - search (with details)
df_search_details = df_search.copy()
# - info
df_search_details = df_search_details.merge(
    df_info, 
    right_index = True, 
    left_on = 'id', 
    copy = False
)
# - ads
df_search_details = df_search_details.merge(
    df_ads, 
    right_index = True, 
    left_on = 'id', 
    copy = False
)
# - channels
df_search_details_channels = df_channels

##### STEP 3  Data Inspection

In [11]:
# take a brief look at our data
df_report(df_search_details, name = 'Search Result')
df_report(df_search_details_channels, name = 'Search Result (Channels)')

---

# Results - Search Result

## Data Preview

Unnamed: 0_level_0,title,description,time,length,tags,category,creator.id,stats.like,stats.dislike,stats.comment,stats.view,video.quality,has_ad,has_ad_at_beginning
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
bsN96zE8FuE,QAnon 101: The Search for Q,To understand what Q is and why it has been so...,2021-01-25 20:00:06+00:00,0 days 00:46:30,"[vice_videos:premiere, politics, internet, mov...",Entertainment,UCn8zNIfYAQNdrFRrr8oibKw,29754.0,5197.0,15663.0,1653957,HD,False,False
u8Gd9MJsnnE,QAnon: The conspiracy theory spreading fake ne...,Twitter has become the first social media outl...,2020-07-23 23:19:01+00:00,0 days 00:08:59,"[QAnon, conspiracy theory, Donald trump, what ...",News & Politics,UC6o-wWU-v2ClFMwougmK7dA,9614.0,6911.0,11210.0,927526,HD,True,True
Rhum_0F4weU,[NU’EST] Ha99y 9th anniversary ‘Q is.spoiler -...,뉴이스트 9주년 기념 아홉 가지 Q&A!\n9에 진심인 질문과 답변들이 궁금하다면?...,2021-03-10 06:15:00+00:00,0 days 00:01:02,"[뉴이스트, 뉴이스트W, NU'EST, NU'EST W, JR, REN, ARON,...",Music,UCUuyrV8JDv5UAMW2StsL-NA,4350.0,26.0,475.0,14761,HD,True,True
VGrfN3v5JL8,Sitting Down with QAnon Conspiracy Theorists -...,Jim gathers a group of QAnon conspiracy theori...,2018-08-22 01:18:14+00:00,0 days 00:06:21,"[The Jim Jefferies Show, Jim Jefferies, Jim Je...",Comedy,UCUsN5ZwHx2kILm84-jPDeXw,24923.0,9415.0,16787.0,2116763,HD,False,False
w1kebU_p0CA,Julian Assange on Q (AUDIO ONLY),"In a Canadian broadcast exclusive, controversi...",2014-09-29 14:12:57+00:00,0 days 00:35:07,"[cbc, jian ghomeshi, interview, QTV, Julian As...",Entertainment,UC1nw_szfrEsDWcwD32wHE_w,435.0,26.0,144.0,28644,HD,True,True


## Stats

Unnamed: 0,length,stats.like,stats.dislike,stats.comment,stats.view
count,1408,1388.0,1388.0,1269.0,1408.0
mean,0 days 00:15:47.441761363,484207.1,66026.15,51968.11,88193310.0
std,0 days 00:44:48.054139789,2045981.0,315156.8,416627.5,428589000.0
min,0 days 00:00:00,1.0,0.0,0.0,0.0
25%,0 days 00:03:37,4099.75,132.5,125.0,129427.0
50%,0 days 00:08:12.500000,21832.5,1287.0,1038.0,1361942.0
75%,0 days 00:15:02.750000,138149.0,9807.75,6707.0,15811990.0
max,0 days 15:13:24,21105080.0,4142134.0,5197883.0,4417995000.0


---

---

# Results - Search Result (Channels)

## Data Preview

Unnamed: 0_level_0,creator.title,creator.description,creator.time,creator.stats.follower,creator.stats.view,creator.stats.post
creator.id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
UC-3jIAlnQmbbVMV6gR7K8aQ,The Majority Report w/ Sam Seder,"The Majority Report is a daily, political talk...",2010-05-25 02:04:46+00:00,1020000.0,445288867,16495
UCZdGJgHbmqQcVZaJCkqDRwg,The Q,Science videos and more :)\n\nour facebook: ht...,2017-01-15 12:53:11+00:00,11300000.0,3294595758,217
UCTrQ7HXWRRxr7OsOtodr2_w,Channel 4 News,News that's committed to challenging expectati...,2006-07-05 23:18:56+00:00,1600000.0,830361269,11657
UCeqKIgPQfNInOswGRWt48kQ,ZDFheute Nachrichten,Willkommen auf dem offiziellen ZDFheute Nachri...,2019-08-05 12:42:10+00:00,322000.0,147531376,2147
UCX2M7xn-jMmq4KfX25TCTCA,HBO Brasil,Acesse um mundo inusitado de entretenimento. D...,2012-02-28 11:53:36+00:00,892000.0,155978465,2051


## Stats

Unnamed: 0,creator.stats.follower,creator.stats.view,creator.stats.post
count,945.0,985.0,985.0
mean,6136526.0,3542005000.0,3878.405076
std,16506780.0,12960390000.0,16160.901048
min,57.0,41924.0,1.0
25%,269000.0,81891780.0,133.0
50%,1270000.0,402397300.0,448.0
75%,5120000.0,2065849000.0,1460.0
max,175000000.0,146140400000.0,249442.0


---

##### STEP 4  Data Archiving

In [12]:
pickle_proto = 3

pickle_fname = f'dsamples/youtube_search_{dataset_id}.pkl'
pickle_fname_channels = f'dsamples/youtube_search_channels_{dataset_id}.pkl'

df_update_pickle(df_search_details, pickle_fname, proto = pickle_proto)
df_update_pickle(df_search_details_channels, pickle_fname_channels, proto = pickle_proto)

# verify that we saved the correct data
df_search_details_verify = pd.read_pickle(pickle_fname)
df_report(df_search_details_verify, name = 'Search Result (Verification)')
df_search_details_channels_verify = pd.read_pickle(pickle_fname_channels)
df_report(df_search_details_channels_verify, name = 'Search Result (Channels) (Verification)')

---

# Results - Search Result (Verification)

## Data Preview

Unnamed: 0_level_0,title,description,time,length,tags,category,creator.id,stats.like,stats.dislike,stats.comment,stats.view,video.quality,has_ad,has_ad_at_beginning
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
bsN96zE8FuE,QAnon 101: The Search for Q,To understand what Q is and why it has been so...,2021-01-25 20:00:06+00:00,0 days 00:46:30,"[vice_videos:premiere, politics, internet, mov...",Entertainment,UCn8zNIfYAQNdrFRrr8oibKw,29754.0,5197.0,15663.0,1653957,HD,False,False
u8Gd9MJsnnE,QAnon: The conspiracy theory spreading fake ne...,Twitter has become the first social media outl...,2020-07-23 23:19:01+00:00,0 days 00:08:59,"[QAnon, conspiracy theory, Donald trump, what ...",News & Politics,UC6o-wWU-v2ClFMwougmK7dA,9614.0,6911.0,11210.0,927526,HD,True,True
Rhum_0F4weU,[NU’EST] Ha99y 9th anniversary ‘Q is.spoiler -...,뉴이스트 9주년 기념 아홉 가지 Q&A!\n9에 진심인 질문과 답변들이 궁금하다면?...,2021-03-10 06:15:00+00:00,0 days 00:01:02,"[뉴이스트, 뉴이스트W, NU'EST, NU'EST W, JR, REN, ARON,...",Music,UCUuyrV8JDv5UAMW2StsL-NA,4350.0,26.0,475.0,14761,HD,True,True
VGrfN3v5JL8,Sitting Down with QAnon Conspiracy Theorists -...,Jim gathers a group of QAnon conspiracy theori...,2018-08-22 01:18:14+00:00,0 days 00:06:21,"[The Jim Jefferies Show, Jim Jefferies, Jim Je...",Comedy,UCUsN5ZwHx2kILm84-jPDeXw,24923.0,9415.0,16787.0,2116763,HD,False,False
w1kebU_p0CA,Julian Assange on Q (AUDIO ONLY),"In a Canadian broadcast exclusive, controversi...",2014-09-29 14:12:57+00:00,0 days 00:35:07,"[cbc, jian ghomeshi, interview, QTV, Julian As...",Entertainment,UC1nw_szfrEsDWcwD32wHE_w,435.0,26.0,144.0,28644,HD,True,True


## Stats

Unnamed: 0,length,stats.like,stats.dislike,stats.comment,stats.view
count,1408,1388.0,1388.0,1269.0,1408.0
mean,0 days 00:15:47.441761363,484207.1,66026.15,51968.11,88193310.0
std,0 days 00:44:48.054139789,2045981.0,315156.8,416627.5,428589000.0
min,0 days 00:00:00,1.0,0.0,0.0,0.0
25%,0 days 00:03:37,4099.75,132.5,125.0,129427.0
50%,0 days 00:08:12.500000,21832.5,1287.0,1038.0,1361942.0
75%,0 days 00:15:02.750000,138149.0,9807.75,6707.0,15811990.0
max,0 days 15:13:24,21105080.0,4142134.0,5197883.0,4417995000.0


---

---

# Results - Search Result (Channels) (Verification)

## Data Preview

Unnamed: 0_level_0,creator.title,creator.description,creator.time,creator.stats.follower,creator.stats.view,creator.stats.post
creator.id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
UC-3jIAlnQmbbVMV6gR7K8aQ,The Majority Report w/ Sam Seder,"The Majority Report is a daily, political talk...",2010-05-25 02:04:46+00:00,1020000.0,445288867,16495
UCZdGJgHbmqQcVZaJCkqDRwg,The Q,Science videos and more :)\n\nour facebook: ht...,2017-01-15 12:53:11+00:00,11300000.0,3294595758,217
UCTrQ7HXWRRxr7OsOtodr2_w,Channel 4 News,News that's committed to challenging expectati...,2006-07-05 23:18:56+00:00,1600000.0,830361269,11657
UCeqKIgPQfNInOswGRWt48kQ,ZDFheute Nachrichten,Willkommen auf dem offiziellen ZDFheute Nachri...,2019-08-05 12:42:10+00:00,322000.0,147531376,2147
UCX2M7xn-jMmq4KfX25TCTCA,HBO Brasil,Acesse um mundo inusitado de entretenimento. D...,2012-02-28 11:53:36+00:00,892000.0,155978465,2051


## Stats

Unnamed: 0,creator.stats.follower,creator.stats.view,creator.stats.post
count,945.0,985.0,985.0
mean,6136526.0,3542005000.0,3878.405076
std,16506780.0,12960390000.0,16160.901048
min,57.0,41924.0,1.0
25%,269000.0,81891780.0,133.0
50%,1270000.0,402397300.0,448.0
75%,5120000.0,2065849000.0,1460.0
max,175000000.0,146140400000.0,249442.0


---