# Data Sampling

## Setup

In [15]:
import sys

!{sys.executable} -m pip install --quiet --user --upgrade pandas==1.*
!{sys.executable} -m pip install --quiet --user --upgrade -r requirements.txt

In [16]:
from goodies import *
import pandas as pd

## Data Collection

In [17]:
from dcollect import plugins

modules = {'http': plugins.fasthttp()}
headers = None

### YouTube (United States)

Initial setup. Be sure to have your API key ready. For details on how to obtain an API key, read [YouTube Data API Overview, Introduction: Before you start](https://developers.google.com/youtube/v3/getting-started#before-you-start).

In [18]:
from dcollect import api_youtube as youtube
from dcollect import api_youtubei as youtubei

# This key is for testing ONLY. DO NOT release to the public!
api_key_testing = None
api_key = api_key_testing or input('YouTube Data API Key: ')

dataset_id_testing = ''
dataset_id = input('Dataset ID for collision avoidance: ') or dataset_id_testing

YouTube Data API Key: AIzaSyBKsF33Y1McGDdBWemcfcTbVyJu23XDNIk
Dataset ID for collision avoidance: 


#### Search

In [19]:
count = 50
keyword = ''

##### STEP 1  API Object Creation

In [20]:
# create a YouTube API object
youtube_o = youtube.api(
    modules = modules,
    headers = headers,
    key = api_key
)

# create a YouTube Internals API object
youtubei_o = youtubei.api(
    modules = modules,
    headers = headers
)

##### STEP 2  Data Collection

In [21]:
from dcollect.utils.thread import threading, thread
from dcollect.utils.log import log

# set logging level
log.enable(level = log.levels.WARNING)


df_search = df_from_json(
    youtube_o.video.search(
        count = count,
        keyword = keyword,
        safesearch = youtube.resource.safesearch.NONE
    )
)

df_info = None
df_channels = None
df_ads = None

thread.start([
    threading.Thread(
        # - info
        target = lambda: \
            globals().update(
                df_info = df_from_json(
                    youtube_o.video.info(
                        id = df_search['id']
                    )
                )
            )
    ),
    threading.Thread(
        # - channels
        target = lambda: \
            globals().update(
                df_channels = df_from_json(
                    youtube_o.channel.info(
                        id = df_search['creator.id']
                    )
                )
            )
    ),
    threading.Thread(
        # - ad placements
        target = lambda: \
            globals().update(
                df_ads = df_from_json(
                    youtubei_o.ad.placements(
                        id = df_search['id']
                    )
                )
            )
    )
])
thread.join()



##### STEP 3  Data Cleaning


In [22]:
# - ads (filter)
def filter_has_ad(ads):
    return not ads == None 
def filter_has_ad_beginning(ads):
    if ads == None:
        return False
    for ad in ads:
        if ad['kind'] == youtubei.resource.ad.kinds.START:
            return True
    return False
# - * (filter)
def drop_common(df, df_other, *args, **kwargs):
    return df.drop(columns = df.columns & df_other.columns, *args, **kwargs)

# - search
df_search.set_index(['id', 'creator.id'], inplace = True)
# - info
df_info.set_index(['id', 'creator.id'], inplace = True)
# - channels
df_channels = df_channels.add_prefix('creator.')
df_channels.set_index(['creator.id'], inplace = True)
# - ads
df_ads.set_index(['id'], inplace = True)
df_ads['has_ad'] = df_ads['ads'].apply(filter_has_ad)
df_ads['has_ad_at_beginning'] = \
        df_ads['ads'].apply(filter_has_ad_beginning)
df_ads.drop('ads', axis = 'columns', inplace = True)

# drop common columns to avoid clashing
# in this case, only `df_search` and `df_info` have merging conflicts
drop_common(df_search, df_info, inplace = True)

  del sys.path[0]


In [23]:
# - search (with details)
df_search_details = df_search.copy()
# - info
df_search_details = df_search_details.merge(
    df_info, 
    right_index = True, 
    left_on = ['id', 'creator.id'], 
    copy = False
)
# - ads
df_search_details = df_search_details.merge(
    df_ads, 
    right_index = True, 
    left_on = 'id', 
    copy = False
)
# - channels
df_search_details_channels = df_channels

##### STEP 4  Data Inspection


In [24]:
# take a brief look at our data
df_report(df_search_details, name = 'Search Result')
df_report(df_search_details_channels, name = 'Search Result (Channels)')

---

# Results - Search Result

## Data Preview

Unnamed: 0_level_0,Unnamed: 1_level_0,title,description,time,length,tags,category,stats.like,stats.dislike,stats.comment,stats.view,video.quality,has_ad,has_ad_at_beginning
id,creator.id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
WYdi1bL6s10,UC3XTzVzaHQEd30rQbuvCtTQ,Raids: Last Week Tonight with John Oliver (HBO),John Oliver explains how raids became a favori...,2021-03-01 07:45:00+00:00,0 days 00:25:55,,Entertainment,154642.0,3621.0,18604.0,3213484.0,HD,False,False
rZGurRM6s_o,UCpEhnqL0y41EpW2TvWAHD7Q,Pawandeep के Performance ने किया Neha को Senti...,Click here to Subscribe to SET INDIA Channel: ...,2021-03-01 15:00:20+00:00,0 days 00:04:28,"[indian reality shows, singing talent, indian ...",Entertainment,61562.0,864.0,1766.0,2037485.0,HD,True,True
B3lyWZl2zbA,UCpEhnqL0y41EpW2TvWAHD7Q,Tejas का Performance देख के Tiger हुए Shock | ...,Click here to Subscribe to SET India: https://...,2021-03-01 15:00:02+00:00,0 days 00:08:11,"[set india, romance on SET, romantic performan...",Entertainment,19404.0,957.0,140.0,1774021.0,HD,True,True
_c4Qh4_T_Ks,UCpEhnqL0y41EpW2TvWAHD7Q,Jayshree ने अपने Performance से जीता सबका दिल ...,Click here to Subscribe to SET India: https://...,2021-03-01 15:30:03+00:00,0 days 00:08:02,"[set india, romance on SET, romantic performan...",Entertainment,11557.0,579.0,235.0,854075.0,HD,True,True
v_Ofrk5JmYQ,UC8BzJM6_VbZTdiNLD4R1jxQ,วันนี้คุณนาบีดูแลลูกค้าเองเลยเหรอครับ? | นาบี ...,นาบี ผู้จัดการสาวสวยของ DUBAI CLUB น้องสาวของ ...,2021-03-01 13:45:01+00:00,0 days 00:03:27,"[GMMTV, GMM-TV, GMM, TV, GMMTV SPOTLIKE, gmmtv...",Entertainment,3150.0,101.0,35.0,439629.0,HD,True,True


## Stats

Unnamed: 0,length,stats.like,stats.dislike,stats.comment,stats.view
count,50,48.0,48.0,45.0,39.0
mean,0 days 00:42:37.620000,29728.791667,3626.541667,4482.4,1488915.0
std,0 days 00:52:00.897482704,117341.594461,21116.830051,21172.765408,3223670.0
min,0 days 00:00:30,25.0,0.0,0.0,11219.0
25%,0 days 00:03:04.500000,492.0,36.25,42.0,78414.5
50%,0 days 00:10:44,4010.0,248.5,235.0,535588.0
75%,0 days 01:31:46.750000,12962.5,867.75,1322.0,1660571.0
max,0 days 02:29:30,806357.0,146781.0,141855.0,19740510.0


---

---

# Results - Search Result (Channels)

## Data Preview

Unnamed: 0_level_0,creator.title,creator.description,creator.time,creator.stats.follower,creator.stats.view,creator.stats.post
creator.id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
UCpr5mSSUbIQU816dNjaDdyg,Indonesian Idol,This is the Official YouTube Channel of Indone...,2011-12-29 07:12:28+00:00,6100000.0,2255775607,1386
UCejaUOXYgKkeVlayiF8ZFiw,YouTube Movies,YouTube's movies destination featuring the lat...,2018-04-15 21:01:27+00:00,,5011,10
UCd97ukfGaYt4LKtIgKm9Vhw,HiTechEntertainment,HI-TECH ENTERTAINMENT brings you super hit Nep...,2012-02-11 04:24:39+00:00,1110000.0,357109563,561
UC6t1Tkom_lz6nPYsBjOeD9w,Shemaroo Kannada,Welcome to shemarookannada channel one of the ...,2011-04-08 09:26:35+00:00,2810000.0,1183089450,5147
UCsN174eDjrNll0zJc7kHjUA,FoneArena,FoneArena is a leading mobile phone portal whi...,2009-03-31 19:16:28+00:00,174000.0,63086455,4144


## Stats

Unnamed: 0,creator.stats.follower,creator.stats.view,creator.stats.post
count,25.0,30.0,30.0
mean,7680268.0,4461696000.0,15369.266667
std,19327370.0,14643790000.0,31839.254225
min,43000.0,5011.0,5.0
25%,493000.0,95878480.0,355.75
50%,2810000.0,512146000.0,1284.5
75%,6450000.0,2831778000.0,15591.25
max,98100000.0,80739250000.0,145567.0


---

##### STEP 5  Data Archiving


In [25]:
pickle_proto = 3

pickle_fname = f'dsamples/youtube_search_{as_fname(keyword)}_{dataset_id}.pkl'
pickle_fname_channels = f'dsamples/youtube_search_channels_{as_fname(keyword)}_{dataset_id}.pkl'

df_search_details.attrs['_search_keyword'] = keyword

df_update_pickle(df_search_details, pickle_fname, proto = pickle_proto)
df_update_pickle(df_search_details_channels, pickle_fname_channels, proto = pickle_proto)

# verify that we saved the correct data
df_search_details_verify = pd.read_pickle(pickle_fname)
df_report(df_search_details_verify, name = 'Search Result (Verification)')
df_search_details_channels_verify = pd.read_pickle(pickle_fname_channels)
df_report(df_search_details_channels_verify, name = 'Search Result (Channels) (Verification)')

---

# Results - Search Result (Verification)

## Data Preview

Unnamed: 0_level_0,Unnamed: 1_level_0,title,description,time,length,tags,category,stats.like,stats.dislike,stats.comment,stats.view,video.quality,has_ad,has_ad_at_beginning
id,creator.id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
WYdi1bL6s10,UC3XTzVzaHQEd30rQbuvCtTQ,Raids: Last Week Tonight with John Oliver (HBO),John Oliver explains how raids became a favori...,2021-03-01 07:45:00+00:00,0 days 00:25:55,,Entertainment,154642.0,3621.0,18604.0,3213484.0,HD,False,False
rZGurRM6s_o,UCpEhnqL0y41EpW2TvWAHD7Q,Pawandeep के Performance ने किया Neha को Senti...,Click here to Subscribe to SET INDIA Channel: ...,2021-03-01 15:00:20+00:00,0 days 00:04:28,"[indian reality shows, singing talent, indian ...",Entertainment,61562.0,864.0,1766.0,2037485.0,HD,True,True
B3lyWZl2zbA,UCpEhnqL0y41EpW2TvWAHD7Q,Tejas का Performance देख के Tiger हुए Shock | ...,Click here to Subscribe to SET India: https://...,2021-03-01 15:00:02+00:00,0 days 00:08:11,"[set india, romance on SET, romantic performan...",Entertainment,19404.0,957.0,140.0,1774021.0,HD,True,True
_c4Qh4_T_Ks,UCpEhnqL0y41EpW2TvWAHD7Q,Jayshree ने अपने Performance से जीता सबका दिल ...,Click here to Subscribe to SET India: https://...,2021-03-01 15:30:03+00:00,0 days 00:08:02,"[set india, romance on SET, romantic performan...",Entertainment,11557.0,579.0,235.0,854075.0,HD,True,True
v_Ofrk5JmYQ,UC8BzJM6_VbZTdiNLD4R1jxQ,วันนี้คุณนาบีดูแลลูกค้าเองเลยเหรอครับ? | นาบี ...,นาบี ผู้จัดการสาวสวยของ DUBAI CLUB น้องสาวของ ...,2021-03-01 13:45:01+00:00,0 days 00:03:27,"[GMMTV, GMM-TV, GMM, TV, GMMTV SPOTLIKE, gmmtv...",Entertainment,3150.0,101.0,35.0,439629.0,HD,True,True


## Stats

Unnamed: 0,length,stats.like,stats.dislike,stats.comment,stats.view
count,50,48.0,48.0,45.0,39.0
mean,0 days 00:42:37.620000,29728.791667,3626.541667,4482.4,1488915.0
std,0 days 00:52:00.897482704,117341.594461,21116.830051,21172.765408,3223670.0
min,0 days 00:00:30,25.0,0.0,0.0,11219.0
25%,0 days 00:03:04.500000,492.0,36.25,42.0,78414.5
50%,0 days 00:10:44,4010.0,248.5,235.0,535588.0
75%,0 days 01:31:46.750000,12962.5,867.75,1322.0,1660571.0
max,0 days 02:29:30,806357.0,146781.0,141855.0,19740510.0


---

---

# Results - Search Result (Channels) (Verification)

## Data Preview

Unnamed: 0_level_0,creator.title,creator.description,creator.time,creator.stats.follower,creator.stats.view,creator.stats.post
creator.id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
UCpr5mSSUbIQU816dNjaDdyg,Indonesian Idol,This is the Official YouTube Channel of Indone...,2011-12-29 07:12:28+00:00,6100000.0,2255775607,1386
UCejaUOXYgKkeVlayiF8ZFiw,YouTube Movies,YouTube's movies destination featuring the lat...,2018-04-15 21:01:27+00:00,,5011,10
UCd97ukfGaYt4LKtIgKm9Vhw,HiTechEntertainment,HI-TECH ENTERTAINMENT brings you super hit Nep...,2012-02-11 04:24:39+00:00,1110000.0,357109563,561
UC6t1Tkom_lz6nPYsBjOeD9w,Shemaroo Kannada,Welcome to shemarookannada channel one of the ...,2011-04-08 09:26:35+00:00,2810000.0,1183089450,5147
UCsN174eDjrNll0zJc7kHjUA,FoneArena,FoneArena is a leading mobile phone portal whi...,2009-03-31 19:16:28+00:00,174000.0,63086455,4144


## Stats

Unnamed: 0,creator.stats.follower,creator.stats.view,creator.stats.post
count,25.0,30.0,30.0
mean,7680268.0,4461696000.0,15369.266667
std,19327370.0,14643790000.0,31839.254225
min,43000.0,5011.0,5.0
25%,493000.0,95878480.0,355.75
50%,2810000.0,512146000.0,1284.5
75%,6450000.0,2831778000.0,15591.25
max,98100000.0,80739250000.0,145567.0


---