# Data Sampling

## Setup

In [1]:
import sys

!{sys.executable} -m pip install --quiet --user --upgrade pandas==1.*
!{sys.executable} -m pip install --quiet --user --upgrade -r requirements.txt

In [2]:
from goodies import *
import pandas as pd

## Data Collection

In [3]:
from dcollect import plugins

modules = {'http': plugins.fasthttp()}
headers = None

### YouTube (United States)

Initial setup. Be sure to have your API key ready. For details on how to obtain an API key, read [YouTube Data API Overview, Introduction: Before you start](https://developers.google.com/youtube/v3/getting-started#before-you-start).

In [4]:
from dcollect import api_youtube as youtube
from dcollect import api_youtubei as youtubei

# This key is for testing ONLY. DO NOT release to the public!
api_key_testing = None
api_key = api_key_testing or input('YouTube Data API Key: ')

dataset_id_testing = ''
dataset_id = input('Dataset ID for collision avoidance: ') or dataset_id_testing

YouTube Data API Key: AIzaSyBKsF33Y1McGDdBWemcfcTbVyJu23XDNIk
Dataset ID for collision avoidance: 


#### Search

In [5]:
count = 50
keyword = ''

##### STEP 1  API Object Creation

In [6]:
# create a YouTube API object
youtube_o = youtube.api(
    modules = modules,
    headers = headers,
    key = api_key
)

# create a YouTube Internals API object
youtubei_o = youtubei.api(
    modules = modules,
    headers = headers
)

##### STEP 2  Data Collection

In [7]:
from dcollect.utils.thread import threading, thread
from dcollect.utils.log import log

# set logging level
log.enable(level = log.levels.WARNING)


df_search = df_from_json(
    youtube_o.video.search(
        count = count,
        keyword = keyword,
        safesearch = youtube.resource.safesearch.NONE
    )
)

df_info = None
df_channels = None
df_ads = None

thread.start([
    threading.Thread(
        # - info
        target = lambda: \
            globals().update(
                df_info = df_from_json(
                    youtube_o.video.info(
                        id = df_search['id']
                    )
                )
            )
    ),
    threading.Thread(
        # - channels
        target = lambda: \
            globals().update(
                df_channels = df_from_json(
                    youtube_o.channel.info(
                        id = df_search['creator.id']
                    )
                )
            )
    ),
    threading.Thread(
        # - ad placements
        target = lambda: \
            globals().update(
                df_ads = df_from_json(
                    youtubei_o.ad.placements(
                        id = df_search['id']
                    )
                )
            )
    )
])
thread.join()



##### STEP 3  Data Cleaning


In [8]:
# - ads (filter)
def filter_has_ad(ads):
    return not ads == None 
def filter_has_ad_beginning(ads):
    if ads == None:
        return False
    for ad in ads:
        if ad['kind'] == youtubei.resource.ad.kinds.START:
            return True
    return False
# - * (filter)
def drop_common(df, df_other, *args, **kwargs):
    return df.drop(columns = df.columns & df_other.columns, *args, **kwargs)

# - search
df_search.set_index(['id', 'creator.id'], inplace = True)
# - info
df_info.set_index(['id', 'creator.id'], inplace = True)
# - channels
df_channels = df_channels.add_prefix('creator.')
df_channels.set_index(['creator.id'], inplace = True)
# - ads
df_ads.set_index(['id'], inplace = True)
df_ads['has_ad'] = df_ads['ads'].apply(filter_has_ad)
df_ads['has_ad_at_beginning'] = \
        df_ads['ads'].apply(filter_has_ad_beginning)
df_ads.drop('ads', axis = 'columns', inplace = True)

# drop common columns to avoid clashing
# in this case, only `df_search` and `df_info` have merging conflicts
drop_common(df_search, df_info, inplace = True)

  del sys.path[0]


In [9]:
# - search (with details)
df_search_details = df_search.copy()
# - info
df_search_details = df_search_details.merge(
    df_info, 
    right_index = True, 
    left_on = ['id', 'creator.id'], 
    copy = False
)
# - ads
df_search_details = df_search_details.merge(
    df_ads, 
    right_index = True, 
    left_on = 'id', 
    copy = False
)
# - channels
df_search_details_channels = df_channels

##### STEP 4  Data Inspection


In [10]:
# take a brief look at our data
df_report(df_search_details, name = 'Search Result')
df_report(df_search_details_channels, name = 'Search Result (Channels)')

---

# Results - Search Result

## Data Preview

Unnamed: 0_level_0,Unnamed: 1_level_0,title,description,time,length,tags,category,stats.like,stats.dislike,stats.comment,stats.view,video.quality,has_ad,has_ad_at_beginning
id,creator.id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
WYdi1bL6s10,UC3XTzVzaHQEd30rQbuvCtTQ,Raids: Last Week Tonight with John Oliver (HBO),John Oliver explains how raids became a favori...,2021-03-01 07:45:00+00:00,0 days 00:25:55,,Entertainment,154640.0,3620.0,18604.0,3213279.0,HD,False,False
rZGurRM6s_o,UCpEhnqL0y41EpW2TvWAHD7Q,Pawandeep के Performance ने किया Neha को Senti...,Click here to Subscribe to SET INDIA Channel: ...,2021-03-01 15:00:20+00:00,0 days 00:04:28,"[indian reality shows, singing talent, indian ...",Entertainment,61548.0,864.0,1764.0,2036713.0,HD,True,True
B3lyWZl2zbA,UCpEhnqL0y41EpW2TvWAHD7Q,Tejas का Performance देख के Tiger हुए Shock | ...,Click here to Subscribe to SET India: https://...,2021-03-01 15:00:02+00:00,0 days 00:08:11,"[set india, romance on SET, romantic performan...",Entertainment,19374.0,955.0,140.0,1770151.0,HD,True,True
_c4Qh4_T_Ks,UCpEhnqL0y41EpW2TvWAHD7Q,Jayshree ने अपने Performance से जीता सबका दिल ...,Click here to Subscribe to SET India: https://...,2021-03-01 15:30:03+00:00,0 days 00:08:02,"[set india, romance on SET, romantic performan...",Entertainment,11537.0,579.0,235.0,851660.0,HD,True,True
v_Ofrk5JmYQ,UC8BzJM6_VbZTdiNLD4R1jxQ,วันนี้คุณนาบีดูแลลูกค้าเองเลยเหรอครับ? | นาบี ...,นาบี ผู้จัดการสาวสวยของ DUBAI CLUB น้องสาวของ ...,2021-03-01 13:45:01+00:00,0 days 00:03:27,"[GMMTV, GMM-TV, GMM, TV, GMMTV SPOTLIKE, gmmtv...",Entertainment,3151.0,101.0,35.0,439112.0,HD,True,True


## Stats

Unnamed: 0,length,stats.like,stats.dislike,stats.comment,stats.view
count,50,47.0,47.0,45.0,40.0
mean,0 days 00:37:45.020000,30438.531915,3694.446809,4456.422222,1431974.0
std,0 days 00:50:48.263336219,118505.451073,21339.028899,21176.3698,3195662.0
min,0 days 00:00:30,25.0,0.0,0.0,11218.0
25%,0 days 00:02:57.750000,518.0,51.5,35.0,72992.25
50%,0 days 00:07:33.500000,5048.0,222.0,243.0,456178.0
75%,0 days 01:24:02.250000,13177.0,871.5,911.0,1656623.0
max,0 days 02:29:30,806327.0,146776.0,141854.0,19736500.0


---

---

# Results - Search Result (Channels)

## Data Preview

Unnamed: 0_level_0,creator.title,creator.description,creator.time,creator.stats.follower,creator.stats.view,creator.stats.post
creator.id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
UCvrhwpnp2DHYQ1CbXby9ypQ,Vijay Television,"Vijay Television (""Vijay"") is a leading tamil ...",2007-02-01 11:57:18+00:00,13800000.0,11459867979,21551
UCejaUOXYgKkeVlayiF8ZFiw,YouTube Movies,YouTube's movies destination featuring the lat...,2018-04-15 21:01:27+00:00,,5011,10
UC6FwzVftf5AQrXYhQ5FdzRA,RVISION: Советские фильмы,"Для ценителей киноклассики времён СССР, на наш...",2013-05-24 14:58:03+00:00,1030000.0,196542310,663
UCCGxeGqC5C7lEbbZYk7xs-Q,Gadget Diary,Subscribe to our channel for various gadget re...,2012-05-10 16:56:08+00:00,272000.0,84093034,1594
UCJhEfZoLs5P_idxX--yhWOA,Çok Güzel Hareketler,"Çok Güzel Hareketler Bunlar, ilk bölümü 7 Mayı...",2014-03-12 15:37:07+00:00,2830000.0,2706866889,1183


## Stats

Unnamed: 0,creator.stats.follower,creator.stats.view,creator.stats.post
count,25.0,29.0,29.0
mean,7638545.0,4597178000.0,15592.517241
std,19340100.0,14883110000.0,32397.883156
min,925.0,5011.0,5.0
25%,493000.0,131234800.0,331.0
50%,1940000.0,667182500.0,716.0
75%,6450000.0,2873416000.0,16750.0
max,98100000.0,80739250000.0,145566.0


---

##### STEP 5  Data Archiving


In [14]:
pickle_proto = 3

pickle_fname = f'dsamples/youtube_search_{as_fname(keyword)}_{dataset_id}.pkl'
pickle_fname_channels = f'dsamples/youtube_search_channels_{as_fname(keyword)}_{dataset_id}.pkl'

df_search_details.attrs['_search_keyword'] = keyword

df_update_pickle(df_search_details, pickle_fname, proto = pickle_proto)
df_update_pickle(df_search_details_channels, pickle_fname_channels, proto = pickle_proto)

# verify that we saved the correct data
df_search_details_verify = pd.read_pickle(pickle_fname)
df_report(df_search_details_verify, name = 'Search Result (Verification)')
df_search_details_channels_verify = pd.read_pickle(pickle_fname_channels)
df_report(df_search_details_channels_verify, name = 'Search Result (Channels) (Verification)')

---

# Results - Search Result (Verification)

## Data Preview

Unnamed: 0_level_0,Unnamed: 1_level_0,category,creator.description,creator.stats.follower,creator.stats.post,creator.stats.view,creator.time,creator.title,description,has_ad,has_ad_at_beginning,length,stats.comment,stats.dislike,stats.like,stats.view,tags,time,title,video.quality
id,creator.id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
WYdi1bL6s10,UC3XTzVzaHQEd30rQbuvCtTQ,Entertainment,Breaking news on a weekly basis. Sundays at 11...,8610000.0,334.0,2873416000.0,2014-03-18 17:41:39+00:00,LastWeekTonight,John Oliver explains how raids became a favori...,False,False,0 days 00:25:55,18604.0,3620.0,154640.0,3213279.0,,2021-03-01 07:45:00+00:00,Raids: Last Week Tonight with John Oliver (HBO),HD
rZGurRM6s_o,UCpEhnqL0y41EpW2TvWAHD7Q,Entertainment,Sony Entertainment Television is a 24 hour Hin...,98100000.0,50945.0,80739250000.0,2006-09-20 22:24:59+00:00,SET India,Click here to Subscribe to SET INDIA Channel: ...,True,True,0 days 00:04:28,1764.0,864.0,61548.0,2036713.0,"[indian reality shows, singing talent, indian ...",2021-03-01 15:00:20+00:00,Pawandeep के Performance ने किया Neha को Senti...,HD
B3lyWZl2zbA,UCpEhnqL0y41EpW2TvWAHD7Q,Entertainment,Sony Entertainment Television is a 24 hour Hin...,98100000.0,50945.0,80739250000.0,2006-09-20 22:24:59+00:00,SET India,Click here to Subscribe to SET India: https://...,True,True,0 days 00:08:11,140.0,955.0,19374.0,1770151.0,"[set india, romance on SET, romantic performan...",2021-03-01 15:00:02+00:00,Tejas का Performance देख के Tiger हुए Shock | ...,HD
_c4Qh4_T_Ks,UCpEhnqL0y41EpW2TvWAHD7Q,Entertainment,Sony Entertainment Television is a 24 hour Hin...,98100000.0,50945.0,80739250000.0,2006-09-20 22:24:59+00:00,SET India,Click here to Subscribe to SET India: https://...,True,True,0 days 00:08:02,235.0,579.0,11537.0,851660.0,"[set india, romance on SET, romantic performan...",2021-03-01 15:30:03+00:00,Jayshree ने अपने Performance से जीता सबका दिल ...,HD
v_Ofrk5JmYQ,UC8BzJM6_VbZTdiNLD4R1jxQ,Entertainment,https://www.facebook.com/GMMTVOFFICIAL\nhttps:...,10600000.0,11872.0,6831155000.0,2010-09-13 05:07:36+00:00,GMMTV,นาบี ผู้จัดการสาวสวยของ DUBAI CLUB น้องสาวของ ...,True,True,0 days 00:03:27,35.0,101.0,3151.0,439112.0,"[GMMTV, GMM-TV, GMM, TV, GMMTV SPOTLIKE, gmmtv...",2021-03-01 13:45:01+00:00,วันนี้คุณนาบีดูแลลูกค้าเองเลยเหรอครับ? | นาบี ...,HD


## Stats

Unnamed: 0,creator.stats.follower,creator.stats.post,creator.stats.view,length,stats.comment,stats.dislike,stats.like,stats.view
count,41.0,46.0,46.0,50,45.0,47.0,47.0,40.0
mean,10575700.0,21702.652174,6893515000.0,0 days 00:37:45.020000,4456.422222,3694.446809,30438.531915,1431974.0
std,25212770.0,33790.176925,19859720000.0,0 days 00:50:48.263336219,21176.3698,21339.028899,118505.451073,3195662.0
min,925.0,5.0,5011.0,0 days 00:00:30,0.0,0.0,25.0,11218.0
25%,628000.0,456.0,197841500.0,0 days 00:02:57.750000,35.0,51.5,518.0,72992.25
50%,1940000.0,1594.0,711110200.0,0 days 00:07:33.500000,243.0,222.0,5048.0,456178.0
75%,6450000.0,32455.0,2831778000.0,0 days 01:24:02.250000,911.0,871.5,13177.0,1656623.0
max,98100000.0,145566.0,80739250000.0,0 days 02:29:30,141854.0,146776.0,806327.0,19736500.0


---

---

# Results - Search Result (Channels) (Verification)

## Data Preview

Unnamed: 0_level_0,creator.title,creator.description,creator.time,creator.stats.follower,creator.stats.view,creator.stats.post
creator.id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
UCvrhwpnp2DHYQ1CbXby9ypQ,Vijay Television,"Vijay Television (""Vijay"") is a leading tamil ...",2007-02-01 11:57:18+00:00,13800000.0,11459867979,21551
UCejaUOXYgKkeVlayiF8ZFiw,YouTube Movies,YouTube's movies destination featuring the lat...,2018-04-15 21:01:27+00:00,,5011,10
UC6FwzVftf5AQrXYhQ5FdzRA,RVISION: Советские фильмы,"Для ценителей киноклассики времён СССР, на наш...",2013-05-24 14:58:03+00:00,1030000.0,196542310,663
UCCGxeGqC5C7lEbbZYk7xs-Q,Gadget Diary,Subscribe to our channel for various gadget re...,2012-05-10 16:56:08+00:00,272000.0,84093034,1594
UCJhEfZoLs5P_idxX--yhWOA,Çok Güzel Hareketler,"Çok Güzel Hareketler Bunlar, ilk bölümü 7 Mayı...",2014-03-12 15:37:07+00:00,2830000.0,2706866889,1183


## Stats

Unnamed: 0,creator.stats.follower,creator.stats.view,creator.stats.post
count,25.0,29.0,29.0
mean,7638545.0,4597178000.0,15592.517241
std,19340100.0,14883110000.0,32397.883156
min,925.0,5011.0,5.0
25%,493000.0,131234800.0,331.0
50%,1940000.0,667182500.0,716.0
75%,6450000.0,2873416000.0,16750.0
max,98100000.0,80739250000.0,145566.0


---