# Data Sampling

## Setup

In [1]:
import sys

!{sys.executable} -m pip install --quiet --user --upgrade pandas==1.*
!{sys.executable} -m pip install --quiet --user --upgrade -r requirements.txt

In [None]:
from goodies import *
import pandas as pd

## Data Collection

In [4]:
from dcollect import plugins

modules = {'http': plugins.fasthttp()}
headers = None

### YouTube (United States)

Initial setup. Be sure to have your API key ready. For details on how to obtain an API key, read [YouTube Data API Overview, Introduction: Before you start](https://developers.google.com/youtube/v3/getting-started#before-you-start).

In [5]:
from dcollect import api_youtube as youtube
from dcollect import api_youtubei as youtubei

# This key is for testing ONLY. DO NOT release to the public!
api_key_testing = None
api_key = api_key_testing or input('YouTube Data API Key: ')

dataset_id_testing = ''
dataset_id = input('Dataset ID for collision avoidance: ') or dataset_id_testing

#### Search

In [6]:
count = 50
keyword = ''

##### STEP 1  API Object Creation

In [7]:
# create a YouTube API object
youtube_o = youtube.api(
    modules = modules,
    headers = headers,
    key = api_key
)

# create a YouTube Internals API object
youtubei_o = youtubei.api(
    modules = modules,
    headers = headers
)

##### STEP 2  Data Collection

In [8]:
from dcollect.utils.thread import threading, thread
from dcollect.utils.log import log

# set logging level
log.enable(level = log.levels.WARNING)


df_search = df_from_json(
    youtube_o.video.search(
        count = count,
        keyword = keyword,
        safesearch = youtube.resource.safesearch.NONE
    )
)

df_info = None
df_channels = None
df_ads = None

thread.start([
    threading.Thread(
        # - info
        target = lambda: \
            globals().update(
                df_info = df_from_json(
                    youtube_o.video.info(
                        id = df_search['id']
                    )
                )
            )
    ),
    threading.Thread(
        # - channels
        target = lambda: \
            globals().update(
                df_channels = df_from_json(
                    youtube_o.channel.info(
                        id = df_search['creator.id']
                    )
                )
            )
    ),
    threading.Thread(
        # - ad placements
        target = lambda: \
            globals().update(
                df_ads = df_from_json(
                    youtubei_o.ad.placements(
                        id = df_search['id']
                    )
                )
            )
    )
])
thread.join()



##### STEP 3  Data Cleaning


In [9]:
# - ads (filter)
def filter_has_ad(ads):
    return not ads == None 
def filter_has_ad_beginning(ads):
    if ads == None:
        return False
    for ad in ads:
        if ad['kind'] == youtubei.resource.ad.kinds.START:
            return True
    return False
# - * (filter)
def drop_common(df, df_other, *args, **kwargs):
    return df.drop(columns = df.columns & df_other.columns, *args, **kwargs)

# - search
df_search.set_index(['id', 'creator.id'], inplace = True)
# - info
df_info.set_index(['id', 'creator.id'], inplace = True)
# - channels
df_channels = df_channels.add_prefix('creator.')
df_channels.set_index(['creator.id'], inplace = True)
# - ads
df_ads.set_index(['id'], inplace = True)
df_ads['has_ad'] = df_ads['ads'].apply(filter_has_ad)
df_ads['has_ad_at_beginning'] = \
        df_ads['ads'].apply(filter_has_ad_beginning)
df_ads.drop('ads', axis = 'columns', inplace = True)

# drop common columns to avoid clashing
# in this case, only `df_search` and `df_info` have merging conflicts
drop_common(df_search, df_info, inplace = True)

In [10]:
# - search (with details)
df_search_details = df_search.copy()
# - info
df_search_details = df_search_details.merge(
    df_info, 
    right_index = True, 
    left_on = ['id', 'creator.id'], 
    copy = False
)
# - ads
df_search_details = df_search_details.merge(
    df_ads, 
    right_index = True, 
    left_on = 'id', 
    copy = False
)
# - channels
df_search_details_channels = df_channels

##### STEP 4  Data Inspection


In [11]:
# take a brief look at our data
df_report(df_search_details, name = 'Search Result')
df_report(df_search_details_channels, name = 'Search Result (Channels)')

---

# Results - Search Results

## Data Preview

Unnamed: 0_level_0,Unnamed: 1_level_0,title,description,time,length,tags,category,stats.like,stats.dislike,stats.comment,stats.view,video.quality,creator.title,creator.description,creator.time,creator.stats.follower,creator.stats.view,creator.stats.post,has_ad,has_ad_at_beginning
id,creator.id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
6bGpGQ08gQs,UCF4Wxdo3inmxP-Y59wXDsFw,피로 물든 미얀마…무차별 발포에 30명 사망 (2021.03.01/뉴스데스크/MBC),"미얀마 군부가 쿠데타를 일으킨지 한 달 째, 미얀마에선 '피의 일요일'이 반복되고 ...",2021-03-01 11:24:24+00:00,0 days 00:02:31,"[MBC, MBC뉴스, 뉴스데스크, newsdesk, 뉴스투데이, newstoday...",News & Politics,4210,123,4224.0,553619,HD,MBCNEWS,MBC 뉴스 공식 유튜브 채널입니다. 시청자 여러분의 의견과 제보를 항상 기다립니다...,2006-11-05 21:58:51+00:00,1140000,2169439928,101323,True,True
Hs-mgG2uEjA,UCNAf1k0yIjyGu3k9BwAg3lg,"Keane and Redknapp get HEATED over ""average"" S...",SUBSCRIBE ► http://bit.ly/SSFootballSub\nPREMI...,2021-02-28 14:07:11+00:00,0 days 00:07:15,"[sky sports, sky sports football, premier leag...",Sports,24995,388,6973.0,1377277,HD,Sky Sports Football,Sky Sports Football is the home of Sky Sports'...,2015-07-06 11:11:54+00:00,2780000,1654548702,4818,True,True
dKJh3_OnPZ8,UCBJeMCIeLQos7wacox4hmLQ,Roma 1-2 Milan | Milan keep the pressure on In...,"Rebic scored the winner for Milan, as they kee...",2021-02-28 22:43:51+00:00,0 days 00:04:14,"[Ronaldo, Serie A, Dybala, highlights, Juventu...",Sports,35514,908,1794.0,2272038,HD,Serie A,Welcome to the Official Serie A channel. Over ...,2012-10-30 13:54:30+00:00,6540000,2175901252,20042,True,True
w1B-NvBVtrw,UC9-OpMMVoNP5o10_Iyq7Ndw,Giannis Antetokounmpo Throws Down Monster Slam...,GREEK FREAK CALLED GAME. \n\r\nSubscribe: http...,2021-02-28 23:11:18+00:00,0 days 00:00:52,"[bleacher report, br, nba, giannis antetokounm...",Sports,4951,93,1582.0,437520,HD,Bleacher Report,The official YouTube page of Bleacher Report. ...,2007-09-14 18:23:54+00:00,2180000,1056524145,4622,True,True
eYO7Ccj1Iok,UCDkl5M0WVaddTWE4rr2cSeA,DES MEILLEURES AMIES ECHANGENT LEURS VIES PEND...,Je connais ma meilleure amie Solene depuis 22 ...,2021-02-28 20:13:20+00:00,0 days 00:23:29,"[lena situations, lena, lena situation, situat...",People & Blogs,112683,518,2077.0,1391601,HD,Léna Situations,désolée je suis nulle pour les descriptions,2013-01-14 17:22:41+00:00,1810000,263268098,296,True,True


## Stats

Unnamed: 0,length,stats.like,stats.dislike,stats.comment,stats.view,creator.stats.follower,creator.stats.view,creator.stats.post
count,20,20.0,20.0,19.0,20.0,20.0,20.0,20.0
mean,0 days 00:09:17.850000,21240.45,545.85,1642.631579,985827.9,14218000.0,11586470000.0,30655.1
std,0 days 00:11:17.421758931,25947.217241,678.091228,1912.237003,745354.1,23967140.0,21052100000.0,35430.73451
min,0 days 00:00:52,2347.0,66.0,65.0,94644.0,467000.0,130805700.0,177.0
25%,0 days 00:03:10.750000,5027.5,194.75,324.5,459342.8,1642500.0,831046800.0,1071.75
50%,0 days 00:05:08.500000,9690.0,349.0,756.0,692996.0,3045000.0,1931002000.0,20513.5
75%,0 days 00:08:49.750000,27624.75,607.75,1935.5,1380858.0,15100000.0,8150189000.0,51529.5
max,0 days 00:46:59,112683.0,3154.0,6973.0,2820763.0,98000000.0,80646880000.0,105638.0


---

##### STEP 5  Data Archiving


In [12]:
pickle_proto = 3

pickle_fname = f'dsamples/youtube_search_{as_fname(keyword)}_{dataset_id}.pkl'
pickle_fname_channels = f'dsamples/youtube_search_channels_{as_fname(keyword)}_{dataset_id}.pkl'

df_search_details.attrs['_search_keyword'] = keyword

df_update_pickle(df_search_details, pickle_fname, protocol = pickle_proto)
df_update_pickle(df_search_details_channels, pickle_fname, protocol = pickle_proto)

# verify that we saved the correct data
df_search_details_verify = pd.read_pickle(pickle_fname)
df_report(df_search_details_verify, name = 'Search Result (Verification)')
df_search_details_channels_verify = pd.read_pickle(pickle_fname_channels)
df_report(df_search_details_channels_verify, name = 'Search Result (Channels) (Verification)')

---

# Results - Search Results (Verification)

## Data Preview

Unnamed: 0_level_0,Unnamed: 1_level_0,title,description,time,length,tags,category,stats.like,stats.dislike,stats.comment,stats.view,video.quality,creator.title,creator.description,creator.time,creator.stats.follower,creator.stats.view,creator.stats.post,has_ad,has_ad_at_beginning
id,creator.id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
6bGpGQ08gQs,UCF4Wxdo3inmxP-Y59wXDsFw,피로 물든 미얀마…무차별 발포에 30명 사망 (2021.03.01/뉴스데스크/MBC),"미얀마 군부가 쿠데타를 일으킨지 한 달 째, 미얀마에선 '피의 일요일'이 반복되고 ...",2021-03-01 11:24:24+00:00,0 days 00:02:31,"[MBC, MBC뉴스, 뉴스데스크, newsdesk, 뉴스투데이, newstoday...",News & Politics,4210,123,4224.0,553619,HD,MBCNEWS,MBC 뉴스 공식 유튜브 채널입니다. 시청자 여러분의 의견과 제보를 항상 기다립니다...,2006-11-05 21:58:51+00:00,1140000,2169439928,101323,True,True
Hs-mgG2uEjA,UCNAf1k0yIjyGu3k9BwAg3lg,"Keane and Redknapp get HEATED over ""average"" S...",SUBSCRIBE ► http://bit.ly/SSFootballSub\nPREMI...,2021-02-28 14:07:11+00:00,0 days 00:07:15,"[sky sports, sky sports football, premier leag...",Sports,24995,388,6973.0,1377277,HD,Sky Sports Football,Sky Sports Football is the home of Sky Sports'...,2015-07-06 11:11:54+00:00,2780000,1654548702,4818,True,True
dKJh3_OnPZ8,UCBJeMCIeLQos7wacox4hmLQ,Roma 1-2 Milan | Milan keep the pressure on In...,"Rebic scored the winner for Milan, as they kee...",2021-02-28 22:43:51+00:00,0 days 00:04:14,"[Ronaldo, Serie A, Dybala, highlights, Juventu...",Sports,35514,908,1794.0,2272038,HD,Serie A,Welcome to the Official Serie A channel. Over ...,2012-10-30 13:54:30+00:00,6540000,2175901252,20042,True,True
w1B-NvBVtrw,UC9-OpMMVoNP5o10_Iyq7Ndw,Giannis Antetokounmpo Throws Down Monster Slam...,GREEK FREAK CALLED GAME. \n\r\nSubscribe: http...,2021-02-28 23:11:18+00:00,0 days 00:00:52,"[bleacher report, br, nba, giannis antetokounm...",Sports,4951,93,1582.0,437520,HD,Bleacher Report,The official YouTube page of Bleacher Report. ...,2007-09-14 18:23:54+00:00,2180000,1056524145,4622,True,True
eYO7Ccj1Iok,UCDkl5M0WVaddTWE4rr2cSeA,DES MEILLEURES AMIES ECHANGENT LEURS VIES PEND...,Je connais ma meilleure amie Solene depuis 22 ...,2021-02-28 20:13:20+00:00,0 days 00:23:29,"[lena situations, lena, lena situation, situat...",People & Blogs,112683,518,2077.0,1391601,HD,Léna Situations,désolée je suis nulle pour les descriptions,2013-01-14 17:22:41+00:00,1810000,263268098,296,True,True


## Stats

Unnamed: 0,length,stats.like,stats.dislike,stats.comment,stats.view,creator.stats.follower,creator.stats.view,creator.stats.post
count,20,20.0,20.0,19.0,20.0,20.0,20.0,20.0
mean,0 days 00:09:17.850000,21240.45,545.85,1642.631579,985827.9,14218000.0,11586470000.0,30655.1
std,0 days 00:11:17.421758931,25947.217241,678.091228,1912.237003,745354.1,23967140.0,21052100000.0,35430.73451
min,0 days 00:00:52,2347.0,66.0,65.0,94644.0,467000.0,130805700.0,177.0
25%,0 days 00:03:10.750000,5027.5,194.75,324.5,459342.8,1642500.0,831046800.0,1071.75
50%,0 days 00:05:08.500000,9690.0,349.0,756.0,692996.0,3045000.0,1931002000.0,20513.5
75%,0 days 00:08:49.750000,27624.75,607.75,1935.5,1380858.0,15100000.0,8150189000.0,51529.5
max,0 days 00:46:59,112683.0,3154.0,6973.0,2820763.0,98000000.0,80646880000.0,105638.0


---