# Data Sampling

## Setup

In [1]:
import sys

!{sys.executable} -m pip install --quiet --user --upgrade pandas==1.*
!{sys.executable} -m pip install --quiet --user --upgrade -r requirements.txt

In [2]:
from goodies import *

## Data Collection

In [3]:
from dcollect import plugins

modules = {'http': plugins.fasthttp()}
headers = None

### YouTube (United States)

Initial setup. Be sure to have your API key ready. For details on how to obtain an API key, read [YouTube Data API Overview, Introduction: Before you start](https://developers.google.com/youtube/v3/getting-started#before-you-start).

In [4]:
from dcollect import api_youtube as youtube
from dcollect import api_youtubei as youtubei

# This key is for testing ONLY. DO NOT release to the public!
api_key_testing = 'AIzaSyBKsF33Y1McGDdBWemcfcTbVyJu23XDNIk'
api_key = api_key_testing or input('YouTube Data API Key: ')

#### Search

In [5]:
count = 20
keyword = ''

##### STEP 1  API Object Creation

In [6]:
# create a YouTube API object
youtube_o = youtube.api(
    modules = modules,
    headers = headers,
    key = api_key
)

# create a YouTube Internals API object
youtubei_o = youtubei.api(
    modules = modules,
    headers = headers
)

##### STEP 2  Data Collection

In [7]:
from dcollect.utils.thread import threading, thread
from dcollect.utils.log import log

# set logging level
log.enable(level = log.levels.WARNING)


df_trending = df_from_json(
    youtube_o.video.trending(
        count = count
    )
)

df_channels = None
df_ads = None

thread.start([
    threading.Thread(
        # - channels
        target = lambda: \
            globals().update(
                df_channels = df_from_json(
                    youtube_o.channel.info(
                        id = df_trending['creator.id']
                    )
                )
            )
    ),
    threading.Thread(
        # - ad placements
        target = lambda: \
            globals().update(
                df_ads = df_from_json(
                    youtubei_o.ad.placements(
                        id = df_trending['id']
                    )
                )
            )
    )
])
thread.join()



##### STEP 3  Data Cleaning


In [8]:
# - ads (filter)
def filter_has_ad(ads):
    return not ads == None 
def filter_has_ad_beginning(ads):
    if ads == None:
        return False
    for ad in ads:
        if ad['kind'] == youtubei.resource.ad.kinds.START:
            return True
    return False
# - * (filter)
def drop_common(df, df_other, *args, **kwargs):
    return df.drop(columns = df.columns & df_other.columns, *args, **kwargs)

# - search
df_trending.set_index(['id', 'creator.id'], inplace = True)
# - channels
df_channels = df_channels.add_prefix('creator.')
df_channels.set_index(['creator.id'], inplace = True)
# - ads
df_ads.set_index(['id'], inplace = True)
df_ads['has_ad'] = df_ads['ads'].apply(filter_has_ad)
df_ads['has_ad_at_beginning'] = \
        df_ads['ads'].apply(filter_has_ad_beginning)
df_ads.drop('ads', axis = 'columns', inplace = True)

In [9]:
# - search (with details)
df_trending_details = df_trending.copy()
# - channels
df_trending_details = df_trending_details.merge(
    df_channels, 
    right_index = True, 
    left_on = 'creator.id', 
    copy = False
)
# - ads
df_trending_details = df_trending_details.merge(
    df_ads, 
    right_index = True, 
    left_on = 'id', 
    copy = False
)

##### STEP 4  Data Inspection


In [10]:
# take a brief look at our data
df_report(df_trending_details, name = 'Trending')

---

# Results - Trending

## Data Preview

Unnamed: 0_level_0,Unnamed: 1_level_0,title,description,time,length,tags,category,stats.like,stats.dislike,stats.comment,stats.view,video.quality,creator.title,creator.description,creator.time,creator.stats.follower,creator.stats.view,creator.stats.post,has_ad,has_ad_at_beginning
id,creator.id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
CkTVoLamPio,UC-SJ6nODDmufqBzPBwCvYvQ,"Meghan, Duchess of Sussex, opens up about her ...","For the first time Meghan, Duchess of Sussex, ...",2021-03-08 14:01:55+00:00,0 days 00:10:19,"[cbs this morning, duchess of sussex, Meghan M...",News & Politics,34772,6218,7977.0,3781614,HD,CBS This Morning,"Each weekday morning, Gayle King, Anthony Maso...",2013-05-23 10:59:52+00:00,1880000,1455944229,29387,True,True
auOrYTASVqQ,UC-SJ6nODDmufqBzPBwCvYvQ,Oprah Winfrey on her bombshell Harry and Megha...,"Oprah Winfrey joined ""CBS This Morning"" the da...",2021-03-08 14:04:06+00:00,0 days 00:05:39,"[Oprah Winfrey, CBS This Morning, prince harry...",News & Politics,15670,3729,5318.0,1947624,HD,CBS This Morning,"Each weekday morning, Gayle King, Anthony Maso...",2013-05-23 10:59:52+00:00,1880000,1455944229,29387,True,True
Tl9KT9RwiGc,UC-SJ6nODDmufqBzPBwCvYvQ,Harry and Meghan on how race factored into the...,"Prince Harry and Meghan, Duchess of Sussex, co...",2021-03-08 14:00:03+00:00,0 days 00:08:33,"[prince harry, Meghan Markle, duchess of susse...",News & Politics,12867,2944,5134.0,1361274,HD,CBS This Morning,"Each weekday morning, Gayle King, Anthony Maso...",2013-05-23 10:59:52+00:00,1880000,1455944229,29387,True,True
Wx4Vpm1KzSY,UCKjU3KzdbJE1EFcHVqXC3_g,"Prince Harry, Meghan reveal struggles behind r...",Prince Harry and Meghan Markle's revealing int...,2021-03-08 05:18:20+00:00,0 days 00:04:46,"[harry and meghan, prince harry and meghan mar...",News & Politics,34011,11456,,4291939,HD,CBC News: The National,The National is the flagship news and current ...,2007-10-17 21:34:25+00:00,998000,531856230,17987,True,True
tRMGdWValIk,UClQubH2NeMmGLTLgNdLBwXg,Last To Stop Customizing Wins Lamborghini,I can't believe we painted 6 Lamborghinis then...,2021-03-08 03:54:59+00:00,0 days 00:18:04,"[lamborghini, lambo, last to, last to stop, zh...",Entertainment,212641,4607,16656.0,4611022,HD,ZHC,I want to make the world a happier place\nI al...,2013-08-07 03:22:54+00:00,19500000,1482509471,283,True,True


## Stats

Unnamed: 0,length,stats.like,stats.dislike,stats.comment,stats.view,creator.stats.follower,creator.stats.view,creator.stats.post
count,20,20.0,20.0,17.0,20.0,20.0,20.0,20.0
mean,0 days 00:07:46.750000,145938.9,3603.0,12863.352941,2910063.0,7347000.0,2330870000.0,11380.1
std,0 days 00:05:16.503741559,342181.4,3095.07461,20492.938036,4510417.0,12321400.0,2658541000.0,12308.492596
min,0 days 00:00:14,326.0,137.0,339.0,90521.0,449000.0,197336900.0,18.0
25%,0 days 00:03:50.500000,11064.25,1441.5,2837.0,1020944.0,1677500.0,856941900.0,298.75
50%,0 days 00:07:50.500000,23192.5,3082.5,5318.0,1391480.0,2115000.0,1455944000.0,8335.5
75%,0 days 00:10:38,80368.25,4757.5,14025.0,2743979.0,8347500.0,1775379000.0,18962.25
max,0 days 00:19:01,1513964.0,11456.0,87819.0,20780750.0,54700000.0,9321449000.0,37993.0


---

##### STEP 5  Data Archiving (Cumulative)


In [15]:
import os

pickle_proto = 3
pickle_fname = 'dsamples/youtube_trending.pkl'

df_trending_details_final = None

if os.path.isfile(pickle_fname):
    df_trending_details_final = pd.read_pickle(pickle_fname)
    df_trending_details_final = pd.concat(
        [df_trending_details_final, df_trending_details], 
        copy = False
    )
else:
    df_trending_details_final = df_trending_details

df_trending_details_final.to_pickle(pickle_fname, protocol = pickle_proto)

# verify that we saved the correct data
df_trending_details_verify = pd.read_pickle(pickle_fname)
df_report(df_trending_details_verify, name = 'Trending (Verification)')

---

# Results - Trending (Verification)

## Data Preview

Unnamed: 0_level_0,Unnamed: 1_level_0,title,description,time,length,tags,category,stats.like,stats.dislike,stats.comment,stats.view,video.quality,creator.title,creator.description,creator.time,creator.stats.follower,creator.stats.view,creator.stats.post,has_ad,has_ad_at_beginning
id,creator.id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
CkTVoLamPio,UC-SJ6nODDmufqBzPBwCvYvQ,"Meghan, Duchess of Sussex, opens up about her ...","For the first time Meghan, Duchess of Sussex, ...",2021-03-08 14:01:55+00:00,0 days 00:10:19,"[cbs this morning, duchess of sussex, Meghan M...",News & Politics,32015,5461,7184.0,3461395,HD,CBS This Morning,"Each weekday morning, Gayle King, Anthony Maso...",2013-05-23 10:59:52+00:00,1880000,1455944229,29384,True,True
auOrYTASVqQ,UC-SJ6nODDmufqBzPBwCvYvQ,Oprah Winfrey on her bombshell Harry and Megha...,"Oprah Winfrey joined ""CBS This Morning"" the da...",2021-03-08 14:04:06+00:00,0 days 00:05:39,"[Oprah Winfrey, CBS This Morning, prince harry...",News & Politics,14725,3387,4883.0,1834547,HD,CBS This Morning,"Each weekday morning, Gayle King, Anthony Maso...",2013-05-23 10:59:52+00:00,1880000,1455944229,29384,True,True
Tl9KT9RwiGc,UC-SJ6nODDmufqBzPBwCvYvQ,Harry and Meghan on how race factored into the...,"Prince Harry and Meghan, Duchess of Sussex, co...",2021-03-08 14:00:03+00:00,0 days 00:08:33,"[prince harry, Meghan Markle, duchess of susse...",News & Politics,12080,2825,4935.0,1276809,HD,CBS This Morning,"Each weekday morning, Gayle King, Anthony Maso...",2013-05-23 10:59:52+00:00,1880000,1455944229,29384,True,True
Wx4Vpm1KzSY,UCKjU3KzdbJE1EFcHVqXC3_g,"Prince Harry, Meghan reveal struggles behind r...",Prince Harry and Meghan Markle's revealing int...,2021-03-08 05:18:20+00:00,0 days 00:04:46,"[harry and meghan, prince harry and meghan mar...",News & Politics,32428,10771,,4112536,HD,CBC News: The National,The National is the flagship news and current ...,2007-10-17 21:34:25+00:00,996000,531856230,17987,True,True
mbwMspaiUVg,UCiWLfSweyRNmLpgEHekhoAg,First Take reacts to Blake Griffin to the Nets,First Take reacts to Blake Griffin to the Nets...,2021-03-08 17:07:45+00:00,0 days 00:07:08,"[espn, first take, blake griffin, nba, nba on ...",Sports,16410,424,3321.0,1104602,HD,ESPN,ESPN on YouTube features up-to-the-minute spor...,2005-10-31 23:34:22+00:00,7730000,7099162935,37990,True,True


## Stats

Unnamed: 0,length,stats.like,stats.dislike,stats.comment,stats.view,creator.stats.follower,creator.stats.view,creator.stats.post
count,40,40.0,40.0,34.0,40.0,40.0,40.0,40.0
mean,0 days 00:07:46.200000,144907.6,3494.4,12759.941176,2871927.0,7326100.0,2318013000.0,11201.025
std,0 days 00:05:13.021192040,336267.7,2986.376475,20123.478824,4426368.0,12171970.0,2632447000.0,12259.068489
min,0 days 00:00:14,326.0,128.0,339.0,90521.0,447000.0,197336900.0,18.0
25%,0 days 00:03:50.500000,11064.25,1415.25,2814.5,991478.5,1362500.0,744268600.0,298.75
50%,0 days 00:07:50.500000,23064.0,3021.5,5226.0,1380364.0,1950000.0,1455944000.0,8335.0
75%,0 days 00:10:38,78013.75,4643.75,13865.75,2663924.0,8347500.0,1775379000.0,18962.25
max,0 days 00:19:01,1513964.0,11456.0,87819.0,20780750.0,54700000.0,9321449000.0,37993.0


---