# Data Sampling

## Setup

In [12]:
import sys

!{sys.executable} -m pip install --quiet --user --upgrade pandas==1.*
!{sys.executable} -m pip install --quiet --user --upgrade -r requirements.txt

In [13]:
from goodies import *
import pandas as pd

## Data Collection

In [14]:
from dcollect import plugins

modules = {'http': plugins.fasthttp()}
headers = None

### YouTube (United States)

Initial setup. Be sure to have your API key ready. For details on how to obtain an API key, read [YouTube Data API Overview, Introduction: Before you start](https://developers.google.com/youtube/v3/getting-started#before-you-start).

In [15]:
from dcollect import api_youtube as youtube
from dcollect import api_youtubei as youtubei

# This key is for testing ONLY. DO NOT release to the public!
api_key_testing = None
api_key = api_key_testing or input('YouTube Data API Key: ')

YouTube Data API Key: AIzaSyBKsF33Y1McGDdBWemcfcTbVyJu23XDNIk


#### Search

In [16]:
count = 200

##### STEP 1  API Object Creation

In [17]:
# create a YouTube API object
youtube_o = youtube.api(
    modules = modules,
    headers = headers,
    key = api_key
)

# create a YouTube Internals API object
youtubei_o = youtubei.api(
    modules = modules,
    headers = headers
)

##### STEP 2  Data Collection

In [18]:
from dcollect.utils.thread import threading, thread
from dcollect.utils.log import log

# set logging level
log.enable(level = log.levels.WARNING)


df_trending = df_from_json(
    youtube_o.video.trending(
        count = count
    )
)

df_channels = None
df_ads = None

thread.start([
    threading.Thread(
        # - channels
        target = lambda: \
            globals().update(
                df_channels = df_from_json(
                    youtube_o.channel.info(
                        id = df_trending['creator.id']
                    )
                )
            )
    ),
    threading.Thread(
        # - ad placements
        target = lambda: \
            globals().update(
                df_ads = df_from_json(
                    youtubei_o.ad.placements(
                        id = df_trending['id']
                    )
                )
            )
    )
])
thread.join()

##### STEP 3  Data Cleaning


In [19]:
# - ads (filter)
def filter_has_ad(ads):
    return not ads == None 
def filter_has_ad_beginning(ads):
    if ads == None:
        return False
    for ad in ads:
        if ad['kind'] == youtubei.resource.ad.kinds.START:
            return True
    return False
# - * (filter)
def drop_common(df, df_other, *args, **kwargs):
    return df.drop(columns = df.columns & df_other.columns, *args, **kwargs)

# - search
df_trending.set_index(['id', 'creator.id'], inplace = True)
# - channels
df_channels = df_channels.add_prefix('creator.')
df_channels.set_index(['creator.id'], inplace = True)
# - ads
df_ads.set_index(['id'], inplace = True)
df_ads['has_ad'] = df_ads['ads'].apply(filter_has_ad)
df_ads['has_ad_at_beginning'] = \
        df_ads['ads'].apply(filter_has_ad_beginning)
df_ads.drop('ads', axis = 'columns', inplace = True)

In [20]:
# - search (with details)
df_trending_details = df_trending.copy()
# - ads
df_trending_details = df_trending_details.merge(
    df_ads, 
    right_index = True, 
    left_on = 'id', 
    copy = False
)
# - channels
df_trending_details_channels = df_channels

##### STEP 4  Data Inspection


In [21]:
# take a brief look at our data
df_report(df_trending_details, name = 'Trending')
df_report(df_trending_details_channels, name = 'Trending (Channels)')

---

# Results - Trending

## Data Preview

Unnamed: 0_level_0,Unnamed: 1_level_0,title,description,time,length,tags,category,stats.like,stats.dislike,stats.comment,stats.view,video.quality,has_ad,has_ad_at_beginning
id,creator.id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
sG9rX6Ifzhw,UCq18eeL7D9Vd8DhjMcLh9QQ,Piers and Alex Clash Over Prince Harry and Meg...,Following the Duke and Duchess of Sussex’s lan...,2021-03-09 10:58:19+00:00,0 days 00:14:05,"[good morning britain, breakfast show, news, m...",Entertainment,46817,7645,,4144988,HD,True,True
CkTVoLamPio,UC-SJ6nODDmufqBzPBwCvYvQ,"Meghan, Duchess of Sussex, opens up about her ...","For the first time Meghan, Duchess of Sussex, ...",2021-03-08 14:01:55+00:00,0 days 00:10:19,"[cbs this morning, duchess of sussex, Meghan M...",News & Politics,51245,11042,12387.0,6580878,HD,True,True
pIQIKIDZJjc,UCi3OE-aN09WOcN9d2stCvPg,David Dobrik & I Bought Markell A Car | Charli...,hi everyone. i wanted to organize this with @D...,2021-03-09 18:00:12+00:00,0 days 00:11:26,"[charli, charlie, charli d'amelio, charli dame...",People & Blogs,130353,4796,9655.0,1415497,HD,True,True
jJdlgKzVsnI,UCpTaAz_BxtkUB1qc8JTU_7g,Doja Cat - Streets (Official Video),Doja Cat // Streets (Official Video)\nHot Pink...,2021-03-09 05:00:12+00:00,0 days 00:04:34,"[doja cat, streets, doja, doja cat streets, st...",Music,869472,8405,56206.0,7154068,HD,True,True
qUUloBe5vEo,UCRijo3ddMTht_IHyNSNXpNQ,Longest Dunk Wins,Slippery Stairs and a GIANT catapult!?! Today'...,2021-03-08 23:00:02+00:00,0 days 00:10:19,"[dude perfect, dude perfect stereotypes, dude ...",Sports,371935,4280,13898.0,7541282,HD,True,True


## Stats

Unnamed: 0,length,stats.like,stats.dislike,stats.comment,stats.view
count,200,200.0,200.0,194.0,200.0
mean,0 days 00:10:06.850000,144468.3,2372.63,13496.324742,2705343.0
std,0 days 00:09:27.893274618,260088.2,3428.171371,30690.249011,3830323.0
min,0 days 00:00:12,719.0,58.0,103.0,150392.0
25%,0 days 00:03:18.250000,21205.0,431.0,2264.5,720892.5
50%,0 days 00:07:32.500000,52133.0,961.5,4552.5,1303486.0
75%,0 days 00:13:22,126815.5,2749.0,12074.5,2943700.0
max,0 days 00:54:22,1743261.0,21802.0,295502.0,24820870.0


---

---

# Results - Trending (Channels)

## Data Preview

Unnamed: 0_level_0,creator.title,creator.description,creator.time,creator.stats.follower,creator.stats.view,creator.stats.post
creator.id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
UCtj45MepAoKxZoyR_Mnt86Q,Royal Family,,2018-07-08 20:48:12+00:00,2790000,203263431,192
UCqAil6_A1dWHdFXzOwVLqlg,Sub Urban,https://suburban.lnk.to/ThrillSeeker,2019-08-16 18:40:17+00:00,3360000,589237473,14
UCbPY1Efha9VPRBYW2x1M16A,JJ Redick,"Hosted by JJ Redick and Tommy Alter, The Old M...",2020-07-09 20:42:32.930403+00:00,226000,40337163,219
UC1GFqqHIBaiCW1ivJJCFTvg,amustycow,I'm a rocket league YouTuber known as musty or...,2016-08-27 15:59:41+00:00,2350000,291577645,262
UCi0ZJJC7ElVN0xl9GJ4V9aQ,Rebecca Maddie Challenges,,2020-05-02 21:08:04.510685+00:00,921000,62673789,37


## Stats

Unnamed: 0,creator.stats.follower,creator.stats.view,creator.stats.post
count,200.0,200.0,200.0
mean,5445723.0,2420539000.0,2432.42
std,11544240.0,8153330000.0,7932.176754
min,8150.0,7057505.0,6.0
25%,748000.0,67760450.0,64.0
50%,1745000.0,291067200.0,263.0
75%,5800000.0,1457738000.0,1220.0
max,74300000.0,55838550000.0,53319.0


---

##### STEP 5  Data Archiving (Cumulative)


In [22]:
pickle_proto = 3

pickle_fname = 'dsamples/youtube_trending.pkl'
pickle_fname_channels = 'dsamples/youtube_trending_channels.pkl'

df_update_pickle(df_trending_details, pickle_fname, proto = pickle_proto)
df_update_pickle(df_trending_details_channels, pickle_fname_channels, proto = pickle_proto)

# verify that we saved the correct data
df_trending_details_verify = pd.read_pickle(pickle_fname)
df_report(df_trending_details_verify, name = 'Trending (Verification)')
df_trending_details_channels_verify = pd.read_pickle(pickle_fname_channels)
df_report(df_trending_details_channels_verify, name = 'Trending (Channels) (Verification)')

---

# Results - Trending (Verification)

## Data Preview

Unnamed: 0_level_0,Unnamed: 1_level_0,title,description,time,length,tags,category,stats.like,stats.dislike,stats.comment,stats.view,video.quality,has_ad,has_ad_at_beginning
id,creator.id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
sG9rX6Ifzhw,UCq18eeL7D9Vd8DhjMcLh9QQ,Piers and Alex Clash Over Prince Harry and Meg...,Following the Duke and Duchess of Sussex’s lan...,2021-03-09 10:58:19+00:00,0 days 00:14:05,"[good morning britain, breakfast show, news, m...",Entertainment,46817,7645,,4144988,HD,True,True
CkTVoLamPio,UC-SJ6nODDmufqBzPBwCvYvQ,"Meghan, Duchess of Sussex, opens up about her ...","For the first time Meghan, Duchess of Sussex, ...",2021-03-08 14:01:55+00:00,0 days 00:10:19,"[cbs this morning, duchess of sussex, Meghan M...",News & Politics,51245,11042,12387.0,6580878,HD,True,True
pIQIKIDZJjc,UCi3OE-aN09WOcN9d2stCvPg,David Dobrik & I Bought Markell A Car | Charli...,hi everyone. i wanted to organize this with @D...,2021-03-09 18:00:12+00:00,0 days 00:11:26,"[charli, charlie, charli d'amelio, charli dame...",People & Blogs,130353,4796,9655.0,1415497,HD,True,True
jJdlgKzVsnI,UCpTaAz_BxtkUB1qc8JTU_7g,Doja Cat - Streets (Official Video),Doja Cat // Streets (Official Video)\nHot Pink...,2021-03-09 05:00:12+00:00,0 days 00:04:34,"[doja cat, streets, doja, doja cat streets, st...",Music,869472,8405,56206.0,7154068,HD,True,True
qUUloBe5vEo,UCRijo3ddMTht_IHyNSNXpNQ,Longest Dunk Wins,Slippery Stairs and a GIANT catapult!?! Today'...,2021-03-08 23:00:02+00:00,0 days 00:10:19,"[dude perfect, dude perfect stereotypes, dude ...",Sports,371935,4280,13898.0,7541282,HD,True,True


## Stats

Unnamed: 0,length,stats.like,stats.dislike,stats.comment,stats.view
count,200,200.0,200.0,194.0,200.0
mean,0 days 00:10:06.850000,144468.3,2372.63,13496.324742,2705343.0
std,0 days 00:09:27.893274618,260088.2,3428.171371,30690.249011,3830323.0
min,0 days 00:00:12,719.0,58.0,103.0,150392.0
25%,0 days 00:03:18.250000,21205.0,431.0,2264.5,720892.5
50%,0 days 00:07:32.500000,52133.0,961.5,4552.5,1303486.0
75%,0 days 00:13:22,126815.5,2749.0,12074.5,2943700.0
max,0 days 00:54:22,1743261.0,21802.0,295502.0,24820870.0


---

---

# Results - Trending (Channels) (Verification)

## Data Preview

Unnamed: 0_level_0,creator.title,creator.description,creator.time,creator.stats.follower,creator.stats.view,creator.stats.post
creator.id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
UCtj45MepAoKxZoyR_Mnt86Q,Royal Family,,2018-07-08 20:48:12+00:00,2790000,203263431,192
UCqAil6_A1dWHdFXzOwVLqlg,Sub Urban,https://suburban.lnk.to/ThrillSeeker,2019-08-16 18:40:17+00:00,3360000,589237473,14
UCbPY1Efha9VPRBYW2x1M16A,JJ Redick,"Hosted by JJ Redick and Tommy Alter, The Old M...",2020-07-09 20:42:32.930403+00:00,226000,40337163,219
UC1GFqqHIBaiCW1ivJJCFTvg,amustycow,I'm a rocket league YouTuber known as musty or...,2016-08-27 15:59:41+00:00,2350000,291577645,262
UCi0ZJJC7ElVN0xl9GJ4V9aQ,Rebecca Maddie Challenges,,2020-05-02 21:08:04.510685+00:00,921000,62673789,37


## Stats

Unnamed: 0,creator.stats.follower,creator.stats.view,creator.stats.post
count,200.0,200.0,200.0
mean,5445723.0,2420539000.0,2432.42
std,11544240.0,8153330000.0,7932.176754
min,8150.0,7057505.0,6.0
25%,748000.0,67760450.0,64.0
50%,1745000.0,291067200.0,263.0
75%,5800000.0,1457738000.0,1220.0
max,74300000.0,55838550000.0,53319.0


---