# Data Sampling

## Setup

In [1]:
import sys

!{sys.executable} -m pip install --quiet --user --upgrade pandas==1.*
!{sys.executable} -m pip install --quiet --user --upgrade -r requirements.txt

In [2]:
from goodies import *
import pandas as pd

## Data Collection

In [3]:
from dcollect import plugins

modules = {'http': plugins.fasthttp()}
headers = None

### YouTube (United States)

Initial setup. Be sure to have your API key ready. For details on how to obtain an API key, read [YouTube Data API Overview, Introduction: Before you start](https://developers.google.com/youtube/v3/getting-started#before-you-start).

In [4]:
from dcollect import api_youtube as youtube
from dcollect import api_youtubei as youtubei

# This key is for testing ONLY. DO NOT release to the public!
api_key_testing = None
api_key = api_key_testing or input('YouTube Data API Key: ')
api_experiment = True

YouTube Data API Key: AIzaSyAa8yy0GdcGPHdtD083HiGGx_S0vMPScDM


#### Search

In [5]:
count = 200

##### STEP 1  API Object Creation

In [6]:
# create a YouTube API object
youtube_o = youtube.api(
    modules = modules,
    headers = headers,
    key = api_key,
    experiment = api_experiment
)

# create a YouTube Internals API object
youtubei_o = youtubei.api(
    modules = modules,
    headers = headers
)

##### STEP 2  Data Collection

In [7]:
from dcollect.utils.thread import threading, thread
from dcollect.utils.log import log

# set logging level
log.enable(level = log.levels.WARNING)


df_trending = df_from_json(
    youtube_o.video.trending(
        count = count
    )
)

df_channels = None
df_ads = None

thread.start([
    threading.Thread(
        # - channels
        target = lambda: \
            globals().update(
                df_channels = df_from_json(
                    youtube_o.channel.info(
                        id = df_trending['creator.id']
                    )
                )
            )
    ),
    threading.Thread(
        # - ad placements
        target = lambda: \
            globals().update(
                df_ads = df_from_json(
                    youtubei_o.ad.placements(
                        id = df_trending['id'],
                        throttle_size = None
                    )
                )
            )
    )
])
thread.join()

##### STEP 3  Data Cleaning


In [8]:
# - ads (filter)
def filter_has_ad(ads):
    return not ads == None 
def filter_has_ad_beginning(ads):
    if ads == None:
        return False
    for ad in ads:
        if ad['kind'] == youtubei.resource.ad.kinds.START:
            return True
    return False
# - * (filter)
def drop_common(df, df_other, *args, **kwargs):
    return df.drop(columns = df.columns & df_other.columns, *args, **kwargs)

# - search
df_trending.set_index(['id', 'creator.id'], inplace = True)
# - channels
df_channels = df_channels.add_prefix('creator.')
df_channels.set_index(['creator.id'], inplace = True)
# - ads
df_ads.set_index(['id'], inplace = True)
df_ads['has_ad'] = df_ads['ads'].apply(filter_has_ad)
df_ads['has_ad_at_beginning'] = \
        df_ads['ads'].apply(filter_has_ad_beginning)
df_ads.drop('ads', axis = 'columns', inplace = True)

In [9]:
# - search (with details)
df_trending_details = df_trending.copy()
# - ads
df_trending_details = df_trending_details.merge(
    df_ads, 
    right_index = True, 
    left_on = 'id', 
    copy = False
)
# - channels
df_trending_details_channels = df_channels

##### STEP 4  Data Inspection


In [10]:
# take a brief look at our data
df_report(df_trending_details, name = 'Trending')
df_report(df_trending_details_channels, name = 'Trending (Channels)')

---

# Results - Trending

## Data Preview

Unnamed: 0_level_0,Unnamed: 1_level_0,title,description,time,length,tags,category,stats.like,stats.dislike,stats.comment,stats.view,video.quality,has_ad,has_ad_at_beginning
id,creator.id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
AeaR5QbXgpM,UCenjunBhBhvKjfDAESnoppw,Rod Wave - Street Runner (Official Video),#rodwave #streetrunner #soulfly\n\nPre-save th...,2021-03-10 05:00:13+00:00,0 days 00:04:10,"[rod wave, hunger games, hunger games 3, ptsd,...",Music,100402,455,8503.0,631126,HD,True,True
sG9rX6Ifzhw,UCq18eeL7D9Vd8DhjMcLh9QQ,Piers and Alex Clash Over Prince Harry and Meg...,Following the Duke and Duchess of Sussex’s lan...,2021-03-09 10:58:19+00:00,0 days 00:14:05,"[good morning britain, breakfast show, news, m...",Entertainment,55602,9169,,5046626,HD,True,True
pIQIKIDZJjc,UCi3OE-aN09WOcN9d2stCvPg,David Dobrik & I Bought Markell A Car | Charli...,hi everyone. i wanted to organize this with @D...,2021-03-09 18:00:12+00:00,0 days 00:11:26,"[charli, charlie, charli d'amelio, charli dame...",People & Blogs,149389,6249,11135.0,1701305,HD,True,True
CkTVoLamPio,UC-SJ6nODDmufqBzPBwCvYvQ,"Meghan, Duchess of Sussex, opens up about her ...","For the first time Meghan, Duchess of Sussex, ...",2021-03-08 14:01:55+00:00,0 days 00:10:19,"[cbs this morning, duchess of sussex, Meghan M...",News & Politics,55138,11245,13351.0,7005216,HD,True,True
jJdlgKzVsnI,UCpTaAz_BxtkUB1qc8JTU_7g,Doja Cat - Streets (Official Video),Doja Cat // Streets (Official Video)\nHot Pink...,2021-03-09 05:00:12+00:00,0 days 00:04:34,"[doja cat, streets, doja, doja cat streets, st...",Music,936127,9817,59250.0,8043745,HD,True,True


## Stats

Unnamed: 0,length,stats.like,stats.dislike,stats.comment,stats.view
count,200,200.0,200.0,193.0,200.0
mean,0 days 00:10:04.885000,145427.4,2429.195,13601.336788,2717102.0
std,0 days 00:09:24.722319610,266478.7,3589.553115,31269.048841,3956645.0
min,0 days 00:00:12,555.0,60.0,104.0,87710.0
25%,0 days 00:03:18.250000,21856.25,412.25,2241.0,703151.0
50%,0 days 00:07:32.500000,55136.5,899.5,4480.0,1238538.0
75%,0 days 00:13:22,126550.5,2697.5,11315.0,2988878.0
max,0 days 00:54:22,1760165.0,22133.0,301036.0,25774470.0


---

---

# Results - Trending (Channels)

## Data Preview

Unnamed: 0_level_0,creator.title,creator.description,creator.time,creator.stats.follower,creator.stats.view,creator.stats.post
creator.id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
UCk6eFAsW72m7jJoEdXi1xBg,AsheMusicVEVO,,2017-07-06 10:08:45+00:00,11000,74829732,41
UCFKE7WVJfvaHW5q283SxchA,Yoga With Adriene,WELCOME to Yoga With Adriene! Our mission is t...,2012-08-30 16:59:10+00:00,9560000,879703632,599
UC2MHTOXktfTK26aDKyQs3cQ,Windies Cricket,The Official channel of the WINDIES internatio...,2017-10-20 00:49:37+00:00,1970000,386016372,892
UC2S7CGceq5tsuhcVyACdA3g,CrankGameplays,WHAT IS UP MAH CRANKY CREW?!?!\nI play those s...,2015-12-02 05:32:13+00:00,2050000,148568460,1598
UCWxCyZibDIWIrGIgP25mbfw,iKON,iKON Official YouTube Channel\n아이콘 공식 유튜브 채널입니...,2014-09-03 02:28:23+00:00,7880000,1922083730,453


## Stats

Unnamed: 0,creator.stats.follower,creator.stats.view,creator.stats.post
count,112.0,112.0,112.0
mean,5425139.0,1811890000.0,4011.285714
std,7309595.0,3555647000.0,8445.08955
min,9090.0,16194750.0,6.0
25%,822500.0,163229000.0,81.5
50%,2095000.0,528281900.0,240.0
75%,7950000.0,1596461000.0,1838.75
max,33400000.0,17649540000.0,34992.0


---

##### STEP 5  Data Archiving (Cumulative)


In [11]:
pickle_proto = 3

pickle_fname = 'dsamples/youtube_trending.pkl'
pickle_fname_channels = 'dsamples/youtube_trending_channels.pkl'

df_update_pickle(df_trending_details, pickle_fname, proto = pickle_proto)
df_update_pickle(df_trending_details_channels, pickle_fname_channels, proto = pickle_proto)

# verify that we saved the correct data
df_trending_details_verify = pd.read_pickle(pickle_fname)
df_report(df_trending_details_verify, name = 'Trending (Verification)')
df_trending_details_channels_verify = pd.read_pickle(pickle_fname_channels)
df_report(df_trending_details_channels_verify, name = 'Trending (Channels) (Verification)')

---

# Results - Trending (Verification)

## Data Preview

Unnamed: 0_level_0,Unnamed: 1_level_0,title,description,time,length,tags,category,stats.like,stats.dislike,stats.comment,stats.view,video.quality,has_ad,has_ad_at_beginning
id,creator.id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
-aToDFwlM1Y,UC2rWFYjCHFcnjNnWhnGvv7Q,CARMEN RUSHED TO THE HOSPITAL BECAUSE OF MISCA...,Carmen rushed to the hospital signs of miscarr...,2021-03-08 22:31:08+00:00,0 days 00:13:20,"[Carmen and Corey, Carmen’s pregnant, Pregnanc...",People & Blogs,30406.0,410.0,4480.0,364643.0,HD,True,True
-hAztPfxZEk,UCtj45MepAoKxZoyR_Mnt86Q,FIRST DAY IN OUR NEW HOME! (HECTIC),LAST VIDEO: \r\n\r\n*MAKE SURE OUR POST NOTIFI...,2021-03-09 03:38:11+00:00,0 days 00:14:23,"[queen Naija, Medicine, Queen, Royalty Squad, ...",Entertainment,25547.0,465.0,1532.0,389765.0,HD,True,True
0L8kRN_bPXM,UC7zOpx9wgvGBCDEjujnAPQA,Big Sean - Deep Reverence ft. Nipsey Hussle,Stream/Download “Detroit 2 https://bigsean.lnk...,2021-03-05 20:00:12+00:00,0 days 00:04:00,"[Big, Sean, Deep, Reverence, Getting, Out, Our...",Music,117003.0,835.0,5298.0,1659213.0,HD,True,True
0PzWnljCpL0,UCvmofFg-oZc4jvBUIfZbjzg,What Is International Women's Day?,International Women’s Day is celebrated every ...,2021-02-09 05:22:52+00:00,0 days 00:06:00,"[international women's day, what is internatio...",Education,2063.0,194.0,646.0,154588.0,HD,True,True
1hlon2py3VU,UCAXEGk-l_ioBMvHa9_uHJjg,Having His Lil Brother Be Mean To Me!,I shouldn't have let it get this far...\n\nJOI...,2021-03-04 00:33:36+00:00,0 days 00:13:21,"[prank, funny, brother, girlfriend, argue, mad...",People & Blogs,32284.0,1377.0,2080.0,934628.0,HD,True,True


## Stats

Unnamed: 0,length,stats.like,stats.dislike,stats.comment,stats.view
count,217,217.0,217.0,209.0,217.0
mean,0 days 00:09:55.317972350,139070.9,2353.668203,12995.052632,2616348.0
std,0 days 00:09:18.177271140,257878.8,3470.035672,30139.818414,3823714.0
min,0 days 00:00:12,555.0,60.0,104.0,87710.0
25%,0 days 00:03:13,19864.0,413.0,2241.0,686333.0
50%,0 days 00:07:27,53419.0,926.0,4493.0,1225780.0
75%,0 days 00:13:20,119702.0,2676.0,11135.0,2910005.0
max,0 days 00:54:22,1760165.0,22133.0,301036.0,25774470.0


---

---

# Results - Trending (Channels) (Verification)

## Data Preview

Unnamed: 0_level_0,creator.title,creator.description,creator.time,creator.stats.follower,creator.stats.view,creator.stats.post
creator.id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
UC-SV8-bUJfXjrRMnp7F8Wzw,Roman Atwood Vlogs,"I’m Roman Atwood, this is my life as a Father ...",2013-08-12 19:42:23+00:00,15600000.0,5346514000.0,1665.0
UC-SV8-bUJfXjrRMnp7F8Wzw,Roman Atwood Vlogs,"I’m Roman Atwood, this is my life as a Father ...",2013-08-12 19:42:23+00:00,15600000.0,5346514000.0,1665.0
UC-SV8-bUJfXjrRMnp7F8Wzw,Roman Atwood Vlogs,"I’m Roman Atwood, this is my life as a Father ...",2013-08-12 19:42:23+00:00,15600000.0,5346514000.0,1665.0
UC-SV8-bUJfXjrRMnp7F8Wzw,Roman Atwood Vlogs,"I’m Roman Atwood, this is my life as a Father ...",2013-08-12 19:42:23+00:00,15600000.0,5346514000.0,1665.0
UC-yW8iz7ICKv_bhuxLtcJaw,TateMcRaeVEVO,,2019-02-05 17:08:22+00:00,8150.0,138895600.0,28.0


## Stats

Unnamed: 0,creator.stats.follower,creator.stats.view,creator.stats.post
count,408.0,408.0,408.0
mean,5698879.0,2253129000.0,2520.509804
std,9922691.0,6433059000.0,7226.666308
min,8150.0,7057505.0,6.0
25%,748000.0,78886610.0,67.0
50%,2000000.0,402920400.0,217.0
75%,7880000.0,1922084000.0,1183.0
max,74300000.0,55838550000.0,53319.0


---