# Data Sampling

## Setup

In [1]:
import sys

!{sys.executable} -m pip install --quiet --user --upgrade pandas==1.*
!{sys.executable} -m pip install --quiet --user --upgrade -r requirements.txt

In [2]:
from goodies import *
import pandas as pd

## Data Collection

In [3]:
from dcollect import plugins

modules = {'http': plugins.fasthttp()}
headers = None

### YouTube (United States)

Initial setup. Be sure to have your API key ready. For details on how to obtain an API key, read [YouTube Data API Overview, Introduction: Before you start](https://developers.google.com/youtube/v3/getting-started#before-you-start).

In [4]:
from dcollect import api_youtube as youtube
from dcollect import api_youtubei as youtubei

# This key is for testing ONLY. DO NOT release to the public!
api_key_testing = None
api_key = api_key_testing or input('YouTube Data API Key: ')

#### Search

In [5]:
count = 200
keyword = ''

##### STEP 1  API Object Creation

In [6]:
# create a YouTube API object
youtube_o = youtube.api(
    modules = modules,
    headers = headers,
    key = api_key
)

# create a YouTube Internals API object
youtubei_o = youtubei.api(
    modules = modules,
    headers = headers
)

##### STEP 2  Data Collection

In [7]:
from dcollect.utils.thread import threading, thread
from dcollect.utils.log import log

# set logging level
log.enable(level = log.levels.WARNING)


df_trending = df_from_json(
    youtube_o.video.trending(
        count = count
    )
)

df_channels = None
df_ads = None

thread.start([
    threading.Thread(
        # - channels
        target = lambda: \
            globals().update(
                df_channels = df_from_json(
                    youtube_o.channel.info(
                        id = df_trending['creator.id']
                    )
                )
            )
    ),
    threading.Thread(
        # - ad placements
        target = lambda: \
            globals().update(
                df_ads = df_from_json(
                    youtubei_o.ad.placements(
                        id = df_trending['id']
                    )
                )
            )
    )
])
thread.join()

##### STEP 3  Data Cleaning


In [8]:
# - ads (filter)
def filter_has_ad(ads):
    return not ads == None 
def filter_has_ad_beginning(ads):
    if ads == None:
        return False
    for ad in ads:
        if ad['kind'] == youtubei.resource.ad.kinds.START:
            return True
    return False
# - * (filter)
def drop_common(df, df_other, *args, **kwargs):
    return df.drop(columns = df.columns & df_other.columns, *args, **kwargs)

# - search
df_trending.set_index(['id', 'creator.id'], inplace = True)
# - channels
df_channels = df_channels.add_prefix('creator.')
df_channels.set_index(['creator.id'], inplace = True)
# - ads
df_ads.set_index(['id'], inplace = True)
df_ads['has_ad'] = df_ads['ads'].apply(filter_has_ad)
df_ads['has_ad_at_beginning'] = \
        df_ads['ads'].apply(filter_has_ad_beginning)
df_ads.drop('ads', axis = 'columns', inplace = True)

In [19]:
# - search (with details)
df_trending_details = df_trending.copy()
# - ads
df_trending_details = df_trending_details.merge(
    df_ads, 
    right_index = True, 
    left_on = 'id', 
    copy = False
)
# - channels
df_trending_details_channels = df_channels

##### STEP 4  Data Inspection


In [10]:
# take a brief look at our data
df_report(df_trending_details, name = 'Trending')
df_report(df_trending_details_channels, name = 'Trending (Channels)')

---

# Results - Trending

## Data Preview

Unnamed: 0_level_0,Unnamed: 1_level_0,title,description,time,length,tags,category,stats.like,stats.dislike,stats.comment,stats.view,video.quality,creator.title,creator.description,creator.time,creator.stats.follower,creator.stats.view,creator.stats.post,has_ad,has_ad_at_beginning
id,creator.id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
R0R0gSE1jmM,UCw1SQ6QRRtfAhrN_cjkrOgA,lazarbeam skin is BROKEN,the lazarbeam skin has a few bugs...\n\nCODE L...,2021-03-09 03:06:47+00:00,0 days 00:08:15,[gaming],Gaming,214199,1559,14148.0,3840764,HD,LazarBeam,THIS CHANNEL FEATURES AUSTRALIAN MAN\nFor Spon...,2015-01-04 23:17:08+00:00,18300000,7052999799,1220,True,True
R0R0gSE1jmM,UCw1SQ6QRRtfAhrN_cjkrOgA,lazarbeam skin is BROKEN,the lazarbeam skin has a few bugs...\n\nCODE L...,2021-03-09 03:06:47+00:00,0 days 00:08:15,[gaming],Gaming,214199,1559,14148.0,3840764,HD,LazarBeam,THIS CHANNEL FEATURES AUSTRALIAN MAN\nFor Spon...,2015-01-04 23:17:08+00:00,18300000,7052999799,1220,True,True
R0R0gSE1jmM,UCw1SQ6QRRtfAhrN_cjkrOgA,lazarbeam skin is BROKEN,the lazarbeam skin has a few bugs...\n\nCODE L...,2021-03-09 03:06:47+00:00,0 days 00:08:15,[gaming],Gaming,214199,1559,14148.0,3840764,HD,LazarBeam,THIS CHANNEL FEATURES AUSTRALIAN MAN\nFor Spon...,2015-01-04 23:17:08+00:00,18300000,7052999799,1220,True,True
R0R0gSE1jmM,UCw1SQ6QRRtfAhrN_cjkrOgA,lazarbeam skin is BROKEN,the lazarbeam skin has a few bugs...\n\nCODE L...,2021-03-09 03:06:47+00:00,0 days 00:08:15,[gaming],Gaming,214199,1559,14148.0,3840764,HD,LazarBeam,THIS CHANNEL FEATURES AUSTRALIAN MAN\nFor Spon...,2015-01-04 23:17:08+00:00,18300000,7052999799,1220,True,True
g5TxSM16S6Q,UCw1SQ6QRRtfAhrN_cjkrOgA,first game with MY SKIN,LazarBeam X Fortnite MARCH 4TH - CODE LAZAR \n...,2021-03-03 12:22:32+00:00,0 days 00:09:59,"[gaming, fortnite]",Gaming,402680,4977,37761.0,8506267,HD,LazarBeam,THIS CHANNEL FEATURES AUSTRALIAN MAN\nFor Spon...,2015-01-04 23:17:08+00:00,18300000,7052999799,1220,True,True


## Stats

Unnamed: 0,length,stats.like,stats.dislike,stats.comment,stats.view,creator.stats.follower,creator.stats.view,creator.stats.post
count,224,224.0,224.0,224.0,224.0,224.0,224.0,224.0
mean,0 days 00:09:03.214285714,112235.3,1857.5,13507.928571,2466097.0,5963271.0,2543321000.0,2299.071429
std,0 days 00:08:27.451266582,162959.2,2652.20721,39329.942919,3515777.0,11266730.0,7772647000.0,7504.823009
min,0 days 00:00:23,3787.0,110.0,504.0,271944.0,8150.0,7057505.0,6.0
25%,0 days 00:03:09.250000,23788.25,392.0,2114.5,744914.8,835000.0,77872390.0,68.5
50%,0 days 00:05:58.500000,56274.5,715.0,4760.5,1282617.0,2090000.0,301932100.0,326.5
75%,0 days 00:11:31.750000,105753.0,1564.0,10137.0,2452179.0,6627500.0,1589453000.0,1223.5
max,0 days 00:37:48,1021220.0,10751.0,293380.0,20894410.0,74300000.0,55838550000.0,53319.0


---

##### STEP 5  Data Archiving (Cumulative)


In [11]:
pickle_proto = 3

pickle_fname = 'dsamples/youtube_trending.pkl'
pickle_fname_channels = 'dsamples/youtube_trending_channels.pkl'

df_update_pickle(df_trending_details, pickle_fname, proto = pickle_proto)
df_update_pickle(df_trending_details_channels, pickle_fname_channels, proto = pickle_proto)

# verify that we saved the correct data
df_trending_details_verify = pd.read_pickle(pickle_fname)
df_report(df_trending_details_verify, name = 'Trending (Verification)')
df_trending_details_channels_verify = pd.read_pickle(pickle_fname)
df_report(df_trending_details_channels_verify, name = 'Trending (Channels) (Verification)')

---

# Results - Trending (Verification)

## Data Preview

Unnamed: 0_level_0,Unnamed: 1_level_0,title,description,time,length,tags,category,stats.like,stats.dislike,stats.comment,stats.view,video.quality,creator.title,creator.description,creator.time,creator.stats.follower,creator.stats.view,creator.stats.post,has_ad,has_ad_at_beginning
id,creator.id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
-hAztPfxZEk,UCtj45MepAoKxZoyR_Mnt86Q,FIRST DAY IN OUR NEW HOME! (HECTIC),LAST VIDEO: \r\n\r\n*MAKE SURE OUR POST NOTIFI...,2021-03-09 03:38:11+00:00,0 days 00:14:23,"[queen Naija, Medicine, Queen, Royalty Squad, ...",Entertainment,24363.0,400.0,1450.0,360642.0,HD,Royal Family,,2018-07-08 20:48:12+00:00,2790000.0,203263400.0,192.0,True,True
-hAztPfxZEk,UCtj45MepAoKxZoyR_Mnt86Q,FIRST DAY IN OUR NEW HOME! (HECTIC),LAST VIDEO: \r\n\r\n*MAKE SURE OUR POST NOTIFI...,2021-03-09 03:38:11+00:00,0 days 00:14:23,"[queen Naija, Medicine, Queen, Royalty Squad, ...",Entertainment,24363.0,400.0,1450.0,360642.0,HD,Royal Family,,2018-07-08 20:48:12+00:00,2790000.0,203263400.0,192.0,True,True
-hAztPfxZEk,UCtj45MepAoKxZoyR_Mnt86Q,FIRST DAY IN OUR NEW HOME! (HECTIC),LAST VIDEO: \r\n\r\n*MAKE SURE OUR POST NOTIFI...,2021-03-09 03:38:11+00:00,0 days 00:14:23,"[queen Naija, Medicine, Queen, Royalty Squad, ...",Entertainment,24363.0,400.0,1450.0,360642.0,HD,Royal Family,,2018-07-08 20:48:12+00:00,2790000.0,203263400.0,192.0,True,True
-hAztPfxZEk,UCtj45MepAoKxZoyR_Mnt86Q,FIRST DAY IN OUR NEW HOME! (HECTIC),LAST VIDEO: \r\n\r\n*MAKE SURE OUR POST NOTIFI...,2021-03-09 03:38:11+00:00,0 days 00:14:23,"[queen Naija, Medicine, Queen, Royalty Squad, ...",Entertainment,24363.0,400.0,1450.0,360642.0,HD,Royal Family,,2018-07-08 20:48:12+00:00,2790000.0,203263400.0,192.0,True,True
1XtcaLagLEo,UCCj956IF62FbT7Gouszaj9w,Meghan accuses royals of 'perpetuating falseho...,Subscribe and 🔔 to OFFICIAL BBC YouTube 👉 http...,2021-03-04 06:58:02+00:00,0 days 00:10:45,"[British TV, British TV Shows, Watch UK TV Onl...",News & Politics,2356.0,876.0,2861.0,665451.0,HD,BBC,The BBC is the world’s leading public service ...,2005-11-12 02:36:31+00:00,10600000.0,7772940000.0,13055.0,False,False


## Stats

Unnamed: 0,length,stats.like,stats.dislike,stats.comment,stats.view,creator.stats.follower,creator.stats.view,creator.stats.post
count,664,664.0,664.0,651.0,664.0,664.0,664.0,664.0
mean,0 days 00:09:51.944277108,122025.0,2884.320783,12511.391705,2516563.0,5488461.0,1983159000.0,3718.052711
std,0 days 00:09:30.577349023,190765.9,11265.946403,30623.037625,3439644.0,8854426.0,4958621000.0,9220.233843
min,0 days 00:00:14,326.0,110.0,339.0,90521.0,8150.0,7057505.0,6.0
25%,0 days 00:03:04,17322.0,356.0,2182.0,702998.0,751000.0,109236700.0,126.0
50%,0 days 00:06:11,50163.0,697.0,4513.0,1295399.0,1880000.0,488165100.0,703.0
75%,0 days 00:11:55,126769.0,2705.75,10029.0,2416874.0,8370000.0,1770375000.0,1647.0
max,0 days 00:49:23,1516146.0,142630.0,293380.0,20894410.0,74300000.0,55838550000.0,53319.0


---