# Data Sampling

## Setup

In [2]:
import sys

!{sys.executable} -m pip install --quiet --user --upgrade pandas==1.*
!{sys.executable} -m pip install --quiet --user --upgrade -r requirements.txt

In [3]:
import IPython
import IPython.display as disp

IPython.core.interactiveshell.\
    InteractiveShell.ast_node_interactivity = "all"

def clear():
    disp.clear_output(wait = True)

def output(disp_os):
    for d in disp_os:
        disp.display(d)

def results_report(disp_os, name = ''):
    output([
            disp.Markdown('---'), 
            disp.Markdown(f'# Results - {name}')
        ] + disp_os + 
        [disp.Markdown('---')]
    )

In [5]:
from dcollect import plugins

from dcollect import api_tiktok as tiktok
from dcollect import api_youtube as youtube
from dcollect import api_youtubei as youtubei

import pandas as pd


def df_from_json(items, *args, **kwargs):
    return pd.json_normalize(items, *args, **kwargs)

def df_report(dfs, full = False, *args, **kwargs):
    dfs = [dfs] if not isinstance(dfs, list) else dfs
    for df in dfs:
        results_report([
            disp.Markdown('## Data Preview'),
            df.head() if not full else df,
            disp.Markdown('## Stats'),
            df.describe()
        ], *args, **kwargs)

def df_report_from_json(items, name = '', full = False, *args, **kwargs):
    return df_report(df_from_json(items, *args, **kwargs), name = name, full = full)

## Data Collection

In [6]:
modules = {'http': plugins.fasthttp()}
headers = None

### YouTube (United States)

Initial setup. Be sure to have your API key ready. For details on how to obtain an API key, read [YouTube Data API Overview, Introduction: Before you start](https://developers.google.com/youtube/v3/getting-started#before-you-start).

In [7]:
# This key is for testing ONLY. DO NOT release to the public!
api_key_testing = 'AIzaSyBKsF33Y1McGDdBWemcfcTbVyJu23XDNIk'
api_key = api_key_testing or input('YouTube Data API Key: ')

#### Search

In [8]:
count = 5
keyword = ''

##### STEP 1  API Object Creation

In [9]:
# create a YouTube API object
youtube_o = youtube.api(
    modules = modules,
    headers = headers,
    key = api_key
)

# create a YouTube Internals API object
youtubei_o = youtubei.api(
    modules = modules,
    headers = headers
)

##### STEP 2  Data Collection

In [10]:
from dcollect.utils.thread import threading, thread


df_search = df_from_json(
    youtube_o.video.search(
        count = count,
        keyword = keyword
    )
)

df_info = None
df_channels = None
df_ads = None

thread.start([
    threading.Thread(
        # - info
        target = lambda: \
            globals().update(
                df_info = df_from_json(
                    youtube_o.video.info(
                        id = df_search['id']
                    )
                )
            )
    ),
    threading.Thread(
        # - channels
        target = lambda: \
            globals().update(
                df_channels = df_from_json(
                    youtube_o.channel.info(
                        id = df_search['creator.id']
                    )
                )
            )
    ),
    threading.Thread(
        # - ad placements
        target = lambda: \
            globals().update(
                df_ads = df_from_json(
                    youtubei_o.ad.placements(
                        id = df_search['id']
                    )
                )
            )
    )
])
thread.join()

##### STEP 3  Data Cleaning


In [11]:
# - ads (filter)
def filter_has_ad(ads):
    return not ads == None 
def filter_has_ad_beginning(ads):
    if ads == None:
        return False
    for ad in ads:
        if ad['kind'] == youtubei.resource.ad.kinds.START:
            return True
    return False
# - * (filter)
def drop_common(df, df_other, *args, **kwargs):
    return df.drop(columns = df.columns & df_other.columns, *args, **kwargs)

# - search
df_search.set_index(['id', 'creator.id'], inplace = True)
# - info
df_info.set_index(['id', 'creator.id'], inplace = True)
# - channels
df_channels = df_channels.add_prefix('creator.')
df_channels.set_index(['creator.id'], inplace = True)
# - ads
df_ads.set_index(['id'], inplace = True)
df_ads['has_ad'] = df_ads['ads'].apply(filter_has_ad)
df_ads['has_ad_at_beginning'] = \
        df_ads['ads'].apply(filter_has_ad_beginning)
df_ads.drop('ads', axis = 'columns', inplace = True)

# drop common columns to avoid clashing
# in this case, only `df_search` and `df_info` have merging conflicts
drop_common(df_search, df_info, inplace = True)

# - search (with details)
df_search_details = df_search.copy()
# - info
df_search_details = df_search_details.merge(
    df_info, 
    right_index = True, 
    on = 'id', 
    copy = False
)
# - channels
df_search_details = df_search_details.merge(
    df_channels, 
    right_index = True, 
    on = 'creator.id', 
    copy = False
)
# - ads
df_search_details = df_search_details.merge(
    df_ads, 
    right_index = True, 
    on = 'id', 
    copy = False
)

##### STEP 4  Data Inspection


In [12]:
# take a brief look at our data
df_report(df_search_details, name = 'Search Results')

---

# Results - Search Results

## Data Preview

Unnamed: 0_level_0,Unnamed: 1_level_0,title,description,time,length,tags,category,stats.like,stats.dislike,stats.comment,stats.view,video.quality,creator.title,creator.description,creator.time,creator.stats.follower,creator.stats.view,creator.stats.post,has_ad,has_ad_at_beginning
id,creator.id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
L4VGtQBXjY8,UCqFzWxSCi39LnW1JKFR3efg,Weekend Update: Rep. Marjorie Taylor Greene on...,Rep. Marjorie Taylor Greene (Cecily Strong) st...,2021-02-28 06:28:15+00:00,0 days 00:03:30,"[snl, saturday night live, season 46, snl 46, ...",Entertainment,34706,755,2400,1875436,HD,Saturday Night Live,Welcome to the official Saturday Night Live ch...,2013-07-23 21:32:27+00:00,11500000,11125565070,7537,True,True
_o_nBVl55W4,UCJ5v_MCY6GNUBTO8-D3XoAg,Top 10 Friday Night SmackDown moments: WWE Top...,WWE Top 10 takes you back to this week's Frida...,2021-02-27 20:07:19+00:00,0 days 00:06:24,"[wwe, world wrestling entertainment, wrestling...",Sports,31359,982,1054,1243962,HD,WWE,WWE on YouTube is your number one spot to catc...,2007-05-11 01:20:02+00:00,74200000,55758124340,53285,True,True
O9nkGW_MIgA,UCWJ2lWNubArHWmf3FIHbfcQ,"NBA Top 10 Plays Of The Night | February 26, 2021",Check out the top 10 plays of the night from F...,2021-02-27 07:25:07+00:00,0 days 00:02:28,"[nba, highlights, basketball, plays, amazing, ...",Sports,7757,278,578,475580,HD,NBA,The official YouTube Page of the NBA\n\nDon’t ...,2005-11-21 01:20:33+00:00,16100000,8182624000,34987,True,True
OpjATUDG1Io,UC0YatYmg5JRYzXJPxIdRd8g,Sevilla vs Barcelona | LALIGA HIGHLIGHTS | 2/2...,A double-header with Sevilla will go some way ...,2021-02-27 17:17:28+00:00,0 days 00:11:49,"[beIN SPORTS, beIN SPORTS USA, Futbol, Footbal...",Sports,10849,344,1672,1021545,HD,beIN SPORTS USA,The fastest growing network in the US offering...,2012-07-16 20:11:38+00:00,458000,131374826,7323,True,True


## Stats

Unnamed: 0,length,stats.like,stats.dislike,stats.comment,stats.view,creator.stats.follower,creator.stats.view,creator.stats.post
count,4,4.0,4.0,4.0,4.0,4.0,4.0,4.0
mean,0 days 00:06:02.750000,21167.75,589.75,1426.0,1154131.0,25564500.0,18799420000.0,25783.0
std,0 days 00:04:11.522530468,13825.949114,336.030133,788.813455,579183.6,33081390.0,25073530000.0,22470.437883
min,0 days 00:02:28,7757.0,278.0,578.0,475580.0,458000.0,131374800.0,7323.0
25%,0 days 00:03:14.500000,10076.0,327.5,935.0,885053.8,8739500.0,6169812000.0,7483.5
50%,0 days 00:04:57,21104.0,549.5,1363.0,1132754.0,13800000.0,9654095000.0,21262.0
75%,0 days 00:07:45.250000,32195.75,811.75,1854.0,1401830.0,30625000.0,22283700000.0,39561.5
max,0 days 00:11:49,34706.0,982.0,2400.0,1875436.0,74200000.0,55758120000.0,53285.0


---

##### STEP 5  Data Archiving


In [13]:
pickle_proto = 3
pickle_fname = 'dsamples/youtube_search.pkl'

df_search_details.to_pickle(pickle_fname, protocol = pickle_proto)

# verify that we saved the correct data
df_search_details_check = pd.read_pickle(pickle_fname)
df_report(df_search_details_check, name = 'Search Results (Verification)')


---

# Results - Search Results (Verification)

## Data Preview

Unnamed: 0_level_0,Unnamed: 1_level_0,title,description,time,length,tags,category,stats.like,stats.dislike,stats.comment,stats.view,video.quality,creator.title,creator.description,creator.time,creator.stats.follower,creator.stats.view,creator.stats.post,has_ad,has_ad_at_beginning
id,creator.id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
L4VGtQBXjY8,UCqFzWxSCi39LnW1JKFR3efg,Weekend Update: Rep. Marjorie Taylor Greene on...,Rep. Marjorie Taylor Greene (Cecily Strong) st...,2021-02-28 06:28:15+00:00,0 days 00:03:30,"[snl, saturday night live, season 46, snl 46, ...",Entertainment,34706,755,2400,1875436,HD,Saturday Night Live,Welcome to the official Saturday Night Live ch...,2013-07-23 21:32:27+00:00,11500000,11125565070,7537,True,True
_o_nBVl55W4,UCJ5v_MCY6GNUBTO8-D3XoAg,Top 10 Friday Night SmackDown moments: WWE Top...,WWE Top 10 takes you back to this week's Frida...,2021-02-27 20:07:19+00:00,0 days 00:06:24,"[wwe, world wrestling entertainment, wrestling...",Sports,31359,982,1054,1243962,HD,WWE,WWE on YouTube is your number one spot to catc...,2007-05-11 01:20:02+00:00,74200000,55758124340,53285,True,True
O9nkGW_MIgA,UCWJ2lWNubArHWmf3FIHbfcQ,"NBA Top 10 Plays Of The Night | February 26, 2021",Check out the top 10 plays of the night from F...,2021-02-27 07:25:07+00:00,0 days 00:02:28,"[nba, highlights, basketball, plays, amazing, ...",Sports,7757,278,578,475580,HD,NBA,The official YouTube Page of the NBA\n\nDon’t ...,2005-11-21 01:20:33+00:00,16100000,8182624000,34987,True,True
OpjATUDG1Io,UC0YatYmg5JRYzXJPxIdRd8g,Sevilla vs Barcelona | LALIGA HIGHLIGHTS | 2/2...,A double-header with Sevilla will go some way ...,2021-02-27 17:17:28+00:00,0 days 00:11:49,"[beIN SPORTS, beIN SPORTS USA, Futbol, Footbal...",Sports,10849,344,1672,1021545,HD,beIN SPORTS USA,The fastest growing network in the US offering...,2012-07-16 20:11:38+00:00,458000,131374826,7323,True,True


## Stats

Unnamed: 0,length,stats.like,stats.dislike,stats.comment,stats.view,creator.stats.follower,creator.stats.view,creator.stats.post
count,4,4.0,4.0,4.0,4.0,4.0,4.0,4.0
mean,0 days 00:06:02.750000,21167.75,589.75,1426.0,1154131.0,25564500.0,18799420000.0,25783.0
std,0 days 00:04:11.522530468,13825.949114,336.030133,788.813455,579183.6,33081390.0,25073530000.0,22470.437883
min,0 days 00:02:28,7757.0,278.0,578.0,475580.0,458000.0,131374800.0,7323.0
25%,0 days 00:03:14.500000,10076.0,327.5,935.0,885053.8,8739500.0,6169812000.0,7483.5
50%,0 days 00:04:57,21104.0,549.5,1363.0,1132754.0,13800000.0,9654095000.0,21262.0
75%,0 days 00:07:45.250000,32195.75,811.75,1854.0,1401830.0,30625000.0,22283700000.0,39561.5
max,0 days 00:11:49,34706.0,982.0,2400.0,1875436.0,74200000.0,55758120000.0,53285.0


---

In [14]:
youtube.types.topic.all


{'us': {'1': 'Film & Animation',
  '2': 'Autos & Vehicles',
  '10': 'Music',
  '15': 'Pets & Animals',
  '17': 'Sports',
  '18': 'Short Movies',
  '19': 'Travel & Events',
  '20': 'Gaming',
  '21': 'Videoblogging',
  '22': 'People & Blogs',
  '23': 'Comedy',
  '24': 'Entertainment',
  '25': 'News & Politics',
  '26': 'Howto & Style',
  '27': 'Education',
  '28': 'Science & Technology',
  '29': 'Nonprofits & Activism',
  '30': 'Movies',
  '31': 'Anime/Animation',
  '32': 'Action/Adventure',
  '33': 'Classics',
  '34': 'Comedy',
  '35': 'Documentary',
  '36': 'Drama',
  '37': 'Family',
  '38': 'Foreign',
  '39': 'Horror',
  '40': 'Sci-Fi/Fantasy',
  '41': 'Thriller',
  '42': 'Shorts',
  '43': 'Shows',
  '44': 'Trailers'}}