# Setups

In [1]:
import sys
import os

# print(os.path.abspath(os.path.join(os.path.dirname(__file__))))
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

In [2]:
from constants import video_ids, channel_ids, search_keywords, playlist_ids
from tests.utils import measure_time

devKey = open("devKey").read()

# Type 1: Iterable

## CommentThreadsRetriever

In [None]:
from yt_pipeline.retriever import CommentThreadsRetriever, PipeSettings, RetrieverSettings

pipe_settings = PipeSettings(retrieval="all", max_page=10)

print("Size of iterable:", len(video_ids))
worker = CommentThreadsRetriever(iterable=video_ids, developerKey=devKey, settings=pipe_settings)
results = worker.invoke(multithread=True)
print("Size of results:", len(results))

In [None]:
for err in worker.ignored_errors:
    print(err.reason)
#     print(err.uri)

### Highlights
- disabled comment sections captured
- invalid video id captured

## CommentThreadsContainer

In [None]:
from yt_pipeline.container import CommentThreadsContainer

# handle different results
if isinstance(results, tuple):
    raw_items, error = results
    self.errors.append(error)
else:
    raw_items = results

box = CommentThreadsContainer(raw_items)

In [None]:
for i in box.items[:5]:
    print(i, end='\n\n')

## CommentThreadsShipper

In [None]:
from yt_pipeline.shipper import CommentThreadsShipper

shipper = CommentThreadsShipper()
shipper.invoke(box.items)

In [None]:
import pandas as pd

df = pd.DataFrame(shipper.main_records)
df.head()

## PlaylistItemsRetriever

In [None]:
from yt_pipeline.retriever import PlaylistItemsRetriever, PipeSettings

settings = PipeSettings(retrieval="all", max_page=10)
print("Size of playlist ids:", len(playlist_ids))
worker = PlaylistItemsRetriever(iterable=playlist_ids, developerKey=devKey, settings=settings, max_workers=16)
results = worker.invoke(multithread=True, flatten_result=False)
print("Size of results:", len(results))

In [None]:
for err in worker.ignored_errors:
    print(err.reason)
#     print(err.uri)

### Highlights
- invalid playlist id captured and skipped
- empty id produce empty result

## PlaylistItemsContainer

In [None]:
from yt_pipeline.container import PlaylistItemsContainer

# handle different results
if isinstance(results, tuple):
    raw_items, error = results
    self.errors.append(error)
else:
    raw_items = results
    
box = PlaylistItemsContainer(raw_items)

for i in box.items[:5]:
    print(i, end='\n\n')

## PlaylistItemShipper

In [None]:
from yt_pipeline.shipper import PlaylistItemShipper

shipper = PlaylistItemShipper()
shipper.invoke(box.items)

import pandas as pd

df = pd.DataFrame(shipper.main_records)
df

## SearchRetriever

In [None]:
# normal run; check what happens when quota exceed 
from yt_pipeline.retriever import SearchRetriever, PipeSettings, SearchParamProps, SearchTypeCheckboxProps


settings = PipeSettings(retrieval="all", max_page=1)
params = [SearchParamProps(kw) for kw in search_keywords[20:80]]
types = SearchTypeCheckboxProps(channel=True, video=True, playlist=True)
# params
print("Size of search params:", len(params))
worker = SearchRetriever(iterable=params, developerKey=devKey, types=types, settings=settings)
results = worker.invoke(multithread=True)
print("Size of results:", len(results))

In [None]:
for err in worker.ignored_errors:
    print(err.reason)
#     print(err.uri)

In [None]:
# run 2: different params or invalid params passed
from yt_pipeline.retriever import SearchRetriever, PipeSettings, SearchParamProps, SearchTypeCheckboxProps

settings = PipeSettings(retrieval="all", max_page=1)
params = [
    SearchParamProps(search_keywords[0], channelId='UC84t1K5ri-7u9bFCaUKTXDA', order="relevance"),
    SearchParamProps(search_keywords[0], channelId='UC84t1K5ri-7u9bFCaUKTXDA', videoDuration="long", order="relevance"),
    SearchParamProps(search_keywords[0], channelId='UC84t1K5ri-7u9bFCaUKTXDA', videoDuration="long", order="relevance", 
                    publishedAfter="2023-02-01T13:00:02Z"),
    SearchParamProps(search_keywords[0], channelId='UC84t1K5ri-7u9bFCaUKTXDA', videoCategoryId="19",
                     order="relevance"),
]
types = SearchTypeCheckboxProps(video=True)
# types = SearchTypeCheckboxProps(video=True, playlist=True)
print("Size of search params:", len(params))
worker = SearchRetriever(iterable=params, developerKey=devKey, types=types, settings=settings)
results = worker.invoke(multithread=True)
print("Size of results:", len(results))

In [None]:
params

In [None]:
# [i['snippet']['publishedAt'] for i in results[-1]]
results[-1]

### Highlights:
- Quota exceed error is captured
- badRequest Error will be returned and pipe will halt: eg. if channelId specified, set type to video; more details in https://developers.google.com/youtube/v3/docs/search/list
- Request without specified 'type' is allowed in API; this pipeline does not encourage the behaviour $\rightarrow$ 'types' parameter in required in instantiation

### Observation:
- Sometimes even type is specified as 'video', output with 'channel' type can still occurs; a bug in YouTube Data API

## SearchContainer

In [None]:
from yt_pipeline.container import SearchContainer

# handle different results
if isinstance(results, tuple):
    raw_items, error = results
    self.errors.append(error)
else:
    raw_items = results
    
box = SearchContainer(raw_items)

for i in box.items[:5]:
    print(i, end='\n\n')

## SearchShipper

In [None]:
from yt_pipeline.shipper import SearchShipper

shipper = SearchShipper()
shipper.invoke(box.items)

import pandas as pd

df = pd.DataFrame(shipper.main_records)
df

## PlaylistRetriever

In [None]:
from yt_pipeline.retriever import PlaylistsRetriever, PipeSettings, SearchParamProps, SearchTypeCheckboxProps

settings = PipeSettings(retrieval="all", max_page=3)
# types = SearchTypeCheckboxProps(video=True, playlist=True)
print("Size of playlist ids:", len(channel_ids))
worker = PlaylistsRetriever(iterable=channel_ids, developerKey=devKey, settings=settings, max_workers=16)
results = worker.invoke(multithread=True)
print("Size of results:", len(results))

### Highlights:
- invalid id input will be captured and return error, as `channelId` serves as a filter instead of input id

## PlaylistsContainer

In [None]:
from yt_pipeline.container import PlaylistsContainer

# handle different results
if isinstance(results, tuple):
    raw_items, error = results
    self.errors.append(error)
else:
    raw_items = results
    
box = PlaylistsContainer(raw_items)

for i in box.items[:5]:
    print(i, end='\n\n')

## PlaylistShipper

In [None]:
from yt_pipeline.shipper import PlaylistShipper

shipper = PlaylistShipper()
shipper.invoke(box.items)

import pandas as pd

df = pd.DataFrame(shipper.main_records)
df

# Type 2: Unique

## VideoRetriever

In [None]:
from yt_pipeline.retriever import VideosRetriever

In [None]:
print("Size of video ids:", len(video_ids))
worker = VideosRetriever(iterable=video_ids, developerKey=devKey, max_workers=16)
results = worker.invoke(multithread=True)
print("Size of results:", len(results))

## VideoContainer

In [None]:
from yt_pipeline.container import VideosContainer

# handle different results
if isinstance(results, tuple):
    raw_items, error = results
    self.errors.append(error)
else:
    raw_items = results
    
box = VideosContainer(raw_items)

for i in box.items[:5]:
    print(i, end='\n\n')

## VideoShipper

In [None]:
from yt_pipeline.shipper import VideoShipper

shipper = VideoShipper()
shipper.invoke(box.items)

import pandas as pd

df = pd.DataFrame(shipper.main_records)
df

## ChannelsRetriever

In [None]:
from yt_pipeline.retriever import ChannelsRetriever

print("Size of channel ids:", len(channel_ids))
worker = ChannelsRetriever(iterable=channel_ids, developerKey=devKey, max_workers=16)
results = worker.invoke(multithread=True)
print("Size of results:", len(results))

## ChannelsContainer

In [None]:
from yt_pipeline.container import ChannelsContainer

# handle different results
if isinstance(results, tuple):
    raw_items, error = results
    self.errors.append(error)
else:
    raw_items = results
    
box = ChannelsContainer(raw_items)

for i in box.items[:5]:
    print(i, end='\n\n')

## ChannelShipper

In [None]:
from yt_pipeline.shipper import ChannelShipper

shipper = ChannelShipper()
shipper.invoke(box.items)

import pandas as pd

df = pd.DataFrame(shipper.main_records)
df

# Type 3: SingleRetriever

## VideoCategoriesRetriever

In [None]:
from yt_pipeline.retriever import VideoCategoriesRetriever
from yt_pipeline.retriever.video_categories import VideoCategoriesParams


worker = VideoCategoriesRetriever(params=VideoCategoriesParams(regionCode="JP"), developerKey=devKey)
results = worker.invoke()
results

## VideoCategoriesContainer

In [None]:
from yt_pipeline.container import VideoCategoriesContainer

box = VideoCategoriesContainer(results)
box.items

## VideoCategoriesShipper

In [None]:
from yt_pipeline.shipper import VideoCategoriesShipper

shipper = VideoCategoriesShipper()
shipper.invoke(box.items)

import pandas as pd

df = pd.DataFrame(shipper.main_records)
df

# Type 4: Captions

## CaptionsRetriever

In [3]:
from yt_pipeline.retriever import CaptionsRetriever
from yt_pipeline.retriever.captions import CaptionsParams

params = [
    CaptionsParams(videoId="eP_P4KOjwhs")
]

worker = CaptionsRetriever(iterable=params, developerKey=devKey)
results = worker.invoke()
results

 1  /  1  batch(s) retrieved: 100%|██████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.61it/s]


[{'kind': 'youtube#caption',
  'etag': 'KvyLPCCMDI4qR7b375Hjdons9Xo',
  'id': 'AUieDaaWfE5gSTZluRBEpOonEu8C-48ZrDo4gea9EpGoqHwRp1c',
  'snippet': {'videoId': 'eP_P4KOjwhs',
   'lastUpdated': '2025-01-24T09:35:52.395103Z',
   'trackKind': 'asr',
   'language': 'en',
   'name': '',
   'audioTrackType': 'unknown',
   'isCC': False,
   'isLarge': False,
   'isEasyReader': False,
   'isDraft': False,
   'isAutoSynced': False,
   'status': 'serving'}}]

## CaptionsContainer

In [4]:
from yt_pipeline.container import CaptionsContainer

box = CaptionsContainer(results)
box.items

[CaptionItem(kind='youtube#caption', etag='KvyLPCCMDI4qR7b375Hjdons9Xo', id='AUieDaaWfE5gSTZluRBEpOonEu8C-48ZrDo4gea9EpGoqHwRp1c', snippet=CaptionSnippet(videoId='eP_P4KOjwhs', lastUpdated='2025-01-24T09:35:52.395103Z', trackKind='asr', language='en', name='', audioTrackType='unknown', isCC=False, isLarge=False, isEasyReader=False, isDraft=False, isAutoSynced=False, status='serving'))]

## CaptionShipper

In [6]:
from yt_pipeline.shipper import CaptionsShipper

shipper = CaptionsShipper()
shipper.invoke(box.items)

import pandas as pd

df = pd.DataFrame(shipper.main_records)
df

Unnamed: 0,kind,etag,id,videoId,lastUpdated,trackKind,language,name,audioTrackType,isCC,isLarge,isEasyReader,isDraft,isAutoSynced,status
0,youtube#caption,KvyLPCCMDI4qR7b375Hjdons9Xo,AUieDaaWfE5gSTZluRBEpOonEu8C-48ZrDo4gea9EpGoqH...,eP_P4KOjwhs,2025-01-24T09:35:52.395103Z,asr,en,,unknown,False,False,False,False,False,serving
