# Subtitle Collection

## Step #1: Collect Youtube Video ID list for Children's works


### Setup

In [2]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import json
import os
import math
import glob
import time
import numpy as np
import dask.dataframe as dd
import motes_corpus.youtube as yt
import youtube_transcript_api
from pathlib import Path

import google_auth_oauthlib.flow
import googleapiclient.discovery
import googleapiclient.errors

In [4]:
scopes = ["https://www.googleapis.com/auth/youtube.readonly"]

# Disable OAuthlib's HTTPS verification when running locally.

os.environ["OAUTHLIB_INSECURE_TRANSPORT"] = "1"

api_service_name = "youtube"
api_version = "v3"
client_secrets_file = list(Path('..').glob('client_secret*'))[0]

flow = google_auth_oauthlib.flow.InstalledAppFlow.from_client_secrets_file(
    client_secrets_file, scopes)

credentials = flow.run_console()

youtube_api = googleapiclient.discovery.build(
    api_service_name, api_version, credentials=credentials)

Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=312872897581-qii8av1f39ko8iavk4qp3sm8lqnp74op.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fyoutube.readonly&state=TqMKcUlLl6JGskvXpbJkLnnWFizLPS&prompt=consent&access_type=offline


Enter the authorization code:  4/4QGOuLG-pPT9fU3kKwOriGjAsAFUo5yLlKV5r5mAFlngB10RzNQFJx0


### Collection Strategy 1 - Searching

Using the search API, search for common keywords for kids, and identify videos. These are then analyzed for `madeForKids` tags, and an analysis of titles generates for search keywords.

https://developers.google.com/youtube/v3/docs/search/list

Also, cycle through topicId, to ensure more results per query.

In [None]:
#topicId = '/m/01k8wb' # Educational; see: https://developers.google.com/youtube/v3/docs/search/list
#topicId = '/m/02jjt' # Entertainment 
#topicId = '/m/0f2f9' # TV-shows - a subset of education
#topicId = None
#topicId = '/m/0bzvm2' # Gaming
#sortBy = 'rating' # date | rating | relevance | title | videoCount | viewCount
# Queries ('relevance', in categories '/m/01k8wb', '/m/0f2f9', '/m/0bzvm2', with 'kids' appended to query):
# To run: ['periwinkle', 'samurai', 'jr', 'mister', 'nickelodeon', 'gospel', 'maggie', 'meena', 'bears', 'gumball', 'hotline', 'jojo', 'kinder', 'songs', 'network', 'wash', 'clues', 'phonics', 'infantiles', 'uncle', 'yoga', 'toddlers', 'poems', 'бег', 'junior', 'pequeños', 'macdonald', 'episodes', 'seen', 'sight', 'bible']

past_queries = ['pizza', 'christmas', 'firefighter', 'sticker', 'halloween', 'yummy', 'superhero',
                "children|kids|kid|teen|teens",'superpowers', 'homework', 'easter', 'cupcake',
                'mickey', 'upbeat', 'meatball', 'mario', 'hotdog', 'zombie', 'robotic', 'inator',
                'astronaut', 'gadget','vroom', 'kidz', 'wheels', 'mountain', 'barbie', 'danger',
                'cbc', 'xd', 'rangers','sesame street', 'paw patrol', 'playtime', 'cartoons']
for q in ['cartoons']:
    q = q + ' kids'
    for topicId in ['/m/01k8wb', '/m/0f2f9', '/m/0bzvm2', '/m/0f2f9','/m/0403l3g', '/m/025zzc', '/m/019_rr', '/m/02vxn', '/m/01k8wb', '/m/02jjt', '/m/09kqc', None, '/m/03glg', '/m/0bzvm2']:
        for sortBy in ['relevance']: #['date', 'relevance', 'viewCount', 'rating']:
            print(q, topicId, sortBy)
            pt = None
            results_collector = []

            for i in range(40):
                pt, df = yt.search_youtube(youtube_api, q, pageToken=pt, topicId=topicId, order=sortBy)
                print(i, 'Next pageToken:', pt)
                results_collector.append(df)
                if (pt is None) or (df.shape[0] == 0):
                    break

            all_searches = pd.concat(results_collector)
            if all_searches.empty:
                continue
            all_searches['q'] = q
            all_searches['topicId'] = topicId
            all_searches['sortBy'] = sortBy
            now = int(time.time())
            all_searches.to_parquet('data/yt_info/initial_search_results_{}.parquet'.format(now))

### Collection Strategy 2 - Downloading all videos for relevant channels

Calling the channels call gives you an 'uploads' playlist, which can be used to download all the videos of that channel. Quota costs are much lower in that case.

This does analysis on the 'details' dataframes, which are collected below, so if you have not data callected your, this section won't run yet!

### Determine channels with madeForKids content and download details

In [80]:
ddf = dd.read_parquet('data/yt_info/details*').drop_duplicates('videoId')
ddf = ddf[ddf.madeForKids == True]
top_channels = ddf.groupby(['channelTitle', 'channelId'])[['videoId']].count().compute()
top_channels = top_channels.reset_index().sort_values('videoId', ascending=False).rename(columns={'videoId':'count'})

channelinfo = dd.read_parquet('data/yt_info/channelInfo*')
if len(channelinfo) > 0:
    already_processed_channels = channelinfo.channelId.compute().tolist()
else:
    already_processed_channels = []

top_channels = top_channels[~top_channels.channelId.isin(already_processed_channels)]
top_channels = top_channels[top_channels['count'] > 1]
top_channels.head(10)

Unnamed: 0,channelTitle,channelId,count
897,Phineas and Ferb,UCsbTCrt-Ndfa2DfR5NJn5fg,30
823,NickJrPlay,UCkR8cM4gLPE7iV49cOitR3Q,30
753,Mickey Mouse Clubhouse,UC8bcK-NGFJbOfGrXX5Y_VIQ,29
1009,Sesame Street,UCEn8ua5KNErvgsHADUWIVWQ,15
490,Henry Danger,UCgVQOLcpiphDMySH8CTm5WQ,11
539,JESSIE,UCtLOBSdu37g_iLYMTW1vn7g,11
173,Caillou,UCnPBcrNgQXXjE1SUGH2AtVA,10
1108,Team Umizoomi,UCLLyHMqMLCt8cu6SsmMiQMw,9
1139,The Jetsons,UCNRep6MD5OE2li7wfLpi-MQ,8
373,Family And Kids Games,UC4TSKUgUvVvec5o-3Fzkb7Q,8


In [81]:
channel_info = yt.load_channel_info(youtube_api, top_channels.channelId)
channel_info.to_parquet('data/yt_info/channelInfo_{}.parquet'.format(time.strftime('%m-%d')))
print(channel_info.shape)
channel_info.sample(1)

0,1,(35, 9)


Unnamed: 0,channelId,title,description,customUrl,publishedAt,madeForKids,uploads,favorites,likes
13,UC01n7Sn93HVMIi3HKZNxrTA,Hampton Primary School,"Hampton School, Mauritius. Helpful videos for ...",hamptonprimaryschoolquatrebornes,2012-10-31T17:22:02Z,True,UU01n7Sn93HVMIi3HKZNxrTA,,


The `uploads` field is a special playlist that links to all the uploads for the channel. `favorites` and `likes` may be slightly useful.

Channels can be `True`, `False`, or `NaN` for madeForKids. `True` is the sensible focus, though `NaN` may be useful too.

In [None]:
all_channel_info = dd.read_parquet('data/yt_info/channelInfo*')
all_channel_info = all_channel_info[all_channel_info.madeForKids == True]
processed = dd.read_parquet('data/yt_info/channel_details*').channelId.compute()
processed += ['UCwJncBmXoz4njCjrfvI9bng'] # Exclude list for inappropriately tagged channels, like the Kentucky news station with 17k vids.
all_channel_info = all_channel_info[~all_channel_info.channelId.isin(processed)]
all_channel_info = all_channel_info.compute()

collector = []
for i, playlistID in enumerate(all_channel_info.uploads):
    print(i, playlistID)
    details = yt.playlist_details(youtube_api, playlistID)
    print("{} videos found".format(len(details)))
    collector.append(details)
df = pd.concat(collector)
# Unfortunately, playlistItems didn't give us 'caption' info, so need to 
# ping the API one more time
df2 = yt.augment_initial_search(youtube_api, df)
df2.to_parquet('data/yt_info/channel_details_{}.parquet'.format(time.time()))

0 UUe1VpF4wS_kdcjyTRSXBcnQ
138 videos found
1 UUpgxmlXoDtkYzRQ4cJgCT5A
2698 videos found
2 UUUe6ZpY6TJ0no8jI4l2iLxw
3625 videos found
3 UU3KknIJZXRygH2pZ6MDtGbg
2218 videos found
4 UUaKkjxZBucoNihPw1NOqThA
322 videos found
5 UUcIVSA2vFpZmuPXG1oAZVaQ
652 videos found
6 UUwC-DciMGGk9AgpJ3hpreHw
201 videos found
7 UUABKyuM59C8Z0iDf-3kIRSA
1035 videos found
8 UURFIPG2u1DxKLNuE3y2SjHA
379 videos found
9 UUx27Pkk8plpiosF14qXq-VA
546 videos found
10 UUkbi6oVP_8Yzk9qPDfz9etw
105 videos found
11 UUcbE7twlpJfQVF2dO21fB1A
2044 videos found
12 UUcJT_hEkkQ03NAzp_6NEBiw
2249 videos found
13 UUQnSQLzMbkSN5j0rtYyPnmQ
535 videos found
14 UUAy25gXkma1g7_kaxupFqWg
161 videos found
15 UUV1SycDpnU1A2dXqob6Aowg
135 videos found
16 UU5M_h2S8Ldoc9M6f7B-_m6A
3259 videos found
17 UU-4za12KjAMe1RIHeL3grAw
174 videos found
18 UUdtojT_ZwTRlZThoBSMVhoQ
558 videos found
19 UU_6TpaXzCHZBTEs7oHLPXYA
752 videos found
20 UUoookXUzPciGrEZEXmh4Jjg
3056 videos found
21 UUm9SlTdSngChZypOUVcRYNQ
516 videos found
22 UU3Gv4u2_

### Download Details

Particularly important is whether the video is 'madeForKids'.

In [79]:
# Find the first file of *initial* search results that hasn't been queried for
# extra info yet.
data_paths = glob.glob('data/yt_info/*')
for initial_path in [x for x in data_paths if 'initial' in x]:
    details_path = initial_path.replace('initial', 'details')
    if details_path in data_paths:
        continue
    else:
        print('Augmenting {}-->{}'.format(initial_path, details_path))
        df = pd.read_parquet(initial_path).drop_duplicates('videoId')
        df2 = yt.augment_initial_search(youtube_api, df)
        df2.to_parquet(details_path)

Augmenting data/yt_info/initial_search_results_1602364309.parquet-->data/yt_info/details_search_results_1602364309.parquet
0,1,2,3,4,5,6,7,8,9,10,
Augmenting data/yt_info/initial_search_results_1602364314.parquet-->data/yt_info/details_search_results_1602364314.parquet
0,1,2,3,4,5,6,7,8,
Augmenting data/yt_info/initial_search_results_1602364320.parquet-->data/yt_info/details_search_results_1602364320.parquet
0,1,2,3,4,5,6,7,8,9,10,
Augmenting data/yt_info/initial_search_results_1602364325.parquet-->data/yt_info/details_search_results_1602364325.parquet
0,1,2,3,4,5,6,7,8,9,10,
Augmenting data/yt_info/initial_search_results_1602364330.parquet-->data/yt_info/details_search_results_1602364330.parquet
0,1,2,3,4,5,6,7,8,9,
Augmenting data/yt_info/initial_search_results_1602364333.parquet-->data/yt_info/details_search_results_1602364333.parquet
0,1,2,3,4,5,6,7,


### Investigate common words in made for kids titles vs others

What are the top topics ids?

In [29]:
topics = pd.read_csv('yt_topic_ids.tsv', sep='\t', names=['topic_id', 'name'])
topicid2name = topics.set_index('topic_id').to_dict()['name']
topicname2id = topics.set_index('name').to_dict()['topic_id']

In [None]:
pd.read_csv('yt_topic_ids.tsv', sep='\t', names=['topic_id', 'name']).set_index('topic_id').iloc[0].to_dict

In [33]:
topics['topic_id']

0      /m/04rlf
1     /m/02mscn
2     /m/0ggq0m
3      /m/01lyv
4      /m/02lkt
        ...    
57     /m/0kt51
58    /m/01h6rj
59     /m/05qt0
60     /m/06bvp
61    /m/01k8wb
Name: topic_id, Length: 62, dtype: object

In [44]:
# Top topic ids. obviously skewed to the ones searched for!
df = dd.read_parquet(glob.glob('data/yt_info/details_*')).drop_duplicates(['videoId']).compute()
past_searches= [] #['/m/0f2f9', '/m/0403l3g', '/m/019_rr', '/m/02vxn', '/m/01k8wb', '/m/02jjt', '/m/09kqc', None, '/m/03glg', '/m/0bzvm2']
kids = df[df.madeForKids > 0]
top_topics = pd.Series([x for y in kids.relevantTopicIds.dropna().tolist() for x in y if (x not in past_searches)]).value_counts()
top_topics = top_topics[top_topics.index.isin(topics['topic_id'])]
top_topics.index = [topicid2name[x] for x in top_topics.index]
top_topics.head(8)

Entertainment (parent topic)    2934
Movies                          2394
TV shows                        2365
Knowledge                       1798
Gaming (parent topic)           1468
Action game                      248
Music (parent topic)             219
Role-playing video game          173
dtype: int64

### What are the top words used in kids titles vs non-madeForKids titles?

Using odds-ratio here, without a correction for missing words.

In [85]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [106]:
searchDetails = dd.read_parquet('data/yt_info/details*')
channelDetails = dd.read_parquet('data/yt_info/channel_details*')
intersect = list(set(searchDetails.columns).intersection(channelDetails.columns))
blacklist = ['UCwJncBmXoz4njCjrfvI9bng']
ddf = dd.concat([searchDetails[intersect], channelDetails[intersect]])
ddf = ddf[~ddf.channelId.isin(blacklist)]
df = ddf.drop_duplicates('videoId').compute()

In [107]:
kids_titles = " ".join(df[df.madeForKids > 0].title.tolist())
adult_titles = " ".join(df[df.madeForKids == 0].title.tolist())

kids_words = pd.Series(kids_titles.lower().split()).value_counts()
adult_words = pd.Series(adult_titles.lower().split()).value_counts()

kids_words = kids_words[kids_words > 1]
adult_words = adult_words[adult_words > 1]

kprob = kids_words / kids_words.sum()
aprob = adult_words / adult_words.sum()
kodds = kprob / (1-kprob)
aodds = aprob / (1-aprob)
odds_ratio = (kodds/aodds).dropna().apply(np.log)

In [112]:
print(odds_ratio[odds_ratio.index.str.isalpha() & ~odds_ratio.index.isin(past_queries)].sort_values(ascending=False).index.tolist()[:50])

['cbc', 'xd', 'uk', 'rangers', 'sesame', 'paw', 'loud', 'motu', 'compilation', 'patlu', 'charge', 'pals', 'nick', 'hoffman', 'shine', 'playtime', 'abby', 'cartoons', 'periwinkle', 'samurai', 'jr', 'mister', 'nickelodeon', 'gospel', 'maggie', 'meena', 'bears', 'gumball', 'hotline', 'jojo', 'kinder', 'songs', 'network', 'wash', 'clues', 'phonics', 'infantiles', 'uncle', 'yoga', 'toddlers', 'poems', 'бег', 'junior', 'patrol', 'pequeños', 'macdonald', 'episodes', 'seen', 'sight', 'bible']


In [110]:
odds_ratio[odds_ratio.index.str.isalpha() & ~odds_ratio.index.isin(past_queries)].sort_values(ascending=False).head(20)

cbc            4.857021
xd             4.177063
uk             3.840047
rangers        3.589981
sesame         3.531126
paw            3.500636
loud           3.451253
motu           3.348485
compilation    3.318697
patlu          3.288349
charge         3.150285
pals           3.121334
nick           3.095340
hoffman        3.035694
shine          3.026230
playtime       3.008160
abby           3.002937
cartoons       2.992741
periwinkle     2.991513
samurai        2.988994
dtype: float64

### Which channels are best represented?

In [895]:
df[df.madeForKids > 0].channelTitle.value_counts().head(20)

PBS KIDS                              2190
HiHiPuffyAmiYumiRules2001              601
ZZ Kids TV                             179
Math Department                        120
SAIL TAHITI                             97
Parkfield Primary                       94
Funny Shark for Kids                    93
Nivu's World                            79
Cross Church                            57
Heidi and Toys                          53
Hemet Unified                           50
kitki                                   47
Nickelodeon                             35
Paw Pack - Funny Cartoons for Kids      33
A Kid Explains History                  32
Wolfoo Family                           28
YouTube Movies                          25
Guddies by Aditi Neel                   24
TuRuLaRa- Gags For Kids                 23
GamerGirl                               18
Name: channelTitle, dtype: int64

### Which search condition has the best hit rate for 'madeForKids' content?

In [544]:
df.groupby(['topicId'])[['madeForKids']].apply(lambda x: x.sum()/x.count()).sort_values('madeForKids', ascending=False)

Unnamed: 0_level_0,madeForKids
topicId,Unnamed: 1_level_1
/m/02vxn,0.905797
02jjt,0.611111
01k8wb,0.570934
/m/03glg,0.514423
/m/02jjt,0.450704
/m/0f2f9,0.440678
/m/01k8wb,0.411894
/m/09kqc,0.373626
/m/019_rr,0.286932
/m/0bzvm2,0.270415


In [None]:
df.groupby(['sortBy'])[['madeForKids']].apply(lambda x: x.sum()/x.count()).sort_values('madeForKids', ascending=False)

## Step #2: Download Transcripts

In [114]:
completed = [os.path.split(x)[1][:-5] for x in glob.glob('/data/motes/yt_captions/**/*') if x.endswith('.json')]
with open('completed.txt', mode='w') as f:
    f.write("\n".join(completed))

In [115]:
searchDetails = dd.read_parquet('data/yt_info/details*')
channelDetails = dd.read_parquet('data/yt_info/channel_details*')
intersect = list(set(searchDetails.columns).intersection(channelDetails.columns))
ddf = dd.concat([searchDetails[intersect], channelDetails[intersect]])
ddf = ddf.drop_duplicates('videoId')
exclude_channels = ['WYMT Television']
ddf = ddf[~ddf.channelTitle.isin(exclude_channels)]

just_for_kids = ddf[(ddf.madeForKids == 1) & (ddf.caption == 'true')].compute()
subtitle_path = '/data/motes/yt_captions/{}/'.format(time.strftime('%m-%d'))

problem_vids = []
if os.path.exists('data/yt_caption_problems.txt'):
    with open('data/yt_caption_problems.txt', mode='r') as f:
        problem_vids = f.read().split('\n')

already_completed = [os.path.split(x)[1][:-5] for x in glob.glob('/data/motes/yt_captions/**/*') if x.endswith('.json')]
already_completed += problem_vids
print(len(just_for_kids), len(already_completed))
just_for_kids = just_for_kids[~just_for_kids.videoId.isin(already_completed)]
print(len(just_for_kids))
just_for_kids.head(2)

49219 42221
10558


Unnamed: 0,tags,caption,channelTitle,relevantTopicIds,description,topicCategories,videoId,channelId,duration,publishedAt,title,categoryId,madeForKids
58,,True,YouTube Movies,"[/m/02jjt, /m/02vxn, /m/02jjt, /m/02vxn, /m/0f...","Arthur, D.W., and their family and friends pre...","[https://en.wikipedia.org/wiki/Film, https://e...",YBa8fm5w1WQ,UC6Qs46AHswdgMq08dqaQbQQ,PT54M35S,2015-11-02T14:10:57Z,Arthur&#39;s Perfect Christmas,30,1.0
68,,True,Top Elf,"[/m/02jjt, /m/0f2f9, /m/02jjt, /m/0f2f9, /m/0f...",5 talented Elf-testants are in the running for...,"[https://en.wikipedia.org/wiki/Entertainment, ...",1vHTAHEeTco,UCJhhiv2zac33YdOC2He3uLg,PT43M5S,2019-12-14T05:00:01Z,Teamwork Makes the Tree Work,43,1.0


In [116]:
with open('captions_to_download.txt', mode='w') as f:
    f.write("\n".join(just_for_kids.videoId.tolist()))

In [None]:
err_vids = []
for i, (ind, vid) in enumerate(just_for_kids.iterrows()):
    print(i, vid.title)
    try:
        yt.fetch_and_save_transcript(vid.videoId, subtitle_path)
    except KeyboardInterrupt:
        raise
    except (youtube_transcript_api.TranscriptsDisabled,
            youtube_transcript_api.NoTranscriptAvailable):
        print('Err with {}'.format(vid.videoId))
        err_vids.append(vid.videoId)
        continue
    except youtube_transcript_api.VideoUnavailable:
        print('Probably rate limited')
        break
    time.sleep(np.random.randint(100, 1000)/100)

#if len(err_vids):
#    problem_vids += err_vids
#    err_vids = []
#with open('data/yt_caption_problems.txt', mode='w') as f:
#    f.write("\n".join(problem_vids))

## Step 3: Parse Dataset

See [YTCaptionParsing.ipynb](YTCaptionParsing.ipynb)