# Explore Youtube 8 Million dataset

## Retrieve video meta-data

In [125]:
import re
import os
import pandas as pd
import numpy as np
import tensorflow as tf
import youtube_dl
# from IPython.display import YouTubeVide


In [137]:
# list all the tfrecord files for video-level training data
arr = os.listdir("./frame-level")
r = re.compile(".+\\.tfrecord$")
video_train = sorted(list(filter(r.match, arr)))
video_train = list(map(lambda orig_string: "./frame-level/" + orig_string, video_train))
record_num =list(map(lambda x: re.findall(r"\d+", x),video_train))
record_num = np.array(record_num).flatten()


In [139]:
import urllib.request
import re


# construct a URI like  URL data.yt8m.org/AB/ABCD.js
# map the pesudo random id to the real youtub id
def get_real_id(pesudo_id):
    url = "http://data.yt8m.org/2/j/i/{}/{}.js".format(pesudo_id[0:2], pesudo_id)
    response = urllib.request.urlopen(url).read().decode()
    real_id = response.split(",")[-1][1:-3]
    return real_id


# This function collects the data provided by youtube-dl based on the real Youtubeid
def get_video_metadata(real_id):
    url = "https://www.youtube.com/watch?v=" + real_id
    ydl = youtube_dl.YoutubeDL()
    result = ydl.extract_info(url, download=False)
    fields = [
        "title",
        "categories",
        "tags",
        "description",
        "is_live",
        "view_count",
        "like_count",
        "channel_url",
        "duration",
        "average_rating",
        "age_limit",
        "webpage_url",
    ]
    video_metadata = [result[field] for field in fields]
    return video_metadata


In [140]:
url = "https://www.youtube.com/watch?v=J4Wdy0Wc_xQ"
ydl = youtube_dl.YoutubeDL()
result = ydl.extract_info(url, download=False)
fields = ""
for i in result.keys():
    fields += i + ", "
print("available metadata:\n", fields)


[youtube] J4Wdy0Wc_xQ: Downloading webpage
available metadata:
 id, title, formats, thumbnails, description, upload_date, uploader, uploader_id, uploader_url, channel_id, channel_url, duration, view_count, average_rating, age_limit, webpage_url, categories, tags, is_live, automatic_captions, subtitles, chapters, like_count, channel, extractor, webpage_url_basename, extractor_key, playlist, playlist_index, thumbnail, display_id, requested_subtitles, requested_formats, format, format_id, width, height, resolution, fps, vcodec, vbr, stretched_ratio, acodec, abr, ext, 


In [141]:
k = 0

# Create the pandas DataFrame
col_names = [
    "title",
    "categories",
    "tags",
    "description",
    "is_live",
    "view_count",
    "like_count",
    "channel_url",
    "duration",
    "average_rating",
    "age_limit",
    "webpage_url",
]

for k in range(10):
    video_train_i = video_train[k]


    # for video_train_i in video_train[:10]:
    puesdo_ids = []
    real_ids = []
    labels = []
    all_rgbs = []
    all_audios = []
    video_metadata_list = []

    for raw_record in tf.data.TFRecordDataset(video_train_i).take(3):
        tf_example = tf.train.SequenceExample()
        rt = tf_example.ParseFromString(raw_record.numpy())
        # exract features
        vid_id = (
            tf_example.context.feature["id"].bytes_list.value[0].decode(encoding="UTF-8")
        )
        puesdo_ids.append(vid_id)
        labels.append(tf_example.context.feature["labels"].int64_list.value)

        # extract rgb and audio feature at each frame

        ty = len(tf_example.feature_lists.feature_list["rgb"].feature)
        rgb = np.zeros((ty, 1024))
        audio = np.zeros((ty, 128))
        for i in range(ty):
            rgb[i] = tf.io.decode_raw(
                tf_example.feature_lists.feature_list["rgb"].feature[i].bytes_list.value[0],
                tf.uint8,
            )  # .numpy()

            audio[i] = tf.io.decode_raw(
                tf_example.feature_lists.feature_list["audio"]
                .feature[i]
                .bytes_list.value[0],
                tf.uint8,
            )  # .numpy()

        all_rgbs.append(rgb)
        all_audios.append(audio)

        # get video metadata
        try:
            real_id = get_real_id(vid_id)
            # Get the youtube-dl valuable metadata
            metadata = get_video_metadata(real_id)
        except:
            real_id = np.nan
            metadata = [np.nan for i in range(12)]

        real_ids.append(real_id)
        video_metadata_list.append(metadata)

        
    df = pd.DataFrame(video_metadata_list, columns=col_names)
    df.insert(0, "puesdo_id", puesdo_ids)
    df.insert(1, "real_id", real_ids)
    df["rgb_by_frame"] = all_rgbs
    df["audio_by_frame"] = all_audios
    df = df.fillna(value=np.nan)
    print(df.shape)

    df.to_csv("./frame-with-metadata/%srecord.csv"%record_num[k])

# df.head()



[youtube] CXg4IMG3e7k: Downloading webpage
[youtube] koPinjfqp9w: Downloading webpage
[youtube] koPinjfqp9w: Downloading MPD manifest
[youtube] tLxlqIXkc7A: Downloading webpage
[youtube] tLxlqIXkc7A: Downloading MPD manifest
(3, 16)
[youtube] _DZhdk6IFTw: Downloading webpage
[youtube] _DZhdk6IFTw: Downloading MPD manifest
[youtube] PSzzpLGFzs4: Downloading webpage
[youtube] Cyx-UJAXZrg: Downloading webpage
[youtube] Cyx-UJAXZrg: Downloading MPD manifest
(3, 16)
[youtube] E6yOEaHaOSE: Downloading webpage
[youtube] tzvZpG-QbUA: Downloading webpage
[youtube] tzvZpG-QbUA: Downloading MPD manifest
[youtube] Fa7aaJ-yWgs: Downloading webpage
(3, 16)
[youtube] TGDyx23lLNA: Downloading webpage
[youtube] TGDyx23lLNA: Downloading MPD manifest
[youtube] 43_fYApnHbs: Downloading webpage
[youtube] e6W0tHusof4: Downloading webpage
(3, 16)
[youtube] i6-Db8Lcypg: Downloading webpage
[youtube] CfTnX7S4Jvs: Downloading webpage
[youtube] NbsYM9_ClGI: Downloading webpage
(3, 16)
[youtube] TcXg8DCgi7U: Down

ERROR: Private video
Sign in if you've been granted access to this video


[youtube] ziY3jVVR4VE: Downloading webpage
(3, 16)
[youtube] AFFpf6Tb9wk: Downloading webpage
[youtube] fZuoPoq-UsQ: Downloading webpage
[youtube] 7RSahADt_hs: Downloading webpage
(3, 16)
[youtube] OR4T0d2n1F4: Downloading webpage
[youtube] px7V6H8F5Ew: Downloading webpage
[youtube] px7V6H8F5Ew: Downloading MPD manifest
[youtube] bQ4Wdm4--es: Downloading webpage
(3, 16)
