# Preprocessing YouTube-8M dataset

## Download a subset of the  dataset

In [None]:
"""
! mkdir -p ~/data/yt8m/frame; cd ~/data/yt8m/frame
! curl data.yt8m.org/download.py | shard=1,100 partition=2/frame/train mirror=us python
! curl data.yt8m.org/download.py | shard=1,100 partition=3/frame/validate mirror=us python
! curl data.yt8m.org/download.py | shard=1,100 partition=3/frame/test mirror=us python
! curl https://research.google.com/youtube8m/csv/2/vocabulary.csv --output vocabulary.csv
"""

## Load Data, Import Modules

In [1]:
import re
import os
import time
import json
import numpy as np 
import pandas as pd
import tensorflow as tf
from IPython.display import YouTubeVideo

In [2]:
directory = "../../../data/yt8m/frame"

## Configure Logging

In [3]:
import logging

logger = logging.getLogger("__name__")
logger.setLevel(logging.DEBUG)
logging.basicConfig(filename='app.log',
                    filemode='w',
                    format='%(name)s - %(levelname)s-%(message)s')

In [4]:
def time_elapsed(start: float) -> float:
    return round(time.time() - start, 2)

### Set `tf` to `tf.compat.v1`

In [5]:
tf = tf.compat.v1

## Extract Video, Label, Frame, and Frame Feature Data

In [12]:
class extractData(object):
    """
    Extracts Video, Frame, Label, and Visual Feature Data from YouTube-8M dataset
    """
    def __init__(self, directory: str):
        self.directory: str = directory
        self.data_type: List[str] = []
        self.vid_ids: List[str] = []
        self.labels: List[List[int]] = []
        self.frames: List[int] = []
        self.feat_rgb: List[List[float]] = []

    def run_extraction(self):
        """Extract data from a directory of TF records"""
        time_start = time.time()
        logging.info(f'Flow starting at: 0 s')
        print(f'Flow starting at: 0 s')

        # iterate through file records in the directory path
        for filename in os.listdir(self.directory):
            filepath: str = os.path.join(directory, filename)
            file, ext = os.path.splitext(filename)

            # delimit to files with a .tfrecord extension
            if ext == '.tfrecord':
                # split filename into train/test/validate and the record id
                match = re.match(r"([a-z]+)([0-9]+)", file, re.I)
                if match:
                    record_type, record_id = match.groups()
                    logging.info(
                        f"Start: {filename} at {time_elapsed(time_start)} s")
                    print(f"Start: {filename} at {time_elapsed(time_start)} s")

                    # iterate through video records in the .tfrecord file
                    for record in tf.python_io.tf_record_iterator(filepath):
                        tf_record = tf.train.Example.FromString(record)
                        video_id = tf_record.features.feature[
                            'id'].bytes_list.value[0].decode(encoding='UTF-8')
                        label_list = tf_record.features.feature[
                            'labels'].int64_list.value
                        print(label_list)

                        tf_seq_record = tf.train.SequenceExample.FromString(
                            record)
                        n_frames = len(tf_seq_record.feature_lists.
                                       feature_list['audio'].feature)

                        # iterate through individual frames
                        sess = tf.InteractiveSession()
                        for frame_idx in range(n_frames):
                            rgb_frame = tf.cast(
                                tf.decode_raw(
                                    tf_seq_record.feature_lists.
                                    feature_list['rgb'].feature[frame_idx].
                                    bytes_list.value[0], tf.uint8),
                                tf.float32).numpy()

                            # capture frame-level data and save to lists
                            self.data_type.append(record_type)
                            self.vid_ids.append(video_id)
                            self.labels.append(label_list)
                            self.frames.append(frame_idx)
                            self.feat_rgb.append(rgb_frame)
                        sess.close()

                    logging.info(
                        f"Complete: {filename} at {time_elapsed(time_start)} s"
                    )
                    print(
                        f"Complete: {filename} at {time_elapsed(time_start)} s"
                    )

    def create_df(self):
        """Save extracted data to pd.DataFrame for inspection"""
        self.df = pd.DataFrame(
            zip(self.data_type, self.vid_ids, self.labels, self.frames,
                self.feat_rgb),
            columns=['data_type', 'video_id', 'labels', 'frames', 'features'])

In [19]:
extractor = extractData(directory)
extractor.run_extraction()
extractor.create_df()

Flow starting at: 0 s
Start: test2069.tfrecord at 0.09 s
[]
[]
[]
[]




[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
Complete: test2069.tfrecord at 0.6 s
Start: train0274.tfrecord at 0.6 s
[15, 18, 59, 1511]
[3]
[2, 7, 83]
[82, 103, 118]
[4, 14]
[12, 237]
[26, 453]
[1, 25]
[3, 4, 13]
[300]
[0, 52, 70, 125]
[8]
[3, 4, 10, 13, 41, 54]
[14]
[11, 20, 22]
[3, 13]
[2, 7, 17, 19, 128, 129, 3247]
[2, 30]
[0, 1, 5, 69, 744]
[15, 363, 1816]
[29, 32, 370, 564, 3175]
[400, 952]
[9]
[39, 181, 216, 222, 249]
[21, 23, 53, 58]
[0, 1, 223, 438]
[4, 9, 10, 34, 970]
[3, 6]
[9, 10, 550]
[50, 65]
[15, 502, 619]
[2, 30, 55]
[2, 7, 48, 116, 224, 267, 691]
[6, 8]
[39]
[12]
[4]
[64, 149]
[3, 4]
[31, 40, 47, 57, 117]
[31]
[0, 8]
[61]
[401, 750]
[1, 623, 2174]
[82, 103, 346, 350, 537]
[14]
[5, 1802]
[11]
[21, 23, 24, 73, 74, 131, 209, 554]
[85]
[2, 7]
[14]
[3, 4, 13]
[2, 7, 2371]
[53, 58]
[0, 245, 352, 1185]
[0, 1, 27]
[21, 23, 775]
[8]
[4, 13, 41]
[0, 12]
[15, 18, 87]
[74, 131]
[3, 6]
[3, 4, 6, 8, 13]
[0, 12]
[1375]
[482, 1290]
[43, 60]
[133, 1027]
[400, 416, 1152, 1927]
[0, 1, 42, 111]


In [94]:
extractor.df.head()

Unnamed: 0,data_type,video_id,labels,frames,features,filter
0,test,o7Hx,[],0,"[115.0, 155.0, 101.0, 128.0, 155.0, 160.0, 137...",0
1,test,o7Hx,[],1,"[163.0, 159.0, 109.0, 109.0, 134.0, 178.0, 121...",0
2,test,o7Hx,[],2,"[157.0, 170.0, 96.0, 120.0, 93.0, 157.0, 148.0...",0
3,test,o7Hx,[],3,"[168.0, 173.0, 110.0, 129.0, 101.0, 144.0, 116...",0
4,test,o7Hx,[],4,"[117.0, 187.0, 97.0, 144.0, 118.0, 161.0, 146....",0


In [136]:
Counter(extractor.df.data_type).most_common()

[('train', 9532755), ('validate', 112471), ('test', 109051)]

### Preprocess DataFrame, Adding Vocabulary

In [73]:
# indoor vs. outdoor labels
indoors = ['Art exhibition',
 'Ballroom dance',
 'Basketball',
 'Bathroom',
 'Bed',
 'Bedroom',
 'Bookcase',
 'Bowling',
 'Boxing',
 'Ceiling',
 'Ceiling fan',
 'Classical ballet',
 'Classroom',
 'Closet',
 'Clothes dryer',
 'Cooking show',
 'Couch',
 'Countertop',
 'Dining room',
 'Drawer (furniture)',
 'Drywall',
 'Figure skating',
 'Fireplace',
 'Floor',
 'Flooring',
 'Flush toilet',
 'Furnace',
 'Furniture',
 'Grocery store',
 'Gymnastics',
 'Hair coloring',
 'Indoor cycling',
 'Indoor soccer',
 'Kickboxing',
 'Kitchen',
 'Kitchen stove',
 'Laboratory',
 'Laminate flooring',
 'Laundry',
 'Loudspeaker',
 'Nursery (room)',
 'Office',
 'Oven',
 'Pipe organ',
 'Refrigerator',
 'Room',
 'Table (furniture)',
 'Table football',
 'Table tennis',
 'Television',
 'Toilet',
 'Wallpaper',
 'Wardrobe',
 'Wood flooring']

outdoors = ['Agriculture',
 'Airboat',
 'Alpine skiing',
 'Backpacking (wilderness)',
 'Barbecue',
 'Baseball park',
 'Beach',
 'Beach volleyball',
 'Big wave surfing',
 'Bodyboarding',
 'Bowling (cricket)',
 'Bull riding',
 'Bulldozer',
 'Bungee jumping',
 'Campervan',
 'Camping',
 'Campsite',
 'Campus',
 'Cannon',
 'Canoe',
 'Canoeing',
 'Canyon',
 'Carousel',
 'Catamaran',
 'Cave',
 'Cement',
 'Cliff',
 'Cloud',
 'Construction',
 'Demolition',
 'Desert',
 'Drive-through',
 'Dump truck',
 'Dune buggy',
 'Eiffel Tower',
 'Elk',
 'Ferris wheel',
 'Fishing',
 'Garden',
 'Gate',
 'Glacier',
 'Gliding',
 'Go-kart',
 'GoPro',
 'Golf cart',
 'Graffiti',
 'Horse racing',
 'Ice climbing',
 'Kayak',
 'Landscape',
 'Locomotive',
 'Lunar eclipse',
 'Marathon',
 'Meteor shower',
 'Military parade',
 'Mountain',
 'Mountain biking',
 'Mountain pass',
 'Mud',
 'Music festival',
 'National park',
 'Nature',
 'Ocean',
 'Off-road racing',
 'Outdoor recreation',
 'Parachuting',
 'Parade',
 'Paragliding',
 'Pitcher',
 'Playground',
 'Pollution',
 'Pond',
 'Quarry',
 'Race track',
 'Rail transport',
 'Railroad car',
 'Rain',
 'Recreational fishing',
 'Recreational vehicle',
 'River',
 'Road',
 'Rocket launch',
 'Roller coaster',
 'Roof',
 'Sand',
 'Skateboarding',
 'Skiing',
 'Sky',
 'Snowboarding',
 'Solar eclipse',
 'Stadium',
 'Street art',
 'Street food',
 'Street racing',
 'Suite (hotel)',
 'Sunrise',
 'Sunset',
 'Surfing',
 'Swimming pool',
 'Thunderstorm',
 'Times Square',
 'Trail',
 'Train',
 'Tubing (recreation)',
 'Village',
 'Vineyard',
 'Volcano',
 'Wakeboarding',
 'Water park',
 'Yosemite National Park']

In [140]:
LABELS_DICT = {name:('indoors' if name in indoors else 'outdoors') for name in (indoors+outdoors)}

In [155]:
class processDataFrame(object):
    """Filters dataset, and """
    
    def __init__(self, vocab_path: str, feature_df: pd.DataFrame):
        self.df = feature_df
        self.vocab = pd.read_csv(vocab_path)

    def drop_frames(self):
        '''Drop frames missing labels for training and validation'''
        self.df['filter'] = self.df['labels'].apply(lambda x: 0
                                                    if str(x) == '[]' else 1)
        for data in ['train', 'validate']:
            self.df = self.df.drop(self.df[(self.df['data_type'] == data)
                                           & (self.df['filter'] == 0)].index)
        self.df = self.df.iloc[:, :-1].reset_index(drop=True)

    def assign_binary_labels(self):
        '''Assign indoors/outdoors label to frames'''
        data = [(self.vocab[self.vocab.Name == name].index.values[0], name,
                 label) for name, label in LABELS_DICT.items()]
        self.vocab_df = pd.DataFrame(data,
                                     columns=['label_id', 'name', 'scene'])
        scene_labels = []
        for label_set in self.df.labels:
            temp = []
            for label_idx in label_set:
                for idx in self.vocab_df.label_id:
                    if label_idx == idx:
                        try:
                            temp.append(
                                self.vocab_df[self.vocab_df.label_id ==
                                              label_idx].scene.values[0])
                        except IndexError:
                            print(label_idx)
            if temp and len(set(temp)) == 1:
                scene_labels.append(temp[0])
            else:
                scene_labels.append('None')
        self.df['scene_labels'] = scene_labels

In [156]:
path = os.path.join(directory, 'vocabulary.csv')
processor = processDataFrame(path, extractor.df)
processor.drop_frames()
processor.assign_binary_labels()

In [157]:
processor.df.head()

Unnamed: 0,data_type,video_id,labels,frames,features,scene_labels
0,test,o7Hx,[],0,"[115.0, 155.0, 101.0, 128.0, 155.0, 160.0, 137...",
1,test,o7Hx,[],1,"[163.0, 159.0, 109.0, 109.0, 134.0, 178.0, 121...",
2,test,o7Hx,[],2,"[157.0, 170.0, 96.0, 120.0, 93.0, 157.0, 148.0...",
3,test,o7Hx,[],3,"[168.0, 173.0, 110.0, 129.0, 101.0, 144.0, 116...",
4,test,o7Hx,[],4,"[117.0, 187.0, 97.0, 144.0, 118.0, 161.0, 146....",
