In [1]:
import tensorflow as tf
import numpy as np
from IPython.display import YouTubeVideo


In [2]:
# to download dataset shards
# curl data.yt8m.org/download.py | shard=1,100 partition=2/frame/train mirror=us python

# Loading data and creating TF dataset

In [3]:
import tensorflow as tf
import os

filenames = []
for f in os.listdir('./'):
    if f.endswith('.tfrecord'):
        filenames.append(f)
dataset = tf.data.TFRecordDataset(filenames[:20]) # at most 20 files

feat_rgb = []
labels_list = []

for raw_record in dataset:
    tf_seq_example = tf.train.SequenceExample.FromString(raw_record.numpy())
    features_rgb = tf_seq_example.feature_lists.feature_list["rgb"]
    n_frames = len(features_rgb.feature)
    labels = tf_seq_example.context.feature['labels'].int64_list.value
    id = tf_seq_example.context.feature['id'].bytes_list.value[0]
    rgb_frame = []
    # iterate through frames
    for i in range(n_frames):
        # Decode RGB frames
        rgb_frame.append(tf.io.decode_raw(
                features_rgb.feature[i].bytes_list.value[0], tf.uint8
            ).numpy().astype(float))
    
    feat_rgb.append(rgb_frame)
    labels_list.append(labels)


2024-03-10 17:16:33.777756: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M3
2024-03-10 17:16:33.777774: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2024-03-10 17:16:33.777778: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2024-03-10 17:16:33.777813: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-03-10 17:16:33.777828: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [4]:
import pandas as pd
mapping = pd.read_csv('label_names.csv')
mapping = dict(zip(mapping['label_id'], mapping['label_name']))

In [5]:
all_labels_names = []
for x in labels_list:
    all_labels_names.extend([mapping[i] for i in x])
# count labels
from collections import Counter
label_count = Counter(all_labels_names)
print(label_count)

Counter({'Games': 4159, 'Vehicle': 2852, 'Video game': 2206, 'Concert': 1984, 'Car': 1536, 'Dance': 1251, 'Animation': 1041, 'Musician': 994, 'Football': 916, 'Music video': 816, 'Animal': 746, 'Food': 699, 'Motorsport': 692, 'Musical ensemble': 660, 'Guitar': 596, 'Cartoon': 555, 'Performance art': 526, 'Racing': 499, 'Outdoor recreation': 446, 'PC game': 434, 'Drums': 400, 'Stadium': 389, 'Trailer': 372, 'Nature': 350, 'Mobile phone': 349, 'String instrument': 343, 'Cooking': 332, 'Drummer': 323, 'Toy': 305, 'Fashion': 298, 'Motorcycle': 290, 'Smartphone': 289, 'Disc jockey': 288, 'Action-adventure game': 276, 'Weapon': 276, 'Minecraft': 273, 'Recipe': 269, 'Piano': 258, 'Ball': 253, 'Gadget': 246, 'Orchestra': 232, 'Road': 231, 'Sports car': 215, 'Pet': 212, 'Cosmetics': 204, 'Fishing': 193, 'Choir': 193, 'Call of Duty': 189, 'Strategy video game': 182, 'Personal computer': 177, 'School': 175, 'Keyboard': 168, 'Aircraft': 168, 'Race track': 167, 'Highlight film': 164, 'Transport': 1

In [6]:
predefined_labels = [1,
3,
5,
8,
10,
12,
18,
22,
23,
26]

In [7]:
def number_of_videos(label):
    ''' Returns the number of videos that contain a given label '''
    return sum([label in labels for labels in labels_list])

for label in predefined_labels:
    print(f'Number of videos with label {label}: {number_of_videos(label)}')

Number of videos with label 1: 2852
Number of videos with label 3: 1984
Number of videos with label 5: 1251
Number of videos with label 8: 916
Number of videos with label 10: 746
Number of videos with label 12: 699
Number of videos with label 18: 446
Number of videos with label 22: 350
Number of videos with label 23: 349
Number of videos with label 26: 332


In [8]:
remap = {}
for i, label in enumerate(predefined_labels):
    remap[label] = i

In [9]:
# select only samples for with at least one label is in predefined_labels and keep only those labels
feat_rgb = [feat_rgb[i] for i in range(len(feat_rgb)) if any(label in predefined_labels for label in labels_list[i])]
labels_list = [list(filter(lambda x: x in predefined_labels, labels)) for labels in labels_list if any(label in predefined_labels for label in labels)]

# remap labels
labels_list = [[remap[label] for label in labels] for labels in labels_list]

In [10]:
total_labels = []
for l in labels_list:
    total_labels.extend(l)
total_labels = list(set(total_labels))
total_labels = max(total_labels) - min(total_labels) + 1
total_labels

10

In [11]:
max_seq_length = 45

# Pad sequences
# feat_rgb_padded = np.array([np.pad(seq, ((0, max_seq_length - len(seq)), (0, 0)), 'constant') for seq in feat_rgb])


# truncated_padded_sequences = [seq[:max_seq_length] if len(seq) > max_seq_length else torch.cat([seq, torch.zeros(max_length - len(seq))]) for seq in feat_rgb]
# feat_rgb_padded = pad_sequence(truncated_padded_sequences, batch_first=True, padding_value=0)


feat_rgb_padded = tf.keras.preprocessing.sequence.pad_sequences(
    feat_rgb,
    maxlen=max_seq_length,  # Maximum length of all sequences
    padding='post',  # Pad after each sequence
    truncating='post',  # Truncate after each sequence
    value=0  # Padding value
)



# Function to one-hot encode labels
def one_hot_encode(labels, total_labels):
    one_hot_labels = np.zeros((len(labels), total_labels), dtype=int)
    for i, label_list in enumerate(labels):
        for l in label_list:
            one_hot_labels[i, l] = 1
    return one_hot_labels

labels_one_hot = one_hot_encode(labels_list, total_labels)

feat_rgb_tensor = tf.convert_to_tensor(feat_rgb_padded, dtype=tf.float32)
labels_tensor = tf.convert_to_tensor(labels_one_hot, dtype=tf.int32)

# tf dataset
dataset = tf.data.Dataset.from_tensor_slices((feat_rgb_tensor, labels_tensor))

# save dataset
save_dir = './saved_dataset'
tf.data.experimental.save(dataset, save_dir)

Instructions for updating:
Use `tf.data.Dataset.save(...)` instead.


In [12]:
len(dataset)

9310