### Audioset Processing

In [9]:
import numpy as np
import pandas as pd
import tensorflow as tf
import json
import os

In [10]:
directory = "audioset_v1_embeddings/eval/"

dataset = []
for file_name in os.listdir(directory):
    if file_name.endswith(".tfrecord"):
        dataset.append(os.path.join(directory, file_name))

In [11]:
tf.compat.v1.enable_eager_execution()

In [12]:
raw_dataset = tf.data.TFRecordDataset(dataset)

In [13]:
# Create a list of distinguishable sounds
class_labels = pd.read_csv("class_labels_indices.csv")
labels = class_labels["display_name"].tolist()

# TODO: Include music genre labels that do not contain the word 'music'
# Create a list for music-genre-specific labels
music_class = class_labels[class_labels["display_name"].str.contains("Music", case=False)]
music_labels = music_class["index"].tolist()

In [14]:
audios = []
counter = 0
NUM_SECONDS = 10

for raw_record in raw_dataset:
    example = tf.train.SequenceExample()
    example.ParseFromString(raw_record.numpy())

    # Audio Meta Data
    audio_labels = example.context.feature["labels"].int64_list.value
    start_time = example.context.feature["start_time_seconds"].float_list.value
    end_time = example.context.feature["end_time_seconds"].float_list.value
    video_id = example.context.feature["video_id"].bytes_list.value

    if not (set(music_labels) & set(audio_labels)):
        continue

    # Audio Feature
    feature_list = example.feature_lists.feature_list["audio_embedding"].feature
    final_features = [list(feature.bytes_list.value[0]) for feature in feature_list]
    audio_embedding = [item for sublist in final_features[:NUM_SECONDS] for item in sublist]

    if len(final_features) < NUM_SECONDS:
        continue

    audio = {
        "label": audio_labels,
        "video_id": video_id,
        "start_time": start_time,
        "end_time": end_time,
        "data": audio_embedding
    }

    audios.append(audio)
    counter += 1
    if (counter % 100 == 0):
        print(f"Processing {counter}th file...")

2023-06-03 22:01:02.443229: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [4062]
	 [[{{node Placeholder/_0}}]]


Processing 100th file...
Processing 200th file...
Processing 300th file...
Processing 400th file...
Processing 500th file...
Processing 600th file...
Processing 700th file...
Processing 800th file...
Processing 900th file...
Processing 1000th file...
Processing 1100th file...
Processing 1200th file...
Processing 1300th file...
Processing 1400th file...
Processing 1500th file...
Processing 1600th file...
Processing 1700th file...
Processing 1800th file...
Processing 1900th file...
Processing 2000th file...
Processing 2100th file...
Processing 2200th file...
Processing 2300th file...
Processing 2400th file...
Processing 2500th file...
Processing 2600th file...
Processing 2700th file...
Processing 2800th file...
Processing 2900th file...
Processing 3000th file...
Processing 3100th file...
Processing 3200th file...
Processing 3300th file...
Processing 3400th file...
Processing 3500th file...
Processing 3600th file...
Processing 3700th file...
Processing 3800th file...
Processing 3900th fil

In [15]:
# Write results to a JSON file
with open("music_set.json", "w") as file:
    str_audio = repr(audios)
    json.dump(str_audio, file)

In [17]:
[audio["data"][:10] + audio["data"][-10:] for audio in audios[:3]]

[[162,
  117,
  221,
  134,
  205,
  60,
  121,
  142,
  88,
  161,
  63,
  0,
  42,
  240,
  93,
  151,
  152,
  69,
  249,
  0],
 [77,
  142,
  153,
  58,
  223,
  76,
  200,
  139,
  203,
  122,
  157,
  112,
  207,
  126,
  208,
  15,
  216,
  95,
  125,
  67],
 [176,
  64,
  84,
  130,
  82,
  183,
  36,
  70,
  220,
  238,
  193,
  0,
  255,
  179,
  203,
  255,
  255,
  255,
  0,
  192]]