## Load AudioSet data from file

Data extracted by:
https://github.com/qiuqiangkong/audioset_classification

In [2]:
import h5py
import numpy as np

feat_dir = "./features"
models_dir = "./models"
hdf5_path = "../dataset/packed_features"

def load_data(hdf5_path):
    with h5py.File(hdf5_path, 'r') as hf:
        x = hf.get('x')
        y = hf.get('y')
        video_id_list = hf.get('video_id_list')
        x = np.array(x)
        y = list(y)
        video_id_list = list(video_id_list)
        
    return x, y, video_id_list

(X_train, y_train, train_video_id_list) = load_data(hdf5_path+"/bal_train.h5")
(X_eval, y_eval, eval_video_id_list) = load_data(hdf5_path+"/eval.h5")
(X_unbal, y_unbal, unbal_video_id_list) = load_data(hdf5_path+"/unbal_train.h5")

#Convert y from bool to int
y_train = np.array(y_train).astype(int)			# shape: (N, 527)
y_eval = np.array(y_eval).astype(int)
y_unbal = np.array(y_unbal).astype(int)

MemoryError: Unable to allocate 2.43 GiB for an array with shape (2041789, 10, 128) and data type uint8

In [None]:
import pandas as pd

classes_file = "../dataset/class_labels_indices.csv"

labels_df = pd.read_csv(classes_file)
labels_df = labels_df.set_index('index')

selected_classes = ["Speech", "Musical instrument", "Car", "Dog", "Child speech, kid speaking", "Rail transport",
                    "Siren", "Vehicle horn, car horn, honking", "Jackhammer", "Pigeon, dove"]

df_sel = labels_df[labels_df['display_name'].isin(selected_classes)]
df_sel.head(10)

## Filter training sets
Select only our classes. Reduce from 527 to 10 classes.

In [None]:
filter_classes = list(df_sel.index)

y_train_filtered = y_train[:,filter_classes]

# Rows that contain only ONE of our classes
filter_ = np.argwhere(np.sum(y_train_filtered, axis=1) == 1)
y_train = y_train_filtered[filter_.flat]
X_train = X_train[filter_.flat]
train_video_id_list = np.array(train_video_id_list)[filter_.flat]

y_eval_filtered = y_eval[:,filter_classes]

filter_ = np.argwhere(np.sum(y_eval_filtered, axis=1) == 1)
y_eval = y_eval_filtered[filter_.flat]
X_eval = X_eval[filter_.flat]
eval_video_id_list = np.array(eval_video_id_list)[filter_.flat]

y_unbal_filtered = y_unbal[:,filter_classes]

filter_ = np.argwhere(np.sum(y_unbal_filtered, axis=1) == 1)
y_unbal = y_unbal_filtered[filter_.flat]
X_unbal = X_unbal[filter_.flat]
unbal_video_id_list = np.array(unbal_video_id_list)[filter_.flat]

## One hot encoding for our classes

In [None]:
import pickle
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
mlb.fit([filter_classes])
print(list(mlb.classes_))

# Save mlb for our project
dump_filename = f"{feat_dir}/new_multiLabelBinarizer.pkl"
with open(dump_filename, "wb") as dump_file:
    pickle.dump(mlb, dump_file)

## Save data

In [None]:
# Save everything
import pickle
from pathlib import Path

# Save training and evaluation sets
Path(feat_dir).mkdir(exist_ok=True)   

dump_filename = f"{feat_dir}/new_X_train.pkl"
with open(dump_filename, "wb") as dump_file:
    pickle.dump(X_train, dump_file)
    
dump_filename = f"{feat_dir}/new_y_train.pkl"
with open(dump_filename, "wb") as dump_file:
    pickle.dump(y_train, dump_file)
    
dump_filename = f"{feat_dir}/new_X_eval.pkl"
with open(dump_filename, "wb") as dump_file:
    pickle.dump(X_eval, dump_file)
    
dump_filename = f"{feat_dir}/new_y_eval.pkl"
with open(dump_filename, "wb") as dump_file:
    pickle.dump(y_eval, dump_file)
    
dump_filename = f"{feat_dir}/new_X_unbal.pkl"
with open(dump_filename, "wb") as dump_file:
    pickle.dump(X_unbal, dump_file)
    
dump_filename = f"{feat_dir}/new_y_unbal.pkl"
with open(dump_filename, "wb") as dump_file:
    pickle.dump(y_unbal, dump_file)

## Balanced data

In [None]:
y_df = pd.DataFrame(y_train)
counters = {}
for i in range(y_train.shape[1]):
    counters[df_sel.iloc[i]["display_name"]] = y_df[i].value_counts().loc[1]

In [None]:
counters

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(20,4))    
plt.bar(counters.keys(), counters.values(), 1)
plt.xticks(rotation = 90)
plt.show()

### Remove speech from the bar plot
To make it more readable

In [None]:
top = dict(sorted(counters.items(), key=lambda item: item[1], reverse=True)[1:10])

plt.figure(figsize=(20,4))    
plt.bar(top.keys(), top.values(), 1)
plt.xticks(rotation = 90)
plt.show()

### Join balanced and unbalanced data

In [None]:
X = np.concatenate((x_train, x_unbal))
y = np.concatenate((y_train, y_unbal))

In [None]:
y_df = pd.DataFrame(y)
counters = {}
for i in range(y.shape[1]):
    counters[df_sel.iloc[i]["display_name"]] = y_df[i].value_counts().loc[1]

In [None]:
counters

In [None]:
plt.figure(figsize=(20,4))    
plt.bar(counters.keys(), counters.values(), 1)
plt.xticks(rotation = 90)
plt.show()

## Save everything

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
mlb.fit([selected_classes_indexes])
print(list(mlb.classes_))

# Save mlb for our project
base_dir = "./features/"

dump_filename = f"{base_dir}multiLabelBinarizer.pkl"
with open(dump_filename, "wb") as dump_file:
    pickle.dump(mlb, dump_file)