In [1]:
%load_ext autoreload
%autoreload 2

In [1]:
import os
import pandas as pd
import numpy as np
import utils, irmasTrainUtils, irmasTestUtils
from scipy.io.wavfile import read as read_wav
from scipy.io.wavfile import write as write_wav
import re
import shutil
import torch
from torch.utils.data import DataLoader, TensorDataset
import pickle

In [9]:
BATCH_SIZE = 32

### Data Augmentation

Parse the IRMAS training data, places them into two directories ("Preprocessed_Trainset/Train" and "Preprocessed_Trainset/Validation") using a 90-10 split

In [3]:
irmasTrainUtils.parse_irmas_trainset("Data/IRMAS-TrainingData", "Data/Preprocessed_Trainset")

Processing directory: 1
Processing directory: 2
Processing directory: 3
Processing directory: 4
Processing directory: 5
Processing directory: 6
Processing directory: 7
Processing directory: 8
Processing directory: 9
Processing directory: 10
Processing directory: 11
Processing directory: 12
Processing directory: 13


Parse the IRMAS testing data, places them into a single directory

In [3]:
irmasTestUtils.parse_irmas_testset("Data/IRMAS-TestingData", "Data/Preprocessed_Testset")

Processing directory: 1


### Audio to Image

Loads the training dataset, converts each item into a mel-spectrogram, and saves the output to a Pandas DataFrame. Note that the data has already undergone the 90-10 split. Repeat for both subsets of data

In [11]:
df = irmasTrainUtils.load_train_dataset("Data/Preprocessed_Trainset/Train")

Count:  0
Count:  500
Count:  1000
Count:  1500
Count:  2000
Count:  2500
Count:  3000
Count:  3500
Count:  4000
Count:  4500
Count:  5000
Count:  5500
Count:  6000
Count:  6500
Count:  7000
Count:  7500
Count:  8000
Count:  8500
Count:  9000
Count:  9500
Count:  10000
Count:  10500
Count:  11000
Count:  11500


In [14]:
df.to_pickle("Data/train.pkl")

In [6]:
df2 = irmasTrainUtils.load_train_dataset("Data/Preprocessed_Trainset/Validation")

Count:  0
Count:  500
Count:  1000


In [15]:
df2.to_pickle("Data/validation.pkl")

In [3]:
df3 = irmasTestUtils.load_test_dataset("Data/Preprocessed_Testset")

Count:  0
Count:  500
Count:  1000
Count:  1500
Count:  2000
Count:  2500


In [None]:
df3.data[0].shape

In [5]:
df3.to_pickle("Data/test.pkl")

### Dataframe to DataLoader

We wil be using torch, so this section is to transform data to torch dataloader. 

In [14]:
def get_data_loader(X_data, y_data, batch_size, shuffle):
    """
    Get torch data loader from the DataFrame and Series objects.
  """
    X_tensor = torch.FloatTensor(X_data)
    y_tensor = torch.FloatTensor(y_data)
    dataset = TensorDataset(X_tensor, y_tensor)
    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)

In [None]:
df_val = pd.read_pickle("Data/validation.pkl")
df_val_label  = df_val.label.apply(lambda row: row.flatten())
dl_val = get_data_loader(df_val.data, df_val_label, BATCH_SIZE, True)

df_train = pd.read_pickle("Data/train.pkl")
df_train_label  = df_train.label.apply(lambda row: row.flatten())
dl_train = get_data_loader(df_train.data, df_train_label, BATCH_SIZE, True)

In [15]:
df_test = pd.read_pickle("Data/test.pkl")
df_test_label  = df_test.label.apply(lambda row: row.flatten())
dl_test = get_data_loader(df_test.data, df_test_label, BATCH_SIZE, True)

In [None]:
# Write DataLoaders to pickle files
output = open('Data/dataLoaderVal.pkl', 'wb')
# Pickle dictionary using protocol 0.
pickle.dump(dl_val, output)
output.close()
output = open('Data/dataLoaderTrain.pkl', 'wb')
# Pickle dictionary using protocol 0.
pickle.dump(dl_train, output)
output.close()

In [18]:
# Write DataLoaders to pickle files
output = open('Data/dataLoaderTest.pkl', 'wb')
# Pickle dictionary using protocol 0.
pickle.dump(dl_test, output)
output.close()