# Preprocessing and save dataset as tfrecords

In [None]:
# Dependencies

 # TensorFlow and tf.keras
import tensorflow as tf
print('Tensorflow Version:', tf.__version__)
from tensorflow import keras

# Helper libraries
import os
import os.path
import glob
import librosa
import librosa.display
import json
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import IPython.display as pd
import pprint
import random
import time


# Check if the GPU is available (otherwise computing will take a looooonnnnggggg time)
print("GPU", "available (YESS!!!!)" if tf.config.list_physical_devices("GPU") else "not available :(")



In [None]:
# load global settings in config-dictionary
with open('./MA_CONFIG.json', 'r') as fp:
  config = json.load(fp)

# define some extra values
config['input_shape'] = (441000, 1)

# print config
print(json.dumps(config, indent=4))

# save config to disk
with open('./MA_CONFIG.json', 'w+') as fp:
    json.dump(config, fp, sort_keys=True, indent=4)


# Online Preprocessing

In [None]:

def load_and_process_data(file_path):

    # paths for ground truths prod files
    fps_prod = glob.glob('/Users/marius/Documents/Uni/TU_Berlin_Master/Masterarbeit/Data/allFiles/producedSpeech/**.wav')

    # path string is saved as byte array in tf.data.dataset -> convert back to str
    if type(file_path) is not str:
        file_path = file_path.numpy()
        file_path = file_path.decode('utf-8')
    
    # load audio data 
    y, _ = librosa.core.load(file_path, sr=config['sr'], mono=True, offset=0.0, duration=None, 
                             dtype=np.float32, res_type='kaiser_fast')


    # get string with speaker and scriptname
    label = file_path.split('/')[-1]
    label = label[:10]
    for filename in fps_prod:
        if label in filename:
            fp = filename
            break

    # load corresponding produced audio file
    y_truth, _ = librosa.core.load(fp, sr=config['sr'], mono=True, offset=0.0, duration=None, 
                             dtype=np.float32, res_type='kaiser_fast')
    

    # cut audio into 10s frames
    seg = 10*44100
    y_10s = librosa.util.frame(y, frame_length=seg, hop_length=seg).T
    y_truth_10s = librosa.util.frame(y_truth, frame_length=seg, hop_length=seg).T


    # zero pad last segment to seg if not zero padded already
    #for i in range(len(y_10s)):
    #    if not len(y_10s[i]) == seg:
    #        y_10s[i] = librosa.util.fix_length(y_10s[i], size=seg)
        
    

    return y_10s, y_truth_10s


def preprocessing_wrapper(file_path):

    # execute the preprocessing function
    y, y_truth = tf.py_function(load_and_process_data, [file_path], [tf.float32, tf.float32])
    
    # Input shape for wave inputs (longest file, 1)
    #y.set_shape([config['input_shape'][1], 1])
    #y_truth.set_shape([config['input_shape'][1], 1])

    return y, y_truth

def transpose(tensor, tensor_spec):
    out = tf.transpose(tensor, perm=[1,0])
    return out




## create Dataset with preprocessing func from wav-files and save

In [None]:
# autotune computation
AUTOTUNE = tf.data.experimental.AUTOTUNE


# folder with the training data
train_files = '/Users/marius/Documents/Uni/TU_Berlin_Master/Masterarbeit/Dataset/train/*.wav'
# folder with the test data
test_files = '/Users/marius/Documents/Uni/TU_Berlin_Master/Masterarbeit/Dataset/test/*.wav'


# define a dataset of file paths - TRAIN
train_dataset = tf.data.Dataset.list_files(train_files, shuffle=False)
# define a dataset of file paths - TEST
test_dataset = tf.data.Dataset.list_files(test_files, shuffle=False)


# little dataset for testing
train_dataset = train_dataset.take(2)
# little dataset for testing
test_dataset = test_dataset.take(2)


# run the preprocessing via map
train_dataset = train_dataset.map(preprocessing_wrapper, num_parallel_calls=AUTOTUNE)
# run the preprocessing via map
test_dataset = test_dataset.map(preprocessing_wrapper, num_parallel_calls=AUTOTUNE)



print('---------------------------------------')
print('-----------TRAIN DATASET---------------')
i = len([i for i in train_dataset])
print(f" Number of Tensors before unbatching: {i}")

train_dataset = train_dataset.unbatch()
i = len([i for i in train_dataset])
print(f" Number of Tensors after unbatching: {i}")

# Dataset shape should be:
# tuple = ((batches, 1, 441000), (batches, 1, 441000))


#  # add channel dimension to tuple
train_dataset = train_dataset.map(lambda x, y: (tf.expand_dims(x, axis=1), tf.expand_dims(y, axis=1)), num_parallel_calls=AUTOTUNE)

# check shape of dataset
for d in train_dataset:
    print('Shapes per element')
    print(f'1. Tensor shape: {d[0].shape}')
    print(f'2. Tensor shape: {d[1].shape}')
    break


print('')
print('--------------------------------------')
print('-----------TEST DATASET---------------')
i = len([i for i in test_dataset])
print(f" Number of Tensors before unbatching: {i}")

test_dataset = test_dataset.unbatch()
i = len([i for i in test_dataset])
print(f" Number of Tensors after unbatching: {i}")

#  # add channel dimension to tuple
test_dataset = test_dataset.map(lambda x, y: (tf.expand_dims(x, axis=1), tf.expand_dims(y, axis=1)), num_parallel_calls=AUTOTUNE)
# transpose the tensors
#test_dataset = test_dataset.map(transpose, num_parallel_calls=AUTOTUNE)



# check shape of dataset
for d in test_dataset:
    print('Shapes per element')
    print(f'1. Tensor shape: {d[0].shape}')
    print(f'2. Tensor shape: {d[1].shape}')
    break




# save datasets to tfrecords

# print('')
# print('--------------------------------------')
# print('-----------SAVING DATASETS---------------')


print(f" saving train dataset to tfrecords...")

# count the time and print it
start = time.time()
tf.data.experimental.save(train_dataset, '/Users/marius/Documents/Uni/TU_Berlin_Master/Masterarbeit/Dataset/train.tfrecord', compression='GZIP')
# print time for saving
print(f" saving train dataset took {time.time()-start} seconds")


print(f" saving test dataset to tfrecords...")
# count the time and print it
start = time.time()
tf.data.experimental.save(test_dataset, '/Users/marius/Documents/Uni/TU_Berlin_Master/Masterarbeit/Dataset/test.tfrecord', compression='GZIP')
# print time for saving
print(f" saving test dataset took {time.time()-start} seconds")






In [None]:
# for noise, gt in train_dataset.take(1):  # only take first element of dataset
#     noisy_speech = noise.numpy()
#     gt_speech = gt.numpy()

# print(noisy_speech.shape)
# print(gt_speech.shape)


# for noise, gt in test_dataset.take(4):  # only take first element of dataset
#     noisy_speech = noise.numpy()
#     gt_speech = gt.numpy()

# print(noisy_speech.shape)
# print(gt_speech.shape)

# # plot noisy speech
# plt.figure(figsize=(10, 4))
# plt.title('Noisy speech')
# librosa.display.waveplot(np.squeeze(noisy_speech), sr=44100)
# plt.show()




# Check the Data by plot and audio display

In [None]:
# look at some example data from train dataset
wavs = train_dataset.as_numpy_iterator()
noisy = []
gt = []

# Setup Subplot
nrows, ncols = 2, 2
fig, ax = plt.subplots(nrows=nrows, ncols=ncols, sharex=True, figsize=(16, 9))


# iterate over dataset
for i, sample in enumerate(wavs):
    
    # get the column and row by modulo and remainder
    j = i % ncols
    k = int(i / ncols)
    
    # extract noisy and produced speech file from tensors
    wave = sample[0]
    ground_truth = sample[1]
        
    # plot files
    librosa.display.waveshow(np.squeeze(wave), x_axis='time', sr=config['sr'], ax=ax[k][j], label='test_file')
    librosa.display.waveshow(np.squeeze(ground_truth), alpha=0.3, x_axis='time', sr=config['sr'], ax=ax[k][j], label='ground_truth')
    ax[k][j].legend()
    ax[k][j].axis('on')
    ax[k][j].set_title('10s speech')  

    # save speech to arrays
    noisy.append(np.squeeze(wave))
    gt.append(np.squeeze(ground_truth))
    
    if i+1 == ncols*nrows:
        break
    
# adjust whitespace in between subplots        
plt.subplots_adjust(hspace=0.25, wspace=0.15)
plt.show()


# listen to the audio samples
for i in range(len(gt)):
    print(f'----------- {i+1}. speechsnippet ---------------')
    print('')
    print(f'Voicefixer file')
    pd.display(pd.Audio(noisy[i].T, rate=config['sr']))
    print(f'corresponding produced file')
    pd.display(pd.Audio(gt[i].T, rate=config['sr']))
    print('')
