# Preprocessing and save datasets as tfrecords

In [10]:
# Dependencies

 # TensorFlow and tf.keras
import tensorflow as tf
print('Tensorflow Version:', tf.__version__)
from tensorflow import keras

# Helper libraries
import os
import os.path
import glob
import librosa
import librosa.display
import json
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import IPython.display as pd
import pprint
import random
import time


# Check if the GPU is available (otherwise computing will take a looooonnnnggggg time)
print("GPU", "available (YESS!!!!)" if tf.config.list_physical_devices("GPU") else "not available :(")



Tensorflow Version: 2.9.1
GPU not available :(


In [11]:
# load global settings in config-dictionary
with open('./MA_CONFIG.json', 'r') as fp:
  config = json.load(fp)

# define some extra values
config['input_shape'] = (441000, 1)

# print config
print(json.dumps(config, indent=4))

# save config to disk
with open('./MA_CONFIG.json', 'w+') as fp:
    json.dump(config, fp, sort_keys=True, indent=4)


{
    "batch_size": 16,
    "fps_noisy": "/Users/marius/Documents/Uni/TU_Berlin_Master/Masterarbeit/Data/allFiles/noisySpeech",
    "fps_produced": "/Users/marius/Documents/Uni/TU_Berlin_Master/Masterarbeit/Data/allFiles/producedSpeech",
    "fps_voicefixer": "/Users/marius/Documents/Uni/TU_Berlin_Master/Masterarbeit/Data/allFiles/voicefixerOutput",
    "hop_length": 64,
    "input_shape": [
        441000,
        1
    ],
    "n_epochs": 10,
    "n_fft": 512,
    "n_mels": 16,
    "offset": 6,
    "sample_length": 20,
    "shuffle_buffer_size": 300,
    "sr": 44100,
    "test_dataset_path": "../Dataset/test.tfrecord",
    "train_dataset_path": "../Dataset/train.tfrecord",
    "win_length": 512
}


# func to save dataset to tfrecords of ~100mb

In [12]:
def load_and_process_data(file_path):

    # paths for ground truths prod files
    fps_prod = glob.glob('/Users/marius/Documents/Uni/TU_Berlin_Master/Masterarbeit/Data/allFiles/producedSpeech/**.wav')
    
    # load audio data 
    y, _ = librosa.core.load(file_path, sr=config['sr'], mono=True, offset=0.0, duration=None, 
                             dtype=np.float32, res_type='kaiser_fast')

    # get string with speaker and scriptname
    label = file_path.split('/')[-1]
    label = label[:10]
    for filename in fps_prod:
        if label in filename:
            fp = filename
            break

    # load corresponding produced audio file
    y_truth, _ = librosa.core.load(fp, sr=config['sr'], mono=True, offset=0.0, duration=None, 
                             dtype=np.float32, res_type='kaiser_fast')
    

    # cut audio into 10s frames
    seg = 10*44100
    y_10s = librosa.util.frame(y, frame_length=seg, hop_length=seg).T
    y_truth_10s = librosa.util.frame(y_truth, frame_length=seg, hop_length=seg).T


    # zero pad last segment to seg if not zero padded already
    #for i in range(len(y_10s)):
    #    if not len(y_10s[i]) == seg:
    #        y_10s[i] = librosa.util.fix_length(y_10s[i], size=seg)
        
    

    return y_10s, y_truth_10s



def _bytes_feature(value):
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy()
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))



# tfrecords writer
def save_tfrecords(tfrecords_path: str,
                   voicefixer_list: list,
                   produced_list: list,
                   config: dict):

    # tfrecords writer
    with tf.io.TFRecordWriter(tfrecords_path) as writer:
        for v_array, p_array in zip(voicefixer_list, produced_list):

            # encode
            v_encoded = tf.audio.encode_wav(v_array[:, np.newaxis], config['sr'])
            p_encoded = tf.audio.encode_wav(p_array[:, np.newaxis], config['sr'])
        
            # save feature
            feature = {'voicefixer': _bytes_feature(v_encoded),
                       'produced': _bytes_feature(p_encoded)}
            features = tf.train.Features(feature=feature)
            example = tf.train.Example(features=features)
            writer.write(example.SerializeToString())


        # close writer
        writer.close()

# Save Train Dataset

In [None]:
# folder with the training data
train_files = glob.glob('/Users/marius/Documents/Uni/TU_Berlin_Master/Masterarbeit/Dataset/train/*.wav')
train_files = train_files

voicefixer_list = []
produced_list = []
length_tf_records = 60

i=0
for idx, file in enumerate(train_files):
    voicefixer, produced = load_and_process_data(file)
    voicefixer_list.extend(voicefixer)
    produced_list.extend(produced)
    print(len(voicefixer_list), len(produced_list))

    if len(voicefixer_list) > (length_tf_records):
        save_tfrecords(f'/Users/marius/Documents/Uni/TU_Berlin_Master/Masterarbeit/Dataset/train_tfrecords/train_{i}.tfrecords', voicefixer_list[:length_tf_records], produced_list[:length_tf_records], config)
        voicefixer_list = voicefixer_list[length_tf_records:]
        produced_list = produced_list[length_tf_records:]
        i=i+1
        print(f'tfrecord #{i} saved')



# Save test Dataset

In [14]:
# folder with the training data
test_files = glob.glob('/Users/marius/Documents/Uni/TU_Berlin_Master/Masterarbeit/Dataset/test/*.wav')
test_files = test_files

voicefixer_list = []
produced_list = []
length_tf_records = 60

i=0
for idx, file in enumerate(test_files):
    voicefixer, produced = load_and_process_data(file)
    voicefixer_list.extend(voicefixer)
    produced_list.extend(produced)
    print(len(voicefixer_list), len(produced_list))

    if len(voicefixer_list) > (length_tf_records):
        save_tfrecords(f'/Users/marius/Documents/Uni/TU_Berlin_Master/Masterarbeit/Dataset/test_tfrecords/test_{i}.tfrecords', voicefixer_list[:length_tf_records], produced_list[:length_tf_records], config)
        voicefixer_list = voicefixer_list[length_tf_records:]
        produced_list = produced_list[length_tf_records:]
        i=i+1
        print(f'tfrecord #{i} saved')



13 13
27 27
41 41
52 52
67 67
tfrecord #1 saved
20 20
33 33
46 46
60 60
75 75
tfrecord #2 saved
30 30
43 43
57 57
73 73
tfrecord #3 saved
27 27
42 42
53 53
68 68
tfrecord #4 saved
21 21
35 35
50 50
64 64
tfrecord #5 saved
15 15
28 28
41 41
54 54
67 67
tfrecord #6 saved
20 20
31 31
42 42
55 55
69 69
tfrecord #7 saved
23 23
38 38
51 51
64 64
tfrecord #8 saved
15 15
29 29
42 42
53 53
68 68
tfrecord #9 saved
19 19
32 32
48 48
62 62
tfrecord #10 saved
18 18
31 31
44 44
57 57
72 72
tfrecord #11 saved
25 25
38 38
54 54
68 68
tfrecord #12 saved
21 21
36 36
52 52
66 66
tfrecord #13 saved
20 20
33 33
47 47
60 60
75 75
tfrecord #14 saved
30 30
45 45
58 58
73 73
tfrecord #15 saved
29 29
45 45
59 59
72 72
tfrecord #16 saved
28 28
41 41
55 55
70 70
tfrecord #17 saved
24 24
37 37
50 50
65 65
tfrecord #18 saved
16 16
31 31
46 46
60 60
74 74
tfrecord #19 saved
27 27
41 41
55 55
66 66
tfrecord #20 saved
22 22
36 36
51 51
66 66
tfrecord #21 saved
20 20
35 35
51 51
64 64
tfrecord #22 saved
20 20
33 33
47 

# Save valid Dataset

In [13]:
# folder with the training data
valid_files = glob.glob('/Users/marius/Documents/Uni/TU_Berlin_Master/Masterarbeit/Dataset/valid/*.wav')
valid_files = valid_files

voicefixer_list = []
produced_list = []
length_tf_records = 60

i=0
for idx, file in enumerate(valid_files):
    voicefixer, produced = load_and_process_data(file)
    voicefixer_list.extend(voicefixer)
    produced_list.extend(produced)
    print(len(voicefixer_list), len(produced_list))

    if len(voicefixer_list) > (length_tf_records):
        save_tfrecords(f'/Users/marius/Documents/Uni/TU_Berlin_Master/Masterarbeit/Dataset/valid_tfrecords/valid_{i}.tfrecords', voicefixer_list[:length_tf_records], produced_list[:length_tf_records], config)
        voicefixer_list = voicefixer_list[length_tf_records:]
        produced_list = produced_list[length_tf_records:]
        i=i+1
        print(f'tfrecord #{i} saved')
    
    


19 19
33 36
52 55
69 72
tfrecord #1 saved
26 29
43 46
62 65
tfrecord #2 saved
21 24
43 43
60 60
77 77
tfrecord #3 saved
39 36
54 53
68 70
tfrecord #4 saved
27 29
46 48
60 65
82 84
tfrecord #5 saved
37 41
56 60
78 79
tfrecord #6 saved
35 36
53 53
72 72


KeyboardInterrupt: 

# Check the Data by plot and audio display

In [None]:
# # look at some example data from train dataset
# wavs = train_dataset.as_numpy_iterator()
# noisy = []
# gt = []

# # Setup Subplot
# nrows, ncols = 2, 2
# fig, ax = plt.subplots(nrows=nrows, ncols=ncols, sharex=True, figsize=(16, 9))


# # iterate over dataset
# for i, sample in enumerate(wavs):
    
#     # get the column and row by modulo and remainder
#     j = i % ncols
#     k = int(i / ncols)
    
#     # extract noisy and produced speech file from tensors
#     wave = sample[0]
#     ground_truth = sample[1]
        
#     # plot files
#     librosa.display.waveshow(np.squeeze(wave), x_axis='time', sr=config['sr'], ax=ax[k][j], label='test_file')
#     librosa.display.waveshow(np.squeeze(ground_truth), alpha=0.3, x_axis='time', sr=config['sr'], ax=ax[k][j], label='ground_truth')
#     ax[k][j].legend()
#     ax[k][j].axis('on')
#     ax[k][j].set_title('10s speech')  

#     # save speech to arrays
#     noisy.append(np.squeeze(wave))
#     gt.append(np.squeeze(ground_truth))
    
#     if i+1 == ncols*nrows:
#         break
    
# # adjust whitespace in between subplots        
# plt.subplots_adjust(hspace=0.25, wspace=0.15)
# plt.show()


# # listen to the audio samples
# for i in range(len(gt)):
#     print(f'----------- {i+1}. speechsnippet ---------------')
#     print('')
#     print(f'Voicefixer file')
#     pd.display(pd.Audio(noisy[i].T, rate=config['sr']))
#     print(f'corresponding produced file')
#     pd.display(pd.Audio(gt[i].T, rate=config['sr']))
#     print('')
