# Prepare data



In [None]:
# Dependencies

import os
import os.path
import glob
import librosa
import librosa.display
import json
import tensorflow as tf
print('Tensorflow Version:', tf.__version__)
import random
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import IPython.display as ipd
from IPython.display import Audio
import pprint


## write Config file and save as json

In [None]:
# Config file

# save global settings in config dict
config = {'sr': 44100,
          'fps_noisy': '/Users/marius/Documents/Uni/TU_Berlin_Master/Masterarbeit/Data/allFiles/noisySpeech',
          'fps_produced': '/Users/marius/Documents/Uni/TU_Berlin_Master/Masterarbeit/Data/allFiles/producedSpeech',
          'fps_voicefixer': '/Users/marius/Documents/Uni/TU_Berlin_Master/Masterarbeit/Data/allFiles/voicefixerOutput',
          'n_fft': 512,
          'hop_length': 64,
          'win_length': 512,
          'n_mels': 16,
          'sample_length': 20,
          'offset': 6}

# print config
print(json.dumps(config, indent=4))

# save config to disk
with open('./MA_CONFIG.json', 'w+') as fp:
    json.dump(config, fp, indent=len(config))


## Create data dict with all labels and paths

In [None]:
# initialize data dictionary

noisy_data_path = config['fps_noisy']
produced_data_path = config['fps_produced']
voicefixer_data_path = config['fps_voicefixer']

data = {'label_noisy': [],
        'path_noisy': [],
        'mic_room_noisy': [],

        'label_produced': [],
        'path_produced': [],
        
        'label_voicefixer': [],
        'path_voicefixer': [],
        'mic_room_voicefixer': []
        }


# loop through all the noisy speech files and store
for i, (dirpath, dirnames, filenames) in enumerate(os.walk(noisy_data_path)):

    # load speechfiles and store in dictionary
    for f in filenames:
        # skip file that doesnt end with ".wav"
        if not f.endswith(".wav"):
            continue
 
        # get filepath
        file_path = os.path.join(dirpath, f)
 
        # store data in dict
        # get the speaker and scriptname and store it in 'label'
        category = f.split('_')[0:2]
        category[0] = category[0] + '_' + category[1]
        data['label_noisy'].append(category[0])

        # get the room and mic and store it in 'mic_room_xx'
        mic_room = f.split('.')[0]
        data['mic_room_noisy'].append(mic_room)

        # store filepath
        data['path_noisy'].append(file_path)


# loop through all the produced speech files and store
for i, (dirpath, dirnames, filenames) in enumerate(os.walk(produced_data_path)):
    #print(f"Produced Filenames: {filenames}")

    # load speechfiles and store in dictionary
    for f in filenames:
        # skip file that doesnt end with ".wav"
        if not f.endswith(".wav"):
            continue

        # get filepath
        file_path = os.path.join(dirpath, f)

        # store data in dict
        # get the speaker and scriptname and store it in 'label'
        category = f.split('_')[0:2]
        category[0] = category[0] + '_' + category[1]
        data['label_produced'].append(category[0])
        # store filepath
        data['path_produced'].append(file_path)


# loop through all the voicefixer output speech files and store
for i, (dirpath, dirnames, filenames) in enumerate(os.walk(voicefixer_data_path)):
    #print(f"Voicefixer Filenames: {filenames}")

    # load speechfiles and store in dictionary
    for f in filenames:
        # skip file that doesnt end with ".wav"
        if not f.endswith(".wav"):
            continue
        
        # get filepath
        file_path = os.path.join(dirpath, f)

        # store data in dict
        # get the speaker and scriptname and store it in 'label'
        category = f.split('_')[0:2]
        category[0] = category[0] + '_' + category[1]
        data['label_voicefixer'].append(category[0])

        # get the room and mic and store it in 'mic_room_xx'
        mic_room = f.split('.')[0]
        data['mic_room_voicefixer'].append(mic_room)

        # store filepath
        data['path_voicefixer'].append(file_path)


#pprint.pprint(data, depth=2)


## Plot 3 corresponding files from all data (noisy, produced, voicefixer)

In [None]:
# create array and store 3 corresponding pathfiles
rand = np.random.randint(len(data['label_produced']))
label1 = data['label_produced'][rand]
#print(label1)

corresponding = []
corresponding.append(data["path_produced"][rand])
mic = []

for count, value in enumerate(data["mic_room_noisy"]):
    if label1 in value:
        corresponding.append(data["path_noisy"][count])
        mic.append(data['mic_room_noisy'][count])
        break
for count, value in enumerate(data['mic_room_voicefixer']):
    if mic[0] in value:
        corresponding.append(data["path_voicefixer"][count])
        mic.append(data['mic_room_voicefixer'][count])
        break


pprint.pprint(corresponding, depth=2) 


# Plot corresponding audiofiles
# load files from corresponding list with filepaths
audio1, _ = librosa.core.load(corresponding[0], sr= config['sr'])
audio2, _ = librosa.core.load(corresponding[1], sr= config['sr'])
audio3, _ = librosa.core.load(corresponding[2], sr= config['sr'])

# Only plot segment of speech files
time_in_sec = 20
seg = int(time_in_sec*config['sr'])
       
# time vector in min
t = np.linspace(0, seg, seg)
# only plot segment of audio file in samples

# setup subplot 
fig, axs = plt.subplots(3, 1, figsize=(8, 12))
axs[0].plot(t[0:seg], audio1[0:seg], c='k')
axs[0].set_title(Path(corresponding[0]).parts[-1])
axs[1].plot(t[0:seg], audio2[0:seg], c='k')
axs[1].set_title(Path(corresponding[1]).parts[-1])
axs[2].plot(t[0:seg], audio3[0:seg], c='k')
axs[2].set_title(Path(corresponding[2]).parts[-1])
axs[2].set_xlabel('Time')

plt.show()


In [None]:
# listen to the audio samples
for fp in corresponding:
    print(fp)
    audio, sr = librosa.core.load(fp, sr= config['sr'], duration=config['sample_length'], offset=config['offset'])
    ipd.display(ipd.Audio(audio, rate=config['sr']))

# Plot Spectograms of the 3 corresponding Audiofiles

In [None]:
## Calculating spectograms

# setup subplot 
nrows, ncols = 3, 2
fig, ax = plt.subplots(nrows=nrows, ncols=ncols, figsize=(14, 10))

# plot some audio waveforms
for i, fp in enumerate(corresponding):
    audio, sr = librosa.core.load(fp, sr=config['sr'], duration=config['sample_length'], offset=config['offset'])

    # calculate stft
    stft = librosa.stft(audio, n_fft=config['n_fft'], hop_length=config['hop_length'], win_length=config['win_length'])
    
    # calculate melspec
    melspec = librosa.feature.melspectrogram(y=audio, n_fft=config['n_fft'], hop_length=config['hop_length'], win_length=config['win_length'], n_mels=config['n_mels'], fmax=int(config['sr']/2))
    melspec = librosa.amplitude_to_db(melspec, ref=np.max)

    # calculate magnitude and scale to dB
    magspec = librosa.amplitude_to_db(np.abs(stft), ref=np.max)

    # plot with librosa
    librosa.display.specshow(magspec, x_axis='time', y_axis='linear', sr=sr, hop_length=config['hop_length'], ax=ax[i][0])
    librosa.display.specshow(melspec, x_axis='time', y_axis='mel', sr=sr, hop_length=config['hop_length'], ax=ax[i][1])
    
    # adjustments
    # ax[i][1].set_yticks([])
    ax[i][1].set_ylabel(Path(fp).parts[-1:], rotation=270, labelpad=20)
    ax[i][1].yaxis.set_label_position("right")
    
    # settings for all axises but bottom ones
    if not i == len(corresponding) - 1:
        ax[i][0].set_xticks([])
        ax[i][1].set_xticks([])
        ax[i][0].set_xlabel('')
        ax[i][1].set_xlabel('')
    
    # settings for upper axises
    if i == 0:
        ax[i][0].set_title('STFT')
        ax[i][1].set_title('Mel Spectrogram')   

# adjust whitespace in between subplots        
plt.subplots_adjust(hspace=0.3, wspace=0.15)

print('Melspec shape: %s' % (str(melspec.shape)))
print('Stft shape: %s' % (str(stft.shape)))
print(f'Total data points in mel-spectrogram: {melspec.shape[0]*melspec.shape[1]}')
print(f'Total data points in stft-spectrogram: {stft.shape[0]*stft.shape[1]}')
print(f'-> Data Reduction by factor: {(stft.shape[0]*stft.shape[1]) / (melspec.shape[0]*melspec.shape[1])}')
print()

# Plot random files wav from dataset (voicefixer files)

In [None]:
# get all pathnames saved in data dict
keys = ['path_voicefixer']
fps = list(map(data.get, keys))
# nested list to flat list
fps = list(np.concatenate(fps))
#pprint.pprint(fps, depth=2)

# Plot Audiofiles
fps_random = []

# setup subplot 
nrows, ncols = 2, 2
fig, ax = plt.subplots(nrows=nrows, ncols=ncols, figsize=(12, 6))

# plot some audio waveforms
for r in range(nrows):
    for c in range(ncols):
        fp_random = fps[np.random.randint(len(fps))]
        audio, sr = librosa.core.load(fp_random, sr= config['sr'], duration=config['sample_length'])
        ax[r][c].plot(audio, c='k')
        # ax[r][c].axis('off')
        ax[r][c].set_title(Path(fp_random).parts[-1])
        if r == 0:
            ax[r][c].set_xticks([])
        # save random audio filepaths
        fps_random.append(fp_random)

In [None]:
# listen to the audio samples
for fp in fps_random:
    print(fp)
    audio, sr = librosa.core.load(fp, sr= config['sr'], duration=config['sample_length'], offset=config['offset'])
    ipd.display(ipd.Audio(audio, rate=config['sr']))