# Data preparation

Main dataset: 
Cantonese youtube videos are downloaded and converted to WAV file as the clean audio data

Noise dataset: 
A folder of background noise WAV file from [Microsoft Scalable Noisy Speech Dataset (MS-SNSD)](https://github.com/microsoft/MS-SNSD) with white noise and recordings of machinery and everyday household activities are used in this analysis to add noises on the cantonese audio. The loudness of the background noise are used in 4 levels, with 10 the largest and 40 the smallest. 


In [9]:
# import libraries
import os
from os import path
from pydub import AudioSegment
from pathlib import Path
import pandas as pd
import soundfile as sf
import os
import numpy as np
import glob
import sounddevice as sd
import scipy.signal as sps

## Convert audio files to wav. format

In [162]:
cwd = os.getcwd()
cwd

'/Users/michliu/Documents/HSBC - Data scientist/audio/cantonese2'

In [163]:
os. chdir('/Users/michliu/Documents/HSBC - Data scientist/audio/cantonese2/clean_train') 

In [136]:
# Set clean audio directory
clean_dir = os.path.join(os.path.dirname(os.path.abspath("__file__")), 'clean_train')
print('clean_dir', clean_dir)

clean_dir /Users/michliu/Documents/HSBC - Data scientist/audio/cantonese2/clean_train/clean_train


In [152]:
# read clean audio
clean_audio = pd.DataFrame({
    "recording_id": [path.stem for path in Path(clean_dir).glob("*.mov")],
})

In [164]:
# Convert files to wav. format
for file in clean_audio['recording_id']:
    # files
    mov_file = str(file + '.mov')
    output = str(file + '.wav')
    # convert mov to wav                                                            
    sound = AudioSegment.from_file(mov_file, format="mov")
    sound = sound.set_channels(1)
    sound.export(output, format="wav")

## Create noisy audio files

In [2]:
# change currnet directory  
os. chdir('/Users/michliu/Documents/HSBC - Data scientist/audio/cantonese2') 

In [3]:
# Function to read audio
def audioread(path, norm = True, start=0, stop=None):
    path = os.path.abspath(path)
    if not os.path.exists(path):
        raise ValueError("[{}] does not exist!".format(path))
    try:
        x, sr = sf.read(path, start=start, stop=stop)
    except RuntimeError:  # fix for sph pcm-embedded shortened v2
        print('WARNING: Audio type not supported')

    if len(x.shape) == 1:  # mono
        if norm:
            rms = (x ** 2).mean() ** 0.5
            scalar = 10 ** (-25 / 20) / (rms)
            x = x * scalar
        return x, sr
    else:  # multi-channel
        x = x.T
        x = x.sum(axis=0)/x.shape[0]
        if norm:
            rms = (x ** 2).mean() ** 0.5
            scalar = 10 ** (-25 / 20) / (rms)
            x = x * scalar
        return x, sr
    
# Funtion to write audio    
def audiowrite(data, fs, destpath, norm=False):
    if norm:
        rms = (data ** 2).mean() ** 0.5
        scalar = 10 ** (-25 / 10) / (rms+eps)
        data = data * scalar
        if max(abs(data))>=1:
            data = data/max(abs(data), eps)
    
    destpath = os.path.abspath(destpath)
    destdir = os.path.dirname(destpath)
    
    if not os.path.exists(destdir):
        os.makedirs(destdir)
    
    sf.write(destpath, data, fs)
    return

# Function to mix clean speech and noise at various signal to noise ratio (SNR) levels
def snr_mixer(clean, noise, snr):
    # Normalizing to -25 dB FS
    rmsclean = (clean**2).mean()**0.5
    scalarclean = 10 ** (-25 / 20) / rmsclean
    clean = clean * scalarclean
    
    rmsnoise = (noise**2).mean()**0.5
    scalarnoise = 10 ** (-25 / 20) /rmsnoise
    noise = noise * scalarnoise
   
    # Set the noise level for a given SNR
    noisescalar = np.sqrt(rmsclean / (10**(snr/20)) / rmsnoise)
    noisenewlevel = noise * noisescalar
    noisyspeech = clean + noisenewlevel
    return clean, noisenewlevel, noisyspeech


In [4]:
# Configuration for generating Noisy Speech Dataset

# - sampling_rate: Specify the sampling rate. Default is 16 kHz
# - audioformat: default is .wav
# - audio_length: Max Length of each audio clip (noisy and clean speech) in seconds that will be generated by augmenting utterances. 
# - silence_length: Duration of silence introduced between clean speech utterances.
# - total_hours: Total number of hours of data required. Units are in hours. 
# - snr_lower: Lower bound for SNR required (default: 0 dB)
# - snr_upper: Upper bound for SNR required (default: 40 dB)
# - total_snrlevels: Number of SNR levels required (default: 5, which means there are 5 levels between snr_lower and snr_upper)
# - noise_dir: Default is None. But specify the noise directory path if noise files are not in the source directory
# - Speech_dir: Default is None. But specify the speech directory path if speech files are not in the source directory
# - noise_types_excluded: Noise files starting with the following tags to be excluded in the noise list. Example: noise_types_excluded: Babble, AirConditioner
#                         Specify 'None' if no noise files to be excluded.

cfg={
'sampling_rate': 44100,
'audioformat': '*.wav',
'audio_length': 60,
'silence_length': 0.2,
'total_hours': 1 ,
'snr_lower': 0,
'snr_upper': 40,
'total_snrlevels': 5,  

'noise_dir': None,
'speech_dir': None,
'noise_types_excluded': None
}

In [5]:
# Path to directories
clean_dir = os.path.join(os.path.dirname(os.path.abspath("__file__")), 'clean_train')
print('clean_dir', clean_dir)
noise_dir = os.path.join(os.path.dirname(os.path.abspath("__file__")), 'noise_train')
print('noise_dir', noise_dir)

clean_dir /Users/michliu/Documents/HSBC - Data scientist/audio/cantonese2/clean_train
noise_dir /Users/michliu/Documents/HSBC - Data scientist/audio/cantonese2/noise_train


In [6]:
# Set up configurations 
snr_lower = int(cfg['snr_lower'])
snr_upper = int(cfg['snr_upper'])
total_snrlevels = int(cfg['total_snrlevels'])
fs = float(cfg["sampling_rate"])
audioformat = cfg["audioformat"]
total_hours = float(cfg["total_hours"])
audio_length = float(cfg["audio_length"])
silence_length = float(cfg["silence_length"])

# Create folders for output files
noisyspeech_dir = os.path.join(os.path.dirname(os.path.abspath("__file__")), 'NoisySpeech_training')
if not os.path.exists(noisyspeech_dir):
    os.makedirs(noisyspeech_dir)
clean_proc_dir = os.path.join(os.path.dirname(os.path.abspath("__file__")), 'CleanSpeech_training')
if not os.path.exists(clean_proc_dir):
    os.makedirs(clean_proc_dir)
noise_proc_dir = os.path.join(os.path.dirname(os.path.abspath("__file__")), 'Noise_training')
if not os.path.exists(noise_proc_dir):
    os.makedirs(noise_proc_dir)

In [7]:
total_secs = total_hours*60*60
total_samples = int(total_secs * fs)
audio_length = int(audio_length*fs)
SNR = np.linspace(snr_lower, snr_upper, total_snrlevels)
cleanfilenames = glob.glob(os.path.join(clean_dir, audioformat))
if cfg["noise_types_excluded"]==None:
    noisefilenames = glob.glob(os.path.join(noise_dir, audioformat))
else:
    filestoexclude = cfg["noise_types_excluded"].split(',')
    noisefilenames = glob.glob(os.path.join(noise_dir, audioformat))
    for i in range(len(filestoexclude)):
        noisefilenames = [fn for fn in noisefilenames if not os.path.basename(fn).startswith(filestoexclude[i])]
    

In [14]:
# main

filecounter = 0
num_samples = 0
    
while num_samples < total_samples:
    idx_s = np.random.randint(0, np.size(cleanfilenames))
    clean, fs = audioread(cleanfilenames[idx_s])
    
    if len(clean)<audio_length:
        clean = clean
       
        
    else:
            
        while len(clean)<=audio_length:
            idx_s = idx_s + 1
            if idx_s >= np.size(cleanfilenames)-1:
                idx_s = np.random.randint(0, np.size(cleanfilenames)) 
            newclean, fs = audioread(cleanfilenames[idx_s])
            cleanconcat = np.append(clean, np.zeros(int(fs*silence_length)))
            clean = np.append(cleanconcat, newclean)


    idx_n = np.random.randint(0, np.size(noisefilenames))
    noise, fs2 = audioread(noisefilenames[idx_n])
    noise_file_name = noisefilenames[idx_n].split("/")[-1].split('.')[0]
    print(noise_file_name)
    idx2 = idx_n
    
    # Resample data
    new_rate = 44100
    number_of_samples = round(len(noise) * float(new_rate) / fs2)
    noise = sps.resample(noise, number_of_samples)
                
    if len(noise)>=len(clean):
        noise = noise[0:len(clean)]
        
        
    else:
        
        while len(noise)<=len(clean):
            idx_n = idx_n + 1
            if idx_n >= np.size(noisefilenames)-1:
                idx_n = np.random.randint(0, np.size(noisefilenames))
            newnoise, fs2 = audioread(noisefilenames[idx_n])
            idx2 = idx_n
             # Resample data
            new_rate = 44100
            number_of_samples = round(len(noise) * float(new_rate) / fs2)
            noise = sps.resample(noise, number_of_samples)
            
            noiseconcat = np.append(noise, np.zeros(int(fs*silence_length)))
            noise = np.append(noiseconcat, newnoise)

    noise = noise[0:len(clean)]
    filecounter = filecounter + 1

    for i in range(np.size(SNR)):
        clean_snr, noise_snr, noisy_snr = snr_mixer(clean=clean, noise=noise, snr=SNR[i])
        noisyfilename = 'noisy_'+(cleanfilenames[idx_s].split("/")[-1].split('.')[0])+'_SNRdb_'+str(int(SNR[i]))+ "_"+noise_file_name +'.wav'
        cleanfilename = 'clnsp_'+(cleanfilenames[idx_s].split("/")[-1].split('.')[0])+'.wav'
        noisefilename = 'noisy_'+(cleanfilenames[idx_s].split("/")[-1].split('.')[0])+'_SNRdb_'+str(int(SNR[i]))+ "_"+ noise_file_name +'.wav'
        noisypath = os.path.join(noisyspeech_dir, noisyfilename)
        cleanpath = os.path.join(clean_proc_dir, cleanfilename)
        noisepath = os.path.join(noise_proc_dir, noisefilename)
        print(noisyfilename )
        audiowrite(noisy_snr, fs, noisypath, norm=False)
        audiowrite(clean, fs, cleanpath, norm=False)
        audiowrite(noise_snr, fs, noisepath, norm=False)
        num_samples = num_samples + len(noisy_snr)
        
        #sd.play(clean, fs)
        #status = sd.wait() 

CopyMachine_2
noisy_Cantonese5_SNRdb_0_CopyMachine_2.wav
noisy_Cantonese5_SNRdb_10_CopyMachine_2.wav
noisy_Cantonese5_SNRdb_20_CopyMachine_2.wav
noisy_Cantonese5_SNRdb_30_CopyMachine_2.wav
noisy_Cantonese5_SNRdb_40_CopyMachine_2.wav
NeighborSpeaking_1
noisy_Cantonese1_SNRdb_0_NeighborSpeaking_1.wav
noisy_Cantonese1_SNRdb_10_NeighborSpeaking_1.wav
noisy_Cantonese1_SNRdb_20_NeighborSpeaking_1.wav
noisy_Cantonese1_SNRdb_30_NeighborSpeaking_1.wav
noisy_Cantonese1_SNRdb_40_NeighborSpeaking_1.wav
Field_1
noisy_Cantonese1_SNRdb_0_Field_1.wav
noisy_Cantonese1_SNRdb_10_Field_1.wav
noisy_Cantonese1_SNRdb_20_Field_1.wav
noisy_Cantonese1_SNRdb_30_Field_1.wav
noisy_Cantonese1_SNRdb_40_Field_1.wav
Restaurant_1
noisy_Cantonese4_SNRdb_0_Restaurant_1.wav
noisy_Cantonese4_SNRdb_10_Restaurant_1.wav
noisy_Cantonese4_SNRdb_20_Restaurant_1.wav
noisy_Cantonese4_SNRdb_30_Restaurant_1.wav
noisy_Cantonese4_SNRdb_40_Restaurant_1.wav
Metro_1
noisy_Cantonese7_SNRdb_0_Metro_1.wav
noisy_Cantonese7_SNRdb_10_Metro_1.w