In [None]:
%load_ext autoreload
%autoreload 2



import librosa
import os
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
import numpy as np
from tqdm import tqdm
from preprocess import *
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint
import matplotlib.pyplot as plt


In [None]:

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import hashlib
import math
import os.path
import random
import re
import sys
import tarfile

import numpy as np
from six.moves import urllib
from six.moves import xrange  # pylint: disable=redefined-builtin
import tensorflow as tf

from tensorflow.contrib.framework.python.ops import audio_ops as contrib_audio
from tensorflow.python.ops import io_ops
from tensorflow.python.platform import gfile
from tensorflow.python.util import compat

# If it's available, load the specialized feature generator. If this doesn't
# work, try building with bazel instead of running the Python script directly.
try:
    from tensorflow.lite.experimental.microfrontend.python.ops import audio_microfrontend_op as frontend_op  # pylint:disable=g-import-not-at-top
except ImportError:
    frontend_op = None

### properties for data extraction

In [None]:
SILENCE_LABEL = '_silence_'
SILENCE_INDEX = 0
UNKNOWN_WORD_LABEL = '_unknown_'
UNKNOWN_WORD_INDEX = 1
BACKGROUND_NOISE_DIR_NAME = '_background_noise_'
RANDOM_SEED = 59185


silence_percentage = 10
unknown_percentage = 10

sample_rate = 16000
clip_duration_ms =1000
window_size_ms = 30
window_stride_ms = 10
feature_bin_count = 40 
time_shift_ms = 100.0
background_volume_range = 0.1
foreground_volume = 0.9

In [None]:
SPEECH_DATA_PATH = "./speech/data/"

URBAN_NOISE_DATA_PATH = "./urban/data/"

COMBINED_DATA_PATH = "./combined/data/"

SPEECH_NPY_PATH = "./data_aug/npy/"

URBAN_NOISE_NPY_PATH = "./data_aug/npy/"

COMBINED_NPY_PATH =  "./data_aug/npy/"

In [None]:
desired_samples = int(sample_rate * clip_duration_ms / 1000)
window_size_samples = int(sample_rate * window_size_ms / 1000)
window_stride_samples = int(sample_rate * window_stride_ms / 1000)
length_minus_window = (desired_samples - window_size_samples)
fingerprint_width = feature_bin_count

time_shift_samples = int((time_shift_ms * sample_rate) / 1000)
time_shift_amount = np.random.randint(-time_shift_samples, time_shift_samples)
if time_shift_amount > 0:
    time_shift_padding = [[time_shift_amount, 0], [0, 0]]
    time_shift_offset = [0, 0]
else:
    time_shift_padding = [[0, -time_shift_amount], [0, 0]]
    time_shift_offset = [-time_shift_amount, 0]

In [None]:
print(desired_samples)
print(window_size_samples)
print(window_stride_samples)
print(length_minus_window)
print(fingerprint_width)
print(time_shift_samples)
print(time_shift_amount)
print(time_shift_padding)
print(time_shift_offset)

In [None]:
background_data = []
background_dir = os.path.join(SPEECH_DATA_PATH ,BACKGROUND_NOISE_DIR_NAME)

### preparing background data

In [None]:
def prepare_background_data():
    # preparing backgound data
    background_data = []
    background_dir = os.path.join(SPEECH_DATA_PATH ,BACKGROUND_NOISE_DIR_NAME)
    with tf.Session(graph=tf.Graph()) as sess:
        wav_filename_placeholder = tf.placeholder(tf.string, [])
        wav_loader = io_ops.read_file(wav_filename_placeholder)
        wav_decoder = contrib_audio.decode_wav(wav_loader, desired_channels=1)
        search_path = os.path.join(SPEECH_DATA_PATH ,BACKGROUND_NOISE_DIR_NAME,'*.wav')
        for wav_path in gfile.Glob(search_path):
            wav_data = sess.run(wav_decoder,feed_dict={wav_filename_placeholder: wav_path}).audio.flatten()
            background_data.append(wav_data)
    return background_data

In [None]:
background_data = prepare_background_data()
print(len(background_data))

In [None]:
background_index = np.random.randint(len(background_data))
background_samples = background_data[background_index]
background_offset = np.random.randint( 0, len(background_samples) - desired_samples)
background_clipped = background_samples[background_offset:(background_offset + desired_samples)]
background_reshaped = background_clipped.reshape([desired_samples, 1])
background_volume = np.random.uniform(0, background_volume_range)
background_data = background_reshaped


In [None]:
print(background_index)
print(background_samples)
print(background_offset)
print(background_clipped)
print(background_reshaped)
print(background_volume)

In [None]:
#loads a WAVE file, decodes it, scales the volume, shifts it in time, 
# adds in background noise, calculates a spectrogram, 
# and then builds an MFCC fingerprint from that.
wav_filename = SPEECH_DATA_PATH + 'happy/27c30960_nohash_0.wav'




### function to add augmentation and extract mfcc features

In [None]:
#does data augmentation with time shift, random background sounds

# use this instead of wav2mfcc function

def prepare_mfcc(wav_filename):
    #placeholders for the session
    wav_filename_placeholder_ = tf.placeholder(tf.string, [], name='wav_filename')
    foreground_volume_placeholder_ = tf.placeholder(tf.float32, [], name='foreground_volume')
    time_shift_padding_placeholder_ = tf.placeholder(tf.int32, [2, 2], name='time_shift_padding')
    time_shift_offset_placeholder_ = tf.placeholder(tf.int32, [2], name='time_shift_offset')
    background_data_placeholder_ = tf.placeholder(tf.float32, [desired_samples, 1], name='background_data')
    background_volume_placeholder_ = tf.placeholder(tf.float32, [], name='background_volume')\

    wav_loader = io_ops.read_file(wav_filename_placeholder_)
    wav_decoder = contrib_audio.decode_wav(wav_loader, desired_channels=1, desired_samples=desired_samples)
    scaled_foreground = tf.multiply(wav_decoder.audio,foreground_volume_placeholder_)
    # Shift the sample's start position, and pad any gaps with zeros.
    padded_foreground = tf.pad(scaled_foreground,time_shift_padding_placeholder_,mode='CONSTANT')
    sliced_foreground = tf.slice(padded_foreground,time_shift_offset_placeholder_,[desired_samples, -1])
    # Mix in background noise.
    background_mul = tf.multiply(background_data_placeholder_,background_volume_placeholder_)
    background_add = tf.add(background_mul, sliced_foreground)
    background_clamp = tf.clip_by_value(background_add, -1.0, 1.0)
    # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio.
    spectrogram = contrib_audio.audio_spectrogram(background_clamp,
          window_size=window_size_samples,
          stride=window_stride_samples,
          magnitude_squared=True)

    tf.summary.image('spectrogram', tf.expand_dims(spectrogram, -1), max_outputs=1)
    output_ = contrib_audio.mfcc(spectrogram, wav_decoder.sample_rate,dct_coefficient_count=fingerprint_width)
    tf.summary.image('mfcc', tf.expand_dims(output_, -1), max_outputs=1)
    input_dict = {
        wav_filename_placeholder_: wav_filename,
        time_shift_padding_placeholder_: time_shift_padding,
        time_shift_offset_placeholder_: time_shift_offset,
        background_data_placeholder_: background_data,
        background_volume_placeholder_: background_volume,
        foreground_volume_placeholder_: foreground_volume,
    }
    data_tensor = sess.run([output_], feed_dict=input_dict)
    data = data_tensor[0].flatten()
    return data

In [None]:
with tf.Session() as sess:
    data = prepare_mfcc(wav_filename)

In [None]:
print(data.shape)
spectrogram_length = 1 + int(length_minus_window / window_stride_samples)
data = data.reshape(spectrogram_length, feature_bin_count)
print(data.shape)
print(data)

### prepare and save features of the augmented data

In [None]:
# write the save data code here....with test train and val split
# make sure the background_noise folder is not in the path
def save_data_speech(path = SPEECH_DATA_PATH, testFile = testFile, valFile = valFile, max_len = 11,savepath = SPEECH_NPY_PATH):

    test_file = open(testFile, "r")
    testFilesList = test_file.read().split('\n')

    val_file = open(valFile, "r")
    valFilesList = val_file.read().split('\n')

    #print(testFilesList)
    #print(valFilesList)
    labels,_,_ = get_labels(path)
    print(labels)
    for label in labels:
        mfcc_train = []
        mfcc_test = []
        mfcc_val = []
        # saving a tuple of wavfile path and label/name format to compare in the test and val list
        wavfiles = [(path + label + '/' + wavfile, label + '/' + wavfile)
                    for wavfile in os.listdir(path + '/' + label)]
        
        #print(wavfiles)
        
        for wavfile in tqdm(wavfiles, "Saving vectors of label - '{}'".format(label)):
            #print(wavfile[0])
            #print(wavfile[1])
            #mfcc = wav2mfcc(wavfile[0], max_len=max_len)
            mfcc = prepare_mfcc(wavfile[0])
            if wavfile[1] in testFilesList:
                mfcc_test.append(mfcc)
            elif wavfile[1] in valFilesList:
                mfcc_val.append(mfcc)
            else:
                mfcc_train.append(mfcc)
                
        np.save(savepath + label + '_test.npy', mfcc_test)
        np.save(savepath + label + '_val.npy', mfcc_val)
        np.save(savepath + label + '_train.npy', mfcc_train)

In [None]:
# just saving the urban data as the npy file.
# will split the data into test train and val after loading the data nd the labels by using a test_train split function
def save_urbanNoise_data(path = URBAN_NOISE_DATA_PATH, max_len = 11, savePath = URBAN_NOISE_NPY_PATH):
    labels,_,_ = get_labels(path)
    for label in labels:
        mfccs = []
        mfcc_train = []
        mfcc_test = []
        mfcc_val = []
        print(label)
        
        wavfiles = [path + label + '/' + wavfile for wavfile in os.listdir(path + '/' + label)]
        
        for wavfile in tqdm(wavfiles, "saving vectors of label - '{}'".format(label)):
            try:
                mfcc = prepare_mfcc(wavfile, max_len = max_len)
                mfccs.append(mfcc)
            except:
                print(wavfile)
        
        np.save(savePath + label + '.npy', mfccs)

In [None]:
# speech data
with tf.Session() as sess:
    save_data_speech()

In [None]:
#urban Noise data
with tf.Session() as sess:
    save_urbanNoise_data()