In [None]:
import sys
## save variables
import pickle
## folder names
from glob import glob
## wav import
from scipy.io import wavfile 
## standard libraries
import numpy as np
## MFCC
#!{sys.executable} -m pip install msgpack --user
#!{sys.executable} -m pip install python_speech_features --user

from python_speech_features import mfcc
from python_speech_features import logfbank
from python_speech_features import delta

import random as rnd
import os.path
import tarfile

from six.moves import urllib

import matplotlib.pyplot as plt
%matplotlib notebook
%matplotlib notebook
from scipy import signal
from scipy.io import wavfile


#!{sys.executable} -m pip install opencv-python --user
#!{sys.executable} -m pip install opencv-contrib-python --user
#import cv2
#garbage collector
import gc
#OS detection
import platform

#!{sys.executable} -m pip install librosa --user
import librosa

In [None]:
#shift signal of #value samples padding with zero to return the same dimension
def shiftVec(signal, value):
    initial_length = signal.shape[0]
    padded = np.pad(signal, (abs(value),abs(value)), 'constant', constant_values=0)
    signal = padded[abs(value)-value:abs(value)+initial_length-value]
    return signal
#return a random noise of nSample
def noiseSelector(noise, nSample):
    length = len(noise)
    choice = rnd.randint(0, length-1)
    key = list(noise.keys())[choice]
    start = rnd.randint(0, noise[key].shape[0]-nSample-1)
    return noise[key][start:start+nSample]

## Return the word between two string starting from left
def find_between( s, first, last ):
    try:
        start = s.index( first ) + len( first )
        end = s.index( last, start )
        return s[start:end]
    except ValueError:
        return ""

# stretching the sound
def stretch(data, rate=1):
    input_length = data.shape[0]
    data = librosa.effects.time_stretch(data, rate)
    if len(data)>input_length:
        data = data[round((data.shape[0]-input_length)/2):round((data.shape[0]+input_length)/2)]
    else:
        data = np.pad(data, (0, max(0, input_length - len(data))), "constant", constant_values=0)

    return data

In [None]:
#make variables directory if not present
dest_directory = 'variables/'
if not os.path.exists(dest_directory):
      os.makedirs(dest_directory)

        #data url from which download the dataset      
data_url = 'http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz'

#make dataset directory if not present
dest_directory = 'dataset/'
if not os.path.exists(dest_directory):
      os.makedirs(dest_directory)

#select the last part of the dataurl (the file name)      
filename = data_url.split('/')[-1]
filepath = os.path.join(dest_directory, filename)

#program the download and extraction if the file doesn't exists
if not os.path.exists(filepath):
    def progress(count, block_size, total_size):
        sys.stdout.write(
            '\r>> Downloading %s %.1f%%' %
            (filename, float(count * block_size) / float(total_size) * 100.0))
        sys.stdout.flush()
    try:
        filepath, _ = urllib.request.urlretrieve(data_url, filepath, progress)
    except:
        print(Error)
        
    tarfile.open(filepath, 'r:gz').extractall(dest_directory)

In [None]:
coreKey = ['yes', "no", "up", "down", "left", "right", "on", "off", "stop", "go", "zero",
           "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"]
sampleRate = 16000

In [None]:
import hashlib
import re
def which_set(filename, validation_percentage, testing_percentage, totClass):
    """Determines which data partition the file should belong to.
    We want to keep files in the same training, validation, or testing sets even
    if new ones are added over time. This makes it less likely that testing
    samples will accidentally be reused in training when long runs are restarted
    for example. To keep this stability, a hash of the filename is taken and used
    to determine which set it should belong to. This determination only depends on
    the name and the set proportions, so it won't change as other files are added.
    It's also useful to associate particular files as related (for example words
    spoken by the same person), so anything after '_nohash_' in a filename is
    ignored for set determination. This ensures that 'bobby_nohash_0.wav' and
    'bobby_nohash_1.wav' are always in the same set, for example.
    Args:
    filename: File path of the data sample.
    validation_percentage: How much of the data set to use for validation.
    testing_percentage: How much of the data set to use for testing.
    Returns:
    String, one of 'training', 'validation', or 'testing'.
    """
    base_name = os.path.basename(filename)
    # We want to ignore anything after '_nohash_' in the file name when
    # deciding which set to put a wav in, so the data set creator has a way of
    # grouping wavs that are close variations of each other.
    hash_name = re.sub(r'_nohash_.*$', '', base_name)
    # This looks a bit magical, but we need to decide whether this file should
    # go into the training, testing, or validation sets, and we want to keep
    # existing files in the same set even if more files are subsequently
    # added.
    # To do that, we need a stable way of deciding based on just the file name
    # itself, so we do a hash of that and then use that to generate a
    # probability value that we use to assign it.
    hash_name_hashed = hashlib.sha1(hash_name.encode('utf-8')).hexdigest()
    percentage_hash = ((int(hash_name_hashed, 16) %
                      (totClass + 1)) *
                     (100.0 / totClass))
    if percentage_hash < validation_percentage:
        result = 'validation'
    elif percentage_hash < (testing_percentage + validation_percentage):
        result = 'testing'
    else:
        result = 'training'
    return result

In [None]:
#control if the raw data are saved with pickle
#if true they will be loaded in rawDict
reDo = False
pathName = {}
if os.path.exists('variables/rawDict.pkl') and not reDo:
    print('rawDict found')
    with open('variables/rawDict.pkl', 'rb') as f:  
        rawDict = pickle.load(f)
    with open('variables/pathName.pkl', 'rb') as f:  
        pathName = pickle.load(f)
    print('rawDict loaded')

#else they will be loaded from wav files
else:
    print('Creating rawDict')
    folders = glob("dataset/*/")
    folders.remove('dataset\\_background_noise_\\')
    print('SIGNALS')
    rawDict = {}
    for key in folders:
        print('Processing ', key)
        dictKey = find_between( key, '\\', '\\' )
        tmpFiles = glob(key+'*')
        array = []
        pathList = []
        for file in tmpFiles:
            pathList.append(file)
            tmp = wavfile.read(file)[1].copy()
            tmp.resize(16000, refcheck=False)
            array.append(tmp)
        rawDict[dictKey] = np.array(array)
        pathName[dictKey] = pathList
    #and saved with pickle
    with open('variables/rawDict.pkl', 'wb') as f:  
        pickle.dump(rawDict, f)
    with open('variables/pathName.pkl', 'wb') as f:  
        pickle.dump(pathName, f)
    print('rawDict created and saved to variables/rawDict.pkl')
reDo = True
#the same with noise signals
if not os.path.exists('variables/noiseDict.pkl') or reDo:    
    print('\nNOISE')
    noiseDict = {}
    folders = glob('dataset/_background_noise_/*.wav')
    for key in folders:
        print('Processing ', key)
        noiseDict[key[key.rindex('/')+1:len(key)]] = wavfile.read(key)[1]
    with open('variables/noiseDict.pkl', 'wb') as f:  
        pickle.dump(noiseDict, f) 
    print('noiseDict created and saved to variables/noiseDict.pkl')
else:
    print('noiseDict found')
    with open('variables/noiseDict.pkl', 'rb') as f:  
        noiseDict = pickle.load(f)
    print('noiseDict loaded')

# Preprocessing

## Shift

In [None]:
%%time
shift_percentage = 0.1
time_shift_max = 100 #[ms]
sample_shift_max = round(time_shift_max / 1000 * sampleRate)

for key in rawDict:
    length = rawDict[key].shape[0]
    toShift = round(length*shift_percentage)
    for i in rnd.sample(range(length),toShift):
        shift = rnd.randint(-sample_shift_max, sample_shift_max)
        rawDict[key][i] = shiftVec(rawDict[key][i],shift)        

## Noise

In [None]:
%%time
noise_percentage = 0.1 
max_noise = .5
for key in rawDict:
    length = rawDict[key].shape[0]
    toNoise = round(length*noise_percentage)
    for i in rnd.sample(range(length),toNoise):
        noise = noiseSelector(noiseDict, 16000)
        rawDict[key][i] += np.array(np.round(np.random.uniform(high = max_noise) * noise),dtype='int16')

## Silence creation

In [None]:
key = 'silence'
silence_percentage = 0.05
max_noise_sound = 0.5
#silence_max = round(tot_samples*silence_percentage)
silence_max = 5000
values = []
for i in range(silence_max):
    noise = noiseSelector(noiseDict, 16000)
    sig = noise * np.random.uniform(high = max_noise_sound)
    values.append(sig)
values = np.array(values)
rawDict[key]=values

## Data division

In [None]:
%%time
trainDict = {}
validationDict = {}
testDict = {}

validation_percentage = 15
testing_percentage = 15

for key in rawDict:
    if key == 'silence':
        #already random so 
        trainDict[key]= rawDict[key][0:round(silence_max*(1-validation_percentage/100-testing_percentage/100))]
        validationDict[key] = rawDict[key][round(silence_max*(1-validation_percentage/100-testing_percentage/100)):round(silence_max*(1-testing_percentage/100))]
        testDict[key] = rawDict[key][round(silence_max*(1-testing_percentage/100)):silence_max]
    else:
        testTemp = []
        trainTemp = []
        validTemp = []
        for count, sample in enumerate(rawDict[key]):
            assign = which_set(pathName[key][count], validation_percentage, testing_percentage, 2**27 - 1)
            if assign == 'testing':
                testTemp.append(sample)
            elif assign == 'training':
                trainTemp.append(sample)
            elif assign == 'validation':
                validTemp.append(sample)
        trainDict[key]= np.array(trainTemp)
        validationDict[key] = np.array(validTemp)
        testDict[key] = np.array(testTemp)

## Data augmentation
Only on core keys

In [None]:
%%time
rnd.seed(1)
#small shift
time_shift_max = 50 #[ms]
sample_shift_max = round(time_shift_max / 1000 * sampleRate)

#small noise perturbation
max_noise = .1

tot_samples = 0
augmented_percentage = 0.3

for key in coreKey:
    print(key)
    length = trainDict[key].shape[0]
    tot_samples+=length
    toAugment = round(length*augmented_percentage)
    new_bunch_of_samples = []
    for i in rnd.sample(range(length),toAugment):
        shift = rnd.randint(-sample_shift_max, sample_shift_max)
        new_sample = shiftVec(trainDict[key][i],shift)     
        noise = noiseSelector(noiseDict, 16000)
        new_sample += np.array(np.round(np.random.uniform(high = max_noise) * noise),dtype='int16')
        new_sample = np.array(stretch(np.array(new_sample,dtype='float32') , np.random.uniform(low = 0.8, high = 1.2)), dtype='int16')
        new_bunch_of_samples.append(new_sample)
    new_bunch_of_samples = np.array(new_bunch_of_samples)
    trainDict[key] = np.vstack((trainDict[key], new_bunch_of_samples))

# Feature extraction

## MFCC

In [None]:
# list with mfcc parameters in order: [numcep, nfilt, winlen, winstep]+
values = [[14,26,0.025,0.01,512]]

\begin{verbatim}
Parameter         Description
signal 	          the audio signal from which to compute features. Should be an N*1 array
samplerate 	      the samplerate of the signal we are working with.
winlen 	          the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
winstep 	      the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
numcep 	          the number of cepstrum to return, default 13
nfilt 	          the number of filters in the filterbank, default 26.
nfft 	          the FFT size. Default is 512
lowfreq 	      lowest band edge of mel filters. In Hz, default is 0
highfreq 	      highest band edge of mel filters. In Hz, default is samplerate/2\\
preemph 	      apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97\\
ceplifter 	      apply a lifter to final cepstral coefficients. 0 is no lifter. Default is 22\\
appendEnergy 	  if this is true, the zeroth cepstral coefficient is replaced with the log of the total frame energy.\\
returns 	      A numpy array of size (NUMFRAMES by numcep) containing features. Each row holds 1 feature vector.\\
\end{verbatim}

In [None]:
def computeMFCC(data, output, i, sampleRate = 16000):
    for sig in data:
        mfcc_feat = mfcc(sig,sampleRate, preemph = 0.97, numcep = i[0], nfilt = i[1], winlen = i[2], winstep = i[3], nfft = i[4], lowfreq=100, highfreq=4000)
        output.append(mfcc_feat)
def computeLogF(data, output, i, sampleRate = 16000):
    for sig in data:
        lfilt_feat = logfbank(sig,sampleRate, preemph = 0.97, nfilt = i[1], winlen = i[2], winstep = i[3], nfft = i[4], lowfreq=100, highfreq=4000)
        output.append(lfilt_feat)
def computeDelta(mfccValues, N = 2):
    temp = []
    for count, _ in enumerate(mfccValues):
        delt = delta(mfccValues[count], N)
        temp.append(delt)
    return temp        

In [None]:
%%time
with open('variables/mfccValues.pkl', 'wb') as f:  
    pickle.dump(values, f)

def getName(index):
    if index == 0:
        return 'Train'
    elif index == 1:
        return 'Test'
    elif index == 2:
        return 'Validation'
    
#look for the already computed mfcc
dictionaries = [trainDict, testDict, validationDict]
for count, i in enumerate(values):
    for index, dictionary in enumerate(dictionaries):
        print('\n#####Coumputing '+getName(index)+ ' Set#####')
        nameMFCC = 'variables/mfccDict'+getName(index)+'[nC='+str(i[0])+' wL='+str(i[2])+' wS='+str(i[3])+'].pkl'
        nameDelta = 'variables/mfccDictDD'+getName(index)+'[nC='+str(i[0])+' wL='+str(i[2])+' wS='+str(i[3])+'].pkl'
        mfccDict = {}
        for countKey, key in enumerate(dictionary):
            print('Processing ',key, " (", countKey+1, "/", len(dictionary),")" )
            array = []
            computeMFCC(dictionary[key], array, i)            
            mfccDict[key] = np.array(array)
        
        with open(nameMFCC, 'wb') as f:  
            pickle.dump(mfccDict, f)
            
        print("\n  Processing delta and delta-delta")
        for countKey, key in enumerate(mfccDict):
            print('Processing ',key, " (", countKey+1, "/", len(mfccDict),")" )
            delt = np.array(computeDelta(mfccDict[key]))
            deltdelt = np.array(computeDelta(delt))
            mfccDict[key] = np.stack([mfccDict[key],delt,deltdelt], axis = -1)
        
        with open(nameDelta, 'wb') as f:  
            pickle.dump(mfccDict, f)

# LogFilter with delta


In [None]:
%%time
for count, i in enumerate(values):
    for index, dictionary in enumerate(dictionaries):
        print('\n#####Coumputing '+getName(index)+ ' Set#####')
        nameLog = 'variables/logfiltDict'+getName(index)+'[nF='+str(i[1])+' wL='+str(i[2])+' wS='+str(i[3])+'].pkl'
        nameLogDD = 'variables/logfiltDictDD'+getName(index)+'[nF='+str(i[1])+' wL='+str(i[2])+' wS='+str(i[3])+'].pkl'
        logFDict = {}
        for countKey, key in enumerate(dictionary):
            print('Processing ',key, " (", countKey+1, "/", len(rawDict),")" )
            array = []
            computeLogF(dictionary[key], array, i)            
            logFDict[key] = np.array(array)
        
        with open(nameLog, 'wb') as f:  
            pickle.dump(logFDict, f)
            
        print("\n  Processing delta and delta-delta")
        
        for countKey, key in enumerate(logFDict):
            print('Processing ',key, " (", countKey+1, "/", len(logFDict),")" )
            delt = np.array(computeDelta(logFDict[key]))
            deltdelt = np.array(computeDelta(delt))
            logFDict[key] = np.stack([logFDict[key],delt,deltdelt], axis = -1)
        
        with open(nameLogDD, 'wb') as f:  
            pickle.dump(logFDict, f)

# Creating dataset

In [1]:
%reset -f
def getName(index):
    if index == 0:
        return 'Train'
    elif index == 1:
        return 'Test'
    elif index == 2:
        return 'Validation'
    
import pickle
from dependencies import functions

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [5]:
#choose which dictionary to use
choice =      'mfcc'# 'logfilter' #
useDelta =  True

data = {}

#retrieving of used values for the computation of mfcc
with open('variables/mfccValues.pkl', 'rb') as f:  
    values = pickle.load(f)
    
selected = 0

if choice == 'mfcc':
    for index in range(3):
        #name format of the selected data
        if useDelta:
            name = 'variables/mfccDictDD'+getName(index)+'[nC='+str(values[selected][0])+' wL='+str(values[selected][2])+' wS='+str(values[selected][3])+'].pkl'
        else:
            name = 'variables/mfccDict'+getName(index)+'[nC='+str(values[selected][0])+' wL='+str(values[selected][2])+' wS='+str(values[selected][3])+'].pkl'
        #loading in usedDict of the mfcc dict
        with open(name, 'rb') as f: 
            data[getName(index)] = pickle.load(f)
        print('Loaded '+name)

elif choice == 'logfilter':
    for index in range(3):
        #name format of the selected data
        if useDelta:
            name = 'variables/logfiltDictDD'+getName(index)+'[nF='+str(values[selected][1])+' wL='+str(values[selected][2])+' wS='+str(values[selected][3])+'].pkl'
        else:
            name = 'variables/logfiltDict'+getName(index)+'[nF='+str(values[selected][1])+' wL='+str(values[selected][2])+' wS='+str(values[selected][3])+'].pkl'
        #saving in usedDict of the logfilter dict
        with open(name, 'rb') as f:  
            data[getName(index)] = pickle.load(f)
        print('Loaded '+name)

Loaded variables/mfccDictDDTrain[nC=14 wL=0.025 wS=0.01].pkl
Loaded variables/mfccDictDDTest[nC=14 wL=0.025 wS=0.01].pkl
Loaded variables/mfccDictDDValidation[nC=14 wL=0.025 wS=0.01].pkl


In [6]:
#core words of the dataset
coreKey = ["yes", "no", "up", "down", "left", "right", "on", "off", "stop", "go", "zero",
           "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"]

#split of the core set
numbers = ['one', 'two', 'three','four','five','six','seven','eight','nine', "zero"]

words = ["yes", "no", "up", "down", "left", "right", "on", "off", "stop", "go"]

#selecting the subset of words to predict
usedLabels = words

#usedLabels.append('silence')

unknownLabels = list(data['Train'].keys())
for key in usedLabels:
    try:
        unknownLabels.remove(key)
    except:
        print(key, ' not in used')

In [7]:
%%time
#divding between train and test with also scaling data
functions.train_test_creator(
    data,
    usedLabels,
    unknownLabels,
    with_unknown = False,
    scalerType = 'robust',
    depth = (len(data['Train'][words[0]].shape)-3)*2 + 1 )

Wall time: 14 s
