### Create dataset from audio files

In [None]:
import os

import numpy as np
import matplotlib.pyplot as plt
import scipy.misc  # for image resizing

#import scipy.io.wavfile
import soundfile

In [None]:
def spectrogram(wav_filepath):
    # https://mail.python.org/pipermail/chicago/2010-December/007314.html
    
    #sample_rate, samples = scipy.io.wavfile.read(wav_filepath)
    samples, sample_rate = soundfile.read(wav_filepath)
    
    # Rescale so that max/min are ~ +/- 1 around 0
    data_av = np.mean(samples)
    data_max = np.max(np.absolute(samples-data_av))
    sound_data = (samples - data_av)/data_max
    
    ## Parameters: 10ms step, 30ms window
    nstep = int(sample_rate * 0.01)
    nwin  = int(sample_rate * 0.03)
    nfft = 2*int(nwin/2)

    window = np.hamming(nwin)

    # will take windows x[n1:n2].  generate and loop over 
    # n2 such that all frames fit within the waveform
    nn = range(nwin, len(sound_data), nstep)

    X = np.zeros( (len(nn), nfft//2) )

    for i,n in enumerate(nn):
        segment = sound_data[ n-nwin:n ]
        z = np.fft.fft(window * segment, nfft)
        X[i,:] = np.log(np.absolute(z[:nfft//2]))
        
    return X

In [None]:
# This is a function that smooths a time-series
#   which enables us to segment the input into words by looking at the 'energy' profile
def smooth(x, window_len=21):  # , window='hanning'
    # http://scipy-cookbook.readthedocs.io/items/SignalSmooth.html
    #s = np.r_[ x[window_len-1:0:-1], x, x[-1:-window_len:-1]]
    s = np.r_[ np.zeros( ((window_len-1)//2,) ), x, np.zeros( ((window_len-1)//2,) ) ]
    w=np.hamming(window_len)
    return np.convolve(w/w.sum(), s, mode='valid') #[window_len-1 : -(window_len-1) ]

In [None]:
f = './data/num_phone_en-UK_m_Martin0.wav'
#f = './data/num_Bing_en-UK_f_Susan.wav'

#f = './data/num_phone_en-UK_m_Martin0.ogg'
#f = './data/num_Bing_en-UK_f_Susan.ogg'

In [None]:
X = spectrogram(f)
print("X.shape=", X.shape)

Y = np.std(X, axis=1)
#Y = np.max(X, axis=1)
Y_min = np.min(Y)
Y_range = Y.max()-Y_min
Y = (Y - Y_min)/Y_range

print("Y.shape=", Y.shape)

Y_crop = np.where(Y>0.25, 1.0, 0.0)
# Apply some smoothing

Y_crop = smooth(Y_crop)
Y_crop = np.where(Y_crop>0.01, 1.0, 0.0)
print("Y_crop.shape=", Y_crop.shape)

plt.imshow(X.T, interpolation='nearest',
    origin='lower',
    aspect='auto')

plt.plot(Y * X.shape[1])

plt.plot(Y_crop * X.shape[1])

plt.show()
#Y.min(), Y.max()
#X[100,:]
np.argmin(X)/248, np.argmax(X)/248

In [None]:
# Split the file into voiced segments

#http://stackoverflow.com/questions/4494404/find-large-number-of-consecutive-values-fulfilling-condition-in-a-numpy-array
def contiguous_regions(condition):
    idx = []
    i = 0
    while i < len(condition):
        x1 = i + condition[i:].argmax()
        try:
            x2 = x1 + condition[x1:].argmin()
        except:
            x2 = x1 + 1
        if x1 == x2:
            if condition[x1] == True:
                x2 = len(condition)
            else:
                break
        idx.append( [x1,x2] )
        i = x2
    return idx

contiguous_regions(Y_crop>0.5)

In [None]:
import re
remove_punc = re.compile('[\,\.\?\!]')
squash_spaces = re.compile('\s+')
def words(s):
    s = remove_punc.sub(' ', s)
    s = squash_spaces.sub(' ', s)
    return s.strip().lower()

sentences=dict(
    num=words("zero one two three four five six seven eight nine."),
    
# https://www.quora.com/Is-there-a-text-that-covers-the-entire-English-phonetic-range/answer/Sheetal-Srivastava-1
    qbf=words("That quick beige fox jumped in the air over each thin dog.  "+
              "Look out, I shout, for he's foiled you again, creating chaos."),
    shy=words("Are those shy Eurasian footwear, cowboy chaps, "+
              "or jolly earthmoving headgear?"),
    ate=words("The hungry purple dinosaur ate the kind, zingy fox, the jabbering crab, "+
              "and the mad whale and started vending and quacking."),
    suz=words("With tenure, Suzie'd have all the more leisure for yachting, "+
              "but her publications are no good."),
    tbh=words("Shaw, those twelve beige hooks are joined if I patch a young, gooey mouth."),
    
    #  https://en.wikipedia.org/wiki/The_North_Wind_and_the_Sun          #594
    #  http://videoweb.nie.edu.sg/phonetic/courses/aae103-web/wolf.html  #1111
)
sentences['num'] #.replace(' ', ',')

In [None]:
def for_msft(prefixes):  # comma separated
    # https://www.microsoft.com/cognitive-services/en-us/speech-api
    return ' '.join([sentences[a] for a in prefixes.split(',')]).replace(' ', '\n') 
"""
This is the SSML that will be sent to the service:
<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" 
      xmlns:mstts="http://www.w3.org/2001/mstts" xml:lang="en-GB">
  <voice xml:lang="en-GB" name="Microsoft Server Speech Text to Speech Voice (en-GB, Susan, Apollo)">
zero
one
two
three
four
five
six
seven
eight
nine
  </voice>
</speak>
"""

# https://www.microsoft.com/cognitive-services/en-us/Speech-api/documentation/API-Reference-REST/BingVoiceOutput
a=for_msft('num')  # 49 long...
#a=for_msft('qbf,shy,ate,suz,tbh')  # 474 long...
print("length_in_chars=%d\n%s" % (len(a),a,))

In [None]:
# sox english.au --rate 16000 --channels 1 --encoding signed-integer english.wav norm -3
# sox english.au --rate 16000 --channels 1 english.ogg norm -3

In [None]:
import python_speech_features

sample_window_step = 0.01 # in seconds (10ms)

def get_sample_features(samples, sample_rate):
    #sample_feat = python_speech_features.mfcc(samples, sample_rate, numcep=13, nfilt=26, appendEnergy=True)
    #sample_feat = python_speech_features.mfcc(samples, sample_rate, numcep=28, nfilt=56, appendEnergy=True)

    #sample_feat, e = python_speech_features.fbank(samples,samplerate=sample_rate,
    #      winlen=0.025,winstep=0.01,nfilt=26,nfft=512,
    #      lowfreq=0,highfreq=None,preemph=0.97, winfunc=lambda x:np.ones((x,)))

    features, energy = python_speech_features.fbank(samples, samplerate=sample_rate, 
                            winlen=0.025, winstep=sample_window_step, 
                            nfilt=32,nfft=512,
                            lowfreq=0,highfreq=None,preemph=0.25,
                            winfunc=lambda x:np.hamming( x ))
    return features, energy
    
def get_sample_isolated_words(energy, plot=False):
    log_e = np.log(energy)
    if plot: plt.plot(log_e)

    log_e_hurdle = (log_e.max() - log_e.min())*0.25 + log_e.min()

    log_e_crop = np.where(log_e>log_e_hurdle, 1.0, 0.0)
    if plot: plt.plot(log_e_crop * 25)

    # By smoothing, and applying a very low hurdle, we expand the crop area safely
    log_e_crop_expanded = np.where(smooth(log_e_crop)>0.01, 1.0, 0.0)
    if plot: plt.plot(log_e_crop_expanded * 25)
        
    return contiguous_regions(log_e_crop_expanded>0.5)

In [None]:
samples, sample_rate = soundfile.read(f)

sample_feat, energy = get_sample_features(samples, sample_rate)

plt.imshow(np.log(sample_feat.T), interpolation='nearest',
    origin='lower',
    aspect='auto')

word_ranges = get_sample_isolated_words(energy, plot=True)

plt.show()
sample_feat.shape, energy.shape, energy[10]

In [None]:
# Break sound into separate WAVs in word-based directories
def split_combined_file_into_wavs(f, prefix='num'):
    # f ~ './data/num_Bing_en-UK_f_Susan.wav'
    f_base_orig = os.path.basename( f )
    if not f_base_orig.startswith(prefix+"_"): 
        print("Wrong prefix for '%s'" % (f_base_orig,))
        return
    
    # Here's the new filename (directory to be calculated per-word)
    f_base = os.path.splitext(f_base_orig)[0][len(prefix)+1:] + '.wav'
    
    samples, sample_rate = soundfile.read(f)
    sample_feat, energy = get_sample_features(samples, sample_rate)
    word_ranges = get_sample_isolated_words(energy, plot=False)
    #print(word_ranges)
    
    words = sentences[prefix].split(' ')
    if len(word_ranges) != len(words):
        print("Found %d segments, rather than %d, in '%s'" % (len(word_ranges), len(words), f,))
        return
    
    for i, word in enumerate(words):
        word_path = os.path.join('data', prefix, word)
        os.makedirs(word_path, exist_ok=True)
        
        wr = word_ranges[i]
        fac = int(sample_window_step*sample_rate)
        soundfile.write(os.path.join(word_path, f_base), samples[ wr[0]*fac:wr[1]*fac ], samplerate=sample_rate)

In [None]:
split_combined_file_into_wavs('./data/num_Bing_en-UK_f_Susan.wav')
#split_combined_file_into_wavs('./data/num_phone_en-UK_m_Martin0.wav')

In [None]:
# Convert a given (isolated word) WAV into a 'stamp'

def wav_to_stamp(prefix, word, wav):
    samples, sample_rate = soundfile.read( os.path.join('data', prefix, word, wav) )
    sample_feat, energy = get_sample_features(samples, sample_rate)
    
    data = np.log(sample_feat)
    
    # Force the data into the 'stamp size' as an image (implicit range normalization occurs)
    stamp = scipy.misc.imresize(data, (64, 32), 'bilinear')
    
    # https://github.com/scipy/scipy/issues/4458 :: The stamps are stored as uint8...
    return stamp

In [None]:
stamp = wav_to_stamp('num', 'six', 'phone_en-UK_m_Martin0.wav')

plt.imshow(stamp.T, interpolation='nearest', origin='lower', aspect='auto')
plt.show()

np.min(stamp), np.max(stamp)

In [None]:
# combine all words from a given prefix into a dataset of 'stamps'
import pickle

def create_dataset_from_folders(prefix, save_as='.pkl'):
    words = sentences[prefix].split(' ')
    stamps, labels = [], []
    
    for label_i, word in enumerate( words ):
        # Find all the files for this word
        for stamp_file in os.listdir( os.path.join('data', prefix, word )):
            if not f.endswith('.wav'): continue
            #print(stamp_file)
            stamp = wav_to_stamp(prefix, word, stamp_file)
            
            stamps.append(stamp)
            labels.append(label_i)

    if save_as is None: # Return the data directly
        return stamps, labels, words
    
    data_dictionary = dict(
        stamp=stamps, label=labels, 
        rand=np.random.rand( len(labels) ), # This is to enable us to sample the data (based on hurdles)
        words=words, 
    )
    pickle.dump(data_dictionary, open(os.path.join('data', prefix+save_as), 'wb'), 
                protocol=pickle.HIGHEST_PROTOCOL)        

In [None]:
#if not os.path.exists('data/num.pkl'):
if True:
    create_dataset_from_folders('num')

In [None]:
# Read in the dataset
dataset = pickle.load(open(os.path.join('data', 'num.pkl'), 'rb'))

In [None]:
# Plot all of a given 'word'
indices = [ i for i,label in enumerate(dataset['label']) if dataset['words'][label]=='six']

plt.figure(figsize=(12,2))
for i in indices[0:16]:  # at most 16
    plt.subplot(2, 8, i+1)  # nrows, ncols, subplot#
    plt.imshow(dataset['stamp'][i].T, cmap='gray', origin='lower', interpolation='nearest')
    plt.axis('off')
plt.show()

In [None]:
# Now do something similar for 'test files', create a dataset for all the audio files in the given folder



In [None]:
test_prefix = 'test-'+'num'