In [54]:
import numpy as np
from scipy.io import wavfile
from scipy.signal import spectrogram
from matplotlib import pyplot as plt
import speech_recognition as sr
from wit import Wit
from sklearn import preprocessing, model_selection

from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten, Conv1D
from keras.optimizers import Adam

In [58]:
def createPaddedSpectrogram(audio):
    f, t ,Sxx = spectrogram(audio[1], fs=44100)
    length = Sxx.shape[1] if Sxx.shape[1] < 450 else 450
    f_cut = f[f<2000.0]
    Sxx = Sxx[:12, :]
    Sxx_padded = np.zeros((12,450))
    Sxx_padded[:12, :length] = Sxx[:12,:length]
#     plt.figure(figsize=(18,5))
#     plt.pcolormesh(t,f_cut, Sxx)
#     plt.show()
    return Sxx_padded

In [83]:
def sayKeyword(keyword, output_file):
    recognizer = sr.Recognizer()
    mic = sr.Microphone()
    wit_api = Wit('VE3COSMD3FBL4DBGPHNZJICM7ZCUJ7J5')
    with mic as source:
        print("Adjusting mic...")
        recognizer.adjust_for_ambient_noise(mic, 3)
        print("Say water")
        audio = recognizer.listen(mic, 2, 2.5)
        print("Detecting what you said...")
        response = wit_api.speech(audio.get_wav_data(), None, {'Content-Type': 'audio/wav'})
        if response['_text'] != keyword:
            i=i-1;
            print('Please speak more clearly')
        else:
            file = open(output_file,'wb')
            file.write(audio.get_wav_data())
            file.close()
            print("All good")
    
    audio_data = wavfile.read(output_file)
    return audio_data

In [3]:
#adding water pronunciation spectrograms from various speakers
Sxx_array = np.empty((110,12,450))

for i in range (1,111):
    filename = "data/pronunciation_en_water({0}).wav".format(i)
    audio = wavfile.read(filename)  
    Sxx_padded = createPaddedSpectrogram(audio)
    Sxx_array[i-1] = Sxx_padded


In [5]:
#adding personal spectrogram to the array
recording = False

if recording == True:
    for i in range(0,6):
        audio_data = sayKeyword('water', 'data/personal_{0}.wav'.format(i))
        Sxx_padded = createPaddedSpectrogram(audio_data)
        Sxx_array = np.append(Sxx_array, Sxx_padded)
else:
    for i in range(0,6):
        audio_data = wavfile.read('data/personal_{0}.wav'.format(i))
        Sxx_padded = createPaddedSpectrogram(audio_data)
        Sxx_array = np.append(Sxx_array, Sxx_padded)

In [34]:
Sxx_array = Sxx_array.reshape(116, 12, 450)
Y_array = np.zeros(116)
Y_array[110:] = 1 
# Y_array = Y_array.reshape((116,1,1))

In [56]:
xtrain,xtest,ytrain,ytest = model_selection.train_test_split(Sxx_array,Y_array, test_size= 0.8, random_state = 7)

In [57]:

model = Sequential()

model.add(Dense(256, input_shape=(12, 450,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(256))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(256))
model.add(Activation('softmax'))
model.add(Flatten())
model.add(Dense(1))

model.compile(loss='mean_squared_error', metrics=['accuracy'], optimizer='adam')
model.fit(xtrain, ytrain, batch_size=32, epochs=10, validation_data=(xtest,ytest))

Train on 23 samples, validate on 93 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f5b48953b00>

In [84]:
#now trying recording test audio
test_data = sayKeyword('water', 'data/testAudio.wav')
# test_data = wavfile.read('data/testAudio.wav')
test_padded = createPaddedSpectrogram(test_data)

Adjusting mic...
Say water
Detecting what you said...
All good


In [85]:
model.predict(test_padded.reshape(1,12,450))

array([[0.03484063]], dtype=float32)