In [1]:
import numpy as np
import time
from pydub import AudioSegment
import random
import sys
import io
import os
import glob
import IPython
from td_utils import *
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
from scipy.io.wavfile import write
from sklearn.model_selection import train_test_split
%matplotlib inline



In [2]:
Tx = 798
n_freq = 101

# The model

In [3]:
from keras.callbacks import ModelCheckpoint
from keras.models import Model, load_model, Sequential
from keras.layers import Dense, Activation, Dropout, Input, Masking, TimeDistributed, LSTM, Conv1D
from keras.layers import GRU, Bidirectional, BatchNormalization, Reshape
from keras.optimizers import Adam

Using TensorFlow backend.


In [4]:
def my_model(input_shape):
    
    X_input = Input(shape= input_shape)
    
    X = Conv1D(196, kernel_size=15, strides=4)(X_input)
    X = BatchNormalization()(X)
    X = Activation('relu')(X)
    X = Dropout(0.8)(X)
    
    encoder = LSTM(128, return_state=True)
    encoder_outputs, state_h, state_c = encoder(X)
    encoder_states = [state_h, state_c]
    
    out_input = Input(shape = (1, 1))
    
    decoder = LSTM(128, return_sequences=True, return_state=True)
    out_decoder, _, _ = decoder(out_input, initial_state= encoder_states)
    
    X = Dense(2, activation='softmax')(out_decoder)
    
    model = Model([X_input, out_input], X)
    
    return model

In [5]:
model = my_model(input_shape = (798, 101))

In [6]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=["accuracy"])

In [19]:
model.load_weights('model.h5')

## Helper Functions

In [8]:
def detect_triggerword_spectrum(x):
    """
    Function to predict the location of the trigger word.
    
    Argument:
    x -- spectrum of shape (freqs, Tx)
    i.e. (Number of frequencies, The number time steps)

    Returns:
    predictions -- flattened numpy array to shape (number of output time steps)
    """
    # the spectogram outputs  and we want (Tx, freqs) to input into the model
    x  = x.swapaxes(0,1)
    x = np.expand_dims(x, axis=0)
    predictions = model.predict([x, np.ones(shape=(x.shape[0], 1, 1))])
    return predictions.reshape(-1)

In [9]:
def get_random_time_segment(segment_ms, h=10000):
    
    segment_start = np.random.randint(low=0, high=h-segment_ms)   # Make sure segment doesn't run past the 10sec background 
    segment_end = segment_start + segment_ms - 1
    
    return (segment_start, segment_end)

In [10]:
def insert_audio_clip(background, audio_clip):

    segment_ms = len(audio_clip)

    segment_time = get_random_time_segment(segment_ms, len(background))
    
    new_background = background.overlay(audio_clip, position = segment_time[0])
    
    return new_background

In [11]:
def create_data(backgrounds, activates, negatives, pos=True):
    
    background = backgrounds[np.random.choice(range(len(backgrounds)))]
    background = background - 20
    seg_start, seg_end = get_random_time_segment(4000)
    background = background[seg_start: seg_end]
    
    if pos:
        y = [1, 0]
        activate = activates[np.random.choice(range(len(activates)))]
        background = insert_audio_clip(background, activate)
    else:
        y = [0, 1]
        negative = negatives[np.random.choice(range(len(negatives)))]
        background = insert_audio_clip(background, negative)
    
    background = match_target_amplitude(background, -20.0)
    
    file_handle = background.export("train" + ".wav", format="wav")
    
    x = graph_spectrogram("train.wav")
    
    return x, y

In [12]:
activates, negatives, backgrounds = load_raw_audio()
t_activates, t_negatives, t_backgrounds = load_test_audio()

## To Train 

In [None]:
X = []
Y = []
for i in range(2000):
    if i%500 == 0:
        print(i)
    x, y = create_data(backgrounds, activates, negatives, np.random.choice([True, False]))
    X.append(x.T)
    Y.append([y])

X = np.array(X)
Y = np.array(Y)
print('X: ', X.shape)
print('Y: ', Y.shape)

0


  Z = 10. * np.log10(spec)


500
1000
1500


In [None]:
for i in rangz

# To test on one sample!

In [38]:
pos = True
x, y = create_data(t_backgrounds, t_activates, t_negatives, pos=pos)

In [39]:
IPython.display.Audio("train.wav")

In [40]:
print('the correct value is:')
if np.argmax(y):
    print('NEGATIVE!')
else:
    print('POSITIVE!')

the correct value is:
POSITIVE!


In [41]:
print('the predicted value is:')
if np.argmax(detect_triggerword_spectrum(x)):
    print('NEGATIVE!')
else:
    print('POSITIVE!')

the predicted value is:
NEGATIVE!


# To test on a batch of samples

In [20]:
nbr = 200
X = []
Y = []
for i in range(nbr):
    if i%50 == 0:
        print(i)
    x, y = create_data(t_backgrounds, t_activates, t_negatives, np.random.choice([True, False]))
    X.append(x.T)
    Y.append([y])

X = np.array(X)
Y = np.array(Y)
print('X: ', X.shape)
print('Y: ', Y.shape)

0


  Z = 10. * np.log10(spec)


50
100
150
X:  (200, 798, 101)
Y:  (200, 1, 2)


In [21]:
model.evaluate([X, np.ones(shape=(X.shape[0], 1, 1))], Y)



[0.43230145692825317, 0.875]