In [1]:
import pickle
import os
import numpy as np
from tqdm import tqdm
from scipy.io import wavfile
from python_speech_features import mfcc
from keras.models import load_model
import pandas as pd
from sklearn.metrics import accuracy_score

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
df = pd.read_csv('commands.csv')

In [3]:
classes = list(np.unique(df.label))

In [4]:
fn2class = dict(zip(df.fname, df.label))

In [45]:
def build_predictions(audio_dir):
	y_true = []
	y_pred = []
	fn_prob = {}

	print('Extracting features from audio')
	for fn in tqdm(os.listdir(audio_dir)):
		rate, wav = wavfile.read(os.path.join(audio_dir,  fn))
		label = fn2class[fn]
		c = classes.index(label)
		y_prob = []

		for i in range(0, wav.shape[0]-config.step, config.step):
			sample = wav[i:i+config.step]
			x = mfcc(sample, rate, numcep=config.nfeat, nfilt=config.nfilt, nfft=config.nfft)
			x = (x - config.min) / (config.max - config.min)

			if config.mode == 'conv':
				x = x.reshape(1, x.shape[0], x.shape[1], 1)
			elif config.mode == 'time':
				x = np.expand_dims(x, axis=0)
			y_hat = model.predict(x)
			y_prob.append(y_hat)
			y_pred.append(np.argmax(y_hat))
			y_true.append(c)

		fn_prob[fn] = np.mean(y_prob, axis=0).flatten()

	return y_true, y_pred, fn_prob

In [7]:
p_path = os.path.join('pickles', 'conv.p')

with open(p_path, 'rb') as handle:
	config = pickle.load(handle)

model = load_model(config.model_path)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.


In [46]:
y_true, y_pred, fn_prob = build_predictions('clean')

Extracting features from audio



  0%|                                                                                                                                          | 0/160 [00:00<?, ?it/s]
  1%|█▋                                                                                                                                | 2/160 [00:00<00:10, 15.63it/s]
  2%|███▎                                                                                                                              | 4/160 [00:00<00:10, 14.80it/s]
  4%|████▉                                                                                                                             | 6/160 [00:00<00:10, 14.51it/s]
  5%|██████▌                                                                                                                           | 8/160 [00:00<00:10, 14.70it/s]
  6%|████████                                                                                                                         | 10/160 [00:00<00:10, 14

 61%|██████████████████████████████████████████████████████████████████████████████▏                                                  | 97/160 [00:07<00:04, 13.42it/s]
 62%|███████████████████████████████████████████████████████████████████████████████▊                                                 | 99/160 [00:07<00:04, 12.58it/s]
 63%|████████████████████████████████████████████████████████████████████████████████▊                                               | 101/160 [00:07<00:04, 13.36it/s]
 64%|██████████████████████████████████████████████████████████████████████████████████▍                                             | 103/160 [00:07<00:04, 12.99it/s]
 66%|████████████████████████████████████████████████████████████████████████████████████                                            | 105/160 [00:07<00:04, 12.94it/s]
 67%|█████████████████████████████████████████████████████████████████████████████████████▌                                          | 107/160 [00:08<00:04, 13.

In [47]:
acc_score = accuracy_score(y_true=y_true, y_pred=y_pred)
print(acc_score)

0.6328502415458938


In [30]:
df.head()

Unnamed: 0,fname,label
0,abort (1).wav,abort
1,abort (10).wav,abort
2,abort (11).wav,abort
3,abort (12).wav,abort
4,abort (13).wav,abort


In [31]:
y_probs = []
for i, row in df.iterrows():
	y_prob = fn_prob[row.fname]
	y_probs.append(y_prob)
	for c, p in zip(classes, y_prob):
		df.at[i, c] = p

In [32]:
df.head()

Unnamed: 0,fname,label,abort,activate,centroid,edge,launch,switch,track,zoom
0,abort (1).wav,abort,0.374854,0.035011,0.07733,0.077322,0.015295,0.092699,0.180331,0.147158
1,abort (10).wav,abort,0.488626,0.038646,0.085118,0.018848,0.014186,0.073773,0.188515,0.092288
2,abort (11).wav,abort,0.639254,0.048512,0.051637,0.069968,0.014048,0.071721,0.069362,0.035497
3,abort (12).wav,abort,0.788865,0.016311,0.083199,0.061342,0.000641,0.040071,0.001065,0.008506
4,abort (13).wav,abort,0.672246,0.046262,0.053543,0.097856,0.003522,0.078063,0.023759,0.024748


In [44]:
fn_prob

{'abort (1).wav': array([0.37485355, 0.03501116, 0.07733013, 0.07732155, 0.01529505,
        0.09269878, 0.1803314 , 0.14715838], dtype=float32),
 'abort (10).wav': array([0.4886262 , 0.0386457 , 0.085118  , 0.01884769, 0.01418584,
        0.0737728 , 0.18851529, 0.09228849], dtype=float32),
 'abort (11).wav': array([0.63925415, 0.04851174, 0.05163749, 0.06996829, 0.01404767,
        0.07172107, 0.06936242, 0.03549713], dtype=float32),
 'abort (12).wav': array([7.8886509e-01, 1.6311355e-02, 8.3199352e-02, 6.1341558e-02,
        6.4115162e-04, 4.0071078e-02, 1.0645860e-03, 8.5057979e-03],
       dtype=float32),
 'abort (13).wav': array([0.6722459 , 0.04626221, 0.05354275, 0.09785598, 0.00352218,
        0.07806314, 0.02375945, 0.02474834], dtype=float32),
 'abort (14).wav': array([0.72219884, 0.00285891, 0.11236823, 0.01827447, 0.00215048,
        0.08035291, 0.01054556, 0.05125054], dtype=float32),
 'abort (15).wav': array([0.7599725 , 0.0042898 , 0.07493152, 0.0145863 , 0.00201624,
  