In [28]:
import os
import pathlib
import random
import tensorflow as tf
import numpy as np
import wave
import pandas as pd
import librosa


In [2]:
DATASET_PATH = 'data/mini_speech_commands'

data_dir = pathlib.Path(DATASET_PATH)
if not data_dir.exists():
    tf.keras.utils.get_file(
        'mini_speech_commands.zip',
        origin="http://storage.googleapis.com/download.tensorflow.org/data/mini_speech_commands.zip",
        extract=True,
        cache_dir='.', cache_subdir='data')


In [3]:
categories = {i:j for i,j in enumerate(os.listdir(DATASET_PATH))}
categories

{0: 'down',
 1: 'go',
 2: 'left',
 3: 'no',
 4: 'right',
 5: 'stop',
 6: 'up',
 7: 'yes'}

In [4]:
def load_data(path):
    data = list()
    for idx, folder in categories.items():
        folder_path = os.path.join(path,folder)
        images = os.listdir(folder_path)
        images = [os.path.join(folder_path,image) for image in images]
        for image in images:
            data.append([image,idx])
    return data
data = load_data(DATASET_PATH)

In [5]:
random.shuffle(data)
random.shuffle(data)

In [6]:
def get_sample_rate(d):
    audio_path,label = d
    with wave.open(audio_path, 'rb') as wav_file:
        sample_rate = wav_file.getframerate()
        num_frames = wav_file.getnframes()
        duration = num_frames / sample_rate
    return sample_rate,duration,label

audio_info = [get_sample_rate(d) for d in data]
df = pd.DataFrame(audio_info, columns =['sample_rate', 'duration','label'])

In [7]:
df['label'].value_counts()

label
4    1000
2    1000
5    1000
3    1000
1    1000
0    1000
7    1000
6    1000
Name: count, dtype: int64

In [8]:
df.describe()

Unnamed: 0,sample_rate,duration,label
count,8000.0,8000.0,8000.0
mean,16000.0,0.983703,3.5
std,0.0,0.061426,2.291431
min,16000.0,0.426687,0.0
25%,16000.0,1.0,1.75
50%,16000.0,1.0,3.5
75%,16000.0,1.0,5.25
max,16000.0,1.0,7.0


In [46]:
def compute_mfcc(audio, sample_rate, n_mfcc=13):
    audio = audio.copy()
    audio[np.isnan(audio)] = 0.0
    audio[np.isinf(audio)] = 0.0
    mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=n_mfcc)
    return mfccs.T  

def process_audio(audio_path,target_length):
    with wave.open(audio_path, 'rb') as wav_file:
        frames = wav_file.readframes(-1)
        num_channels = wav_file.getnchannels()
        
        numpy_array = np.frombuffer(frames, dtype=np.float16)
        
        if num_channels == 2:
            numpy_array = numpy_array.reshape(-1, 2)
        
        if len(numpy_array) < target_length:
            padding_length = target_length - len(numpy_array)
            if num_channels == 2:
                numpy_array = np.pad(numpy_array, ((0, padding_length), (0, 0)), 'constant')
            else:
                numpy_array = np.pad(numpy_array, (0, padding_length), 'constant')
        
        elif len(numpy_array) > target_length:
            numpy_array = numpy_array[:target_length]

        mfcc = compute_mfcc(numpy_array,target_length)
        
        return mfcc #numpy_array.reshape((target_length,1))

In [47]:
class DataSequence(tf.keras.utils.Sequence):

    def __init__(self,data,batch_size=32,target_length=44100):
        self.data = data
        self.batch_size = batch_size
        self.target_length = target_length
    
    def __len__(self):
        return int(np.ceil(len(self.data) / self.batch_size))

    def __getitem__(self,idx):
        batch = self.data[idx * self.batch_size : (idx + 1) * self.batch_size]
        return self.data_gen(batch)

    def data_gen(self,data):
        audios,labels = list(),list()
        for audio,label in data:
            aud = process_audio(audio,self.target_length)
            audios.append(aud)
            labels.append(label)
        return np.array(audios),np.array(labels)

In [48]:
train_ratio = 0.7
val_ratio = 0.1
total_examples = len(data)
train_size = round(total_examples * train_ratio)
val_size = round(total_examples * val_ratio)
train_examples = data[:train_size]
val_examples = data[train_size:train_size+val_size]
test_examples = data[train_size+val_size:]

In [49]:
batch_size = 32
sample_rate = 16000
train_data = DataSequence(train_examples,batch_size,sample_rate)
test_data = DataSequence(test_examples,1,sample_rate)
eval_data = DataSequence(val_examples,batch_size,sample_rate)

In [51]:
train_data[1][0].shape

(32, 32, 13)

In [61]:
model = tf.keras.Sequential([
    tf.keras.layers.InputLayer(shape=list(train_data[1][0].shape[1:])),

    tf.keras.layers.Conv1D(32,3),
    tf.keras.layers.Activation('relu'),
    tf.keras.layers.Conv1D(32,3),
    tf.keras.layers.Activation('relu'),
    tf.keras.layers.Conv1D(64,3),
    tf.keras.layers.Activation('relu'),
    tf.keras.layers.MaxPooling1D(),
    tf.keras.layers.Dropout(0.25),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(512),
    tf.keras.layers.Activation('relu'),
    tf.keras.layers.Dense(128),
    tf.keras.layers.Activation('relu'),
    tf.keras.layers.Dense(64),
    tf.keras.layers.Activation('relu'),
    tf.keras.layers.Dropout(0.5),

    tf.keras.layers.Dense(len(categories)),
    tf.keras.layers.Activation('softmax'),
])
model.compile(
    optimizer=tf.keras.optimizers.Adam(),
    loss=tf.keras.losses.sparse_categorical_crossentropy,
    metrics=['accuracy']
)

In [62]:
model.summary()

In [63]:
history = model.fit(train_data,epochs=100,validation_data=eval_data,verbose=1)

Epoch 1/100
[1m175/175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 300ms/step - accuracy: 0.1205 - loss: 14.7918 - val_accuracy: 0.1275 - val_loss: 2.0794
Epoch 2/100
[1m175/175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 322ms/step - accuracy: 0.1266 - loss: 2.0795 - val_accuracy: 0.1250 - val_loss: 2.0794
Epoch 3/100
[1m175/175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 313ms/step - accuracy: 0.1180 - loss: 2.0798 - val_accuracy: 0.1275 - val_loss: 2.0795
Epoch 4/100
[1m175/175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 314ms/step - accuracy: 0.1255 - loss: 2.0797 - val_accuracy: 0.1275 - val_loss: 2.0796
Epoch 5/100
[1m175/175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 298ms/step - accuracy: 0.1209 - loss: 2.0800 - val_accuracy: 0.1213 - val_loss: 2.0797
Epoch 6/100
[1m175/175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 316ms/step - accuracy: 0.1220 - loss: 2.0793 - val_accuracy: 0.1075 - val_loss: 2.0798
Epo

KeyboardInterrupt: 