**Import the necessary libraries**

In [1]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
import shutil

**Set the seed value for experiment reproducibility.**

In [2]:
seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)

**Download and extract the dataset from Tensorflow's Mini Speech Commands dataset**

In [3]:
DATASET_PATH = '/content/mini_speech_commands'
#Since im using colab for this, the path is set like this,
#feel free to change the path if you're running on your own PC.
if not os.path.exists(DATASET_PATH):
    tf.keras.utils.get_file(
        'mini_speech_commands.zip',
        origin="http://storage.googleapis.com/download.tensorflow.org/data/mini_speech_commands.zip",
        extract=True,
        cache_dir='/content/',
        cache_subdir='.')

Downloading data from http://storage.googleapis.com/download.tensorflow.org/data/mini_speech_commands.zip


**Since we are only working with yes,go and left in this example, im gonna remove other folders so we get less processing time.**

In [4]:
commands = ['yes', 'go', 'left']
for folder in os.listdir(DATASET_PATH):
    folder_path = os.path.join(DATASET_PATH, folder)
    if os.path.isdir(folder_path) and folder not in commands:
        shutil.rmtree(folder_path)

**Preprocess the audio data**

In [5]:
def preprocess_audio(audio, labels):
    audio = tf.squeeze(audio, axis=-1)
    return audio, labels

**Load and preprocess the training and validation datasets**

In [6]:
train_ds, val_ds = tf.keras.utils.audio_dataset_from_directory(
    directory=DATASET_PATH,
    batch_size=64,
    validation_split=0.2,
    seed=0,
    output_sequence_length=16000,
    subset='both',
    labels='inferred',
    label_mode='int',
)

train_ds = train_ds.map(preprocess_audio, tf.data.AUTOTUNE)
val_ds = val_ds.map(preprocess_audio, tf.data.AUTOTUNE)

Found 3000 files belonging to 3 classes.
Using 2400 files for training.
Using 600 files for validation.


**Now time to define the model architecture**

*   We want the inputs in shapes of 16000 items.
*   Activation is set to 'relu'



In [7]:
input_shape = (16000,)
num_labels = len(commands)

model = models.Sequential([
    layers.Input(shape=input_shape),
    layers.Reshape((16000, 1)),
    layers.Conv1D(32, 3, activation='relu'),
    layers.Conv1D(64, 3, activation='relu'),
    layers.MaxPooling1D(),
    layers.Dropout(0.25),
    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(num_labels),
])

**Compiling and then training the model.**

In [8]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy'],
)

In [9]:
EPOCHS = 10
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=EPOCHS,
    callbacks=tf.keras.callbacks.EarlyStopping(verbose=1, patience=2),
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 5: early stopping


**Evaluate the model**

*   Finally we are going to evaluate our created model and see the test results.
*   This includes the Loss and Accuracy of our model.





In [10]:
test_ds = val_ds.take(1)
test_results = model.evaluate(test_ds, return_dict=True)
print("Test Loss:", test_results['loss'])
print("Test Accuracy:", test_results['accuracy'])

Test Loss: 1.1192364692687988
Test Accuracy: 0.671875


**Export the model**

Now we can export our created model and use it elsewhere.

In [11]:
model.save("/content/sound_classification_model")



---



# Test the exported model

**Now that we have exported our model, It's time to load it up in another notebook and test it.**

**Load the saved model**

In [12]:
model = tf.keras.models.load_model("/content/sound_classification_model")

**Load an example**

*   Here we just copied an example from our own dataset.
*   The example is "Yes".
*   Let's see if our model can detect the voice.







In [135]:
example_file_path = '/content/mini_speech_commands/yes/01648c51_nohash_1.wav' #yes
#example_file_path = '/content/mini_speech_commands/left/0e5193e6_nohash_0.wav' #left
#example_file_path = '/content/mini_speech_commands/go/07c5129e_nohash_0.wav' #go
audio_binary = tf.io.read_file(example_file_path)
waveform, _ = tf.audio.decode_wav(audio_binary)

**Preprocess the audio data**

In [136]:
waveform = tf.squeeze(waveform, axis=-1)
waveform = tf.reshape(waveform, (1, -1))

**Predict the class.**

In [137]:
predictions = model.predict(waveform)
predicted_class_index = np.argmax(predictions)



**We used these names during training so we are gonna stick with them in here too.**

In [138]:
class_names = ['go', 'left', 'yes']

**Get the predicted class index**

*   Go=0, Left=1, Yes=2



In [139]:
predicted_class_name = class_names[predicted_class_index]

**And finally we're gonna print out the predicted class index and name accordingly.**

In [141]:
print("Predicted class name:", predicted_class_name)

Predicted class name: yes


Feel free to test out other classes.

*   For "Yes", Change the example_file_path to : '/content/mini_speech_commands/yes/0132a06d_nohash_1.wav'
*   For "Go", Change the example_file_path to : '/content/mini_speech_commands/go/016e2c6d_nohash_0.wav'
*   For "Left" Change the example_file_path to : '/content/mini_speech_commands/left/0132a06d_nohash_0.wav'

