In [1]:
import os
import pathlib

import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import tensorflow as tf

from tensorflow.keras import layers
from tensorflow.keras import models
from IPython import display

# Set the seed value for experiment reproducibility.
seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)

In [2]:
path = os.getcwd()
os.listdir()

['.ipynb_checkpoints',
 'build_model_mini.ipynb',
 'mini_modelsaved',
 'run.ipynb',
 'test',
 'Train']

In [3]:
DATASET_PATH = 'Train/Train'

data_dir = pathlib.Path(DATASET_PATH)

In [4]:
import os
import shutil

# Set the root directory containing the audio files
root_dir = 'Train/Train'

# Define the directory names
directory_names = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine']

# Create the directories if they don't exist
for dir_name in directory_names:
    dir_path = os.path.join(root_dir, dir_name)
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)

# Move the audio files to their respective directories
for filename in os.listdir(root_dir):
    if filename.endswith('.wav'):
        first_digit = int(filename[0])
        src_path = os.path.join(root_dir, filename)
        dst_path = os.path.join(root_dir, directory_names[first_digit], filename)
        shutil.copy(src_path, dst_path)
        print(f"Moved {filename} to directory {directory_names[first_digit]}")


In [None]:
import os
import librosa
import soundfile as sf
import time

def convert_to_mono(input_path, output_path):
    y, sr = librosa.load(input_path, sr=None, mono=False)
    if y.ndim > 1:
        y = librosa.to_mono(y)
    sf.write(output_path, y, sr)

def preprocess_audio_files(root_directory):
    start_time = time.time()
    file_count = 0

    for root, dirs, files in os.walk(root_directory):
        for file in files:
            if file.endswith('.wav'):
                file_path = os.path.join(root, file)
                temp_path = os.path.join(root, 'temp_' + file)
                
                # Convert to mono if needed
                convert_to_mono(file_path, temp_path)
                
                # Replace original file with the mono file
                os.remove(file_path)
                os.rename(temp_path, file_path)
                
                file_count += 1
    
    end_time = time.time()
    total_time = end_time - start_time

    print(f"Processed {file_count} files in {total_time:.2f} seconds.")
    print(f"Average time per file: {total_time / file_count:.2f} seconds.")

root_directory = 'Train/Train'
preprocess_audio_files(root_directory)


In [None]:
train_ds, val_ds = tf.keras.utils.audio_dataset_from_directory(
    directory=data_dir,
    batch_size=64,
    validation_split=0.2,
    seed=0,
    output_sequence_length=16000,
    subset='both')

label_names = np.array(train_ds.class_names)
print()
print("label names:", label_names)

In [None]:
train_ds.element_spec

In [None]:
def squeeze(audio, labels):
  audio = tf.squeeze(audio, axis=-1)
  return audio, labels

train_ds = train_ds.map(squeeze, tf.data.AUTOTUNE)
val_ds = val_ds.map(squeeze, tf.data.AUTOTUNE)

In [None]:
test_ds = val_ds.shard(num_shards=2, index=0)
val_ds = val_ds.shard(num_shards=2, index=1)

In [None]:
for example_audio, example_labels in train_ds.take(1):
    
  print(example_audio.shape)
  print(example_labels.shape)

In [None]:
plt.figure(figsize=(16, 10))
rows = 3
cols = 3
n = rows * cols
for i in range(n):
  plt.subplot(rows, cols, i+1)
  audio_signal = example_audio[i]
  plt.plot(audio_signal)
  plt.title(label_names[example_labels[i]])
  plt.yticks(np.arange(-1.2, 1.2, 0.2))
  plt.ylim([-1.1, 1.1])

In [None]:
def get_spectrogram(waveform):
  # Convert the waveform to a spectrogram via a STFT.
  spectrogram = tf.signal.stft(
      waveform, frame_length=255, frame_step=128)
  # Obtain the magnitude of the STFT.
  spectrogram = tf.abs(spectrogram)
  # Add a `channels` dimension, so that the spectrogram can be used
  # as image-like input data with convolution layers (which expect
  # shape (`batch_size`, `height`, `width`, `channels`).
  spectrogram = spectrogram[..., tf.newaxis]
  return spectrogram

In [None]:
for i in range(3):
  label = label_names[example_labels[i]]
  waveform = example_audio[i]
  spectrogram = get_spectrogram(waveform)

  print('Label:', label)
  print('Waveform shape:', waveform.shape)
  print('Spectrogram shape:', spectrogram.shape)
  print('Audio playback')
  display.display(display.Audio(waveform, rate=16000))

In [None]:
def plot_spectrogram(spectrogram, ax):
  if len(spectrogram.shape) > 2:
    assert len(spectrogram.shape) == 3
    spectrogram = np.squeeze(spectrogram, axis=-1)
  # Convert the frequencies to log scale and transpose, so that the time is
  # represented on the x-axis (columns).
  # Add an epsilon to avoid taking a log of zero.
  log_spec = np.log(spectrogram.T + np.finfo(float).eps)
  height = log_spec.shape[0]
  width = log_spec.shape[1]
  X = np.linspace(0, np.size(spectrogram), num=width, dtype=int)
  Y = range(height)
  ax.pcolormesh(X, Y, log_spec)

In [None]:
fig, axes = plt.subplots(2, figsize=(12, 8))
timescale = np.arange(waveform.shape[0])
axes[0].plot(timescale, waveform.numpy())
axes[0].set_title('Waveform')
axes[0].set_xlim([0, 16000])

plot_spectrogram(spectrogram.numpy(), axes[1])
axes[1].set_title('Spectrogram')
plt.suptitle(label.title())
plt.show()

In [None]:
def make_spec_ds(ds):
  return ds.map(
      map_func=lambda audio,label: (get_spectrogram(audio), label),
      num_parallel_calls=tf.data.AUTOTUNE)

In [None]:
train_spectrogram_ds = make_spec_ds(train_ds)
val_spectrogram_ds = make_spec_ds(val_ds)
test_spectrogram_ds = make_spec_ds(test_ds)

In [None]:
for example_spectrograms, example_spect_labels in train_spectrogram_ds.take(1):
  break

In [None]:
rows = 3
cols = 3
n = rows*cols
fig, axes = plt.subplots(rows, cols, figsize=(16, 9))

for i in range(n):
    r = i // cols
    c = i % cols
    ax = axes[r][c]
    plot_spectrogram(example_spectrograms[i].numpy(), ax)
    ax.set_title(label_names[example_spect_labels[i].numpy()])

plt.show()

In [None]:
train_spectrogram_ds = train_spectrogram_ds.cache().shuffle(10000).prefetch(tf.data.AUTOTUNE)
val_spectrogram_ds = val_spectrogram_ds.cache().prefetch(tf.data.AUTOTUNE)
test_spectrogram_ds = test_spectrogram_ds.cache().prefetch(tf.data.AUTOTUNE)

In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt

from pathlib import Path
from collections import Counter

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [None]:
# We have used two max pool with pool size and strides 2.
# Hence, downsampled feature maps are 16x smaller. The number of
# filters in the last layer is 128. Reshape accordingly before
# passing the output to the RNN part of the model

input_shape = example_spectrograms.shape[1:]
print('Input shape:', input_shape)
num_labels = len(label_names)
spect_width =124
spect_height=129


# Instantiate the `tf.keras.layers.Normalization` layer.
norm_layer = layers.Normalization()
# Fit the state of the layer to the spectrograms
# with `Normalization.adapt`.
norm_layer.adapt(data=train_spectrogram_ds.map(map_func=lambda spec, label: spec))
new_shape = (spect_width // 16, (spect_height // 16) * 256)

model=keras.models.Sequential([
                              layers.Input(shape=input_shape),#256
                              layers.Conv2D(32,(3, 3), activation="relu", padding="same"),
                              layers.MaxPooling2D((2, 2)),#128
                              layers.Conv2D(64,(3, 3), activation="relu", padding="same"),
                              layers.MaxPooling2D((2, 2)),#64
                              layers.Conv2D(128,(3, 3), activation="relu", padding="same"),
                              layers.MaxPooling2D((2, 2)),#32
                              layers.Conv2D(256,(3, 3), activation="relu", padding="same"),
                              layers.MaxPooling2D((2, 2)),#16
                              layers.Reshape(target_shape=new_shape),
                              layers.Dense(128, activation="relu"),
                              layers.Dropout(0.2),
                              layers.Conv1D(64, 3,activation="relu", padding="same"),
                              layers.MaxPool1D(2),
                              layers.Dropout(0.5),
                              layers.Flatten(), # Add a Flatten layer to collapse the sequence dimension
                              layers.Dense(num_labels, activation="softmax")
                              ])
model.summary()

In [None]:
import tensorflow as tf
from tensorflow.keras import models, layers

# Assuming the same input_shape and num_labels as before
input_shape = example_spectrograms.shape[1:]
num_labels = len(label_names)

# Instantiate the `tf.keras.layers.Normalization` layer.
norm_layer = layers.Normalization()
# Fit the state of the layer to the spectrograms
# with `Normalization.adapt`.
norm_layer.adapt(data=train_spectrogram_ds.map(map_func=lambda spec, label: spec))

desired_sequence_length = 12544

model = models.Sequential([
    layers.Input(shape=input_shape),
    # Downsample the input.
    layers.Resizing(32, 32),
    # Normalize.
    norm_layer,
    layers.Conv2D(32, 3, activation='relu'),
    layers.Conv2D(64, 3, activation='relu'),
    layers.MaxPooling2D(),
    layers.Dropout(0.25),
    layers.Flatten(),
    layers.Reshape((-1, desired_sequence_length)),  # Reshape to the desired sequence length
    layers.TimeDistributed(layers.Dense(128, activation='relu')),
    layers.LSTM(128, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(num_labels, activation='softmax'),
])

model.summary()


In [None]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
    metrics=['accuracy'],
)

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [None]:
EPOCHS = 100
early_stopping_patience = 30
reduce_lr_patience=10

# Add early stopping
early_stopping = keras.callbacks.EarlyStopping(
    monitor="val_accuracy", patience=early_stopping_patience, restore_best_weights=True
)
reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor='val_accuracy', factor=0.1, patience=reduce_lr_patience)

# Train the model
history = model.fit(
    train_spectrogram_ds,
    validation_data=val_spectrogram_ds,
    epochs=EPOCHS,
    callbacks=[early_stopping, reduce_lr],
)

In [None]:
metrics = history.history
plt.figure(figsize=(16,6))
plt.subplot(1,2,1)
plt.plot(history.epoch, metrics['loss'], metrics['val_loss'])
plt.legend(['loss', 'val_loss'])
plt.ylim([0, max(plt.ylim())])
plt.xlabel('Epoch')
plt.ylabel('Loss [CrossEntropy]')

plt.subplot(1,2,2)
plt.plot(history.epoch, 100*np.array(metrics['accuracy']), 100*np.array(metrics['val_accuracy']))
plt.legend(['accuracy', 'val_accuracy'])
plt.ylim([0, 100])
plt.xlabel('Epoch')
plt.ylabel('Accuracy [%]')

In [None]:
model.evaluate(test_spectrogram_ds, return_dict=True)

In [None]:
y_pred = model.predict(test_spectrogram_ds)

In [None]:
y_pred = tf.argmax(y_pred, axis=1)

In [None]:
y_true = tf.concat(list(test_spectrogram_ds.map(lambda s,lab: lab)), axis=0)

In [None]:
confusion_mtx = tf.math.confusion_matrix(y_true, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(confusion_mtx,
            xticklabels=label_names,
            yticklabels=label_names,
            annot=True, fmt='g')
plt.xlabel('Prediction')
plt.ylabel('Label')
plt.show()

In [None]:
class ExportModel(tf.Module):
  def __init__(self, model):
    self.model = model

    # Accept either a string-filename or a batch of waveforms.
    # YOu could add additional signatures for a single wave, or a ragged-batch.
    self.__call__.get_concrete_function(
        x=tf.TensorSpec(shape=(), dtype=tf.string))
    self.__call__.get_concrete_function(
       x=tf.TensorSpec(shape=[None, 16000], dtype=tf.float32))


  @tf.function
  def __call__(self, x):
    # If they pass a string, load the file and decode it.
    if x.dtype == tf.string:
      x = tf.io.read_file(x)
      x, _ = tf.audio.decode_wav(x, desired_channels=1, desired_samples=16000,)
      x = tf.squeeze(x, axis=-1)
      x = x[tf.newaxis, :]

    x = get_spectrogram(x)
    result = self.model(x, training=False)

    class_ids = tf.argmax(result, axis=-1)
    class_names = tf.gather(label_names, class_ids)
    return {'predictions':result,
            'class_ids': class_ids,
            'class_names': class_names}

In [None]:
export = ExportModel(model)

In [None]:
tf.saved_model.save(export, "mini_modelsaved")
imported = tf.saved_model.load("mini_modelsaved")
imported(waveform[tf.newaxis, :])