# Lab 5: Google Speech Commands

## Imports

In [1]:
import copy
import requests
import json
from urllib.parse import urlparse
import wave
from pathlib import Path
import os
import numpy as np
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Input, Conv1D, AvgPool1D, MaxPool1D, ZeroPadding1D, BatchNormalization, Flatten, Dense, Activation, Dropout, SeparableConv1D, GlobalMaxPool1D, GlobalAveragePooling1D, MaxPooling1D, ReLU, LeakyReLU
from keras.activations import softmax
from keras.utils import get_file
from keras.utils import to_categorical
import librosa
import soundfile as sf
import random
from sklearn.model_selection import train_test_split
from pydub import AudioSegment



## Download, cache and extract Google Speech Commands

In [2]:
def sanitize_filename(name):
    invalid_chars = '<>:"/\\|?*'
    for char in invalid_chars:
        name = name.replace(char, '_')
    return name

def download_file(url, filepath):
    print(f"Downloading from {url} to {filepath}...")
    response = requests.get(url)
    response.raise_for_status()  # Va lever une exception si la requête a échoué

    os.makedirs(os.path.dirname(filepath), exist_ok=True)
    with open(filepath, 'wb') as f:
        f.write(response.content)

urls = [
    'https://xeno-canto.org/api/2/recordings?query=type:song+gen:emberiza',
    'https://xeno-canto.org/api/2/recordings?query=type:song+gen:muscicapa',
    'https://xeno-canto.org/api/2/recordings?query=type:song+gen:alauda'
]

dataset_path = 'dataset'

for url in urls:
    response = requests.get(url)
    response.raise_for_status()
    data = response.json()
    
    if data['recordings']:
        genus = data['recordings'][0]['gen'].lower()
        genus_path = os.path.join(dataset_path, genus)
        
        for recording in data['recordings'][:300]:  # Limite à 300 enregistrements pour diminuer le temps de traitement
            audio_url = recording['file']
            filename = sanitize_filename(recording['file-name'])
            filepath = os.path.join(genus_path, filename)
            download_file(audio_url, filepath)


Downloading from https://xeno-canto.org/672086/download to dataset\emberiza\XC672086-bunting song[ExtAudio].mp3...
Downloading from https://xeno-canto.org/652967/download to dataset\emberiza\XC652967-crested bunting 20210530-100646.mp3...
Downloading from https://xeno-canto.org/796619/download to dataset\emberiza\XC796619-CrestedBunting_MuangNgan_230421_2087_edited.mp3...
Downloading from https://xeno-canto.org/796618/download to dataset\emberiza\XC796618-CrestedBunting_MuangNgan_230421_2085_edited.mp3...
Downloading from https://xeno-canto.org/481292/download to dataset\emberiza\XC481292-crbu_may2019_C0004a.mp3...
Downloading from https://xeno-canto.org/472696/download to dataset\emberiza\XC472696-LS101479 Crested Bunting song B.mp3...
Downloading from https://xeno-canto.org/463615/download to dataset\emberiza\XC463615-2060 CRESTED BUNTING 18.mp3...
Downloading from https://xeno-canto.org/177703/download to dataset\emberiza\XC177703-Crested-Bunting-Mishmi-Hills-May3-2014-Pritam-Baruah

## Splitting sounds into 1 sec clips and trimming silence / parasitic noise

In [157]:
def preprocess_audio(file_path, target_dir, duration=1.0, sample_rate=16000):
    try:
        if file_path.endswith(".wav"):
            audio, _ = librosa.load(file_path, sr=sample_rate)
        elif file_path.endswith(".mp3"):
            audio_segment = AudioSegment.from_mp3(file_path)
            audio = audio_segment.set_frame_rate(sample_rate).set_channels(1).get_array_of_samples()
            audio = librosa.util.buf_to_float(audio)
        else:
            raise ValueError(f"Unsupported file format: {file_path}")

        trimmed_audio, _ = librosa.effects.trim(audio)
        target_samples = int(duration * sample_rate)
        num_segments = len(trimmed_audio) // target_samples

        for i in range(num_segments):
            start = i * target_samples
            end = start + target_samples
            segment = trimmed_audio[start:end]

            if not os.path.exists(target_dir):
                os.makedirs(target_dir)

            segment_path = os.path.join(target_dir, f"{os.path.splitext(os.path.basename(file_path))[0]}_{i}.wav")
            sf.write(segment_path, segment, sample_rate, 'PCM_16')
    except Exception as e:
        print(f"Erreur lors du traitement de {file_path}: {str(e)}")

def process_directory(dataset_path, output_root):
    for root, dirs, files in os.walk(dataset_path):
        for file in files:
            if file.endswith(".wav") or file.endswith(".mp3"):
                file_path = os.path.join(root, file)
                sub_dir = os.path.relpath(root, dataset_path)
                target_dir = os.path.join(output_root, sub_dir)
                print(f"Processing {file_path}...")
                preprocess_audio(file_path, target_dir)

dataset_path = "dataset"
output_root = "dataset_cleaned"
process_directory(dataset_path, output_root)

## Load raw spoken digits data from Google Speech Commands

In [159]:
CLASSES = ['alauda', 'emberiza', 'muscicapa']
dataset_dir = Path('dataset_cleaned')
test_size_fraction = 0.2  # 20% des données pour l'ensemble de test
x = []
y = []
target_length = 16000  # Longueur cible pour tous les segments audio

for class_folder in CLASSES:
    files = list(dataset_dir.glob(f'{class_folder}/*'))
    for recording in files:
        data, sr = librosa.load(str(recording), sr=16000)
        data = librosa.util.fix_length(data, size=target_length)  # Ajustement de la longueur
        data = data.reshape(-1, 1)
        x.append(data)
        y.append(CLASSES.index(class_folder))

x = np.array(x, dtype=np.float32)
y = np.array(y)

# Trouver le nombre minimal d'échantillons parmi toutes les classes
min_size = min(np.sum(y == i) for i in range(len(CLASSES)))

# Sous-échantillonnage pour équilibrer les classes
indices_to_keep = []
for i in range(len(CLASSES)):
    indices = np.where(y == i)[0]
    np.random.shuffle(indices)
    indices_to_keep.extend(indices[:min_size])

# Utiliser uniquement les indices sélectionnés
x_balanced = x[indices_to_keep]
y_balanced = y[indices_to_keep]

# Divisez les données en ensembles d'entraînement et de test en utilisant la stratification
x_train, x_test, y_train, y_test = train_test_split(x_balanced, y_balanced, test_size=test_size_fraction, stratify=y_balanced)

# Convertir les étiquettes en représentation catégorielle
y_train = to_categorical(y_train, num_classes=len(CLASSES))
y_test = to_categorical(y_test, num_classes=len(CLASSES))

# Normalisation des données
x_mean = np.mean(x_train)
x_std = np.std(x_train)
x_train = (x_train - x_mean) / x_std
x_test = (x_test - x_mean) / x_std

print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)
unique_labels, counts = np.unique(y_test.argmax(axis=1), return_counts=True)
print(dict(zip(unique_labels, counts)))

(5390, 16000, 1) (5390, 3)
(1348, 16000, 1) (1348, 3)
{0: 449, 1: 449, 2: 450}


## Prepare for inference with fixed-point Q7.9 samples by scaling input data accordingly

In [160]:
FIXED_POINT = 9
x_train /= 2**FIXED_POINT
x_test  /= 2**FIXED_POINT

## Export small dataset (250 random vectors)

In [161]:
perms = np.random.permutation(len(y_test))[0:250]
x_test_250 = x_test[perms]
y_test_250 = y_test[perms]
np.savetxt('x_test_birds_250.csv', x_test_250.reshape((x_test_250.shape[0], -1)), delimiter=',', fmt='%s')
np.savetxt('y_test_birds_250.csv', y_test_250, delimiter=',', fmt='%s')

## Build model M5

In [228]:
model = Sequential()
model.add(Input(shape=(16000, 1)))
model.add(MaxPool1D(pool_size=2, strides=2, padding='valid'))
model.add(Conv1D(filters=8, kernel_size=40, strides=4, activation='relu'))
model.add(MaxPool1D(pool_size=4, strides=4, padding='valid'))
model.add(Conv1D(filters=16, kernel_size=2, activation='relu'))
model.add(MaxPool1D(pool_size=4, strides=4, padding='valid'))
model.add(Conv1D(filters=32, kernel_size=2, activation='relu'))
model.add(MaxPool1D(pool_size=4, strides=4, padding='valid'))
model.add(Conv1D(filters=32, kernel_size=2, activation='relu'))
model.add(MaxPool1D(pool_size=4, strides=4, padding='valid'))
model.add(Flatten())
model.add(Dense(units=3))
model.add(Activation('softmax'))
opt = tf.keras.optimizers.Adam(learning_rate=1e-2)
model.summary()
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['categorical_accuracy'])

Model: "sequential_25"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 max_pooling1d_132 (MaxPooli  (None, 8000, 1)          0         
 ng1D)                                                           
                                                                 
 conv1d_108 (Conv1D)         (None, 1991, 8)           328       
                                                                 
 max_pooling1d_133 (MaxPooli  (None, 497, 8)           0         
 ng1D)                                                           
                                                                 
 conv1d_109 (Conv1D)         (None, 496, 16)           272       
                                                                 
 max_pooling1d_134 (MaxPooli  (None, 124, 16)          0         
 ng1D)                                                           
                                                     

## Train model

In [229]:
model.fit(x_train, y_train, epochs=50, batch_size=268, validation_data=(x_test, y_test))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x1af89a8fdc0>

## Evaluate model on test dataset

In [230]:
model.evaluate(x_test, y_test, verbose=2)
pred_test = model.predict(x_test)
print(tf.math.confusion_matrix(y_test.argmax(axis=1), pred_test.argmax(axis=1)))

43/43 - 0s - loss: 0.3939 - categorical_accuracy: 0.8657 - 271ms/epoch - 6ms/step
tf.Tensor(
[[342  87  20]
 [ 24 416   9]
 [  9  32 409]], shape=(3, 3), dtype=int32)


## Evaluate model on small dataset

In [231]:
model.evaluate(x_test_250, y_test_250, verbose=2)
pred_test_250 = model.predict(x_test_250)
print(tf.math.confusion_matrix(y_test_250.argmax(axis=1), pred_test_250.argmax(axis=1)))

8/8 - 0s - loss: 0.3423 - categorical_accuracy: 0.8800 - 72ms/epoch - 9ms/step
tf.Tensor(
[[55  9  7]
 [ 7 82  1]
 [ 1  5 83]], shape=(3, 3), dtype=int32)


## Save trained model

In [232]:
model.save('model.h5')

## Remove SoftMax layer

In [233]:
if isinstance(model.layers[-1], Activation) and model.layers[-1].activation == softmax:
    model = tf.keras.Model(model.input, model.layers[-2].output, name=model.name)
else:
    print('Error: last layer is not SoftMax Activation')

## Install Qualia-CodeGen for C inference code generation

In [234]:
#%pip install qualia_codegen_core
import qualia_codegen_core
from qualia_codegen_core.graph.KerasModelGraph import KerasModelGraph
from qualia_codegen_core.graph.Quantization import Quantization
from qualia_codegen_core.graph.RoundMode import RoundMode
#
from importlib.resources import files
main_path = str((files('qualia_codegen_core.examples')/'Linux'/'main.cpp').resolve())
print(main_path)

C:\Program Files\Python310\Lib\site-packages\qualia_codegen_core\examples\Linux\main.cpp


## Convert Keras Model to Qualia-CodeGen's internal representation

In [235]:
modelgraph = KerasModelGraph(model).convert()
print(modelgraph)

—————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————
Inputs                                           | Layer                                            | Outputs                                          | Input shape                                      | Output shape                                    
—————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————
                                                 | input_26                                         | max_pooling1d_132                                | (1, 16000, 1)                                    | ((1, 16000, 1),)                   

## Generate C code for the trained model with 32-bit floating-point representation

In [236]:
float_modelgraph = copy.deepcopy(modelgraph)

# layer quantization annotations for float32
for node in float_modelgraph.nodes:
    # No scale factor if not fixed-point quantization on integers
    node.q = Quantization(
            number_type=float,
            width=32,
            long_width=32,
            weights_scale_factor=0,
            output_scale_factor=0,
            weights_round_mode=RoundMode.NONE,
            output_round_mode=RoundMode.NONE,
            )

float_res = qualia_codegen_core.Converter(output_path=Path('birds_output_floating')).convert_model(float_modelgraph)

with open('birds_model_floating.h', 'w') as f:
    f.write(float_res)

Keras weights file (<HDF5 file "variables.h5" (mode r+)>) saving:
...layers\conv1d
......vars
.........0
.........1
...layers\conv1d_1
......vars
.........0
.........1
...layers\conv1d_2
......vars
.........0
.........1
...layers\conv1d_3
......vars
.........0
.........1
...layers\dense
......vars
.........0
.........1
...layers\flatten
......vars
...layers\input_layer
......vars
...layers\max_pooling1d
......vars
...layers\max_pooling1d_1
......vars
...layers\max_pooling1d_2
......vars
...layers\max_pooling1d_3
......vars
...layers\max_pooling1d_4
......vars
...vars
Keras model archive saving:
File Name                                             Modified             Size
config.json                                    2024-04-20 15:58:19         5044
metadata.json                                  2024-04-20 15:58:19           64
variables.h5                                   2024-04-20 15:58:19        46712


Keras model archive loading:
File Name                                             Modified             Size
config.json                                    2024-04-20 15:58:18         5044
metadata.json                                  2024-04-20 15:58:18           64
variables.h5                                   2024-04-20 15:58:18        46712
Keras weights file (<HDF5 file "variables.h5" (mode r)>) loading:
...layers\conv1d
......vars
.........0
.........1
...layers\conv1d_1
......vars
.........0
.........1
...layers\conv1d_2
......vars
.........0
.........1
...layers\conv1d_3
......vars
.........0
.........1
...layers\dense
......vars
.........0
.........1
...layers\flatten
......vars
...layers\input_layer
......vars
...layers\max_pooling1d
......vars
...layers\max_pooling1d_1
......vars
...layers\max_pooling1d_2
......vars
...layers\max_pooling1d_3
......vars
...layers\max_pooling1d_4
......vars
...vars


## Compile the 32-bit floating-point C code for x86 and evaluate on small dataset

In [237]:
!g++ -std=c++17 -Wall -Wextra -pedantic -Ofast -o birds_floating -include birds_output_floating/include/defines.h -Ibirds_output_floating/include birds_output_floating/model.c "{main_path}"

import os

# Run the compiled program with the appropriate executable based on the OS
if os.name == 'posix':
    !./birds_floating x_test_birds_250.csv y_test_birds_250.csv
else:
    !birds_floating.exe x_test_birds_250.csv y_test_birds_250.csv


In file included from birds_output_floating/model.c:15:
birds_output_floating/include/number.h: In function 'float scale_number_t_float(float, int, round_mode_t)':
   float number, int scale_factor, round_mode_t round_mode) {
                 ~~~~^~~~~~~~~~~~
   float number, int scale_factor, round_mode_t round_mode) {
                                   ~~~~~~~~~~~~~^~~~~~~~~~
birds_output_floating/include/number.h: In function 'float scale_and_clamp_to_number_t_float(float, int, round_mode_t)':
   float number, int scale_factor, round_mode_t round_mode) {
                 ~~~~^~~~~~~~~~~~
   float number, int scale_factor, round_mode_t round_mode) {
                                   ~~~~~~~~~~~~~^~~~~~~~~~
In file included from birds_output_floating/include/model.h:19,
                 from C:\Program Files\Python310\Lib\site-packages\qualia_codegen_core\examples\Linux\main.cpp:12:
birds_output_floating/include/number.h: In function 'float scale_number_t_float(float, int, round_mode

## Generate C code for the trained model with 16-bit fixed-point representation

In [275]:
fixed_modelgraph = copy.deepcopy(modelgraph)

# layer quantization annotations for int16 Q9.7
for node in fixed_modelgraph.nodes:
    node.q = Quantization(
            number_type=int,
            width=16,
            long_width=32,
            weights_scale_factor=13,
            output_scale_factor=13,
            weights_round_mode=RoundMode.FLOOR,
            output_round_mode=RoundMode.FLOOR,
            )

fixed_res = qualia_codegen_core.Converter(output_path=Path('birds_output_fixed')).convert_model(fixed_modelgraph)

with open('birds_model_fixed.h', 'w') as f:
    f.write(fixed_res)


Keras weights file (<HDF5 file "variables.h5" (mode r+)>) saving:
...layers\conv1d
......vars
.........0
.........1
...layers\conv1d_1
......vars
.........0
.........1
...layers\conv1d_2
......vars
.........0
.........1
...layers\conv1d_3
......vars
.........0
.........1
...layers\dense
......vars
.........0
.........1
...layers\flatten
......vars
...layers\input_layer
......vars
...layers\max_pooling1d
......vars
...layers\max_pooling1d_1
......vars
...layers\max_pooling1d_2
......vars
...layers\max_pooling1d_3
......vars
...layers\max_pooling1d_4
......vars
...vars
Keras model archive saving:
File Name                                             Modified             Size
config.json                                    2024-04-20 16:05:19         5044
metadata.json                                  2024-04-20 16:05:19           64
variables.h5                                   2024-04-20 16:05:19        46712
Keras model archive loading:
File Name                                        

## Compile the 16-bit fixed-point C code for x86 and evaluate on small dataset

In [276]:
!g++ -std=c++17 -Wall -Wextra -pedantic -Ofast -o birds_fixed -include birds_output_fixed/include/defines.h -Ibirds_output_fixed/include birds_output_fixed/model.c "{main_path}"

# Run the compiled program with the appropriate executable based on the OS
if os.name == 'posix':
    !./birds_fixed x_test_birds_250.csv y_test_birds_250.csv
else:
    !birds_fixed.exe x_test_birds_250.csv y_test_birds_250.csv

Testing accuracy: 0.776
