# ML4IoT - HW2

Team 1: Homayoun Afshari (s308563), Marcelo Bastos Ferreira (s308964), Gustavo Nicoletti Rosa (s317672)

# Notebook Initialization

In [1]:
### initial variables
ROOT_FOLDER = '.'
DATABASE_FOLDER = f'/tmp'
MODEL_NAME = 'model1'

In [2]:
import sys
import os
import shutil
import zipfile
import tensorflow as tf
import numpy as np
import pandas as pd
import tensorflow_model_optimization as tfmot
import wandb
from time import time
from preprocessing import LABELS
from preprocessing import AudioReader
from preprocessing import MelSpectrogram
from preprocessing import MFCC

2024-01-01 15:45:34.852234: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-01-01 15:45:34.883710: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-01-01 15:45:34.884561: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# The Reference Model

In [3]:
REF_PREPROCESSING_ARGS = {
    'sampling_rate': 16000,
    'frame_length_in_s': 0.04,
    'frame_step_in_s': 0.02,
    'num_mel_bins': 40,
    'lower_frequency': 20,
    'upper_frequency': 4000,
}

tflite_models_dir = f'{ROOT_FOLDER}/tflite_models'
if not os.path.exists(tflite_models_dir):
    os.makedirs(tflite_models_dir)
tflite_model_name = os.path.join(tflite_models_dir, 'ref_model.tflite')

if not os.path.exists(tflite_model_name):
    ref_model = tf.keras.Sequential([
        tf.keras.layers.Input(shape=[49, 40, 1]),
        tf.keras.layers.Conv2D(filters=128, kernel_size=[3, 3], strides=[2, 2], use_bias=False, padding='valid'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.ReLU(),
        tf.keras.layers.Conv2D(filters=128, kernel_size=[3, 3], strides=[1, 1], use_bias=False, padding='same'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.ReLU(),
        tf.keras.layers.Conv2D(filters=128, kernel_size=[3, 3], strides=[1, 1], use_bias=False, padding='same'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.ReLU(),
        tf.keras.layers.GlobalAveragePooling2D(),
        tf.keras.layers.Dense(units=2),
        tf.keras.layers.Softmax()
    ])

    ref_model.build()

    saved_model_dir = f'{ROOT_FOLDER}/saved_models/ref_model'
    if not os.path.exists(saved_model_dir):
        os.makedirs(saved_model_dir)
    ref_model.save(saved_model_dir)

    converter = tf.lite.TFLiteConverter.from_saved_model(f'{ROOT_FOLDER}/saved_models/ref_model')
    tflite_model = converter.convert()

    with open(tflite_model_name, 'wb') as fp:
        fp.write(tflite_model)

# Functions

In [4]:
def compare_to_reference(tflite_model_name, PREPROCESSING_ARGS):

    ### generating a random audio
    audio = tf.random.normal((16000,))

    ### reference model's latency
    mel_spec_processor = MelSpectrogram(**REF_PREPROCESSING_ARGS)
    interpreter = tf.lite.Interpreter(model_path=f'{ROOT_FOLDER}/tflite_models/ref_model.tflite')
    interpreter.allocate_tensors()
    input_details = interpreter.get_input_details()
    output_details = interpreter.get_output_details()
    ref_latencies = []
    for i in range(100):
        start_preprocess = time()
        log_mel_spectrogram = mel_spec_processor.get_mel_spec(audio)
        log_mel_spectrogram = tf.expand_dims(log_mel_spectrogram, 0)
        log_mel_spectrogram = tf.expand_dims(log_mel_spectrogram, -1)
        interpreter.set_tensor(input_details[0]['index'], log_mel_spectrogram)
        interpreter.invoke()
        output = interpreter.get_tensor(output_details[0]['index'])
        end_inference = time()
        ref_latencies.append(end_inference - start_preprocess)
    median_ref_latency = np.median(ref_latencies)

    ### developed model's accuracy and latency
    mfcc_processor = MFCC(**PREPROCESSING_ARGS)
    interpreter = tf.lite.Interpreter(model_path=f'{ROOT_FOLDER}/tflite_models/{tflite_model_name}.tflite')
    interpreter.allocate_tensors()
    input_details = interpreter.get_input_details()
    output_details = interpreter.get_output_details()
    developed_model_latencies = []
    for i in range(100):
        start_preprocess = time()
        mfccs = mfcc_processor.get_mfccs(audio)
        mfccs = tf.expand_dims(mfccs, 0)
        mfccs = tf.expand_dims(mfccs, -1)
        interpreter.set_tensor(input_details[0]['index'], mfccs)
        interpreter.invoke()
        output = interpreter.get_tensor(output_details[0]['index'])
        end_inference = time()
        developed_model_latencies.append(end_inference - start_preprocess)
    median_developed_model_latency = np.median(developed_model_latencies)

    # measuring the savings
    savings = 100 * (median_ref_latency - median_developed_model_latency) / median_ref_latency
    return savings

In [5]:
def develop_model(PREPROCESSING_ARGS, TRAINING_ARGS, OPTIMIZATION_ARGS):

    ### preprocessing objects
    audio_reader = AudioReader(tf.int16, PREPROCESSING_ARGS['sampling_rate'])
    mfcc_processor = MFCC(**PREPROCESSING_ARGS)
    def prepare_for_training(feature, label):
        feature = tf.expand_dims(feature, -1)
        label_id = tf.argmax(label == LABELS)
        return feature, label_id

    ### datasets
    print('Loading the datasets...')
    train_ds = tf.data.Dataset.list_files(f'{DATABASE_FOLDER}/yn-train/*')
    test_ds = tf.data.Dataset.list_files(f'{DATABASE_FOLDER}/yn-test/*')
    train_ds = (train_ds
                .map(audio_reader.get_audio_and_label)
                .map(mfcc_processor.get_mfccs_and_label)
                .map(prepare_for_training)
                .batch(TRAINING_ARGS['batch_size'])
                .cache())
    test_ds = (test_ds
                .map(audio_reader.get_audio_and_label)
                .map(mfcc_processor.get_mfccs_and_label)
                .map(prepare_for_training)
                .batch(TRAINING_ARGS['batch_size']))
    
    ### defining the model
    print()
    print('Defining the model...')
    for example_batch, example_labels in train_ds.take(1):
        shape = example_batch.shape
    if OPTIMIZATION_ARGS['use_depthwise']:
        model = tf.keras.Sequential([
            tf.keras.layers.Input(shape=shape[1:]),
            tf.keras.layers.Conv2D(filters=int(128 * OPTIMIZATION_ARGS['alpha']), kernel_size=[3, 3], strides=[2, 2], use_bias=False, padding='valid'),
            tf.keras.layers.BatchNormalization(),
            tf.keras.layers.ReLU(),
            tf.keras.layers.DepthwiseConv2D(kernel_size=[3, 3], strides=[1, 1], use_bias=False, padding='same'),
            tf.keras.layers.Conv2D(filters=int(128 * OPTIMIZATION_ARGS['alpha']), kernel_size=[1, 1], strides=[1, 1], use_bias=False),
            tf.keras.layers.BatchNormalization(),
            tf.keras.layers.ReLU(),
            tf.keras.layers.DepthwiseConv2D(kernel_size=[3, 3], strides=[1, 1], use_bias=False, padding='same'),
            tf.keras.layers.Conv2D(filters=int(128 * OPTIMIZATION_ARGS['alpha']), kernel_size=[1, 1], strides=[1, 1], use_bias=False),
            tf.keras.layers.BatchNormalization(),
            tf.keras.layers.ReLU(),
            tf.keras.layers.GlobalAveragePooling2D(),
            tf.keras.layers.Dense(int(100 * OPTIMIZATION_ARGS['beta'])),
            tf.keras.layers.Dense(1, activation='sigmoid')
        ])
    else:
        model = tf.keras.Sequential([
            tf.keras.layers.Input(shape=shape[1:]),
            tf.keras.layers.Conv2D(filters=int(128 * OPTIMIZATION_ARGS['alpha']), kernel_size=[3, 3], strides=[2, 2], use_bias=False, padding='valid'),
            tf.keras.layers.BatchNormalization(),
            tf.keras.layers.ReLU(),
            tf.keras.layers.Conv2D(filters=int(128 * OPTIMIZATION_ARGS['alpha']), kernel_size=[3, 3], strides=[1, 1], use_bias=False, padding='same'),
            tf.keras.layers.BatchNormalization(),
            tf.keras.layers.ReLU(),
            tf.keras.layers.Conv2D(filters=int(128 * OPTIMIZATION_ARGS['alpha']), kernel_size=[3, 3], strides=[1, 1], use_bias=False, padding='same'),
            tf.keras.layers.BatchNormalization(),
            tf.keras.layers.ReLU(),
            tf.keras.layers.GlobalAveragePooling2D(),
            tf.keras.layers.Dense(int(100 * OPTIMIZATION_ARGS['beta'])),
            tf.keras.layers.Dense(1, activation='sigmoid')
        ])

    ### quantization aware model
    print()
    print('Applying QAT...')
    quant_aware_model = tfmot.quantization.keras.quantize_model(model)

    ### training the model
    print()
    print('Training the model...')
    linear_decay = tf.keras.optimizers.schedules.PolynomialDecay(
        initial_learning_rate = TRAINING_ARGS['initial_learning_rate'],
        end_learning_rate = TRAINING_ARGS['end_learning_rate'],
        decay_steps = len(train_ds) * TRAINING_ARGS['epochs'],
    )
    quant_aware_model.compile(
        loss = tf.losses.BinaryCrossentropy(from_logits=False),
        optimizer = tf.optimizers.Adam(learning_rate=linear_decay),
        metrics = [tf.metrics.BinaryAccuracy()]
    )
    history = quant_aware_model.fit(
        train_ds,
        epochs = TRAINING_ARGS['epochs']
    )
    pre_quant_training_accuracy = history.history['binary_accuracy'][-1] # accuracy of the last epoch

    ### testing the model
    print()
    print('Testing the model...')
    _, pre_quant_test_accuracy = quant_aware_model.evaluate(test_ds)

    ### storing the model
    print()
    print('Storing the model...')
    quant_aware_model.save(f'{ROOT_FOLDER}/saved_models/{MODEL_NAME}')

    ### converting the model
    print()
    print('Converting the model...')
    converter = tf.lite.TFLiteConverter.from_saved_model(f'{ROOT_FOLDER}/saved_models/{MODEL_NAME}')
    if OPTIMIZATION_ARGS['use_post_training_optimization']:
        # we need to give to the converter a representative dataset so that the
        # activations quantization can be done (weights quantization dont need this).
        representative_train_ds = tf.data.Dataset.list_files(f'{DATABASE_FOLDER}/yn-train/*')
        representative_train_ds = (representative_train_ds
                                    .map(audio_reader.get_audio_and_label)
                                    .map(mfcc_processor.get_mfccs_and_label)
                                    .map(prepare_for_training)
                                    .cache())
        converter.optimizations = [tf.lite.Optimize.DEFAULT]
        def representative_data_gen():
            for input_value, label in representative_train_ds.take(1000):
                input_value_batched = tf.expand_dims(input_value, axis=0)
                yield [input_value_batched]
        converter.representative_dataset = representative_data_gen
    tflite_model = converter.convert()
    with open(f'{ROOT_FOLDER}/tflite_models/{MODEL_NAME}.tflite', 'wb') as fp:
        fp.write(tflite_model)

    ### getting the final accuracy
    print()
    print('Getting the TFLite model accuracy...')
    interpreter = tf.lite.Interpreter(model_path=f'{ROOT_FOLDER}/tflite_models/{MODEL_NAME}.tflite')
    interpreter.allocate_tensors()
    input_details = interpreter.get_input_details()
    output_details = interpreter.get_output_details()
    predictionOutcomes = []
    for currentFile in os.listdir(f'{DATABASE_FOLDER}/yn-test'):
        fileName, fileExtension = os.path.splitext(currentFile)
        if fileExtension == '.wav':
            path = os.path.join(f'{DATABASE_FOLDER}/yn-test', currentFile)
            audio, label = audio_reader.get_audio_and_label(path)
            mfccs = mfcc_processor.get_mfccs(audio)
            mfccs = tf.expand_dims(mfccs, 0)
            mfccs = tf.expand_dims(mfccs, -1)
            interpreter.set_tensor(input_details[0]['index'], mfccs)
            interpreter.invoke()
            output = interpreter.get_tensor(output_details[0]['index'])
            predicted_probability = output[0][0]
            predicted_label = int(predicted_probability > 0.5)
            binary_label = int(label == LABELS[1])
            predictionOutcomes.append(binary_label == predicted_label)
    accuracy = 100 * sum(predictionOutcomes) / len(predictionOutcomes)

    ### comparing the developed model to the reference model
    print()
    print('Benchmarking the developed TFLite model...')
    savings = compare_to_reference(MODEL_NAME, PREPROCESSING_ARGS)

    ### compressing and storing the tflite model
    print('Compressing TFLite model...')
    with zipfile.ZipFile(f'{ROOT_FOLDER}/tflite_models/{MODEL_NAME}.tflite.zip', 'w', compression=zipfile.ZIP_DEFLATED) as fp:
        fp.write(f'{ROOT_FOLDER}/tflite_models/{MODEL_NAME}.tflite')

    ### getting the tflite model size
    size_uncompressed = os.path.getsize(f'{ROOT_FOLDER}/tflite_models/{MODEL_NAME}.tflite') / 1024
    size_compressed = os.path.getsize(f'{ROOT_FOLDER}/tflite_models/{MODEL_NAME}.tflite.zip') / 1024

    ### returning the final results
    return pre_quant_training_accuracy, pre_quant_test_accuracy, accuracy, savings, size_uncompressed, size_compressed

# Training and Generating the TFLite Model

In [6]:
### setting the arguments based on the results discussed in the report
PREPROCESSING_ARGS = {
    'sampling_rate': 16000,
    'frame_length_in_s': 0.064,
    'frame_step_in_s': 0.016,
    'num_mel_bins': 10,
    'num_coefficients': 13,
    'lower_frequency': 0,
    'upper_frequency': 4000
}
TRAINING_ARGS = {
    'batch_size': 20,
    'initial_learning_rate': 1e-2,
    'end_learning_rate': 1e-5,
    'epochs': 20
}
OPTIMIZATION_ARGS = {
    'use_depthwise': True,
    'use_post_training_optimization': True,
    'alpha': 0.5,
    'beta': 0.05
}

### developing the model
pre_quant_training_accuracy, pre_quant_test_accuracy, accuracy, savings, size_uncompressed, size_compressed = develop_model(PREPROCESSING_ARGS, TRAINING_ARGS, OPTIMIZATION_ARGS)

### checking the validity
validity = all([
    accuracy > 98.9,
    savings > 35,
    size_compressed < 25,
])

### printing
print()
print('Final Result...')
print(f'  Model Name: {MODEL_NAME}')
print(f'  Training Accuracy (Before QAT): {pre_quant_training_accuracy:.4f}')
print(f'  Test Accuracy (Before QAT): {pre_quant_test_accuracy:.4f}')
print(f'  Accuracy: {accuracy:.4f}%')
print(f'  Savings: {savings:.4f}%')
print(f'  Size (Uncompressed): {size_uncompressed:.4f}KB')
print(f'  Size (Compressed): {size_compressed:.4f}KB')
print(f'  Validity: {validity}')

Loading the datasets...
2024-01-01 15:45:37.936420: I tensorflow_io/core/kernels/cpu_check.cc:128] Your CPU supports instructions that this TensorFlow IO binary was not compiled to use: AVX AVX2 FMA
2024-01-01 15:45:37.938108: W tensorflow_io/core/kernels/audio_video_mp3_kernels.cc:271] libmp3lame.so.0 or lame functions are not available

Defining the model...
2024-01-01 15:45:39.164242: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.

Applying QAT...

Training the model...
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Ep

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=abe8bec9-2977-4d96-847e-1817af901527' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>