# Predict multi label sound tagging AI problem using a trained CNN (VGGish)

* [TensorRT Quick Start Guide](https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#trtexec)

In [1]:
%tensorflow_version 1.x
# verify which tensorflow version was imported
import tensorflow as tf
print(tf.__version__)

TensorFlow 1.x selected.
1.15.2


In [2]:
# Mount google drive
from google.colab import drive, files
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# show nvidia GPU specifications
!nvidia-smi

Thu Apr 14 07:45:44 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   50C    P8    29W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

### Install dependencies

In [4]:
!pip install keras==2.3.0 oyaml==0.9 scipy==1.3.1 pycuda --quiet
# scikit-learn==0.21.3

In [5]:
!pip install h5py==2.10.0 --force-reinstall --quiet

import os, time, h5py
print("h5py version", h5py.__version__)
if h5py.__version__ != "2.10.0":
  print("You need to restart the kernel")
  os._exit(0)

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
lucid 0.3.10 requires umap-learn, which is not installed.
tensorflow 1.15.2 requires gast==0.2.2, but you have gast 0.5.3 which is incompatible.
lucid 0.3.10 requires numpy<=1.19, but you have numpy 1.21.6 which is incompatible.
kapre 0.3.7 requires tensorflow>=2.0.0, but you have tensorflow 1.15.2 which is incompatible.
google-colab 1.0.0 requires six~=1.15.0, but you have six 1.16.0 which is incompatible.
datascience 0.10.6 requires folium==0.2.1, but you have folium 0.8.3 which is incompatible.
albumentations 0.1.12 requires imgaug<0.2.7,>=0.2.5, but you have imgaug 0.2.9 which is incompatible.[0m
h5py version 2.10.0


### Import utilities

In [6]:
import sys

if not os.path.exists('mel_features.py'):
  !wget https://gitlab-research.centralesupelec.fr/sleglaive/embedded-ust-students/raw/master/mel_features.py

if not os.path.exists('utils.py'):
  !wget https://gitlab-research.centralesupelec.fr/sleglaive/embedded-ust-students/raw/master/utils.py

if not os.path.exists('vggish_params.py'):
  !wget https://gitlab-research.centralesupelec.fr/sleglaive/embedded-ust-students/raw/master/vggish_params.py

In [7]:
import oyaml as yaml
import numpy as np
import mel_features

### Define paths

In [8]:
ust_data_dir = './drive/MyDrive/CS/sonyc-ust/'

dataset_dir = os.path.join(ust_data_dir, 'data')

annotation_file = os.path.join(dataset_dir, 'annotations.csv')
taxonomy_file = os.path.join(dataset_dir, 'dcase-ust-taxonomy.yaml')

log_mel_spec_dir = os.path.join(ust_data_dir, 'log-mel-spectrograms')
output_training_dir = os.path.join(ust_data_dir, 'output_training')
output_prediction_dir = os.path.join(ust_data_dir, 'output_prediction')

model_architecture_file = os.path.join(ust_data_dir, 'model_architecture.json')
model_weights_file = os.path.join(ust_data_dir, 'best_model_weights.h5')

audio_sample_wav = os.path.join(dataset_dir, 'audio-eval/00_010587.wav')
sonyc_engine_trt = os.path.join(ust_data_dir, 'sonyc_engine.trt')

In [9]:
# load taxonomy file
with open(taxonomy_file, 'r') as f:
    taxonomy = yaml.load(f, Loader=yaml.Loader)

# get list of coarse labels from taxonomy
labels = [v for k,v in taxonomy['coarse'].items()]
labels

['engine',
 'machinery-impact',
 'non-machinery-impact',
 'powered-saw',
 'alert-signal',
 'music',
 'human-voice',
 'dog']

### Preprocessing input audio

In [10]:
import vggish_params
import resampy
from scipy.io import wavfile
import scipy.signal as sps
#import librosa

CLASSES = 8
sr = vggish_params.SAMPLE_RATE

# compute log-Mel spectrogram from input audio file
print("Reading input audio from file {}".format(audio_sample_wav))
old_sr, y = wavfile.read(audio_sample_wav)
# convert audio to mono if input audio file is stereo
if len(y.shape) > 1:
  if y.shape[1] == 2:
      y = y.mean(axis=1)
if sr != old_sr:
  number_of_samples = round(len(y) * float(sr) / old_sr)
  y = sps.resample(y, number_of_samples)
# y, old_sr = librosa.load(audio_sample_wav, mono=True, sr=None)
# if sr != old_sr:
#   y = resampy.resample(y, old_sr, sr)

log_mel_spec = mel_features.waveform_to_log_mel_spectrogram(y, sr)
log_mel_input = log_mel_spec[np.newaxis,:,:,np.newaxis]
print("Log-mel input shape", log_mel_input.shape)

Reading input audio from file ./drive/MyDrive/CS/sonyc-ust/data/audio-eval/00_010587.wav
Log-mel input shape (1, 998, 64, 1)


### Load keras model

In [11]:
from keras.models import model_from_json

# model reconstruction from JSON file
with open(model_architecture_file, 'r') as f:
    model = model_from_json(f.read())

# load weights into the new model
model.load_weights(model_weights_file)

model.summary()

Instructions for updating:
If using Keras pass *_constraint arguments to layers.



Using TensorFlow backend.


Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 998, 64, 1)        0         
_________________________________________________________________
conv1 (Conv2D)               (None, 998, 64, 64)       640       
_________________________________________________________________
pool1 (MaxPooling2D)         (None, 499, 32, 64)       0         
_________________________________________________________________
conv2 (Conv2D)               (None, 499, 32, 128)      73856     
_________________________________________________________________
pool2 (MaxPooling2D)         (None, 249, 16, 128)      0         
_________________________________________________________________
conv3/conv3_1 (Conv2D)       (None, 249, 16, 256)      295168    
_________________________________________________________________
conv3/conv3_2 (Conv2D)       (None, 249, 16, 256)      5900

### Predict with Keras

In [12]:
#%%time
out = model.predict(log_mel_input)




In [13]:
for ind, label in enumerate(labels):
  print(label + ": {0:.1%}".format(out[0][ind]))

engine: 19.2%
machinery-impact: 61.3%
non-machinery-impact: 1.5%
powered-saw: 2.8%
alert-signal: 9.4%
music: 0.7%
human-voice: 3.9%
dog: 1.2%


### Create PB model

In [14]:
def keras_to_pb(model, output_filename, output_node_names):
    """
    This is the function to convert the keras model to pb.
 
    Args:
       model: The keras model.
       output_filename: The output .pb file name.
       output_node_names: The output nodes of the network (if None, 
       the function gets the last layer name as the output node).
    """
 
    # Get names of input and output nodes.
    in_name = model.layers[0].get_output_at(0).name.split(':')[0]
 
    if output_node_names is None:
        output_node_names = [model.layers[-1].get_output_at(0).name.split(':')[0]]
     
    sess = tf.compat.v1.keras.backend.get_session()

    frozen_graph_def = tf.compat.v1.graph_util.convert_variables_to_constants(
        sess,
        sess.graph_def,
        output_node_names)
 
    sess.close()
    wkdir = ''
    tf.io.write_graph(frozen_graph_def, wkdir, output_filename, as_text=False)
 
    return in_name, output_node_names

In [15]:
# save TensorFlow model in pb format
in_tensor_name, out_tensor_names = keras_to_pb(model, 'keras_model_onnx.pb', None)
print(in_tensor_name, out_tensor_names[0])

Instructions for updating:
Use `tf.compat.v1.graph_util.convert_variables_to_constants`
Instructions for updating:
Use `tf.compat.v1.graph_util.extract_sub_graph`
INFO:tensorflow:Froze 18 variables.
INFO:tensorflow:Converted 18 variables to const ops.
input_2 classifier/Softmax


### Exporting TF model to ONNX file

In [16]:
!pip install onnx tf2onnx --quiet
import tf2onnx
print("tf2onnx version", tf2onnx.__version__)

tf2onnx version 1.9.3


In [17]:
# convert TensorFlow model to ONNX model with tf2onnx CLI
!python -m tf2onnx.convert --input keras_model_onnx.pb --output temp.onnx --inputs input_2:0 --outputs classifier/Softmax:0



2022-04-14 07:46:25,140 - INFO - Using tensorflow=1.15.2, onnx=1.11.0, tf2onnx=1.9.3/1190aa
2022-04-14 07:46:25,140 - INFO - Using opset <onnx, 9>
2022-04-14 07:46:33,473 - INFO - Computed 0 values for constant folding
2022-04-14 07:46:35,673 - INFO - Optimizing ONNX model
2022-04-14 07:46:35,916 - INFO - After optimization: Const -2 (24->22), Gather +1 (0->1), Identity -1 (1->0), Reshape +1 (1->2), Squeeze -1 (1->0), Transpose -17 (18->1), Unsqueeze -3 (3->0)
2022-04-14 07:46:36,033 - INFO - 
2022-04-14 07:46:36,033 - INFO - Successfully converted TensorFlow model keras_model_onnx.pb to ONNX
2022-04-14 07:46:36,034 - INFO - Model inputs: ['input_2:0']
2022-04-14 07:46:36,034 - INFO - Model outputs: ['classifier/Softmax:0']
2022-04-14 07:46:36,034 - INFO - ONNX model is saved at temp.onnx


In [18]:
import onnx

# load ONNX model
onnx_model = onnx.load_model('temp.onnx')

In [19]:
# Check if ONNX model is valid
try:
    onnx.checker.check_model(onnx_model)
except onnx.checker.ValidationError as e:
    print('The ONNX model is invalid: %s' % e)
else:
    print('The ONNX model is valid!')

The ONNX model is valid!


In [20]:
# set an explicit batch size in the ONNX file
# by default, TensorFlow doesn’t set an explicit batch size
BATCH_SIZE = 1

inputs = onnx_model.graph.input
for input in inputs:
    dim1 = input.type.tensor_type.shape.dim[0]
    dim1.dim_value = BATCH_SIZE

In [21]:
# save the final ONNX model in a file
onnx_filename = "sonyc_model.onnx"
onnx.save_model(onnx_model, onnx_filename)

### Converting ONNX To a TensorRT Engine

To convert the ONNX model to TensorRT engine, we use trtexec. It's a command-line tool to generating serialized engine from the model.

In [22]:
# install tensorRT
!pip install --upgrade --index-url https://pypi.ngc.nvidia.com nvidia-tensorrt \
--quiet

In [23]:
# https://github.com/prratadiya/tensorrt-installation-colab/blob/master/TensorRT_7_0_0_installation.ipynb
# Installing trtexec
!sudo dpkg -i "/content/drive/MyDrive/nv-tensorrt-repo-ubuntu1804-cuda11.6-trt8.4.0.6-ea-20220212_1-1_amd64.deb"

!sudo apt-get update
!sudo apt-get install libnvinfer-dev libnvonnxparsers-dev libnvparsers-dev libnvinfer-plugin-dev python-libnvinfer python3-libnvinfer-dev

!sudo apt-get install tensorrt

# check that TensorRT has been installed correctly
!dpkg -l | grep TensorRT

(Reading database ... (Reading database ... 5%(Reading database ... 10%(Reading database ... 15%(Reading database ... 20%(Reading database ... 25%(Reading database ... 30%(Reading database ... 35%(Reading database ... 40%(Reading database ... 45%(Reading database ... 50%(Reading database ... 55%(Reading database ... 60%(Reading database ... 65%(Reading database ... 70%(Reading database ... 75%(Reading database ... 80%(Reading database ... 85%(Reading database ... 90%(Reading database ... 95%(Reading database ... 100%(Reading database ... 157184 files and directories currently installed.)
Preparing to unpack .../nv-tensorrt-repo-ubuntu1804-cuda11.6-trt8.4.0.6-ea-20220212_1-1_amd64.deb ...
Unpacking nv-tensorrt-repo-ubuntu1804-cuda11.6-trt8.4.0.6-ea-20220212 (1-1) over (1-1) ...
Setting up nv-tensorrt-repo-ubuntu1804-cuda11.6-trt8.4.0.6-ea-20220212 (1-1) ...
Get:1 file:/var/nv-tensorrt-repo-ubuntu1804-cuda11.6-trt8.4.0.6-ea-20220212  InRelease
Ign:1 file:/var/nv-tens

In [24]:
# show CUDA version (=11.1)
#!nvcc --version

# show Ubuntu release (=18.04)
#!lsb_release -a

In [25]:
# build TensorRT engine
!/usr/src/tensorrt/bin/trtexec --onnx="/content/sonyc_model.onnx" --saveEngine="/content/sonyc_engine.trt" --best --buildOnly

&&&& RUNNING TensorRT.trtexec [TensorRT v8400] # /usr/src/tensorrt/bin/trtexec --onnx=/content/sonyc_model.onnx --saveEngine=/content/sonyc_engine.trt --best --buildOnly
[04/14/2022-07:47:13] [I] === Model Options ===
[04/14/2022-07:47:13] [I] Format: ONNX
[04/14/2022-07:47:13] [I] Model: /content/sonyc_model.onnx
[04/14/2022-07:47:13] [I] Output:
[04/14/2022-07:47:13] [I] === Build Options ===
[04/14/2022-07:47:13] [I] Max batch: explicit batch
[04/14/2022-07:47:13] [I] Memory Pools: workspace: default, dlaSRAM: default, dlaLocalDRAM: default, dlaGlobalDRAM: default
[04/14/2022-07:47:13] [I] minTiming: 1
[04/14/2022-07:47:13] [I] avgTiming: 8
[04/14/2022-07:47:13] [I] Precision: FP32+FP16+INT8
[04/14/2022-07:47:13] [I] LayerPrecisions: 
[04/14/2022-07:47:13] [I] Calibration: Dynamic
[04/14/2022-07:47:13] [I] Refit: Disabled
[04/14/2022-07:47:13] [I] Sparsity: Disabled
[04/14/2022-07:47:13] [I] Safe mode: Disabled
[04/14/2022-07:47:13] [I] DirectIO mode: Disabled
[04/14/2022-07:47:13] 

In [26]:
# load and test TensorRT engine
#!/usr/src/tensorrt/bin/trtexec --loadEngine="/content/sonyc_engine.trt" --exportTimes=trace.json

In [27]:
# prints timestamps and duration of input, compute, and output, in different forms
#!/usr/src/tensorrt/samples/trtexec/tracer.py trace.json

### Inference pipeline
[Another TensorRT runtime implementation](https://github.com/NVIDIA/TensorRT/blob/main/quickstart/SemanticSegmentation/tutorial-runtime.ipynb)

In [28]:
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit

def allocate_buffers(engine, batch_size, data_type):
   """
   This is the function to allocate buffers for input and output in the device
   Args:
      engine : The path to the TensorRT engine.
      batch_size : The batch size for execution time.
      data_type: The type of the data for input and output, for example trt.float32.

   Output:
      h_input_1: Input in the host.
      d_input_1: Input in the device.
      h_output_1: Output in the host.
      d_output_1: Output in the device.
      stream: CUDA stream.
   """

   #print('Matrix input shape: ', engine.get_binding_shape(0))
   #print('Matrix output shape: ', engine.get_binding_shape(1))

   # Determine dimensions and create page-locked memory buffers (which won't be swapped to disk) to hold host inputs/outputs.
   h_input_1 = cuda.pagelocked_empty(BATCH_SIZE * trt.volume(engine.get_binding_shape(0)), dtype=trt.nptype(data_type))
   #print("h_input_1 shape:", h_input_1.shape)
   h_output = cuda.pagelocked_empty(BATCH_SIZE * trt.volume(engine.get_binding_shape(1)), dtype=trt.nptype(data_type))
   #print("h_output_1 shape:", h_output.shape)
   # Allocate device memory for inputs and outputs.
   d_input_1 = cuda.mem_alloc(h_input_1.nbytes)
   d_output = cuda.mem_alloc(h_output.nbytes)
   # Create a stream in which to copy inputs/outputs and run inference.
   stream = cuda.Stream()

   return h_input_1, d_input_1, h_output, d_output, stream

def load_audio_to_buffer(log_mel, pagelocked_buffer):
   preprocessed = np.asarray(log_mel).ravel()
   print("Preprocessed log-mel", preprocessed.shape)
   np.copyto(pagelocked_buffer, preprocessed)

def do_inference(engine, wav_1, h_input_1, d_input_1, h_output, d_output, stream, batch_size=1):
   """
   This is the function to run the inference
   Args:
      engine : Path to the TensorRT engine
      wav_1 : Input audio to the model.
      h_input_1: Input in the host
      d_input_1: Input in the device
      h_output_1: Output in the host
      d_output_1: Output in the device
      stream: CUDA stream
      batch_size : Batch size for execution time
      
   Output:
      The list of predicted classes 
   """
   #trt.init_libnvinfer_plugins(None,'')
   
   start = cuda.Event()
   end = cuda.Event()

   load_audio_to_buffer(wav_1, h_input_1)
   with engine.create_execution_context() as context:
       # start profiling
       start.record()

       # Transfer input data to the GPU.
       cuda.memcpy_htod_async(d_input_1, h_input_1, stream)

       # Run inference.
       context.profiler = trt.Profiler()
       #context.execute(batch_size=batch_size, bindings=[int(d_input_1), int(d_output)])
       #context.execute_v2(bindings=[int(d_input_1), int(d_output)])
       context.execute_async_v2(bindings=[int(d_input_1), int(d_output)], stream_handle=stream.handle)

       # Transfer predictions back from the GPU.
       cuda.memcpy_dtoh_async(h_output, d_output, stream)

       # Synchronize the stream
       stream.synchronize()

       # end profiling
       end.record()
       end.synchronize()
       secs = start.time_till(end)
       print("Time elapsed: {0:.3f}ms".format(secs))

       # Return the host output.
       out = h_output.reshape((batch_size,8))
       return out

def load_engine(engine_file_path):
    # deserialize the TensorRT engine from specified plan file.
    trt.init_libnvinfer_plugins(None, "")
    assert os.path.exists(engine_file_path)
    print("Reading engine from {}".format(engine_file_path))
    with open(engine_file_path, 'rb') as f, trt.Runtime(trt.Logger()) as runtime:
      return runtime.deserialize_cuda_engine(f.read())

### Playing input audio used for inference

In [29]:
import IPython.display as ipd
ipd.Audio(audio_sample_wav, rate=sr)

### Predict with TensorRT engine

In [30]:
engine_filename = "sonyc_engine.trt"

with load_engine(engine_filename) as engine:
    h_input, d_input, h_output, d_output, stream = allocate_buffers(engine, BATCH_SIZE, trt.float32)
    out_trt = do_inference(engine, log_mel_input, h_input, d_input, h_output, d_output, stream, BATCH_SIZE)

Reading engine from sonyc_engine.trt
Preprocessed log-mel (63872,)
Time elapsed: 1922.407ms


In [31]:
for ind, label in enumerate(labels):
  print(label + ": {0:.1%}".format(out_trt[0][ind]))

engine: 19.1%
machinery-impact: 61.5%
non-machinery-impact: 1.5%
powered-saw: 2.8%
alert-signal: 9.3%
music: 0.7%
human-voice: 3.9%
dog: 1.2%
