# Inference experiments with YamNet

We will evaluate:
- regular YamNet (PyTorch, ONNX and OpenVino) from spectrograms
- End-to-end YamNet from waveform

In [1]:
import torch
import torchaudio
from nnAudio import features
from scipy.io import wavfile
from nnAudio import features
import torch

import openvino as ov

from torch_audioset.yamnet.model import yamnet as torch_yamnet
from torch_audioset.data.torch_input_processing import WaveformToInput as TorchTransform
import onnxruntime

import custom_yamnet_model

In [2]:
# wav_file = '/data/audio/loccus-asv-datasets/QA/ASR_evaluation/wav.16kHz/common_voice_en_538718_en.wav'
wav_file = '/data/audio/ITW-Music/snr10-inthewild-16khz-inthewild_10002.wav'
waveform, sample_rate = torchaudio.load(wav_file, normalize=True)

## Frontend validation

In [3]:
patches, spectrogram = TorchTransform().wavform_to_log_mel(waveform, 16000)

### PyTorch Frontend

In [4]:
pt_model = custom_yamnet_model.ModelFrontend()
patches_pt = pt_model(waveform)

STFT kernels created, time used = 0.0162 seconds


### ONNX Frontend

In [5]:
# ONNX    
session = onnxruntime.InferenceSession('yamnet_frontend.onnx', providers=['CPUExecutionProvider'])

ort_inputs = {session.get_inputs()[0].name: waveform.numpy()}
ort_outs = session.run([session.get_outputs()[0].name], ort_inputs)

patches_onnx = ort_outs[0]

### OpenVino Frontend

In [6]:
# OpenVino
core = ov.Core()
ov_model = core.read_model(model='yamnet_frontend.xml')        
compiled_model = ov.compile_model(ov_model, "CPU") #, config=config)
output_layer = compiled_model.output(0)

patches_ov = compiled_model(waveform)[output_layer]

## Inference not E2E (starting from spectrogram)

### PyTorch

In [7]:
# PyTorch
pt_model = torch_yamnet(pretrained=False)
pt_model.load_state_dict(torch.load('./yamnet.pth'))

with torch.no_grad():
    pt_model.eval()
    pred_pt_backend = pt_model(patches)

  pt_model.load_state_dict(torch.load('./yamnet.pth'))


### ONNX

In [8]:
# ONNX     
session = onnxruntime.InferenceSession('yamnet.onnx', providers=['CPUExecutionProvider'])

ort_inputs = {session.get_inputs()[0].name: patches.numpy()}
ort_outs = session.run([session.get_outputs()[0].name], ort_inputs)

pred_onnx_backend = ort_outs[0]

### OpenVino

In [9]:
# OpenVino
core = ov.Core()
ov_model = core.read_model(model='yamnet.xml')        
compiled_model = ov.compile_model(ov_model, "CPU") #, config=config)
output_layer = compiled_model.output(0)

pred_ov_backend = compiled_model(patches)[output_layer]

## Inference E2E

### ONNX E2E

In [10]:
# ONNX    
session = onnxruntime.InferenceSession('yamnet_e2e.onnx', providers=['CPUExecutionProvider'])

ort_inputs = {session.get_inputs()[0].name: waveform.numpy()}
ort_outs = session.run([session.get_outputs()[0].name], ort_inputs)

pred_e2e_onnx = ort_outs[0]

### OpenVino E2E

In [11]:
# OpenVino
core = ov.Core()
ov_model = core.read_model(model='yamnet_e2e.xml')        
compiled_model = ov.compile_model(ov_model, "CPU") #, config=config)
output_layer = compiled_model.output(0)

pred_e2e_ov = compiled_model(waveform)[output_layer]