# Inference experiments with YamNet

We will evaluate:
- regular YamNet (PyTorch, ONNX and OpenVino) from spectrograms
- End-to-end YamNet from waveform

In [1]:
import torch
import torchaudio
from nnAudio import features
from scipy.io import wavfile
from nnAudio import features
import torch

import openvino as ov

from torch_audioset.yamnet.model import yamnet as torch_yamnet
from torch_audioset.data.torch_input_processing import WaveformToInput as TorchTransform
import onnxruntime

In [None]:
wav_file = '/data/audio/loccus-asv-datasets/QA/ASR_evaluation/wav.16kHz/common_voice_en_538718_en.wav'
waveform, sample_rate = torchaudio.load(wav_file, normalize=True)

## Inference not E2E (starting from spectrogram)

In [6]:
patches, spectrogram = TorchTransform().wavform_to_log_mel(waveform, 16000)

### PyTorch

In [7]:
# PyTorch
pt_model = torch_yamnet(pretrained=False)
pt_model.load_state_dict(torch.load('./yamnet.pth'))

with torch.no_grad():
    pt_model.eval()
    pt_pred = pt_model(patches, to_prob=True)

  pt_model.load_state_dict(torch.load('./yamnet.pth'))


### ONNX

In [10]:
# ONNX
opts = onnxruntime.SessionOptions()
opts.inter_op_num_threads = 1
opts.intra_op_num_threads = 1      
session = onnxruntime.InferenceSession('yamnet.onnx', providers=['CPUExecutionProvider'], sess_options=opts)

ort_inputs = {session.get_inputs()[0].name: patches.numpy()}
ort_outs = session.run([session.get_outputs()[0].name], ort_inputs)

onnx_pred = ort_outs[0]

### OpenVino

In [12]:
# OpenVino
core = ov.Core()
ov_model = core.read_model(model='yamnet.xml')        
compiled_model = ov.compile_model(ov_model, "CPU") #, config=config)
output_layer = compiled_model.output(0)

ov_pred = compiled_model(patches)[output_layer]