# Benchmark wav2vec2 performance in onnx

In [None]:
# install librosa for getting duration of audio files
!pip install librosa

In [6]:
!pip uninstall  onnxruntime-gpu -y
!pip install onnxruntime

Found existing installation: onnxruntime-gpu 1.10.0
Uninstalling onnxruntime-gpu-1.10.0:
  Successfully uninstalled onnxruntime-gpu-1.10.0
Collecting onnxruntime
  Using cached onnxruntime-1.10.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.9 MB)
Installing collected packages: onnxruntime
Successfully installed onnxruntime-1.10.0


In [2]:
# create onnx modelsf
from convert_wav2vec2 import convert_wav2vec2_onnx

!rm -rf ./exports

model_id = "facebook/wav2vec2-base-960h"
convert_wav2vec2_onnx(model_id=model_id, optimize=True, quantize=True)


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


save model at: /home/ubuntu/onnx-transformers/speech/exports/wav2vec2-base-960h.onnx
input_names:  ['input_values']
current input shape {'input_values': torch.Size([1, 219040])}
Using framework PyTorch: 1.10.0+cu102
output_names:  ['logits']
dynamic_axes:  {'input_values': {0: 'batch_size', 1: 'sequence'}, 'logits': {0: 'batch_size', 1: 'sequence'}}


  _verify_batch_size([input.size(0) * input.size(1) // num_groups, num_groups] + list(input.size()[2:]))
  if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
  if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):


optimized model saved at: /home/ubuntu/onnx-transformers/speech/exports/wav2vec2-base-960h.onnx




quantized model saved at: /home/ubuntu/onnx-transformers/speech/exports/wav2vec2-base-960h-q8.onnx
outpus are different


# Run ONNX CPU Predictions

In [3]:
import onnxruntime as ort
import numpy as np 
from transformers import Wav2Vec2Processor
from convert_wav2vec2 import get_sample,get_inuputs_from_audio
import time 
import librosa

sess_options = ort.SessionOptions()

# Set graph optimization level
sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL

model_path="exports/wav2vec2-base-960h-q8.onnx"
ort_session = ort.InferenceSession(model_path, sess_options)
processor = Wav2Vec2Processor.from_pretrained(model_id)


def onnx_asr(path,sess,processor):
    # get inputs
    onnx_inputs = get_inuputs_from_audio(path=path, processor=processor, tensor_type="np")
    # run inference
    st = time.time()
    logits = sess.run(None, onnx_inputs.data)[0]
    dur = time.time() - st
    # decode
    predicted_ids=np.argmax(logits, axis=-1)
    transcription = processor.decode(predicted_ids[0])
    return transcription, dur


In [5]:
print(f"running onnx inference with {model_id} on CPU")
sample = get_sample(1)
print(f"Duration of {sample} is {librosa.get_duration(filename=sample)}s")

trans , duration = onnx_asr(sample,ort_session,processor)
print(f"Prediction with ORT took {round(duration,2)}s")
print(f"Meaining: 1 second audio takes {round(round(duration,2)/librosa.get_duration(filename=sample),2)} seconds to predict")
print(f"transcript: \n{trans}")

running onnx inference with facebook/wav2vec2-base-960h on CPU
Duration of sample1.flac is 13.69s
Prediction with ORT took 1.25s
Meaining: 1 second audio takes 0.09 seconds to predict
transcript: 
GOING ALONG SLUSHY COUNTRY ROADS AND SPEAKIN TO DAMP AUDIENCES IN GRATHTY SCHOOLROOMS DAY AFTER DAY FOR A FORTNIGHT HE'LL HAVE TO PUT IN AN APPEARANCE AT SUN PLACE OF WORSHIP ON SUNDAY MORNING AND HE CAN COME TO US IMMEDIATELY AFTERWARDS


# Run Pytorch CPU Predictions

In [35]:
import torch
from transformers import Wav2Vec2Processor,Wav2Vec2ForCTC
from convert_wav2vec2 import get_sample,get_inuputs_from_audio
import time 
import librosa

model = Wav2Vec2ForCTC.from_pretrained(model_id)
processor = Wav2Vec2Processor.from_pretrained(model_id)

def pytorch_asr(path,model,processor):
    # get inputs
    inputs = get_inuputs_from_audio(path=path, processor=processor, tensor_type="pt")
    # run inference
    st = time.time()
    logits = model(**inputs)[0]
    dur = time.time() - st
    # decode
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.decode(predicted_ids[0])
    return transcription, dur


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [36]:
print(f"running pytorch inference with {model_id} on CPU")
sample = get_sample(1)
print(f"Duration of {sample} is {librosa.get_duration(filename=sample)}s")

trans , duration = pytorch_asr(sample,model,processor)
print(f"Prediction with Pytorch took {round(duration,2)}s")
print(f"Meaining: 1 second audio takes {round(round(duration,2)/librosa.get_duration(filename=sample),2)} seconds to predict")
print(f"transcript: \n{trans}")

running pytorch inference with facebook/wav2vec2-base-960h on CPU
Duration of sample1.flac is 13.69s
Prediction with Pytorch took 1.3s
Meaining: 1 second audio takes 0.09 seconds to predict
transcript: 
GOING ALONG SLUSHY COUNTRY ROADS AND SPEAKING TO DAMP AUDIENCES IN DRAUGHTY SCHOOL ROOMS DAY AFTER DAY FOR A FORTNIGHT HE'LL HAVE TO PUT IN AN APPEARANCE AT SOME PLACE OF WORSHIP ON SUNDAY MORNING AND HE CAN COME TO US IMMEDIATELY AFTERWARDS


# GPU test (Not tested yet)

In [4]:
!pip uninstall onnxruntime -y
!pip install onnxruntime-gpu



In [None]:
# create onnx modelsf
from convert_wav2vec2 import convert_wav2vec2_onnx

!rm -rf exports
model_id = "facebook/wav2vec2-base-960h"
convert_wav2vec2_onnx(model_id=model_id, optimize=True, quantize=False, use_gpu=True)

# Run ONNX GPU Predictions

In [None]:
import onnxruntime as ort
import numpy as np 
from transformers import Wav2Vec2Processor
from convert_wav2vec2 import get_sample,get_inuputs_from_audio
import time 
import librosa

sess_options = ort.SessionOptions()

# Set graph optimization level
sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL

model_path="exports/wav2vec2-base-960h-opt.onnx"
ort_session = ort.InferenceSession(model_path, sess_options,providers=['CUDAExecutionProvider'])
processor = Wav2Vec2Processor.from_pretrained(model_id)


def onnx_asr(path,sess,processor):
    # get inputs
    onnx_inputs = get_inuputs_from_audio(path=path, processor=processor, tensor_type="np")
    # run inference
    st = time.time()
    logits = sess.run(None, onnx_inputs.data)[0]
    dur = time.time() - st
    # decode
    predicted_ids=np.argmax(logits, axis=-1)
    transcription = processor.decode(predicted_ids[0])
    return transcription, dur
