# Whisper Embeddding Extraction

In [1]:
import torch
from transformers import AutoFeatureExtractor, WhisperModel

In [2]:
import sounddevice as sd
import soundfile as sf

In [3]:
import numpy as np

In [4]:
MODEL_SIZE = 'openai/whisper-base.en'

In [5]:
model = WhisperModel.from_pretrained(MODEL_SIZE)
feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_SIZE)

In [6]:
def get_normalize_audio(filename=''):
    '''
    Read an Audio file as int16 array
    Normalize it
    '''
    wave,info = sf.read(filename,dtype='int16')
    wave = wave / np.iinfo(np.int16).max
    return wave

In [15]:
def extract_embeddings(wave):
    inputs = feature_extractor(wave, return_tensors="pt") #extract features as pytorch
    input_features = inputs.input_features
    decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id #dummy decoding ids
    with torch.no_grad():
        last_hidden_state = model(input_features, decoder_input_ids=decoder_input_ids).last_hidden_state #passing input to models
    print(list(last_hidden_state.shape))
    return last_hidden_state

In [21]:
%%time
wave = get_normalize_audio('audios/2sec.wav')
extract_embeddings(wave)

It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


[1, 2, 512]
CPU times: user 4.65 s, sys: 1.35 s, total: 6 s
Wall time: 1.52 s


tensor([[[-5.3620e-03, -5.7831e-01,  1.4868e+00,  ..., -3.2280e+00,
          -1.6686e+00,  1.3653e+00],
         [-8.6298e+00, -2.0907e+00,  1.8474e+00,  ..., -1.7293e+00,
          -7.2221e-01,  1.0072e+01]]])

In [25]:
%%time
wave = get_normalize_audio('audios/preamble_5sec_resample_with_pause.wav')
extract_embeddings(wave)

It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


[1, 2, 512]
CPU times: user 5.19 s, sys: 1.24 s, total: 6.43 s
Wall time: 1.62 s


tensor([[[-0.2195,  1.4762,  4.0492,  ...,  0.7420,  1.9835,  2.9061],
         [-8.8873, -3.2836,  0.5441,  ..., -2.0624,  3.6385, 11.6977]]])

In [22]:
%%time
wave = get_normalize_audio('audios/10sec.wav')
extract_embeddings(wave)

It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


[1, 2, 512]
CPU times: user 5.04 s, sys: 1.48 s, total: 6.52 s
Wall time: 1.63 s


tensor([[[-0.1290,  2.1790, -2.9553,  ...,  2.1216,  2.3728, -6.0154],
         [-6.5927, -4.3119,  0.0987,  ...,  1.2869, -0.7027,  7.3346]]])

In [23]:
%%time
wave = get_normalize_audio('audios/20sec.wav')
extract_embeddings(wave)

It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


[1, 2, 512]
CPU times: user 4.55 s, sys: 1.42 s, total: 5.96 s
Wall time: 1.47 s


tensor([[[ -0.3863,   2.2555,   2.2305,  ...,   0.8704,   4.1336,   0.8985],
         [ -4.8217,  -3.9847,   1.2224,  ..., -10.3806,  -4.3587,  13.1559]]])

In [24]:
%%time
wave = get_normalize_audio('audios/40sec.wav')
extract_embeddings(wave)

It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


[1, 2, 512]
CPU times: user 5.26 s, sys: 1.35 s, total: 6.6 s
Wall time: 1.66 s


tensor([[[-1.1567,  1.2513,  0.6916,  ...,  0.0371, -1.8902, -1.8424],
         [-5.6961, -3.2571,  0.1291,  ..., -3.5791, -0.2094,  5.0859]]])

# Whisper Transcript Generation

In [26]:
from transformers import AutoProcessor, WhisperForConditionalGeneration

In [28]:
processor = AutoProcessor.from_pretrained(MODEL_SIZE)
model = WhisperForConditionalGeneration.from_pretrained(MODEL_SIZE)

In [29]:
def get_transcript(wave):

    '''
    Genrate Transcript
    '''
    inputs = processor(wave, return_tensors="pt")

    input_features = inputs.input_features

    generated_ids = model.generate(inputs=input_features)

    transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

    return transcription

In [31]:
%%time
wave = get_normalize_audio('audios/2sec.wav')
text = get_transcript(wave)
text

It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


CPU times: user 5.61 s, sys: 477 ms, total: 6.09 s
Wall time: 1.49 s


' Hello, hello, hello, hello.'

In [32]:
%%time
wave = get_normalize_audio('audios/10sec.wav')
text = get_transcript(wave)
text

It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


CPU times: user 6.25 s, sys: 521 ms, total: 6.78 s
Wall time: 1.64 s


' Once upon a time, I mean was very thirsty.'

In [38]:
%%time
wave = get_normalize_audio('audios/20sec.wav')
text = get_transcript(wave)
text

It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


CPU times: user 11.7 s, sys: 690 ms, total: 12.4 s
Wall time: 3.05 s


" Okay, I am reading a name from the watch and my watch is of golden color and I am hearing a wind and a hand-free and a jacket and blue loch is and my say is asking me to test his model and I speak English in front of the laptop and that's all I guess."

In [37]:
%%time
wave = get_normalize_audio('audios/preamble_5sec_resample_with_pause.wav')
text = get_transcript(wave)
text

It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


CPU times: user 6.96 s, sys: 688 ms, total: 7.64 s
Wall time: 1.93 s


' We, the people of the United States, in order to form a more perfect union, establish'

In [34]:
%%time
wave = get_normalize_audio('audios/40sec.wav')
text = get_transcript(wave)
text

It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


CPU times: user 12.6 s, sys: 428 ms, total: 13.1 s
Wall time: 3.28 s


' I arrived here just 5 minutes ago and now I am seeing a chair in front of me and two laptops and a person with glasses and white hairs and another person working on the laptop and I am all seeing a heater and two umbrellas and a fan and there are two glasses, two different other glasses and I am seeing here a lot of books and'

Times in Seconds

| sec | Only Encoder | Encoder with Language Head |
| 2s  | 1.52s        | 1.49
| 8s  | 1.62s        | 1.64
| 10s | 1.65         | 1.93
| 20s | 1.97         | 2.5s
| 40s | 1.66         | 3.28