##Install required tools

In [1]:
# Installing whisper package
!pip install git+https://github.com/openai/whisper.git

#Insalloing datasets
!pip install datasets

# Cloning whisper repository
!git clone https://github.com/openai/whisper.git

# Downloading a sample audio file
!wget https://huggingface.co/datasets/osanseviero/dummy_ja_audio/resolve/main/result.flac

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-oqv_hgcp
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-oqv_hgcp
  Resolved https://github.com/openai/whisper.git to commit c0d2f624c09dc18e709e37c2ad90c039a4eb72a2
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: openai-whisper
  Building wheel for openai-whisper (pyproject.toml) ... [?25l[?25hdone
  Created wheel for openai-whisper: filename=openai_whisper-20250625-py3-none-any.whl size=803979 sha256=8c46d6f0aa183a3dccab28429179a710f4dfe904d82bb41ebfa7c5cff00225ca
  Stored in directory: /tmp/pip-ephem-wheel-cache-vxxuttad/wheels/c3/03/25/5e0ba78bc27a3a089f137c9f1d92fdfce16d06996c071a016c
Successfully built openai-whisper
Installing collec

##Generate Whisper TFLite model

In [None]:
import tensorflow as tf


# Importing necessary classes from transformers
from transformers import AutoProcessor, TFWhisperForConditionalGeneration, GenerationConfig

# Importing necessary functions from datasets
from datasets import load_dataset


# Creating force_token_map to be used in GenerationConfig
force_token_map = [[50258, 50266], [50359, 50363]] #

# Creating generation_config with force_token_map
generation_config = GenerationConfig(force_token_map=force_token_map)

# Creating an instance of AutoProcessor from the pretrained model
processor = AutoProcessor.from_pretrained("openai/whisper-base")

# Creating an instance of TFWhisperForConditionalGeneration from the pretrained model
model = TFWhisperForConditionalGeneration.from_pretrained("openai/whisper-base")

# Loading dataset
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")

# Inputs
inputs = processor(ds[0]["audio"]["array"], return_tensors="tf")
input_features = inputs.input_features

# Generating Transcription
generated_ids = model.generate(input_ids=input_features, generation_config=generation_config)
transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(transcription)

# Creating a GenerateModel Class
class GenerateModel(tf.Module):
  def __init__(self, model):
    super(GenerateModel, self).__init__()
    self.model = model

  @tf.function(
    input_signature=[
      tf.TensorSpec(shape=(1, 80,3000), dtype=tf.float32, name="input_ids"),
    ]
  )
  def serving(self, input_ids):
    outputs = self.model.generate(input_ids, forced_decoder_ids=force_token_map)
    return {"sequences": outputs}

# Saving the model
saved_model_dir = '/content/tf'
generate_model = GenerateModel(model=model)
tf.saved_model.save(generate_model, saved_model_dir, signatures={"serving_default": generate_model.serving})

# Converting to TFLite model
tflite_model_path = '/content/whisper-base.tflite'
converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
converter.optimizations = [tf.lite.Optimize.DEFAULT]
tflite_model = converter.convert()

# Saving the TFLite model
with open(tflite_model_path, 'wb') as f:
    f.write(tflite_model)

##Run the inference on Whisper TFLite model

In [None]:
# Import necessary libraries
import whisper
import numpy as np
from timeit import default_timer as timer


# Define the path to the TFLite model
tflite_model_path = '/content/whisper-base.tflite'

# Create an interpreter to run the TFLite model
interpreter = tf.lite.Interpreter(tflite_model_path)

# Allocate memory for the interpreter
interpreter.allocate_tensors()

# Get the input and output tensors
input_tensor = interpreter.get_input_details()[0]['index']
output_tensor = interpreter.get_output_details()[0]['index']


inference_start = timer()

# Calculate the mel spectrogram of the audio file
print(f'Calculating mel spectrogram...')
mel_from_file = whisper.audio.log_mel_spectrogram('/content/whisper/tests/jfk.flac')

# Pad or trim the input data to match the expected input size
input_data = whisper.audio.pad_or_trim(mel_from_file, whisper.audio.N_FRAMES)

# Add a batch dimension to the input data
input_data = np.expand_dims(input_data, 0)

# Run the TFLite model using the interpreter
print("Invoking interpreter ...")
interpreter.set_tensor(input_tensor, input_data)
interpreter.invoke()

# Get the output data from the interpreter
output_data = interpreter.get_tensor(output_tensor)

# Print the output data
#print(output_data)

# Create a tokenizer to convert tokens to text
wtokenizer = whisper.tokenizer.get_tokenizer(True, language="ja")

# convert tokens to text
print("Converting tokens ...")
for token in output_data:
    # Replace -100 with the end of text token
    token[token == -100] = wtokenizer.eot
    text = wtokenizer.decode(token, skip_special_tokens=True)
    print(text)

print("\nInference took {:.2f}s ".format(timer() - inference_start))

In [None]:
%ls -la