In [2]:
import os
import sys
# Ensure transformers module is accessible
#transformers_path = os.path.join(os.getcwd(), "transformers/src")
#sys.path.insert(0, str(transformers_path))
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset


device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "/home/azureuser/laurin/code/research/output/crisper_whisper_timestamp_finetuned"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)
model.generation_config.median_filter_width=3
processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=800,
    chunk_length_s=30,
    batch_size=1,
    return_timestamps=True,
    torch_dtype=torch_dtype,
    device=device,
)

dataset = load_dataset("distil-whisper/librispeech_long", "clean", split="validation")
sample = dataset[0]["audio"]
print(sample)
result = pipe(sample, return_timestamps="word")
print(result)

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 2/2 [00:08<00:00,  4.09s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


{'path': '0d38672e0bbdbdc460af55b8bb84a15b2730db2819f2af64f9c777d4d586f2de', 'array': array([0.00238037, 0.0020752 , 0.00198364, ..., 0.00024414, 0.00048828,
       0.0005188 ]), 'sampling_rate': 16000}


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


{'text': 'Mister Quilter is the apostle of the middle classes, and we are glad to welcome his gospel. Nor is Mister Quilter is manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similes drawn from eating and its results occur most readily to the mind, he has grave doubts whether Sir Frederick Leighton is work is really Greek after all, and can discover in it but little of rocky Ithaca. Lynelle is pictures are a sort of up guards and Adam paintings, and Mason is exquisite. Idols are as national as a jingo poem. Mister Burkett Foster is landscapes smile at one much in the same way that Mister Carker used to flash his teeth, and Mister John Collier gives his sitter a cheerful slap on the back before he says, like a shampooer in a Turkish bath, Next man.', 'chunks': [{'text': 'Mister', 'timestamp': (0.0, 0.78)}, {'text': 'Quilter', 'timestamp': (0.88, 1.22)}, {'text': 'is', 'timestamp': (1.32, 1.38)

In [13]:
def timestamps_to_srt(timestamps):
  """
  Converts a list of timestamps with text data to SubRip (.srt) format string.

  Args:
      timestamps: A list of dictionaries containing 'text' and 'timestamp' keys.

  Returns:
      A string containing the subtitle data in SubRip format.
  """
  srt_content = ""
  # Counter for subtitle line numbers
  counter = 1
  for word in timestamps:
    start_time, end_time = word["timestamp"]
    # Format timestamps into hours:minutes:seconds.milliseconds format
    start_time_str = f"{int(start_time // 3600):02d}:{int(start_time // 60 % 60):02d}:{start_time % 60:.03f}"
    end_time_str = f"{int(end_time // 3600):02d}:{int(end_time // 60 % 60):02d}:{end_time % 60:.03f}"
    
    # Add subtitle line with counter, timings, and text
    srt_content += f"{counter}\n{start_time_str} --> {end_time_str}\n{word['text']}\n\n"
    counter += 1
  return srt_content


srt_string = timestamps_to_srt(result['chunks'])
srt_string

'1\n00:00:0.000 --> 00:00:0.780\nMister\n\n2\n00:00:0.880 --> 00:00:1.220\nQuilter\n\n3\n00:00:1.320 --> 00:00:1.380\nis\n\n4\n00:00:1.420 --> 00:00:1.520\nthe\n\n5\n00:00:1.620 --> 00:00:2.040\napostle\n\n6\n00:00:2.160 --> 00:00:2.240\nof\n\n7\n00:00:2.300 --> 00:00:2.320\nthe\n\n8\n00:00:2.400 --> 00:00:2.580\nmiddle\n\n9\n00:00:2.740 --> 00:00:3.220\nclasses,\n\n10\n00:00:3.320 --> 00:00:3.440\nand\n\n11\n00:00:3.480 --> 00:00:3.600\nwe\n\n12\n00:00:3.620 --> 00:00:3.660\nare\n\n13\n00:00:3.720 --> 00:00:4.040\nglad\n\n14\n00:00:4.100 --> 00:00:4.180\nto\n\n15\n00:00:4.260 --> 00:00:4.560\nwelcome\n\n16\n00:00:4.640 --> 00:00:4.820\nhis\n\n17\n00:00:4.900 --> 00:00:5.480\ngospel.\n\n18\n00:00:6.400 --> 00:00:6.640\nNor\n\n19\n00:00:6.720 --> 00:00:6.920\nis\n\n20\n00:00:6.980 --> 00:00:7.200\nMister\n\n21\n00:00:7.280 --> 00:00:7.600\nQuilter\n\n22\n00:00:7.620 --> 00:00:7.680\nis\n\n23\n00:00:7.780 --> 00:00:8.060\nmanner\n\n24\n00:00:8.220 --> 00:00:8.420\nless\n\n25\n00:00:8.520

In [7]:
sample = dataset[0]["audio"]

In [8]:
pipe(sample['array'], return_timestamps="word")

Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


{'text': 'Mister Quilter is the apostle of the middle classes, and we are glad to welcome his gospel. Nor is Mister Quilter is manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similes drawn from eating and its results occur most readily to the mind, he has grave doubts whether Sir Frederick Leighton is work is really Greek after all, and can discover in it but little of rocky Ithaca. Lynelle is pictures are a sort of up guards and Adam paintings, and Mason is exquisite. Idols are as national as a jingo poem. Mister Burkett Foster is landscapes smile at one much in the same way that Mister Carker used to flash his teeth, and Mister John Collier gives his sitter a cheerful slap on the back before he says, like a shampooer in a Turkish bath, Next man.',
 'chunks': [{'text': 'Mister', 'timestamp': (0.0, 0.78)},
  {'text': 'Quilter', 'timestamp': (0.88, 1.22)},
  {'text': 'is', 'timestamp': (1.32, 

In [4]:
import streamlit as st
import torchaudio
import torch
from transformers import pipeline
from scipy.io import wavfile
import numpy as np
import gradio as gr
from transformers import pipeline
import numpy as np
import torchaudio
from torchaudio import transforms
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset


device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "/home/azureuser/laurin/code/research/output/crisper_whisper_timestamp_finetuned"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)
model.generation_config.median_filter_width=3
processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=800,
    chunk_length_s=30,
    batch_size=1,
    return_timestamps=True,
    torch_dtype=torch_dtype,
    device=device,
)
# Initialize the transformer pipeline
pipe = pipeline("automatic-speech-recognition")

# Define the transcribe function
def transcribe(audio_bytes):
    sr, y = wavfile.read(audio_bytes)
    transform = torchaudio.transforms.Resample(sr, 16000)
    waveform = transform(torch.tensor(y).float())
    
    # Ensure waveform is a numpy array and run through the model
    transcription = pipe(waveform.numpy(), return_timestamps="word")
    
    # Extracting just the transcribed text for simplicity
    text = transcription['text'] if 'text' in transcription else "Transcription failed"
    return text

# Streamlit interface
st.title("Speech to Text Transcription")
st.write("Upload an audio file to transcribe it.")

# Upload audio file
audio_file = st.file_uploader("Upload an audio file", type=["wav"])

if audio_file is not None:
    # Display the audio player
    st.audio(audio_file)
    
    # Perform transcription
    transcription = transcribe(audio_file)
    
    # Display the transcription result
    st.write("Transcription:")
    st.write(transcription)


No model was supplied, defaulted to facebook/wav2vec2-base-960h and revision 55bb623 (https://huggingface.co/facebook/wav2vec2-base-960h).
Using a pipeline without specifying a model name and revision in production is not recommended.
Some weights of the model checkpoint at facebook/wav2vec2-base-960h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebo