In [1]:
!pip install --upgrade git+https://github.com/huggingface/transformers.git accelerate datasets[audio] gradio

Collecting git+https://github.com/huggingface/transformers.git
  Cloning https://github.com/huggingface/transformers.git to /tmp/pip-req-build-l1lgx1kn
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git /tmp/pip-req-build-l1lgx1kn
  Resolved https://github.com/huggingface/transformers.git to commit deba7655e6e54fb885e79204dec9f767393dd2df
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets[audio]
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting gradi

In [2]:
import torch
from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
import gradio as gr

# Device setup
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

# Load the Whisper model and processor for transcription
model_id = "openai/whisper-large-v3"
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)
processor.feature_extractor.sampling_rate = 16000  # Ensure the sampling rate is set correctly

# Set up the transcription pipeline
transcription_pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=30,
    batch_size=16,
    return_timestamps=True,
    torch_dtype=torch_dtype,
    device=device,
)


# Function to transcribe and process text
def transcribe_and_process(audio):
    if audio is None:
        return "Please upload an audio file.", ""

    try:
        # Transcription
        result = transcription_pipe(audio)
        text = result["text"]

        # Further processing
        result2 = transcription_pipe(audio, generate_kwargs={"language": "english"})
        text2 = result2["text"]

        return text, text2
    except Exception as e:
        return f"An error occurred: {e}", ""

# Gradio interface setup
inputs = gr.Audio(type="filepath", label="Upload Audio File")
outputs = [
    gr.Textbox(label="Transcription"),
    gr.Textbox(label="Processed Text")
]

examples = [
    ["ben.mp3"]  # Replace with the actual path to your audio file
]

app = gr.Interface(
    fn=transcribe_and_process,
    inputs=inputs,
    outputs=outputs,
    title="Audio Transcription and Processing",
    examples=examples,
)
app.launch(share=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.27k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.90k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.07k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://936180d26553b9074c.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


