In [42]:
!pip install transformers
!pip install -U datasets
!pip install soundfile # The soundfile module can read and write sound files
!pip install librosa
!pip install gradio
# Gradio is an open-source Python package that allows you to quickly build a demo or web application for your machine learning model, API, or any arbitrary Python function.



In [43]:
from transformers.utils import logging
logging.set_verbosity_error()

In [44]:
from datasets import load_dataset

In [45]:
dataset = load_dataset("librispeech_asr", split="train.clean.100", streaming=True, trust_remote_code=True)

In [46]:
# In streaming mode, the dataset is loaded in chunks, allowing you to process it piece by piece.
# It means that you trust the code that is executed to load the dataset from a remote source.


In [47]:
ex = next(iter(dataset))

In [48]:
dataset_head = dataset.take(5)
list(dataset_head)

[{'file': '374-180298-0000.flac',
  'audio': {'path': '374-180298-0000.flac',
   'array': array([ 7.01904297e-04,  7.32421875e-04,  7.32421875e-04, ...,
          -2.74658203e-04, -1.83105469e-04, -3.05175781e-05]),
   'sampling_rate': 16000},
  'text': 'CHAPTER SIXTEEN I MIGHT HAVE TOLD YOU OF THE BEGINNING OF THIS LIAISON IN A FEW LINES BUT I WANTED YOU TO SEE EVERY STEP BY WHICH WE CAME I TO AGREE TO WHATEVER MARGUERITE WISHED',
  'speaker_id': 374,
  'chapter_id': 180298,
  'id': '374-180298-0000'},
 {'file': '374-180298-0001.flac',
  'audio': {'path': '374-180298-0001.flac',
   'array': array([-9.15527344e-05, -1.52587891e-04, -1.52587891e-04, ...,
          -2.13623047e-04, -1.83105469e-04, -2.74658203e-04]),
   'sampling_rate': 16000},
  'text': "MARGUERITE TO BE UNABLE TO LIVE APART FROM ME IT WAS THE DAY AFTER THE EVENING WHEN SHE CAME TO SEE ME THAT I SENT HER MANON LESCAUT FROM THAT TIME SEEING THAT I COULD NOT CHANGE MY MISTRESS'S LIFE I CHANGED MY OWN",
  'speaker_id': 374

In [49]:
print(ex)

{'file': '374-180298-0000.flac', 'audio': {'path': '374-180298-0000.flac', 'array': array([ 7.01904297e-04,  7.32421875e-04,  7.32421875e-04, ...,
       -2.74658203e-04, -1.83105469e-04, -3.05175781e-05]), 'sampling_rate': 16000}, 'text': 'CHAPTER SIXTEEN I MIGHT HAVE TOLD YOU OF THE BEGINNING OF THIS LIAISON IN A FEW LINES BUT I WANTED YOU TO SEE EVERY STEP BY WHICH WE CAME I TO AGREE TO WHATEVER MARGUERITE WISHED', 'speaker_id': 374, 'chapter_id': 180298, 'id': '374-180298-0000'}


In [50]:
from IPython.display import Audio as IPythonAudio

IPythonAudio(ex["audio"]["array"],
             rate=ex["audio"]["sampling_rate"])

Let's build the pipeline

In [51]:

from transformers import pipeline

In [52]:
asr = pipeline(task="automatic-speech-recognition",
               model="distil-whisper/distil-small.en")

In [53]:
asr.feature_extractor.sampling_rate

16000

In [54]:
ex['audio']['sampling_rate']

16000

In [55]:
asr(ex["audio"]["array"])

{'text': ' Chapter 16 I might have told you of the beginning of this liaison in a few lines, but I wanted you to see every step by which we came. I too agree to whatever Marguerite wished.'}

In [56]:

print(ex["text"])

CHAPTER SIXTEEN I MIGHT HAVE TOLD YOU OF THE BEGINNING OF THIS LIAISON IN A FEW LINES BUT I WANTED YOU TO SEE EVERY STEP BY WHICH WE CAME I TO AGREE TO WHATEVER MARGUERITE WISHED


Gradio

In [57]:
import gradio as gr

In [58]:
import os


In [59]:
# sp = gr.Blocks()

In [60]:
def transcribe_speech(filepath):
  if filepath is None:
    gr.Warning("No Audio Found")
    return ""
  op = asr(filepath)
  return op["text"]

In [61]:
# Transcription
mic_transcribe = gr.Interface(
    fn=transcribe_speech,
    inputs=gr.Audio(sources="microphone", type="filepath"),
    outputs=gr.Textbox(label="Transcription", lines=3),
    allow_flagging="never"  # Disabling flagging for simplicity
)

In [62]:
# uploading
file_transcribe = gr.Interface(
    fn=transcribe_speech,
    inputs=gr.Audio(sources="upload",
                    type="filepath"),
    outputs=gr.Textbox(label="Transcription",
                       lines=3),
    allow_flagging="never",
)

In [63]:
demo =gr.Blocks()

In [64]:
with demo:
    gr.TabbedInterface(
        [mic_transcribe,
         file_transcribe],
        ["Transcribe Microphone",
         "Transcribe Audio File"],
    )

demo.launch(debug=True)

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://bf7c1480de2232a00b.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://bf7c1480de2232a00b.gradio.live




In [65]:
demo.close()

Closing server running on port: 7860


In [66]:
import soundfile as sf
import io # allows us to manage the file-related input and output operations|

In [67]:
audio, sampling_rate = sf.read('/content/6_Channel_ID.wav') # example audio is taken from McGill University

In [68]:
sampling_rate # large file

44100

In [69]:
asr.feature_extractor.sampling_rate

16000

In [70]:
asr(audio) # we have to convert the audio from stereo to mono

ValueError: We expect a single channel audio input for AutomaticSpeechRecognitionPipeline

In [71]:
import librosa

Stereo to Mono

In [72]:
audio.shape

(174752, 6)

In [73]:

import numpy as np

In [74]:
audio_tp = np.transpose(audio)

In [75]:
audio_tp.shape

(6, 174752)

In [76]:
mono = librosa.to_mono(audio_tp)

In [77]:
IPythonAudio(mono,
             rate=sampling_rate)

In [78]:
asr(mono)

{'text': ' For this, for this, for this, for this.'}

In [79]:
sampling_rate

44100

In [80]:
asr.feature_extractor.sampling_rate # both are not equal

16000

In [81]:
a_16KHz = librosa.resample(mono,
                               orig_sr=sampling_rate,
                               target_sr=16000)

In [82]:
asr(
    a_16KHz,
    chunk_length_s=30, # 30 seconds
    batch_size=4,
    return_timestamps=True,
)["chunks"]

[{'timestamp': (0.0, 3.84),
  'text': ' Front left, front right, center, back left, back.'}]

Now build the gradio interface

In [84]:
import gradio as gr
demo = gr.Blocks()

In [98]:
def transcribe_long_form(filepath):
    if filepath is None:
        gr.Warning("No audio found!")
        return ""
    output = asr(
      filepath,
      max_new_tokens=256,
      chunk_length_s=30,
      batch_size=8,
    )
    return output["text"]

In [99]:
mic_transcribe = gr.Interface(
    fn=transcribe_long_form,
    inputs=gr.Audio(sources="microphone",
                    type="filepath"),
    outputs=gr.Textbox(label="Transcription",
                       lines=3),
    allow_flagging="never")

file_transcribe = gr.Interface(
    fn=transcribe_long_form,
    inputs=gr.Audio(sources="upload",
                    type="filepath"),
    outputs=gr.Textbox(label="Transcription",
                       lines=3),
    allow_flagging="never",
)

In [100]:
with demo:
    gr.TabbedInterface(
        [mic_transcribe,
         file_transcribe],
        ["Transcribe Microphone",
         "Transcribe Audio File"],
    )
demo.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://a5a8d74ac61cb3c2f2.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




In [96]:
demo.close()

Closing server running on port: 7860
