# Long Audio Transcription
Source: https://github.com/machinelearnear/long-audio-transcription-spanish

In [None]:
import time
from os.path import exists as path_exists
from pathlib import Path

In [None]:
if not path_exists('transcripts'):
    !mkdir transcripts

## Download audio from YouTube video

In [None]:
YouTubeID = 'gFFLJaQbLCM' 
OutputFile = 'test_audio_youtube.m4a'

In [None]:
if not path_exists(OutputFile):
    !youtube-dl -o $OutputFile $YouTubeID --extract-audio --restrict-filenames -f 'bestaudio[ext=m4a]'

## End to End Automatic Speech Recognition

In [None]:
model_name = "jonatasgrosman/wav2vec2-xls-r-1b-spanish"

In [None]:
def save_to_file(transcript, text=None):
    with open(f'transcripts/transcribed_speech_{text}.txt', "w") as f:
        f.write(transcript)

### Option A: Process long audio file directly with `Pipelines`

In [None]:
from transformers import pipeline

In [None]:
pipe = pipeline(model=model_name)

In [None]:
stime = time.time()
# transcript = pipe(OutputFile, chunk_length_s=10, stride_length_s=(4,2))
print(f'total time: {time.time()-stime:.2f} seconds')

In [None]:
print(transcript)
save_to_file(transcript,text='hf_pipelines')

### Option B: Split audio files in chunks by timestamp (`PyDub`)

In [None]:
import torch
import pydub
import array
import numpy as np
from pydub.utils import mediainfo
from pydub import AudioSegment
from pydub.utils import get_array_type

In [None]:
pydub_speech = pydub.AudioSegment.from_file(OutputFile)

In [None]:
pydub_speech[:15*1000] # miliseconds

In [None]:
def audio_resampler(sound, sample_rate=16000):
    sound = sound.set_frame_rate(sample_rate)
    left = sound.split_to_mono()[0]
    bit_depth = left.sample_width * 8
    array_type = pydub.utils.get_array_type(bit_depth)
    numeric_array = np.array(array.array(array_type, left._data))
    
    return np.asarray(numeric_array,dtype=np.double), sample_rate

In [None]:
speech, sample_rate = audio_resampler(pydub_speech)

In [None]:
transcript = ''
for chunk in np.array_split(speech,len(speech)/sample_rate/30)[:2]: # split every 30 seconds
    output = pipe(chunk)
    transcript = transcript + ' ' + output['text']
    print(output)
    
transcript = transcript.strip()

In [None]:
print(transcript)
save_to_file(transcript,text='pydub_timestamps')

### Option C: Split audio files based on silence detection (`Librosa`/`PyDub`)

(1) `Librosa`

In [None]:
import librosa
from librosa import display

import matplotlib.pyplot as plt

In [None]:
speech, sample_rate = librosa.load(OutputFile,sr=16000)

In [None]:
plt.figure()
librosa.display.waveshow(y=speech[:30*sample_rate], sr=sample_rate) # first 30 seconds
plt.xlabel('Time (seconds)')
plt.ylabel('Amplitude')
plt.show()

In [None]:
non_mute_sections_in_speech = librosa.effects.split(speech,top_db=50)

In [None]:
transcript = ''
for chunk in non_mute_sections_in_speech[:6]:
    speech_chunk = speech[chunk[0]:chunk[1]]
    output = pipe(speech_chunk)
    transcript = transcript + ' ' + output['text']
    print(output)
    
transcript = transcript.strip()

In [None]:
print(transcript)
save_to_file(transcript,text='librosa_silence')

(2) `PyDub`

In [None]:
pydub_speech = pydub.AudioSegment.from_file(OutputFile)

In [None]:
chunks = pydub.silence.split_on_silence(
    pydub_speech,
    min_silence_len = 500,
    silence_thresh = pydub_speech.dBFS - 16,
    keep_silence = 250, # optional
)

# minimum chunk length
target_length = 20 * 1000 # 20 seconds

output_chunks = [chunks[0]]
for chunk in chunks[1:]:
    if len(output_chunks[-1]) < target_length:
        output_chunks[-1] += chunk
    else:
        # if the last output chunk
        # is longer than the target length,
        # we can start a new one
        output_chunks.append(chunk)

In [None]:
output_chunks[0]

In [None]:
transcript = ''
for chunk in output_chunks[:6]:
    speech_chunk, sample_rate = audio_resampler(chunk)
    output = pipe(speech_chunk)
    transcript = transcript + ' ' + output['text']
    print(output)
    
transcript = transcript.strip()

In [None]:
print(transcript)
save_to_file(transcript,text='pydub_silence')

## Compare results

In [15]:
from utils import *
from IPython.display import HTML, display

In [16]:
base = "transcripts/transcribed_speech_hf_pipelines.txt"
compare = "transcripts/transcribed_speech_pydub_timestamps.txt"
# compare = "transcripts/transcribed_speech_pydub_silence.txt"
# compare = "transcripts/transcribed_speech_librosa_silence.txt"

In [17]:
a = open(base,'r').readlines()[0][:1000]
b = open(compare_to,'r').readlines()[0]
print(f'Original: {base} / Compare: {compare}')
display(HTML(html_diffs(a,b)))

Original: transcripts/transcribed_speech_pydub_silence.txt / Compare: transcripts/transcribed_speech_pydub_timestamps.txt


## References

- [Making automatic speech recognition work on large files with Wav2Vec2 in Transformers](https://huggingface.co/blog/asr-chunking)
- [Boosting Wav2Vec2 with n-grams in Transformers](https://huggingface.co/blog/wav2vec2-with-ngram)
- https://huggingface.co/jonatasgrosman/wav2vec2-xls-r-1b-spanish
- https://huggingface.co/jonatasgrosman/wav2vec2-large-xlsr-53-spanish
- https://huggingface.co/spaces/speech-recognition-community-v2/FinalLeaderboard

### Option D: Stream audio using `Librosa`

In [None]:
# import nltk
# nltk.download('punkt')

# def correct_sentence(input_text):
#     sentences = nltk.sent_tokenize(input_text)
#     return (' '.join([s.replace(s[0],s[0].capitalize(),1) for s in sentences]))

# def asr_transcript(tokenizer, model, input_file):
#     transcript = ""
#     # Ensure that the sample rate is 16k
#     print(librosa.get_samplerate(input_file))

#     # Stream over 30 seconds chunks rather than load the full file
#     stream = librosa.stream(
#         input_file,
#         block_length=30,
#         frame_length=16000,
#         hop_length=16000
#     )

#     for speech in stream:
#         if len(speech.shape) > 1:
#             speech = speech[:, 0] + speech[:, 1]

#         input_values = tokenizer(speech, return_tensors="pt").input_values
#         logits = model(input_values).logits

#         predicted_ids = torch.argmax(logits, dim=-1)
#         transcription = tokenizer.decode(predicted_ids[0])
#         transcript += correct_sentence(transcription.lower())

#     return transcript