In [1]:
import sys
import os
from os.path import splitext, join
from collections import defaultdict

import librosa
import soundfile as sf
import torch


import gradio as gr
from gradio.components import Audio, Dropdown, Textbox, Image




import txtgrid_master.TextGrid_Master as tm
from core.utils import generate_file_basename, load_speech_file, zip_files
from config import output_dir, MAX_DUR, ASR_MODEL, SPEECH_LABEL, LM_PATH, device

from tasks.sd import pyannote_sd
from tasks.vad import pyannote_vad
from tasks.asr import wav2vec_asr

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def load_asr(params=None):
    #TODO add to the asr class to read parameters and load the correct model? or a separate function

    try:
        asr_engine = wav2vec_asr.speech_recognition(ASR_MODEL, device= device, lm_model_path=LM_PATH) #This from parameters and has default one #If language not determine use language id
    except:
        print(f'Error loading asr model {ASR_MODEL}')
        raise "Error in loading asr model"
    
    return asr_engine



def load_sd(params=None):
    
    try:
        diarizer = pyannote_sd.speaker_diar(device=device)
    except Exception as e:
        print(f'Error loading speaker diarization model')
        raise f"Error in loading speaker diarization model {e}"
    
    return diarizer



vad_params = {
             'min_duration_off': 0.09791355693027545,
             'min_duration_on': 0.05537587440407595
             }

def load_vad(params=None):
    model_path = 'Models/VAD/pytorch_model.bin'
    try:
        vad_pipeline = pyannote_vad.speech_detection(model_path, params)
    except:
        print(f'Error loading voice activity detection model {model_path}')
        raise "Error in loading voice activity detection model"   
    return vad_pipeline

In [3]:
def process_file(speech_file, tasks=['SD', 'ASR'], parameters=None, progress=gr.Progress()):
    basename = generate_file_basename()
    
    speech, sr, duration = load_speech_file(speech_file)
    
    #This save a version of the speech file with 16k, mono, 16bit
    speech_file = join(output_dir,f'{basename}.wav')
    sf.write(speech_file, speech, sr)
    
    tasks = set(tasks)
    
    task_pipeline = []
    if len(tasks) == 1:
        if 'ASR' in tasks and duration > MAX_DUR: #Add VAD task to split the speech file by sil
            task_pipeline = ['VAD', 'ASR']
        else:
            task_pipeline = list(tasks)
    elif set(tasks) == set(['SD', 'ASR']):
        task_pipeline = ['SD', 'ASR']
    elif set(tasks) == set(['VAD', 'ASR']):
        task_pipeline = ['VAD', 'ASR']
    elif set(tasks) == set(['VAD', 'ASR', 'SD']): #If both SD, VAD and ASR then ASR will be applied on SD output
        task_pipeline = ['VAD', 'SD', 'ASR']
    else:
        task_pipeline = tasks #Only 'SD' and 'VAD' each one will be applied separetly

    #print(speech_file, tasks, duration)
    if 'ASR' in task_pipeline:
        asr_engine = load_asr()
    
    if 'SD' in task_pipeline:
        diarizer = load_sd()
        
    if 'VAD' in task_pipeline:
        vad_engine = load_vad(params=vad_params)
    
    out_textgrid = []
    
    i = 0
    for task in task_pipeline:
        progress(i/(len(task_pipeline)+1), desc=f"Applying {task}")
        i = i+1
        if task == 'ASR':
            texgrid_file = join(output_dir,f'{basename}_ASR.TextGrid')
            if not out_textgrid:
                dTiers_asr = asr_engine.process_speech(speech)
            else:
                input_textgrid = out_textgrid[-1]
                dTiers_asr = asr_engine.process_intervals(speech, input_textgrid, sr = sr, offset_sec=0, speech_label = SPEECH_LABEL)
            
            tm.WriteTxtGrdFromDict(texgrid_file,dTiers_asr,0,duration)
            out_textgrid.append(texgrid_file)
        
        elif task == 'VAD':
            rttm_file = join(output_dir,f'{basename}_VAD.rttm')
            texgrid_file = join(output_dir,f'{basename}_VAD.TextGrid')
            vad_engine.DoVAD(speech,sr)
            vad_engine.write_rttm(rttm_file)
            vad_engine.write_textgrid(texgrid_file, speech_label=SPEECH_LABEL)
            out_textgrid.append(texgrid_file)
        
        elif task == 'SD':
            rttm_file = join(output_dir,f'{basename}_SD.rttm')
            texgrid_file = join(output_dir,f'{basename}_SD.TextGrid')
            diarizer.diarize(speech=speech, sr=sr)
            diarizer.write_rttm(rttm_file)
            diarizer.write_textgrid(texgrid_file, speech_label=SPEECH_LABEL)
            out_textgrid.append(texgrid_file)
    
    
    output_zip_file = join(output_dir,f'{basename}_output.zip')
    
    progress(len(task_pipeline)/(len(task_pipeline)+1), desc=f"Create output archive")
    p = zip_files(out_textgrid, output_zip_file)
    
    progress(1, desc=p)
    
    return [p, gr.DownloadButton(label=f"Download output file", value=output_zip_file, visible=True), 
           gr.DownloadButton(label=f"Download speech file", value=speech_file, visible=True)]

In [4]:
#TODO: Progress bar
#TODO: VAD ########DONE
#TODO: Number of speakers
#TODO: Word alignment
#TODO: Download button  ######DONE
#TODO: Wrape in a docker #####DONE
#TODO: Test pyannote offline  ########DONE
#TODO: Logging of errors and info
#TODO: ASR with LM  ########DONE
#TODO: Process batch
#TODO: Kaldi ASR
#TODO: MMS ASR
#TODO: Parameters for ASR (hotwords, LM/noLM)



with gr.Blocks(theme=gr.themes.Soft()) as gui:

    record_audio = gr.Audio(sources=["microphone","upload"], type="filepath")

    tasks = gr.CheckboxGroup(choices=[("Speech to Text","ASR"), ("Speaker Separation","SD"),("Speech Detection","VAD")], label="Tasks", info="Apply the following tasks:")

    process = gr.Button("Process Audio")

    output_text = gr.Textbox(label='Progress', interactive=False)
    with gr.Row():
        with gr.Column():
            d1 = gr.DownloadButton("Download output", visible=False)
        with gr.Column():
            d2 = gr.DownloadButton("Download speech file", visible=False)

    process.click(process_file, inputs=[record_audio, tasks], outputs=[output_text, d1, d2])
    
    
     
gui.launch()

Running on local URL:  http://127.0.0.1:7864

To create a public link, set `share=True` in `launch()`.




Some weights of the model checkpoint at /Users/z5173707/root/projects/FHS/alKhalilVox/Models/ASR/wav2vec2-large-xlsr-53-english/ were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at /Users/z5173707/root/projects/FHS/alKhalilVox/Models/ASR/wav2vec2-large-xlsr-53-english/ and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.o

# Draft

In [5]:
import torch

In [6]:
torch.cuda.is_available()

False

In [55]:
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, AutoProcessor

In [56]:
model_id = "facebook/mms-1b-all"

In [57]:
model = Wav2Vec2ForCTC.from_pretrained(model_id)

Some weights of the model checkpoint at facebook/mms-1b-all were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/mms-1b-all and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream

In [58]:
processor = AutoProcessor.from_pretrained(model_id)

In [59]:
processor.tokenizer.set_target_lang("eng")

In [60]:
model.load_adapter("eng")

In [61]:
speech, sr, duration = load_speech_file('../tmp/file_6820_20240928_232234_530279.wav')

In [62]:
duration

19.26

In [63]:
inputs = processor(speech, sampling_rate=16_000, return_tensors="pt")

In [64]:
with torch.no_grad():
    outputs = model(**inputs).logits

In [65]:
ids = torch.argmax(outputs, dim=-1)[0]
transcription = processor.decode(ids)

In [66]:
transcription

"okay now i will pretend i'm two this is the first speaker tis is a second speaer  first one will say hallo second one will say hallo back"

In [107]:
!pip install pyctcdecode==0.5.0



In [105]:
from pyctcdecode import build_ctcdecoder
import pyctcdecode

In [106]:
pyctcdecode

<module 'pyctcdecode' from '/Users/z5173707/root/projects/FHS/venv/python3.12.6/lib/python3.12/site-packages/pyctcdecode/__init__.py'>

In [18]:
lm_path = 'Models/ASR/LM/ngram/4gram_small.arpa.gz'
lm_path_unzip = 'Models/ASR/LM/ngram/4gram_small.arpa'

In [26]:
import gzip
import os, shutil
with gzip.open(lm_path, 'rb') as f_zipped:
    with open(lm_path_unzip, 'wb') as f_unzipped:
        shutil.copyfileobj(f_zipped, f_unzipped)

In [19]:
vocab_dict = processor.tokenizer.get_vocab()
sorted_vocab_dict = {k.lower(): v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1])}

In [20]:
import kenlm

In [21]:
from pyctcdecode import build_ctcdecoder

decoder = build_ctcdecoder(
    labels=list(sorted_vocab_dict.keys()),
    kenlm_model_path=lm_path_unzip,
)


Loading the LM will be faster if you build a binary file.
Reading /Users/z5173707/root/projects/FHS/alKhalilVox/Models/ASR/LM/ngram/4gram_small.arpa
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************
Found entries of length > 1 in alphabet. This is unusual unless style is BPE, but the alphabet was not recognized as BPE type. Is this correct?


In [22]:
text = decoder.decode(outputs)

ValueError: Input logits have 3 dimensions, but need 2: (time, vocabulary)

In [None]:
uppercase_lm_path = '3-gram.pruned.1e-7.arpa'
if not os.path.exists(uppercase_lm_path):
    with gzip.open(lm_gzip_path, 'rb') as f_zipped:
        with open(uppercase_lm_path, 'wb') as f_unzipped:
            shutil.copyfileobj(f_zipped, f_unzipped)
    print('Unzipped the 3-gram language model.')
else:
    print('Unzipped .arpa already exists.')

lm_path = 'lowercase_3-gram.pruned.1e-7.arpa'
if not os.path.exists(lm_path):
    with open(uppercase_lm_path, 'r') as f_upper:
        with open(lm_path, 'w') as f_lower:
            for line in f_upper:
                f_lower.write(line.lower())
print('Converted language model file to lowercase.')

In [15]:
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torch

In [77]:
model_id = 'Models/ASR/wav2vec2-large-xlsr-53-english/'

In [78]:
processor = Wav2Vec2Processor.from_pretrained(model_id)
model = Wav2Vec2ForCTC.from_pretrained(model_id)

Some weights of the model checkpoint at Models/ASR/wav2vec2-large-xlsr-53-english/ were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at Models/ASR/wav2vec2-large-xlsr-53-english/ and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You sho

In [79]:
from core.utils import generate_file_basename, load_speech_file, zip_files
from pyctcdecode import build_ctcdecoder

In [80]:
vocab_dict = processor.tokenizer.get_vocab()
sorted_vocab_dict = {k.lower(): v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1])}

In [81]:
lm_path = 'Models/ASR/LM/ngram/4gram_big.arpa.gz'
lm_path_unzip = 'Models/ASR/LM/ngram/4gram_big.arpa'

In [82]:
import gzip
import os, shutil
with gzip.open(lm_path, 'rb') as f_zipped:
    with open(lm_path_unzip, 'wb') as f_unzipped:
        shutil.copyfileobj(f_zipped, f_unzipped)

In [98]:
from pyctcdecode import build_ctcdecoder
#TODO: Create .bin LM

decoder = build_ctcdecoder(
    labels=list(sorted_vocab_dict.keys()),
    kenlm_model_path=lm_path_unzip,  # either .arpa or .bin file
     alpha=0.5,  # tuned on a val set
    beta=1.0,  # tuned on a val set
)


Loading the LM will be faster if you build a binary file.
Reading /Users/z5173707/root/projects/FHS/alKhalilVox/Models/ASR/LM/ngram/4gram_big.arpa
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************
Found entries of length > 1 in alphabet. This is unusual unless style is BPE, but the alphabet was not recognized as BPE type. Is this correct?
Unigrams and labels don't seem to agree.


In [84]:
speech, sr, duration = load_speech_file('../tmp/file_6820_20240928_232234_530279.wav')

In [85]:
inputs = processor(speech, sampling_rate=16_000, return_tensors="pt", padding=True)

In [86]:
with torch.no_grad():
    logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits

In [87]:
predicted_ids = torch.argmax(logits, dim=-1)
predicted_sentences = processor.batch_decode(predicted_ids)

In [88]:
predicted_sentences

["okay now i will pretend i'm two this is the first speakerthis is a secondy speaker first one will say hellosecond one will say hello back"]

In [109]:
x = logits.squeeze()
x = x.cpu().detach().numpy()
hotwords = None #["hello", "second"]
text = decoder.decode(x,
                     hotwords=hotwords,
                     hotword_weight=10.0)

In [110]:
text

"okay now i will pretend i'm two this is the first speaker this is a second speaker first one will say hellosecond one will say hello back"

In [103]:
text

"okay now i will pretend i'm two this is the first speaker this is a second speaker first one will say hellosecond one will say hello back"

In [38]:
x = logits.squeeze()
x = x.cpu().detach().numpy()

In [35]:
import numpy as np

In [None]:
np.amax(x, axis=1, keepdims=True)

In [37]:
x.cpu().detach().numpy()

array([[ 12.993846 , -17.811337 , -17.980957 , ...,  -5.951513 ,
         -2.7360475,  -5.3542457],
       [ 12.287314 , -16.559212 , -16.688166 , ...,  -5.4883504,
         -2.484035 ,  -4.5250134],
       [ 12.447207 , -16.961746 , -17.092838 , ...,  -5.403464 ,
         -2.4596238,  -4.6880875],
       ...,
       [ 14.150415 , -20.84775  , -21.004705 , ...,  -6.004456 ,
         -3.8888366,  -6.075698 ],
       [ 13.587078 , -19.1908   , -19.345034 , ...,  -5.3483753,
         -3.1083999,  -5.3358965],
       [  1.1179023,  -8.682626 ,  -8.614253 , ...,  -3.0244646,
         -1.1476758,  -4.2867713]], dtype=float32)

In [45]:
import kenlm

kenlm_model = kenlm.Model(lm_path_unzip)

Loading the LM will be faster if you build a binary file.
Reading /Users/z5173707/root/projects/FHS/alKhalilVox/Models/ASR/LM/ngram/4gram_small.arpa
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************


In [6]:
import pyannote.audio as p

In [7]:
p.__version__

'3.3.2'