In [1]:
from ctc_forced_aligner import (
    load_audio,
    load_alignment_model,
    generate_emissions,
    preprocess_text,
    get_alignments,
    get_spans,
    postprocess_results,
)
import torch
import torchaudio

language = "ms"
device = "cuda" if torch.cuda.is_available() else "cpu"
batch_size = 16

alignment_model, alignment_tokenizer = load_alignment_model(
    device,
    dtype=torch.float16 if device == "cuda" else torch.float32,
)

You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.


In [2]:
ls *.parquet

anwar_ibrahim_chatbot-00000-of-00001.parquet
confidence-wav2vec2.parquet
husein_chatbot-00000-of-00001.parquet
husein_news-00000-of-00001.parquet
shafiqah_idayu_chatbot-00000-of-00001.parquet
test.parquet
wild_qa-00000-of-00001.parquet


In [3]:
import pandas as pd

df = pd.read_parquet('anwar_ibrahim_chatbot-00000-of-00001.parquet').to_dict(orient = 'records')
df[0]

{'original': 'Tidak bukan saya. Saya ialah program komputer yang direka bentuk untuk mensimulasikan perbualan dan membantu menjawab soalan.',
 'normalized': 'Tidak bukan saya . Saya ialah program komputer yang direka bentuk untuk mensimulasikan perbualan dan membantu menjawab soalan .',
 'audio_filename': 'anwar-ibrahim-chatbot-normalized/0.mp3'}

In [4]:
!mkdir verify-anwar-ibrahim

mkdir: cannot create directory ‘verify-anwar-ibrahim’: File exists


In [5]:
from tqdm import tqdm
import json
import os

for i in tqdm(range(len(df))):
    filename = os.path.join('verify-anwar-ibrahim', f'{i}.json')
    if os.path.exists(filename):
        continue
    
    audio_filename = df[i]['audio_filename'].replace('normalized/', 'normalized-enhanced/')
    t = df[i]['normalized']
    new_wav, sr = torchaudio.load(audio_filename)
    audio_waveform = torchaudio.functional.resample(
        new_wav[0], orig_freq=sr, new_freq=16000
    ).type(torch.float16).cuda()
    emissions, stride = generate_emissions(
        alignment_model, audio_waveform, batch_size=1
    )
    tokens_starred, text_starred = preprocess_text(
        t,
        romanize=True,
        language=language,
    )
    segments, scores, blank_token = get_alignments(
        emissions,
        tokens_starred,
        alignment_tokenizer,
    )
    spans = get_spans(tokens_starred, segments, blank_token)
    word_timestamps = postprocess_results(text_starred, spans, stride, scores)
    with open(filename, 'w') as fopen:
        json.dump(word_timestamps, fopen)

100%|███████████████████████████████| 100696/100696 [00:00<00:00, 160375.30it/s]
