In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '2'

In [2]:
from ctc_forced_aligner import (
    load_audio,
    load_alignment_model,
    generate_emissions,
    preprocess_text,
    get_alignments,
    get_spans,
    postprocess_results,
)
import torch
import torchaudio

language = "ms"
device = "cuda" if torch.cuda.is_available() else "cpu"
batch_size = 16

alignment_model, alignment_tokenizer = load_alignment_model(
    device,
    dtype=torch.float16 if device == "cuda" else torch.float32,
)

You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.


In [3]:
ls *.parquet

anwar_ibrahim_chatbot-00000-of-00001.parquet
confidence-wav2vec2.parquet
husein_chatbot-00000-of-00001.parquet
husein_news-00000-of-00001.parquet
shafiqah_idayu_chatbot-00000-of-00001.parquet
test.parquet
wild_qa-00000-of-00001.parquet


In [4]:
import pandas as pd

df = pd.read_parquet('husein_news-00000-of-00001.parquet').to_dict(orient = 'records')
df[0]

{'original': "Sedangkan dalam bahasa Perancis, ''frire'' hanya bererti menggoreng di dalam minyak goreng yang banyak hingga terendam.",
 'normalized': "Sedangkan dalam bahasa Perancis , ' ' frira ' ' hanya bererti menggoreng di dalam minyak goreng yang banyak hingga terendam .",
 'audio_filename': 'generate-husein-news-normalized/0.mp3'}

In [6]:
!mkdir verify-husein-news

mkdir: cannot create directory ‘verify-husein-news’: File exists


In [7]:
from tqdm import tqdm
import json
import os

for i in tqdm(range(len(df))):
    filename = os.path.join('verify-husein-news', f'{i}.json')
    if os.path.exists(filename):
        continue
    
    audio_filename = df[i]['audio_filename'].replace('normalized/', 'normalized-enhanced/')
    if not os.path.exists(audio_filename):
        continue
    t = df[i]['normalized']
    new_wav, sr = torchaudio.load(audio_filename)
    audio_waveform = torchaudio.functional.resample(
        new_wav[0], orig_freq=sr, new_freq=16000
    ).type(torch.float16).cuda()
    emissions, stride = generate_emissions(
        alignment_model, audio_waveform, batch_size=1
    )
    tokens_starred, text_starred = preprocess_text(
        t,
        romanize=True,
        language=language,
    )
    segments, scores, blank_token = get_alignments(
        emissions,
        tokens_starred,
        alignment_tokenizer,
    )
    spans = get_spans(tokens_starred, segments, blank_token)
    word_timestamps = postprocess_results(text_starred, spans, stride, scores)
    with open(filename, 'w') as fopen:
        json.dump(word_timestamps, fopen)

 42%|█████████████▊                   | 25206/60094 [2:23:41<3:06:27,  3.12it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

100%|███████████████████████████████████| 60094/60094 [5:10:02<00:00,  3.23it/s]
