In [None]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '1'
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

In [None]:
import sys

SOURCE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__name__))))
sys.path.insert(0, SOURCE_DIR)

In [None]:
SOURCE_DIR

In [None]:
import malaya_speech
from malaya_speech import Pipeline
from malaya_speech.utils.astype import float_to_int
malaya_speech

In [None]:
vad_model = malaya_speech.vad.webrtc()
p_vad = Pipeline()
pipeline = (
    p_vad.map(lambda x: malaya_speech.resample(x, old_samplerate = 22050, new_samplerate = 16000))
    .map(lambda x: float_to_int(x, divide_max_abs=False))
    .map(vad_model)
)
p_vad.visualize()

In [None]:
# import json

# with open('filtered-youtube.json') as fopen:
#     youtubes = json.load(fopen)
# youtubes = youtubes[:len(youtubes) // 2]
# len(youtubes)

In [None]:
parent_dir = '/home/husein/ssd2/processed-youtube-asr-whisper-large-v3'
# !rm -rf {parent_dir}
!mkdir {parent_dir}

In [None]:
parent_dir_done = '/home/husein/ssd2/processed-youtube-asr-whisper-large-v3-done'
# !rm -rf {parent_dir_done}
!mkdir {parent_dir_done}

In [None]:
from glob import glob
from unidecode import unidecode

youtubes_ = sorted(glob('/home/husein/ssd3/youtube/audio/*.mp3'))
youtubes = []

for f in youtubes_:
    
    new_f = unidecode(os.path.split(f)[1].replace('.mp3', '').replace(' ', '_'))
    new_f = new_f.replace('/', '_')
    f_done = os.path.join(parent_dir_done, new_f)
    
    if os.path.exists(f_done):
        continue
    youtubes.append(f)
    
youtubes = sorted(youtubes)

In [None]:
len(youtubes)

In [None]:
import whisper

model = whisper.load_model('large-v2')

In [None]:
from whisper.audio import (
    FRAMES_PER_SECOND,
    HOP_LENGTH,
    N_FRAMES,
    N_SAMPLES,
    SAMPLE_RATE,
    log_mel_spectrogram,
    pad_or_trim,
)

def detect_lang(x):
    mel = log_mel_spectrogram(y.astype('float32'), padding=N_SAMPLES)
    content_frames = mel.shape[-1] - N_FRAMES
    mel_segment = pad_or_trim(mel, N_FRAMES).to('cuda')
    _, probs = model.detect_language(mel_segment)
    return probs

In [None]:
y = malaya_speech.load('/home/husein/dev/malaya-speech/speech/example-speaker/husein-zolkepli.wav', sr = 16000)[0]

In [None]:
p_asr = Pipeline()
resample = p_asr.map(lambda x: malaya_speech.resample(x, old_samplerate = 22050, new_samplerate = 16000))
p = resample.map(lambda x: (
    detect_lang(x),
    model.transcribe(x.astype('float32'), task = 'transcribe', language = 'ms'),
    model.transcribe(x.astype('float32'), task = 'transcribe', language = 'en'),
), name = 'speech-to-text')
p_asr.visualize()

In [None]:
y = malaya_speech.load('/home/husein/dev/malaya-speech/speech/example-speaker/husein-zolkepli.wav', sr = 22050)[0]

In [None]:
p_asr(y)['speech-to-text']

In [None]:
from unidecode import unidecode
import torchaudio
import torch
import pickle
import random

In [None]:
# f = glob('/home/husein/youtube/audio/(LANGSUNG)*')[2]
# samples = malaya_speech.streaming.torchaudio.stream(f,
#                                                         vad_model = p_vad,
#                                                         asr_model = p_asr,
#                                                         segment_length = 441,
#                                                         realtime_print = True,
#                                                         sample_rate = 22050,
#                                                         min_length = 3.0,
#                                                         max_length = 15.0)

In [None]:
# len(samples)

In [None]:
# import IPython.display as ipd
# ipd.Audio(samples[1]['wav_data'], rate = 22050)

In [None]:
# samples[3]

In [None]:
from tqdm import tqdm

In [None]:
for f in tqdm(youtubes):
    
    new_f = unidecode(os.path.split(f)[1].replace('.mp3', '').replace(' ', '_').replace('|', '_'))
    new_f = new_f.replace('/', '_')
    f_done = os.path.join(parent_dir_done, new_f)
    
    if os.path.exists(f_done):
        continue
    
    samples = malaya_speech.streaming.torchaudio.stream(f, mode_utterence = False,
                                                        vad_model = p_vad,
                                                        asr_model = p_asr,
                                                        segment_length = 441,
                                                        realtime_print = False,
                                                        sample_rate = 22050,
                                                        min_length = 30.0,
                                                        max_length = 30.0
                                                       )
    
    if len(samples):
        
        parent_new_f = os.path.join(parent_dir, new_f)
        os.makedirs(parent_new_f, exist_ok=True)

        for i in range(len(samples)):
            audio_path = os.path.join(parent_new_f, f'{i}.mp3')
            torchaudio.save(audio_path, 
                            torch.tensor(samples[i]['wav_data'].astype('float32')).unsqueeze(0), 
                            22050, format='mp3')
            samples[i]['wav_data'] = audio_path

        with open(f'{parent_new_f}.pkl', 'wb') as fopen:
            pickle.dump(samples, fopen)
            
    with open(f_done, 'w') as fopen:
        fopen.write('done')