In [1]:
import json
import os
import numpy as np
import re
from tqdm import tqdm
from glob import glob
from collections import defaultdict
from datasets import Audio
import torchaudio
import torch

In [2]:
import yaml

with open('config.yaml') as fopen:
    config = yaml.safe_load(fopen)
    
config

{'sampling_rate': 22050,
 'fft_size': 1024,
 'hop_size': 256,
 'win_length': None,
 'window': 'hann',
 'num_mels': 80,
 'fmin': 80,
 'fmax': 7600,
 'global_gain_scale': 1.0,
 'trim_silence': True,
 'trim_threshold_in_db': 60,
 'trim_frame_size': 2048,
 'trim_hop_size': 512}

In [3]:
def is_outlier(x, p25, p75):
    """Check if value is an outlier."""
    lower = p25 - 1.5 * (p75 - p25)
    upper = p75 + 1.5 * (p75 - p25)
    return x <= lower or x >= upper


def remove_outlier(x, p_bottom: int = 25, p_top: int = 75):
    """Remove outlier from x."""
    p_bottom = np.percentile(x, p_bottom)
    p_top = np.percentile(x, p_top)

    indices_of_outliers = []
    for ind, value in enumerate(x):
        if is_outlier(value, p_bottom, p_top):
            indices_of_outliers.append(ind)

    x[indices_of_outliers] = 0.0
    x[indices_of_outliers] = np.max(x)
    return x

In [4]:
import malaya_speech
malaya_speech

`pyaudio` is not available, `malaya_speech.streaming.pyaudio` is not able to use.


<module 'malaya_speech' from '/home/ubuntu/.local/lib/python3.8/site-packages/malaya_speech/__init__.py'>

In [5]:
sr = 22050
reader = Audio(sampling_rate = sr)

In [6]:
import pickle
from glob import glob

files = sorted(glob('processed-harry-potter/*.pkl'))
len(files)

6

In [7]:
from malaya_speech.tts import load_text_ids

t = load_text_ids(is_lower = False)

In [8]:
vad = malaya_speech.vad.webrtc()

start_silent_trail = int(0.15 * config['sampling_rate'])
middle_silent_trail = int(0.3 * config['sampling_rate'])
end_silent_trail = int(0.2 * config['sampling_rate'])
process_middle_silent = True

In [9]:
directory = 'harry-potter-noisy'
!mkdir {directory}

mkdir: cannot create directory ‘harry-potter-noisy’: File exists


In [10]:
from torchaudio.pipelines import HDEMUCS_HIGH_MUSDB_PLUS
from torchaudio.transforms import Fade
import torchaudio

bundle = HDEMUCS_HIGH_MUSDB_PLUS

model = bundle.get_model()
device = 'cuda'
model.to(device)

sample_rate = bundle.sample_rate
segment = 10
overlap = 0.1
length = 10

def separate_sources(
        model,
        mix,
        segment=10.,
        overlap=0.1,
        device=None,
):
    """
    Apply model to a given mixture. Use fade, and add segments together in order to add model segment by segment.

    Args:
        segment (int): segment length in seconds
        device (torch.device, str, or None): if provided, device on which to
            execute the computation, otherwise `mix.device` is assumed.
            When `device` is different from `mix.device`, only local computations will
            be on `device`, while the entire tracks will be stored on `mix.device`.
    """
    if device is None:
        device = mix.device
    else:
        device = torch.device(device)

    batch, channels, length = mix.shape

    chunk_len = int(sample_rate * segment * (1 + overlap))
    start = 0
    end = chunk_len
    overlap_frames = overlap * sample_rate
    fade = Fade(fade_in_len=0, fade_out_len=int(overlap_frames), fade_shape='linear')

    final = torch.zeros(batch, len(model.sources), channels, length, device=device)

    while start < length - overlap_frames:
        chunk = mix[:, :, start:end]
        with torch.no_grad():
            out = model.forward(chunk)
        out = fade(out)
        final[:, :, :, start:end] += out
        if start == 0:
            fade.fade_in_len = int(overlap_frames)
            start += int(chunk_len - overlap_frames)
        else:
            start += chunk_len
        end += chunk_len
        if end >= length:
            fade.fade_out_len = 0
    return final

def get_split(s):
    waveform, sample_rate = torchaudio.load(s)
    if sample_rate != bundle.sample_rate:
        y_ = malaya_speech.resample(waveform.numpy()[0], sample_rate, bundle.sample_rate)
        waveform = torch.Tensor(np.expand_dims(y_, 0))
    waveform = torch.concat([waveform, waveform])
    waveform = waveform.to(device)
    mixture = waveform
    
    ref = waveform.mean(0)
    waveform = (waveform - ref.mean()) / ref.std()

    sources = separate_sources(
        model,
        waveform[None],
        device=device,
        segment=segment,
        overlap=overlap,
    )[0]
    sources = sources * ref.std() + ref.mean()

    sources_list = model.sources
    sources = list(sources)

    audios = dict(zip(sources_list, sources))
    return audios, sample_rate

  0%|          | 0.00/319M [00:00<?, ?B/s]

In [17]:
splitted['vocals'].cpu().numpy()

array([[ 2.1948827e-04,  2.0105563e-04,  9.8269757e-06, ...,
        -2.2794618e-04, -1.7218920e-04, -2.3007867e-04],
       [ 2.0001913e-04,  2.6206029e-04, -7.3003066e-05, ...,
        -2.0393322e-04, -2.6642316e-04, -1.9584704e-04]], dtype=float32)

In [25]:
import soundfile as sf

audios, text_ids = [], []
index = 0

for f in files:
    with open(f, 'rb') as fopen:
        data = pickle.load(fopen)
    
    for i in tqdm(range(len(data))):
        lang = max(data[i]['asr_model'][0], key=data[i]['asr_model'][0].get)
        if lang not in {'ms', 'en'}:
            continue

        ms_score = [s['avg_logprob'] for s in data[i]['asr_model'][1]['segments']]
        en_score = [s['avg_logprob'] for s in data[i]['asr_model'][2]['segments']]

        if np.mean(ms_score) > np.mean(en_score):
            selected = data[i]['asr_model'][1]['segments']
        else:
            selected = data[i]['asr_model'][2]['segments']

        texts = [s['text'] for s in selected]
        text = ''.join(texts).strip()
        if len(text):
            n = t.normalize(text, trim_end_comma = False, add_fullstop = False)[0]
            # audio = reader.decode_example(reader.encode_example(data[i]['wav_data']))['array']
            splitted, _ = get_split(data[i]['wav_data'])
            audio = (splitted['vocals']).cpu().numpy()[0]
            #audio = audio[(audio < 0.2) & (audio > -0.2)]
            audio = malaya_speech.resample(audio, bundle.sample_rate, config['sampling_rate'])
            
            y_= malaya_speech.resample(audio, config['sampling_rate'], 16000)
            y_ = malaya_speech.astype.float_to_int(y_)
            frames = list(malaya_speech.generator.frames(audio, 30, config['sampling_rate']))
            frames_ = list(malaya_speech.generator.frames(y_, 30, 16000, append_ending_trail = False))
            frames_webrtc = [(frames[no], vad(frame)) for no, frame in enumerate(frames_)]
            grouped_deep = malaya_speech.group.group_frames(frames_webrtc)
            grouped_deep = malaya_speech.group.group_frames_threshold(grouped_deep, 0.15)
            r = []
            for no, g in enumerate(grouped_deep):
                if g[1]:
                    g = g[0].array
                else:
                    if no == 0:
                        g = g[0].array[-start_silent_trail:]
                    elif no == (len(grouped_deep) - 1):
                        g = g[0].array[:end_silent_trail]
                    else:
                        if process_middle_silent:
                            g = np.concatenate([g[0].array[:middle_silent_trail], g[0].array[-middle_silent_trail:]])
                        else:
                            g = g[0].array
                        
                r.append(g)
            audio = np.concatenate(r)
            audio = np.pad(audio, (0, config["fft_size"]), mode="edge")
            new_f = f'{directory}/{index}.wav'
            sf.write(new_f, audio, 22050)
            audios.append(new_f)
            text_ids.append(n)
            index += 1
        
#         break
#     break

 41%|████████████████████████████████████████▉                                                          | 50/121 [00:12<00:19,  3.72it/s]bad escape \d at position 7
 46%|█████████████████████████████████████████████▊                                                     | 56/121 [00:13<00:13,  4.75it/s]bad escape \d at position 7
 92%|█████████████████████████████████████████████████████████████████████████████████████████▉        | 111/121 [00:25<00:01,  5.18it/s]bad escape \d at position 7
100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 121/121 [00:27<00:00,  4.35it/s]
 36%|████████████████████████████████████                                                               | 51/140 [00:10<00:19,  4.55it/s]bad escape \d at position 7
 78%|████████████████████████████████████████████████████████████████████████████▎                     | 109/140 [00:23<00:07,  4.14it/s]bad escape \d at position 7
100%|████████████████████████████████

In [27]:
len(audios)

1135

In [29]:
import IPython.display as ipd
ipd.Audio(audios[-2])

In [30]:
text_ids[-2]

'Merepek lah jawab hormoni . Bukankah kamu telah membaca bukunya ? Lihatlah semua perkara menajubkan yang telah dilakukannya . Dia kata dia yang melakukannya .'

In [31]:
with open('harry-potter-processed.json', 'w') as fopen:
    json.dump({'audio': audios, 'text': text_ids}, fopen)