In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '1'
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

In [2]:
# !wget https://huggingface.co/huseinzol05/jik876-UNIVERSAL_V1/raw/main/config.json -O hifigan-config.json

In [3]:
import malaya_speech
import torch
import json
from librosa.util import normalize
from torch.nn.utils.rnn import pad_sequence
from malaya_speech.augmentation.waveform import random_sampling
from malaya_speech.torch_model.hifivoice.env import AttrDict
from malaya_speech.torch_model.hifivoice.meldataset import mel_spectrogram, mel_normalize
from malaya_speech.torch_model.mediumvc.any2any import MagicModel

`pyaudio` is not available, `malaya_speech.streaming.pyaudio` is not able to use.


In [4]:
speaker_v = malaya_speech.speaker_vector.nemo(model = 'huseinzol05/nemo-titanet_large', 
                                              local_files_only=True)

_ = speaker_v.cuda()
_ = speaker_v.eval()

In [5]:
config = 'hifigan-config.json'
with open(config) as fopen:
    json_config = json.load(fopen)
    
config = AttrDict(json_config)

In [6]:
y, _ = malaya_speech.load('speech/example-speaker/husein-zolkepli.wav', sr = 22050)
y = random_sampling(y, 22050, length = 8000)
y_16k, _ = malaya_speech.load('speech/example-speaker/husein-zolkepli.wav')
spk_emb = speaker_v([y_16k])[0]
spk_emb = normalize(spk_emb)

In [7]:
audio = normalize(y) * 0.95
audio = torch.FloatTensor(audio)
audio = audio.unsqueeze(0)

mel = mel_spectrogram(audio, config["n_fft"], config["num_mels"], config["sampling_rate"],
                                          config["hop_size"], config["win_size"], config["fmin"], config["fmax"],
                                          center=False)

mel = mel.squeeze(0).transpose(0, 1)
mel = mel_normalize(mel)

  return _VF.stft(input, n_fft, hop_length, win_length, window,  # type: ignore[attr-defined]


In [8]:
mel.shape

torch.Size([484, 80])

In [9]:
Generator = MagicModel(d_model = 192)

In [10]:
ori_mels = [mel]
spk_input_mels = [torch.tensor(spk_emb)]
spk_input_mels = torch.stack(spk_input_mels)
ori_lens = [len(ori_mel) for ori_mel in ori_mels]
overlap_lens = ori_lens
ori_mels = pad_sequence(ori_mels, batch_first=True)
mel_masks = [torch.arange(ori_mels.size(1)) >= mel_len for mel_len in ori_lens]
mel_masks = torch.stack(mel_masks)

In [11]:
fake_mels = Generator(spk_input_mels,ori_mels,mel_masks)

In [12]:
fake_mels.shape

torch.Size([1, 484, 80])

In [13]:
from glob import glob
from tqdm import tqdm
import random

base_directory = '/home/husein/ssd1/speech-bahasa'

In [14]:
khalil = glob(f'{base_directory}/tolong-sebut/*.wav')
mas = glob(f'{base_directory}/sebut-perkataan-woman/*.wav')
husein = glob(f'{base_directory}/sebut-perkataan-man/*.wav')
len(khalil), len(mas), len(husein)

(565, 200, 698)

In [15]:
salina = glob('/home/husein/ssd1/speech-bahasa/combined/salina*.wav')
pasentran = glob('/home/husein/ssd1/speech-bahasa/combined/dari-pasentran-ke-istana*.wav')
salina = random.sample(salina, 1500)
pasentran = random.sample(pasentran, 1500)

In [16]:
import pandas as pd

In [17]:
df = pd.read_csv(f'{base_directory}/haqkiem/metadata.csv', header = None, sep = '|')
txts = df.values.tolist()
haqkiem = []
for f in txts:
    f = f[0]
    r = f'{base_directory}/haqkiem/{f}.wav'
    if os.path.exists(r):
        haqkiem.append(r)
        
haqkiem = random.sample(haqkiem, 1500)
len(haqkiem)

1500

In [18]:
my_a = random.sample(glob('/home/husein/ssd2/ms-MY-Wavenet-A/*.mp3'), 1500)
my_b = random.sample(glob('/home/husein/ssd2/ms-MY-Wavenet-B/*.mp3'), 1500)
my_c = random.sample(glob('/home/husein/ssd2/ms-MY-Wavenet-C/*.mp3'), 1500)
my_d = random.sample(glob('/home/husein/ssd2/ms-MY-Wavenet-D/*.mp3'), 1500)

In [19]:
osman = random.sample(glob('/home/husein/ssd2/osman-news-edge-tts-wav/*'), 1500)
yasmin = random.sample(glob('/home/husein/ssd2/yasmin-news-edge-tts-wav/*'), 1500)

In [20]:
y, _ = malaya_speech.load(khalil[0])
v = speaker_v([y])[0]
v.shape

(192,)

In [21]:
dicts = {
    'khalil': khalil,
    'mas': mas,
    'husein': husein,
    'salina': salina,
    'pasentran': pasentran,
    'haqkiem': haqkiem,
    'my_a': my_a,
    'my_b': my_b,
    'my_c': my_c,
    'my_d': my_d,
    'osman': osman,
    'yasmin': yasmin,
}

In [23]:
import pickle
from datasets import Audio

audio = Audio(sampling_rate=16000)

In [25]:
for k, v in dicts.items():
    r = []
    for f in tqdm(v):
        y = audio.decode_example(audio.encode_example(f))
        y = y['array']
        r.append({
            'wav_data': f,
            'classification_model': (speaker_v([y])[0], 'speaker 0'),
            'asr_model': 'very legit',
        })
    
    with open(f'random-embedding-{k}.pkl', 'wb') as fopen:
        pickle.dump(r, fopen)

100%|█████████████████████████████████████████| 565/565 [00:52<00:00, 10.70it/s]
100%|█████████████████████████████████████████| 200/200 [00:19<00:00, 10.49it/s]
100%|█████████████████████████████████████████| 698/698 [00:47<00:00, 14.84it/s]
100%|███████████████████████████████████████| 1500/1500 [01:07<00:00, 22.32it/s]
100%|███████████████████████████████████████| 1500/1500 [01:18<00:00, 19.04it/s]
100%|███████████████████████████████████████| 1500/1500 [03:41<00:00,  6.76it/s]
100%|███████████████████████████████████████| 1500/1500 [00:28<00:00, 52.34it/s]
100%|███████████████████████████████████████| 1500/1500 [00:27<00:00, 54.18it/s]
100%|███████████████████████████████████████| 1500/1500 [00:28<00:00, 52.43it/s]
100%|███████████████████████████████████████| 1500/1500 [00:27<00:00, 53.64it/s]
100%|███████████████████████████████████████| 1500/1500 [02:26<00:00, 10.22it/s]
100%|███████████████████████████████████████| 1500/1500 [02:31<00:00,  9.93it/s]
