In [1]:
import os
import time
import json
from pathlib import Path

import numpy as np
import librosa
from scipy.io.wavfile import write

RATE = 16000

In [2]:
NEMO_PATH = Path('/src/NeMo/')
NOISE_PATH = Path('/opt/storage/datasets/audio/noise/DNS-Challenge/datasets/noise/')
TARGET_PATH = Path('/opt/storage/datasets/audio/japanese/noised')
noise_path_list = list(NOISE_PATH.glob('**/*.wav'))

In [3]:
decoder = json.JSONDecoder()
def read_json(file):
    res = []
    with open(file, 'r') as f:
        line = f.readline()
        while line:
            res.append(decoder.raw_decode(line)[0])
            line = f.readline()
    return res

In [4]:
train_all = []
val_all = []
roots = [
    '/opt/storage/datasets/audio/japanese/JNAS',
    '/opt/storage/datasets/audio/japanese/jvs_ver1',
    '/opt/storage/datasets/audio/japanese/CSJ',
]

# 入れたくないコーパスは適宜コメントアウト

for r in roots:
    train = Path(r) / 'mix_train_manifest.json'
    val = Path(r) / 'mix_val_manifest.json'
    
    train_all.extend(read_json(train))
    val_all.extend(read_json(val))

In [5]:
np.random.shuffle(train_all)

In [6]:
len(val_all)

38712

In [7]:
def align_length(clean_wav, noise_wav):
    before = time.time()
    clean_length = len(clean_wav)
    noise_length = len(noise_wav)
    if clean_length > noise_length:
        noise_wav = np.tile(noise_wav, 1+clean_length//noise_length)
        noise_length = len(noise_wav)
    start = np.random.randint(low=0, high=noise_length-clean_length)
    return noise_wav[start: start+clean_length]

def synthesize_wav(clean_wav, noise_wav, noise_thresh=(0.1, 0.4)):
    now = time.time()
    b=time.time()
    noise_wav = align_length(clean_wav, noise_wav)
    assert len(noise_wav) == len(clean_wav)
    noise_coeff = np.random.uniform(*noise_thresh)
    
    result = clean_wav + noise_coeff * noise_wav
    return result

def synthesize(clean_path, noise_path_list, num=5):
    clean_wav, _ = librosa.load(clean_path, RATE)
    noises = np.random.choice(noise_path_list, num)
    result = []
    for noise_path in noises:
        load = time.time()
        noise_wav, _ = librosa.load(noise_path, RATE)
        noised_wav = synthesize_wav(clean_wav, noise_wav)
        result.append(noised_wav)
    return result

In [None]:
noised_train_all = []

for i, elem in enumerate(train_all):
    try:
        
        result = synthesize(elem['audio_filepath'], noise_path_list)
        for j, wav in enumerate(result):
            path = TARGET_PATH / f"file{i}_{j}.wav"
            write(path, RATE, wav)
            noised_train_all.append({
                'audio_filepath': str(path),
                'duration': elem['duration'],
                'text': elem['text']
            })
    except:
        continue

In [None]:
len(noised_val_all)

In [12]:
with open(NEMO_PATH / 'examples/asr/conf/noised_mix_train_manifest-test.json', 'w') as f:
    for metadata in noised_train_all:
        json.dump(metadata, f, ensure_ascii=False)
        f.write('\n')

In [None]:
noised_val_all = []
for i, elem in enumerate(val_all):
    try:
        result = synthesize(elem['audio_filepath'], noise_path_list, num=1)
        for j, wav in enumerate(result):
            path = TARGET_PATH / f"val_file{i}_{j}.wav"
            write(path, RATE, wav)
            noised_val_all.append({
                'audio_filepath': str(path),
                'duration': elem['duration'],
                'text': elem['text']
            })
    except:
        continue

In [12]:
with open(NEMO_PATH / 'examples/asr/conf/noised_mix_val_manifest-test.json', 'w') as f:
    for metadata in noised_val_all:
        json.dump(metadata, f, ensure_ascii=False)
        f.write('\n')