In [1]:
import os
import json
import re

import librosa
from pathlib import Path
from scipy.io.wavfile import write

root = '/opt/storage/datasets/audio/japanese/jvs_ver1'
os.chdir(root)

RATE = 16000

In [2]:
transcript_files = sorted(list(Path('.').glob('**/*transcripts_utf8.txt')))

In [3]:
def num_there(s):
    return any(i.isdigit() for i in s)

In [4]:
remove_chars = ['A', '×', '×', '÷', '―', '△', '○', '々', '〇', '〒', '％', '＆', '＋', '／', '０', '１', '２', '３', '４', '５', '６', '７', '８', '９', '＝', '？', '?', 'Ａ', 'Ｂ', 'Ｃ', 'Ｄ', 'Ｅ', 'Ｆ', 'Ｇ', 'Ｈ', 'Ｉ', 'Ｊ', 'Ｋ', 'Ｌ', 'Ｍ', 'Ｎ', 'Ｏ', 'Ｐ', 'Ｑ', 'Ｒ', 'Ｓ', 'Ｔ', 'Ｕ', 'Ｖ', 'Ｗ', 'Ｘ', 'Ｙ', 'ａ', 'ｂ', 'ｅ', 'ｉ', 'ｌ', 'ｍ', 'ｎ', 'ｏ', 'ｐ', '～']# これがあったらデータセットから削除
ignore_chars =  ['「', '」', '『', '』', '、', '。', ',', '.', '，', '．']#  これは空白に置換

In [5]:
cnt = 0 
i = 0
removed = 0
train_metadata_set = []
val_metadata_set = []

for tf in transcript_files:
    with open(tf, 'r') as f:
        transcript = f.read().splitlines()

        for line in transcript: # line is like 'BMP05E50 夜空を 赤い 灯 が 点滅しながら 旋回し 十分おきに 照明弾が ゆっくりと 落ちてくる'
            wavfile = tf.parent / 'wav24kHz16bit' / (line.split(':')[0] + '.wav')
            
            if not os.path.exists(wavfile):
                continue
                
            text = "".join(line.split(':')[1:]) # 北海道に行ったら、やっぱり、海の幸を食べないと、行った意味がないでしょう。
            duration = librosa.core.get_duration(filename=wavfile.absolute())
            if duration <= 0.75:
                continue
            if num_there(text):
                removed += 1
                continue
            remove_flag = False
            for c in remove_chars:
                if c in text:
                    remove_flag = True
                    break
            if remove_flag:
                removed += 1
                continue
            
            for c in ignore_chars:
                text = text.replace(c, '')
            
            metadata = {
                "audio_filepath": str(wavfile.absolute()),
                "duration": duration,
                "text": text
            }
            i += 1
            if i % 20 == 0:
                val_metadata_set.append(metadata)
            else:
                train_metadata_set.append(metadata)

In [6]:
with open('mix_train_manifest.json', 'w') as f:
    for metadata in train_metadata_set:
        json.dump(metadata, f, ensure_ascii=False)
        f.write('\n')

with open('mix_val_manifest.json', 'w') as f:
    for metadata in val_metadata_set:
        json.dump(metadata, f, ensure_ascii=False)
        f.write('\n')

In [7]:
removed

708