In [1]:
import os
import json
import re

import librosa
from pathlib import Path
from scipy.io.wavfile import write

root = '/opt/storage/datasets/audio/japanese/JNAS'
os.chdir(root)

RATE = 16000

In [2]:
transcript_files = sorted(list(Path('Transcription/KANJI').glob('**/*.txt')))

In [3]:
#!rm -r clips
os.makedirs('clips', exist_ok=True)

In [4]:
def num_there(s):
    return any(i.isdigit() for i in s)

In [5]:
remove_chars = ['A', '×', '÷', '―', '△', '○', '々', '〇', '〒', '％', '＆', '＋', '／', '０', '１', '２', '３', '４', '５', '６', '７', '８', '９', '＝', '？', '?', 'Ａ', 'Ｂ', 'Ｃ', 'Ｄ', 'Ｅ', 'Ｆ', 'Ｇ', 'Ｈ', 'Ｉ', 'Ｊ', 'Ｋ', 'Ｌ', 'Ｍ', 'Ｎ', 'Ｏ', 'Ｐ', 'Ｑ', 'Ｒ', 'Ｓ', 'Ｔ', 'Ｕ', 'Ｖ', 'Ｗ', 'Ｘ', 'Ｙ', 'ａ', 'ｂ', 'ｅ', 'ｉ', 'ｌ', 'ｍ', 'ｎ', 'ｏ', 'ｐ', '～']# これがあったらデータセットから削除
ignore_chars =  ['「', '」', '『', '』', '、', '。', ',', '.', '，', '．']#  これは空白に置換

In [6]:
cnt = 0 # (3591) 数字が入っている文の数。処理が面倒なので一旦無視
exist_count = 0 
nonexist_count = 0 # wavファイルが存在しない文の数.
i = 0
train_metadata_set = []
val_metadata_set = []
removed = 0

for tf in transcript_files:
    mode = tf.parent.name
    speaker = tf.name.replace('_KAN.txt', '')
    with open(tf, 'r') as f:
        transcript = f.read().splitlines()

        for line in transcript: # line is like 'BMP05E50 夜空を 赤い 灯 が 点滅しながら 旋回し 十分おきに 照明弾が ゆっくりと 落ちてくる'
            speech_id = line.split(' ')[0]
            wavfile_dt = Path('WAVES_DT/') / speaker / mode / (speech_id + '_DT.wav')
            wavfile_hs = Path('WAVES_HS/') / speaker / mode / (speech_id + '_HS.wav')
            if os.path.exists(wavfile_dt):
                wavfile = wavfile_dt
            elif os.path.exists(wavfile_hs):
                wavfile = wavfile_hs
            else:
                continue
                
            text = "".join(line.split(' ')[1:]) # text is like 時間で勉強を計るのは悪い
            duration = librosa.core.get_duration(filename=wavfile.absolute())
            if duration <= 0.75:
                continue
            if num_there(text):
                continue
                
            remove_flag = False
            for c in remove_chars:
                if c in text:
                    remove_flag = True
                    break
            if remove_flag:
                removed += 1
                continue
                
            for c in ignore_chars:
                text = text.replace(c, '')
                
            #write(path, RATE, audio)
            metadata = {
                "audio_filepath": str(wavfile.absolute()),
                "duration": duration,
                "text": text
            }
            i += 1
            if i % 20 == 0:
                val_metadata_set.append(metadata)
            else:
                train_metadata_set.append(metadata)

In [7]:
removed

2202

In [9]:
with open('mix_train_manifest.json', 'w') as f:
    for metadata in train_metadata_set:
        json.dump(metadata, f, ensure_ascii=False)
        f.write('\n')

with open('mix_val_manifest.json', 'w') as f:
    for metadata in val_metadata_set:
        json.dump(metadata, f, ensure_ascii=False)
        f.write('\n')