In [1]:
import os
import json
import re

import textgrid
import librosa
from pathlib import Path
from scipy.io.wavfile import write

root = '/opt/storage/datasets/audio/japanese/CSJ'
os.chdir(root)

save_clips = False
RATE = 16000

In [2]:
transcript_files = sorted(list(Path('TRN/Form2/').glob('**/*.trn')))

In [3]:
#!rm -r clips
#os.makedirs('clips', exist_ok=True)

In [6]:
train_metadata_set = []
val_metadata_set = []

for i, transcript_file in enumerate(transcript_files):
    wavfile = Path(root) / 'WAV' / transcript_file.parent.name / transcript_file.name.replace('.trn', '.wav')
    wav, _ = librosa.load(wavfile, RATE)
    with open(transcript_file, 'r', encoding='shift_jis') as f:
        transcript = f.read().splitlines()
    for line in transcript:
        if '%' in line: # コメント業は無視
            continue
        flag = True
        if len(re.findall('\(R', line)) > 0:
            continue
        if len(re.findall('<', line)) > 0:
            continue
        if '?' in line or ';' in line:
            continue

        line = re.sub('\(F(.+?)\)', '\\1', line)
        line = re.sub('\(D(.+?)\)', '\\1', line)
        line = re.sub('\(A(.+?)\)', '\\1', line)
        line = re.sub('\(O(.+?)\)', '\\1', line)
        line = re.sub('\(K(.+?)\)', '\\1', line)
        line = re.sub('\(M(.+?)\)', '\\1', line) # 補助記号を削除
        
        if '(' in line:
            continue

        tag, text = line.split(':')
        text = text.replace(')', '')
        text = text.replace('2', '二')
        tag = tag[:-2]
        Id, time = tag.split(' ')
        start, end = list(map(float, time.split('-')))
        duration = end - start
        if duration > 0.75:
            path = Path('clips') / (transcript_file.name.replace('.trn', '_') + Id + '.wav')
            metadata = {
                "audio_filepath": str(path.absolute()),
                "duration": duration,
                "text": text
            }
            if i % 20 == 0:
                val_metadata_set.append(metadata)
            else:
                train_metadata_set.append(metadata)
            
            if save_clips:
                audio = wav[int(RATE * start): int(RATE*end)+1]
                write(path, RATE, audio)

In [7]:
with open('mix_train_manifest.json', 'w') as f:
    for metadata in train_metadata_set:
        json.dump(metadata, f, ensure_ascii=False)
        f.write('\n')

with open('mix_val_manifest.json', 'w') as f:
    for metadata in val_metadata_set:
        json.dump(metadata, f, ensure_ascii=False)
        f.write('\n')