In [1]:
# CSJ の TRN ファイルがあるディレクトリ
CSJ_TRN_DIR = "/autofs/diamond/share/corpus/CSJ/TRN/Form2"

In [2]:
import os

# TRNファイルのリストを作成
trn_files = []
for root, dirs, files in os.walk(CSJ_TRN_DIR):
    for file in files:
        if file.endswith('.trn'):
            trn_files.append(os.path.join(root, file))

In [3]:
from csj_formatter import remove_tag_from_plain_tagged_string
from espnet_phoneme_tokenizer import pyopenjtalk_g2p_prosody

def read_and_format_trn_file(filename):
    # 講演ID
    session_id = os.path.basename(filename).split('.')[0]

    results = []
    with open(filename, 'r', encoding='sjis') as f:     # 各ファイルを読み込む
        for line in f:
            utt_id, _, text = line.rstrip().split(' ', 2)
            channel = text[0]
            text = text[2:]

            if 'R' in text:
                continue
            if '<' in text:
                continue

            try:
                formatted_text = remove_tag_from_plain_tagged_string(text)
            except ValueError as e:
                # print(e)
                # print(text)
                continue

            if len(formatted_text) == 0:
                continue

            phoneme_text = ' '.join(pyopenjtalk_g2p_prosody(formatted_text))

            if len(phoneme_text) == 0:
                continue
            
            results.append({
                'session_id': session_id,
                'utt_id': int(utt_id),
                'channel': channel,
                'text': formatted_text,
                'phoneme_text': phoneme_text,
            })
            
        return results

In [4]:
import tqdm

results = []
for filename in tqdm.tqdm(trn_files):
    results.extend(read_and_format_trn_file(filename))

  0%|          | 0/3302 [00:00<?, ?it/s]

  4%|▍         | 125/3302 [00:33<14:09,  3.74it/s]


KeyboardInterrupt: 

In [None]:
for i in range(11):
    print(results[i])


{'session_id': 'A01F0055', 'utt_id': 5, 'channel': 'L', 'text': '発表します', 'phoneme_text': '^ h a [ cl py o o # sh i [ m a ] s u $'}
{'session_id': 'A01F0055', 'utt_id': 8, 'channel': 'L', 'text': '私共は', 'phoneme_text': '^ w a [ t a sh i ] d o m o w a $'}
{'session_id': 'A01F0055', 'utt_id': 9, 'channel': 'L', 'text': '乳児が音楽をどのように聞いているか', 'phoneme_text': '^ ny u ] u j i g a # o ] N g a k u o # d o ] n o # y o ] o n i # k i [ i t e # i [ r u ] k a $'}
{'session_id': 'A01F0055', 'utt_id': 10, 'channel': 'L', 'text': 'また聴取に発達齢差が見られるかを検討しております', 'phoneme_text': '^ m a [ t a # ch o ] o sh u n i # h a [ cl t a ts u y o w a i ] s a g a # m i [ r a r e ] r u k a o # k e [ N t o o # sh i [ t e # o [ r i m a ] s u $'}
{'session_id': 'A01F0055', 'utt_id': 11, 'channel': 'L', 'text': '本研究では旋律の調つまり長調ですとか短調の変化の', 'phoneme_text': '^ h o [ N k e ] N ky u u d e w a # s e [ N r i ts u n o # sh i ] r a b e # ts u [ m a r i ch o o ch o o d e ] s u t o k a # t a ] N ch o o n o # h e ] N k a n o $'}
{'session

In [None]:
# results[]の中身をall.jsonに保存
import json
json.dump(results, open('all.json', 'w', encoding='utf-8'), ensure_ascii=False)