### CSJ/ATR503ベースライン testデータセット作成

In [None]:
mount='/home/hattori/dataset' # マウントポイント（絶対パスで）
exp_name='0731' # 実験名
#n=0.2 #検証データ割合
exp_dir='./'+exp_name+'/datasets'

In [None]:
# eval1    (1272)  (filepath,char)
csj_eval1_csv=mount+'/CSJ_dataset/label/eval1/eval1.csv'
# eval2    (1292)  (filepath,char)
csj_eval2_csv=mount+'/CSJ_dataset/label/eval2/eval2.csv'
# eval3    (1382)  (filepath,char)
csj_eval3_csv=mount+'/CSJ_dataset/label/eval3/eval3.csv'
# atr503   (503)   (filepath,char,kana(atr),phone(atr))  
atr503_csv=mount+'/atr503/atr503.csv'

readme=mount+'/readme.txt'
!cat $readme

In [None]:
import numpy as np
import pandas as pd
import warnings
import datasets
warnings.filterwarnings('ignore')

In [None]:
#test dataset
test_eval1 = datasets.load_dataset("csv", data_files={'test':[csj_eval1_csv]})
test_eval2 = datasets.load_dataset("csv", data_files={'test':[csj_eval2_csv]})
test_eval3 = datasets.load_dataset("csv", data_files={'test':[csj_eval3_csv]})
test_atr=datasets.load_dataset("csv", data_files={'test':[atr503_csv]})
sets=[test_eval1,test_eval2,test_eval3,test_atr]

In [None]:
#show
from datasets import ClassLabel
import random
from IPython.display import display, HTML

def show_random(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    display(HTML(df.to_html()))
#show_random({dataset}['test'],2))

In [None]:
#rename path
def renamepath(batch):
    batch['path']=mount+str(batch['path'])[1:]
    return batch

In [None]:
# renamepath
for s in sets:s['test']=s['test'].map(renamepath)

In [None]:
import re
CHARS_TO_IGNORE = [",", "?", "¿", ".", "!", "¡", ";", "；", ":", '""', "%", '"', "�", "ʿ", "·", "჻", "~", "՞",
                   "؟", "،", "।", "॥", "«", "»", "„", "“", "”", "「", "」", "‘", "’", "《", "》", "(", ")", "[", "]",
                   "{", "}", "=", "`", "_", "+", "<", ">", "…", "–", "°", "´", "ʾ", "‹", "›", "©", "®", "—", "→", "。",
                   "、", "﹂", "﹁", "‧", "～", "﹏", "，", "｛", "｝", "（", "）", "［", "］", "【", "】", "‥", "〽",
                   "『", "』", "〝", "〟", "⟨", "⟩", "〜", "：", "！", "？", "♪", "؛", "/", "\\", "º", "−", "^", "'", "ʻ", "ˆ","「","」","　","｡"]
chars_to_ignore_regex = f"[{re.escape(''.join(CHARS_TO_IGNORE))}]"

def remove_special_characters(batch):
    #s=str(batch['char'])
    batch["conv_char"] = re.sub(chars_to_ignore_regex, ' ', str(batch["char"])).upper()
    return batch

In [None]:
# remove_special_char
for s in sets: s['test']=s['test'].map(remove_special_characters)

In [None]:
import MeCab
import unidic
import romkan
#!python -m unidic download #最新辞書
mecab = MeCab.Tagger()

In [None]:
# char2kana（元々カナは出力されないので場合分け）
def convert_char_to_kana(batch):
    s = mecab.parse(batch["conv_char"])
    kana = ""
    for line in s.split("\n"):
      if line.find("\t")<=0: continue
      columns = line.split(',')
      if len(columns) < 10:
        kana += line.split('\t')[0]
      else:
        kana += columns[9]
    batch["kana"] = kana
    return batch

In [None]:
# conv_char2kana
for s in sets: s['test']=s['test'].map(convert_char_to_kana)

In [None]:
# kana2roman
def convert_kana_to_roman(batch):
    s = mecab.parse(batch["kana"])
    kana = ""
    for line in s.split("\n"):
      if line.find("\t")<=0: continue
      columns = line.split(',')
      if len(columns) < 10:
        kana += line.split('\t')[0]
      else:
        kana += columns[9]
    roman = romkan.to_roma(kana)
    batch["roman"] = roman
    return batch

In [None]:
# conv_kana2roman
for s in sets: s['test']=s['test'].map(convert_kana_to_roman)

In [None]:
# kana2phoneme (2通り。Mecab等を仲介するか、直接か)
# openjtalk options: https://note.com/npaka/n/n6a5307cf8fe1

import pyopenjtalk
def convert_char2phone_openjtalk(batch):
    s = batch["conv_char"]
    phones = pyopenjtalk.g2p(s, kana=False)
    batch["phone"] = phones
    return batch

def convert_kana2phone(batch):
    s = batch["kana"]
    phones = pyopenjtalk.g2p(s, kana=False)
    batch["phone"] = phones
    return batch

In [None]:
# conv_kana2phoneme
for s in sets: s['test']=s['test'].map(convert_char2phone_openjtalk)

In [None]:
#被り計測 
def extract_all_chars_kana(batch):
  all_text = " ".join(batch["phone"])
  vocab = list(set(all_text))
  return {"vocab": [vocab]}

vocab_train_kana = test_eval1.map(extract_all_chars_kana)
a=vocab_train_kana['test']['vocab']

import itertools
b=list(itertools.chain.from_iterable(a))
c=list(itertools.chain.from_iterable(b))
d=set(c)
print(len(d))
d

In [None]:
# save
test_eval1['test'].to_csv(exp_dir+'/test_eval1_phone.csv',columns=['path','phone'])
test_eval2['test'].to_csv(exp_dir+'/test_eval2_phone.csv',columns=['path','phone'])
test_eval3['test'].to_csv(exp_dir+'/test_eval3_phone.csv',columns=['path','phone'])
test_atr['test'].to_csv(exp_dir+'/test_atr503_phone.csv',columns=['path','phone'])