In [1]:
import json
import math
import os
import random
import subprocess
import wave

import requests
from tqdm import tqdm

In [2]:
# 定数
# ポート一覧
port_map = {"voicebox": "50021", "sharevox": "50025"}

# リストに含まれていない名前は機械学習への活用が禁止されている可能性が高いので実行しない
# 実行可能
allow_for_ml = [
    # voicebox組
    # 運営tweet: https://twitter.com/hiho_karuta/status/1553026175098028038
    "春日部つむぎ",# 規約  https://tsumugi-official.studio.site/rule
    "波音リツ", # 規約: https://www.canon-voice.com/terms/ 
    "冥鳴ひまり", # 規約: https://meimeihimari.wixsite.com/himari/terms-of-use
    "剣崎雌雄", # 規約: https://frontier.creatia.cc/fanclubs/413/posts/4507#fromHistory
    "櫻歌ミコ", # 規約: https://voicevox35miko.studio.site/rule
    "小夜/SAYO", # 規約: https://316soramegu.wixsite.com/sayo-official/guideline FAQ: https://316soramegu.wixsite.com/sayo-official/question
    # sharevox組 規約: https://www.sharevox.app/characters
    "小春音アミ",
    "つくよみちゃん",
    "白痴ー",
    "Yくん/開発者"
]

# 警告を出し、実行しない
allow_only_mmvc = [
    # MMVCは個別で許可を得ている。それ以外では不明
    "ずんだもん",
    "四国めたん",
    "九州そら"
]

In [None]:
%%writefile config.json
{
  "host": "localhost",
  "output_dir": "output/つくよみちゃん/",
  "file_prefix": "",
  "speaker_name": "つくよみちゃん",
  "style": null,
  "target_speaker_name": null,
  "target_style": null,
  "morph_rate": null,
  "speed_mean": 1.0,
  "pitch_mean": 0.0,
  "intonation_mean": 1.0,
  "speed_random_strength": 0.0,
  "pitch_random_strength": 0.0,
  "intonation_random_strength": 0.0
}

In [4]:
# ITAコーパスのダウンロード
# https://github.com/mmorise/ita-corpus/blob/main/LICENCE.txt
subprocess.run("curl -Os https://raw.githubusercontent.com/mmorise/ita-corpus/main/emotion_transcript_utf8.txt", shell=True)
subprocess.run("curl -Os https://raw.githubusercontent.com/mmorise/ita-corpus/main/recitation_transcript_utf8.txt", shell=True)

scripts = []
with open("emotion_transcript_utf8.txt", "r", encoding="utf-8") as f:
    for line in f.readlines():
        line, kana = line.split(",")
        file, script = line.split(":")
        scripts.append({"file": file, "script": script, "kana": kana.rstrip()})
with open("recitation_transcript_utf8.txt", "r", encoding="utf-8") as f:
    for line in f.readlines():
        line, kana = line.split(",")
        file, script = line.split(":")
        scripts.append({"file": file, "script": script, "kana": kana.rstrip()})

# configの読み込み
with open("config.json", encoding="utf-8") as f:
    config = json.load(f)

if config["speaker_name"] in allow_only_mmvc:
    raise Exception("このキャラクターはMMVCにおいてMMVC付属のデータセットでのみ学習可能です")
if not config["speaker_name"] in allow_for_ml:
    raise Exception("このキャラクターでの機械学習は許可されていない可能性があります。規約を確認してください")

# 出力先の作成
if not os.path.isdir(config["output_dir"]):
    os.makedirs(config["output_dir"])


In [5]:
# ボイボ名の一覧の取得
speakers_info = {}
id_map = {}
id_map_r = {}
for engine in port_map.keys():
    res = requests.get("http://{}:{}/speakers".format(config["host"], port_map[engine]))
    if not res.ok:
        continue
    for info in json.loads(res.text):
        info["engine"] = engine
        if "ノーマル" in [i["name"] for i in info["styles"]]:
            info["default_style"] = "ノーマル"
        else:
            info["default_style"] = sorted(info["styles"], key=lambda x: x["id"])[0]["name"]
        speakers_info[info["name"]] = info
        
        for style in info["styles"]:
            id_map[(info["engine"], style["id"])] = (info["name"], style["name"])
            id_map_r[(info["name"], style["name"])] = (info["engine"], style["id"])

In [6]:
# speakerの取得

if config["speaker_name"] is None:
    raise Exception("speaker_nameを指定してください")
elif not config["speaker_name"] in speakers_info.keys():
    raise Exception("speaker_nameが間違っています")
if config["style"] is None:
    config["style"] = speakers_info[config["speaker_name"]]["default_style"]
key = (config["speaker_name"], config["style"])
if not key in id_map_r.keys():
    raise Exception("styleが間違っています")
config["engine"], config["speaker"] = id_map_r[key]

if config["target_speaker_name"] is not None:
    if not config["target_speaker_name"] in speakers_info.keys():
        raise Exception("target_speaker_nameが間違っています")
    if config["target_style"] is None:
        config["target_style"] = speakers_info[config["target_speaker_name"]]["default_style"]
    key = (config["target_speaker_name"], config["target_style"])
    if not key in id_map_r.keys():
        raise Exception("target_styleが間違っています")
    config["target_engine"], config["target_speaker"] = id_map_r[key]
    if config["taget_engine"] != config["engine"]:
        raise Exception("異なるエンジン間の合成はできません")

    if not config["target_speaker"] in id_map.keys():
        raise Exception("target_speakerが間違っています")

In [None]:
# 参考: https://qiita.com/hatt_takumi/items/d65c243294f250724c19
with tqdm(total=len(scripts)) as pbar:    
    for line in tqdm(scripts):
        params = (
            ("text", line["script"]),
            ("speaker", config["speaker"])
        )
        query = requests.post(
            "http://{}:{}/audio_query".format(config["host"], port_map[speakers_info[config["speaker_name"]]["engine"]]),
            params=params
        )

        data = query.json()
        data['speedScale'] = min(2., max(0.5, pow(2., math.log2(config["speed_mean"]) + random.gauss(mu=0, sigma=config["speed_random_strength"] / 3))))
        data['pitchScale'] = min(0.15, max(-0.15, config["pitch_mean"] + random.gauss(mu=0, sigma=config["pitch_random_strength"] * .05)))
        data['intonationScale'] = min(2., max(-1., config["intonation_mean"] + random.gauss(mu=0, sigma=config["intonation_random_strength"] / 3)))
        data['prePhonemeLength'] = 0.
        data['postPhonemeLength'] = 0.
        data["outputStereo"] = False

        headers = {'Content-Type': 'application/json'}
        if config["target_speaker_name"]:
            params = (
                ("base_speaker", config["speaker"]),
                ("target_speaker", config["speaker"]),
                ("morph_rate", config["morph_rate"]),
            )
            response = requests.post(
                'http://{}:{}/synthesis'.format(config["host"], port_map[speakers_info[config["speaker_name"]]["engine"]]),
                headers=headers,
                params=params,
                data=json.dumps(data)
            )
        else:
            response = requests.post(
                'http://{}:{}/synthesis'.format(config["host"], port_map[speakers_info[config["speaker_name"]]["engine"]]),
                headers=headers,
                params=params,
                data=json.dumps(data)
            )

        save_path = config["output_dir"] + (config["file_prefix"] + line["file"] + ".wav")
        wf = wave.open(str(save_path), 'wb')
        wf.setnchannels(1)
        wf.setsampwidth(2)
        wf.setframerate(data["outputSamplingRate"])
        wf.writeframes(response.content)
        wf.close()

        progress = "speed: {speedScale}, pitch: {pitchScale}, intonation: {intonationScale}".format(**data)
        pbar.set_postfix_str(progress)