
# Exporting the model components to ONNX

Lyodos 著

Version 1.0.0 (2024-07-14)

このノートブックでは、ClassicVC の PyTorch モデルを ONNX に書き出す方法を示す。

----

## 準備


In [None]:
%%time
from pathlib import Path

# チェックポイントやログを保存する、機械学習関連のデータを置くルートディレクトリの指定

DATASET_ROOT_PATH = Path("/home/lyodos/study/dataset") # このフォルダ名はユーザーの実情に合わせて書き変えること

proj_path = DATASET_ROOT_PATH / "checkpoints" / "classic-vc"
proj_path.mkdir(parents = True, exist_ok = True)
print("Project directory:", str(proj_path))


----

## HarmoF0 pitch tracker の ONNX 化


In [None]:
import torch
import torchaudio
import sys
sys.path.append('../') # ClassicVC のリポジトリのルートをパスに入れて、model ディレクトリを探せるようにしている

from model.harmof0.pitch_tracker import BatchedPitchEnergyTracker

def pred_f0_len(length):
    return length // 160 + 1

harmof0_tracker = BatchedPitchEnergyTracker(
    checkpoint_path = "../model/harmof0/checkpoints/mdb-stem-synth.pth", # HarmoF0 作者による訓練済みの重みを再配布
    fmin = 27.5, # f0 として想定する最低周波数の Hz で、ピアノの最低音の A に相当する。
    sample_rate = 16000,
    hop_length = 160, # f0 を推定する間隔。160/16000 = 10 ms 
    frame_len = 1024, # sliding window を切り出す長さ
    frames_per_step = 1000, # 1 回の forward で投入する最大セグメント数
    high_threshold = 0.8, 
    low_threshold = 0.1, 
    freq_bins_in = 88*4,
    bins_per_octave_in = 48,
    bins_per_octave_out = 48,
    device = device,
    compile = False,
    dry_run = 10, 
)

# ちなみに初期化時点でネットワークの重みは freeze 済み

In [None]:

# ONNX形式にエクスポート

torch.onnx.export(
    harmof0_tracker, 
    wav16,
    str(proj_path / "harmof0.onnx"), 
    input_names=['input'],
    output_names=['freq_t', 'act_t', 'energy_t', 'spec'],
    dynamic_axes = {
        'input': {0: 'batch', 1: 'frames'},
        'freq_t': {0: 'batch', 1: 'frames'},
        'act_t': {0: 'batch', 1: 'frames'},
        'energy_t': {0: 'batch', 1: 'frames'},
        'spec': {0: 'batch', 2: 'frames'}
    }
)



書き出した ONNX ファイルは、以下のように ONNX Runtime で推論セッションを作ったり、
他の言語からのバインディングを通じて呼び出したりできる。


In [None]:
import onnxruntime
import numpy as np
import librosa

sess = onnxruntime.InferenceSession(
    str(proj_path / "harmof0.onnx"), 
    providers  =[
        'CUDAExecutionProvider', 
        'CPUExecutionProvider',
    ],
)

print(len(sess.get_inputs()))
print(len(sess.get_outputs()))

input_name = sess.get_inputs()[0].name
input_shape = sess.get_inputs()[0].shape
input_type = sess.get_inputs()[0].type
print("Input name  :", input_name)
print("Input shape :", input_shape)
print("Input type  :", input_type)

output_name = sess.get_outputs()[0].name
output_shape = sess.get_outputs()[0].shape
output_type = sess.get_outputs()[0].type
print("Output name  :", output_name)
print("Output shape :", output_shape)
print("Output type  :", output_type)

audio_array, sr = librosa.load('../wavs/p225_003.wav', sr = 16000, mono = True)
if audio_array.ndim == 1:
    audio_array = audio_array[np.newaxis, :]
print(audio_array.shape)

%time freq_t, act_t, energy_t, spec = sess.run(['freq_t', 'act_t', 'energy_t', 'spec'], {"input": audio_array})
%time freq_t, act_t, energy_t, spec = sess.run(['freq_t', 'act_t', 'energy_t', 'spec'], {"input": audio_array})

print(tensor_x.shape, freq_t.shape, act_t.shape, energy_t.shape, spec.shape)

import matplotlib.pyplot as plt

plt.imshow(spec.squeeze(), origin = "lower")
plt.tight_layout()
plt.show()


----

## (Acoustic) Style Encoder の ONNX 化




In [None]:
from dataclasses import dataclass
import typing
from omegaconf import OmegaConf

from model.StyleTTS2.models import StyleEncoder


@dataclass
class StyleEncoderConfig:
    dim_in: int = 304
    style_dim: int = 128
    max_conv_dim: int = 512

style_encoder_cfg = OmegaConf.structured(StyleEncoderConfig())

style_encoder = StyleEncoder(
    dim_in = style_encoder_cfg.dim_in, # 304
    style_dim = style_encoder_cfg.style_dim, # 128
    max_conv_dim = style_encoder_cfg.max_conv_dim, # 512
)

# この場所に 作った重みを 置いておく
style_dict_path = "../weights/style_encoder.pth"

style_dict = torch.load(style_dict_path, map_location = device)
style_encoder.load_state_dict(style_dict, strict = True)


In [None]:

tensor_x = torch.rand((1, 1, 304, 676), dtype = torch.float32)

# ONNX形式にエクスポート
torch.onnx.export(
    style_encoder, 
    tensor_x, 
    str(proj_path / "style_encoder_304.onnx"), 
    opset_version = 17,
    input_names = ['input'],
    output_names = ['output'],
    dynamic_axes = {
        'input': {0: 'batch', 3: 'frames'}
    }
)


In [None]:

import onnxruntime

# 実は frame が 4 の倍数でないと途中で止まる
spec_array = np.zeros((1, 1, 304, 676), dtype = np.float32)

sess = onnxruntime.InferenceSession(
    str(proj_path / "style_encoder_304.onnx"), 
    providers  =[
        'CUDAExecutionProvider', 
        'CPUExecutionProvider',
    ],
)

%time style_onnx = sess.run(['output'], {"input": spec_array})[0]
%time style_onnx = sess.run(['output'], {"input": spec_array})[0]

print(style_onnx.shape)




----

## ContentVec のONNX 化


In [None]:
from transformers import HubertConfig, HubertModel

CE = HubertModel(HubertConfig())

# この位置に作った重みを置いておく
contentvec_path = DATASET_ROOT_PATH / "checkpoints" / "classic-vc" / "contentvec_500_hubert.pth"

CE_dict = torch.load(str(contentvec_path), map_location = torch.device('cpu'))
CE.load_state_dict(CE_dict, strict = True)
CE.eval()


In [None]:

tensor_x = torch.rand((1, 16000*4), dtype = torch.float32) * 0.2

onnx_path = DATASET_ROOT_PATH / "checkpoints" / "classic-vc" / "hubert500.onnx"

# ONNX形式にエクスポート
torch.onnx.export(
    CE, 
    tensor_x, 
    str(proj_path / "hubert500.onnx"), 
    opset_version = 17,
    input_names=['input'],
    output_names=['last_hidden_state'],
    dynamic_axes = {
        'input': {0: 'batch', 1: 'samples'},
        'last_hidden_state': {0: 'batch', 1: 'frames'}
    }
)



----

## ProsodyPredictor の ONNX 化


In [None]:
from pathlib import Path
from dataclasses import dataclass
import typing
from omegaconf import OmegaConf

from model.StyleTTS2.models import F0NPredictorAll

@dataclass
class PrododyPredictorConfig:
    style_dim: int = 128
    hidden_dim: int = 768
    n_layer: int = 3
    dropout: float = 0.2

prosody_predictor_cfg = OmegaConf.structured(PrododyPredictorConfig())

f0n_predictor = F0NPredictorAll(
    style_dim = prosody_predictor_cfg.style_dim,
    d_hid = prosody_predictor_cfg.hidden_dim,
    nlayers = prosody_predictor_cfg.n_layer,
    dropout = prosody_predictor_cfg.dropout,
)


f0n_dict_path = "../weights/f0n_predictor.pth"
f0n_dict = torch.load(f0n_dict_path, map_location = "cpu")
f0n_predictor.load_state_dict(f0n_dict, strict = True)


In [None]:

content_tensor = torch.rand((1, 768, 270), dtype = torch.float32)
style_tensor = torch.rand((1, 128), dtype = torch.float32)

# ONNX形式にエクスポート
torch.onnx.export(
    f0n_predictor, 
    (content_tensor, style_tensor), 
    str(proj_path / "f0n_predictor_hubert500.onnx"), 
    opset_version = 17, # 17 だと動かない。
    input_names = ['content', 'style'],
    output_names = ['pred_F0', 'pred_N'],
    dynamic_axes = {
        'content': {0: 'batch', 2: 'frames'},
        'style': {0: 'batch'},
        'pred_F0': {0: 'batch', 1: 'frames_double'},
        'pred_N': {0: 'batch', 1: 'frames_double'},
    }
)


---


# VC Decoder の ONNX 化



In [None]:
from pathlib import Path
from dataclasses import dataclass
import typing
from omegaconf import OmegaConf
import math

from model.StyleTTS2.hifigan import Decoder

upsample_rate_list = [10, 4, 3, 2]

@dataclass
class DecoderConfig:
    sampling_rate: int = 24000
    dim_in: int = 768
    style_dim: int = 128
    upsample_rate_list: list = tuple(upsample_rate_list)
    upsample_kernel_list: list = tuple([i*2 for i in upsample_rate_list])
    upsample_total: int = math.prod(upsample_rate_list)*2
    upsample_initial_channel: int = 512
    harmonic_num: int = 8

decoder_cfg = OmegaConf.structured(DecoderConfig())

decoder = Decoder(
    sampling_rate = decoder_cfg.sampling_rate,
    dim_in = decoder_cfg.dim_in,
    style_dim = decoder_cfg.style_dim,
    resblock_kernel_sizes = [3, 7, 11], # ここは大多数のモデルで同じ設定値を採用している
    resblock_dilation_sizes = [[1, 3, 5], [1, 3, 5], [1, 3, 5]], # ここは大多数のモデルで同じ設定値を採用している
    upsample_rates = decoder_cfg.upsample_rate_list,
    upsample_initial_channel = decoder_cfg.upsample_initial_channel,
    upsample_kernel_sizes = decoder_cfg.upsample_kernel_list,
    harmonic_num = decoder_cfg.harmonic_num,
)

decoder_dict_path = "../weights/decoder.pth"

decoder_dict = torch.load(decoder_dict_path, map_location = "cpu")
decoder.load_state_dict(decoder_dict, strict = True)


In [None]:
content_tensor = torch.rand((1, 768, 270), dtype = torch.float32)
pred_F0 = torch.rand((1, 540), dtype = torch.float32)
pred_N  = torch.rand((1, 540), dtype = torch.float32)
style_tensor = torch.rand((1, 128), dtype = torch.float32)

# ONNX形式にエクスポート
torch.onnx.export(
    decoder, 
    (content_tensor, pred_F0, pred_N, style_tensor), 
    str(proj_path / "decoder_24k.onnx"), 
    opset_version = 17,
    input_names = ['content', 'pitch', 'energy', 'style'],
    output_names = ['output'],
    dynamic_axes = {
        'content': {0: 'batch', 2: 'frames'},
        'pitch': {0: 'batch', 1: 'frames_double'},
        'energy': {0: 'batch', 1: 'frames_double'},
        'style': {0: 'batch'},
    }
)
