<a name='1'></a>
## Prerequisites

In [None]:
%pip install -q datasets "transformers>=4.33.1" accelerate "openvino>=2023.1.0"
%git clone https://github.com/svc-develop-team/so-vits-svc -b 4.1-Stable
%cd so-vits-svc
%pip install --upgrade pip setuptools
%pip install -r requirements.txt --extra-index-url https://download.pytorch.org/whl/cu118

In [None]:
!wget -P pretrain/ https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/hubert_base.pt -O checkpoint_best_legacy_500.pt
!wget -P logs/44k/ https://huggingface.co/Sucial/so-vits-svc4.1-sanwu/resolve/main/kmeans_10000.pt -O kmeans_10000.pt
!wget -P logs/44k/ https://huggingface.co/Sucial/so-vits-svc4.1-sanwu/resolve/main/sanwu_100800.pth -O sanwu_100800.pth
!wget -P config/ https://huggingface.co/Sucial/so-vits-svc4.1-sanwu/resolve/main/config.json -O config.json
!wget -P logs/44k/ https://huggingface.co/Sucial/so-vits-svc4.1-sanwu/resolve/main/feature_and_index.pkl -O feature_and_index.pkl
!wget -P logs/44k/ "https://huggingface.co/therealvul/so-vits-svc-4.0/resolve/main/Rainbow%20Dash%20(singing)/G_30400.pth" -O G_30400.pth
!wget -P logs/44k/ "https://huggingface.co/therealvul/so-vits-svc-4.0/resolve/main/Rainbow%20Dash%20(singing)/D_30400.pth" -O D_30400.pth

## Use the original model to run an inference

In [None]:
from inference.infer_tool import Svc

model = Svc("logs/44k/G_30400.pth", "configs/config.json", device='cpu')

In [None]:
kwarg = {
    'raw_audio_path': 'raw/p226_002.wav', 
    'spk': 'sanwu', 
    'tran': 0, 
    'slice_db': -40, 
    'cluster_infer_ratio': 0, 
    'auto_predict_f0': False, 
    'noice_scale': 0.4, 
    'pad_seconds': 0.5, 
    'clip_seconds': 0, 
    'lg_num': 0, 
    'lgr_num': 0.75, 
    'f0_predictor': 'pm', 
    'enhancer_adaptive_key': 0, 
    'cr_threshold': 0.05, 
    'k_step': 100, 
    'use_spk_mix': False, 
    'second_encoding': False, 
    'loudness_envelope_adjustment': 1
}

audio = model.slice_inference(**kwarg)

In [None]:
import IPython.display as ipd

ipd.Audio(audio, rate=model.target_sample)

## Convert to OpenVINO IR model

In [None]:
import openvino as ov
import torch
from pathlib import Path


dummy_c = torch.randn(1, 768, 457)
dummy_f0 = torch.randn(1, 457)
dummy_uv = torch.ones(1, 457)
dummy_g = torch.tensor([[0]])
model.net_g_ms.forward = model.net_g_ms.infer

#input_info = [("c", dummy_c.shape, torch.float32),("f0", dummy_f0.shape, torch.float32),("uv", dummy_uv.shape, torch.float32), ("g", dummy_g.shape, torch.int64)]
net_g_kwargs = {
    'c': dummy_c,
    'f0': dummy_f0,
    'uv': dummy_uv,
    'g': dummy_g,
    
    # 'noice_scale': 0.35,
    # 'seed': 52468,
    # 'predict_f0': False,
    # 'vol':  0
}
core = ov.Core()

converted_model = ov.convert_model(model.net_g_ms, example_input=net_g_kwargs)

net_g_model_xml_path = Path('models/ov_net_g_model.xml')

net_g_model_xml_path.parent.mkdir(parents=True, exist_ok=True)
ov.save_model(converted_model, net_g_model_xml_path)
#compiled_net_g_model = core.compile_model(net_g_model_xml_path)

In [None]:
import ipywidgets as widgets
import openvino as ov

core = ov.Core()

device = widgets.Dropdown(
    options=core.available_devices + ["AUTO"],
    value='AUTO',
    description='Device:',
    disabled=False,
)

device

In [None]:
class NetGModelWrapper:
    def __init__(self, net_g_model_xml_path):
        super().__init__()
        self.net_g_model = core.compile_model(net_g_model_xml_path, device.value)
        
    def infer(self, c, *, f0, uv, g, noice_scale=0.35, seed=52468, predict_f0=False, vol = None):
        print(self.net_g_model)
        print(c.shape, f0.shape, uv.shape, g.shape)
        #results = self.net_g_model({'c': c, 'f0': f0, 'uv': uv, 'g': g})[0]
        results = self.net_g_model((c, f0, uv, g))[0]
        return results

        

kwarg = {
    'raw_audio_path': 'raw/p226_002.wav', 
    'spk': 'sanwu', 
    'tran': 0, 
    'slice_db': -40, 
    'cluster_infer_ratio': 0, 
    'auto_predict_f0': False, 
    'noice_scale': 0.4, 
    'pad_seconds': 0.5, 
    'clip_seconds': 0, 
    'lg_num': 0, 
    'lgr_num': 0.75, 
    'f0_predictor': 'pm', 
    'enhancer_adaptive_key': 0, 
    'cr_threshold': 0.05, 
    'k_step': 100, 
    'use_spk_mix': False, 
    'second_encoding': False, 
    'loudness_envelope_adjustment': 1
}
#compiled_net_g_model.create_infer_request()
model.net_g_ms = NetGModelWrapper(net_g_model_xml_path)
#compiled_net_g_model.infer = compiled_net_g_model.__call__
audio = model.slice_inference(**kwarg)

## Convert model by parts

In [None]:
model = Svc("logs/44k/G_30400.pth", "configs/config.json", device='cpu')

In [None]:
#(x * x_mask).shape=torch.Size([1, 192, 457]), x_mask.shape=torch.Size([1, 1, 457])
# x.shape=torch.Size([1, 192, 457]), x_mask.shape=torch.Size([1, 1, 457]), f0_to_coarse(f0).shape=torch.Size([1, 457]), noice_scale=0.4
# x.dtype=torch.float32, x_mask.dtype=torch.float32, f0_to_coarse(f0).dtype=torch.int64, noice_scale=0.4

import openvino as ov
import torch
from pathlib import Path
from utils import f0_to_coarse


dummy_x = torch.randn(1, 192, 457)
dummy_x_mask = torch.randn(1, 1, 457)
f0 = torch.randn(1, 457)
noice_scale = torch.tensor(0.4)

core = ov.Core()

#converted_enc_p_model = ov.convert_model(model.net_g_ms.enc_p, example_input=(dummy_x, dummy_x_mask, f0_to_coarse(f0), noice_scale))
converted_enc_p_model = ov.convert_model(model.net_g_ms.enc_p.enc_, example_input=(dummy_x, dummy_x_mask))

enc_p_model_xml_path = Path('models/ov_enc_p_model.xml')

enc_p_model_xml_path.parent.mkdir(parents=True, exist_ok=True)
ov.save_model(converted_enc_p_model, enc_p_model_xml_path)
#compiled_enc_p_model = core.compile_model(enc_p_model_xml_path)

In [None]:
compiled_enc_p_model = core.compile_model(enc_p_model_xml_path)
compiled_enc_p_model(dummy_x, dummy_x_mask)

In [None]:
# z_p.shape=torch.Size([1, 192, 457]), c_mask.shape=torch.Size([1, 1, 457]), g.shape=torch.Size([1, 768, 1])
dummy_z_p = torch.randn(1, 192, 457)
dummy_c_mask = torch.randn(1, 1, 457)
dummy_g = torch.randn(1, 768, 1)
dummy_reverse = torch.tensor(True)

converted_flow_model = ov.convert_model(model.net_g_ms.flow, example_input=(dummy_z_p, dummy_c_mask, dummy_g, dummy_reverse))

flow_model_xml_path = Path('models/ov_flow_model.xml')

flow_model_xml_path.parent.mkdir(parents=True, exist_ok=True)
ov.save_model(converted_flow_model, flow_model_xml_path)
#compiled_flow_model = core.compile_model(flow_model_xml_path)

In [None]:
# (z * c_mask).shape=torch.Size([1, 192, 457]), g.shape=torch.Size([1, 768, 1]), f0.shape=torch.Size([1, 457])

dummy_z_c_mask = torch.randn(1, 192, 457)
dummy_g = torch.randn(1, 768, 1)
f0 = torch.randn(1, 457)


converted_dec_model = ov.convert_model(model.net_g_ms.dec, example_input=(dummy_z_c_mask, f0, dummy_g))

dec_model_xml_path = Path('models/ov_dec_model.xml')

dec_model_xml_path.parent.mkdir(parents=True, exist_ok=True)
ov.save_model(converted_dec_model, dec_model_xml_path)
#compiled_dec_model = core.compile_model(dec_model_xml_path)

In [None]:
model = Svc("logs/44k/G_30400.pth", "configs/config.json", device='cpu')

class EncPModelWrapper(torch.nn.Module):
    def __init__(self, model_path):
        super().__init__()
        self.compiled_model = core.compile_model(model_path, device.value)

    # def forward(self, x, x_mask, f0=None, noice_scale=1):
    #     return self.compiled_model((x, x_mask, f0, noice_scale))[0]

    def forward(self, x, x_mask):
        x = self.compiled_model((x, x_mask))[0]
        return x


class FlowWrapper(torch.nn.Module):
    def __init__(self, model_path):
        super().__init__()
        self.compiled_model = core.compile_model(model_path, device.value)

    def forward(self, x, x_mask, g=None, reverse=False):
        z = self.compiled_model((x, x_mask, g, reverse))[0]
        return torch.tensor(z)


class DecWrapper(torch.nn.Module):
    def __init__(self, model_path):
        super().__init__()
        self.compiled_model = core.compile_model(model_path, device.value)

    def forward(self, z_c_mask, *, g, f0):
        o = self.compiled_model((z_c_mask, f0, g))[0]
        return torch.tensor(o)


model.net_g_ms.enc_p.enc_ = EncPModelWrapper(enc_p_model_xml_path)
model.net_g_ms.flow = FlowWrapper(flow_model_xml_path)
model.net_g_ms.dec = DecWrapper(dec_model_xml_path)

In [None]:
model.net_g_ms.enc_p.enc_.compiled_model((dummy_x, dummy_x_mask))

In [None]:
kwarg = {
    'raw_audio_path': 'raw/p226_002.wav', 
    'spk': 'sanwu', 
    'tran': 0, 
    'slice_db': -40, 
    'cluster_infer_ratio': 0, 
    'auto_predict_f0': False, 
    'noice_scale': 0.4, 
    'pad_seconds': 0.5, 
    'clip_seconds': 0, 
    'lg_num': 0, 
    'lgr_num': 0.75, 
    'f0_predictor': 'pm', 
    'enhancer_adaptive_key': 0, 
    'cr_threshold': 0.05, 
    'k_step': 100, 
    'use_spk_mix': False, 
    'second_encoding': False, 
    'loudness_envelope_adjustment': 1
}

audio = model.slice_inference(**kwarg)