In [1]:
import librosa
import soundfile as sf
import torch
# from facodec import FACodecEncoderV2, FACodecDecoderV2
from huggingface_hub import hf_hub_download
from zact.models.facodec import FACodecEncoder, FACodecDecoder


def load_audio(wav_path):
    wav = librosa.load(wav_path, sr=16000)[0]
    wav = torch.from_numpy(wav).float()
    wav = wav.unsqueeze(0).unsqueeze(0)
    return wav


fa_encoder = FACodecEncoder(
    ngf=32,
    up_ratios=[2, 4, 5, 5],
    out_channels=256,
)

fa_decoder = FACodecDecoder(
    in_channels=256,
    upsample_initial_channel=1024,
    ngf=32,
    up_ratios=[5, 5, 4, 2],
    vq_num_q_c=2,
    vq_num_q_p=1,
    vq_num_q_r=3,
    vq_dim=256,
    codebook_dim=8,
    codebook_size_prosody=10,
    codebook_size_content=10,
    codebook_size_residual=10,
    use_gr_x_timbre=True,
    use_gr_residual_f0=True,
    use_gr_residual_phone=True,
)

encoder_ckpt = hf_hub_download(repo_id="amphion/naturalspeech3_facodec", filename="ns3_facodec_encoder.bin")
decoder_ckpt = hf_hub_download(repo_id="amphion/naturalspeech3_facodec", filename="ns3_facodec_decoder.bin")

fa_encoder.load_state_dict(torch.load(encoder_ckpt))
fa_decoder.load_state_dict(torch.load(decoder_ckpt))

fa_encoder.eval()
fa_decoder.eval()

accelerator = 'cuda:0'
fa_encoder.to(accelerator)
fa_decoder.to(accelerator)

  WeightNorm.apply(module, name, dim)
  fa_encoder.load_state_dict(torch.load(encoder_ckpt))
  fa_decoder.load_state_dict(torch.load(decoder_ckpt))


FACodecDecoder(
  (quantizer): ModuleList(
    (0): ResidualVQ(
      (layers): ModuleList(
        (0): FactorizedVectorQuantize(
          (in_proj): Linear(in_features=256, out_features=8, bias=True)
          (out_proj): Linear(in_features=8, out_features=256, bias=True)
          (_codebook): Embedding(1024, 8)
        )
      )
    )
    (1): ResidualVQ(
      (layers): ModuleList(
        (0-1): 2 x FactorizedVectorQuantize(
          (in_proj): Linear(in_features=256, out_features=8, bias=True)
          (out_proj): Linear(in_features=8, out_features=256, bias=True)
          (_codebook): Embedding(1024, 8)
        )
      )
    )
    (2): ResidualVQ(
      (layers): ModuleList(
        (0-2): 3 x FactorizedVectorQuantize(
          (in_proj): Linear(in_features=256, out_features=8, bias=True)
          (out_proj): Linear(in_features=8, out_features=256, bias=True)
          (_codebook): Embedding(1024, 8)
        )
      )
    )
  )
  (model): Sequential(
    (0): Conv1d(256, 

In [2]:
from zact import ZACT
import soundfile as sf
from omegaconf import OmegaConf

ckpt_path = 'path/to/ckpt.pt'
codec_cfg = OmegaConf.load('./configs/codec.yaml')
codec_cfg['encoder']['device'] = accelerator
codec_cfg['decoder']['device'] = accelerator

cfg = OmegaConf.load('path/to/config.yaml')
cfg['flow_matching']['device'] = accelerator
cfg['codes_generator']['device'] = accelerator

model = ZACT.from_pretrained(
    cfg=cfg, 
    ckpt_path=ckpt_path,
    device=accelerator,
    training_mode=False
)
model.to(accelerator)

  ckpt = torch.load(ckpt_path, map_location=device)


ZACT(
  (codes_generator): CodesGenerator(
    (encoder): Encoder(
      (src_word_emb): Embedding(361, 256, padding_idx=0)
      (layer_stack): ModuleList(
        (0-1): 2 x FFTBlock(
          (slf_attn): MultiHeadAttention(
            (w_qs): Linear(in_features=256, out_features=256, bias=True)
            (w_ks): Linear(in_features=256, out_features=256, bias=True)
            (w_vs): Linear(in_features=256, out_features=256, bias=True)
            (attention): ScaledDotProductAttention(
              (softmax): Softmax(dim=2)
            )
            (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
            (fc): Linear(in_features=256, out_features=256, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (pos_ffn): PositionwiseFeedForward(
            (w_1): Conv1d(256, 1024, kernel_size=(9,), stride=(1,), padding=(4,))
            (w_2): Conv1d(1024, 256, kernel_size=(1,), stride=(1,))
            (layer_norm): LayerNorm

In [3]:
import torch
import numpy as np
from omegaconf import DictConfig


def synthesize(
    self, 
    text: str,
    acoustic_prompt: str | np.ndarray | torch.Tensor,
    sr: int = 16000,
    codec_cfg: DictConfig = None,
    codec_encoder: torch.nn.Module = None,
    codec_decoder: torch.nn.Module = None,
    temperature: float = 0.02,
    lexicon_path: str = None,
    cleaners: str = ['english_cleaners'],
    ):
    
    if codec_encoder is None or codec_decoder is None:
        if codec_cfg is None:
            raise ValueError('The codec_encoder or codec_decoder is set to None. To initialize the codec encoder or decoder, you need to provide a codec_cfg of type omegaconf.DictConfig.')
        codec_cfg['encoder'] = accelerator
        codec_cfg['decoder'] = accelerator
        codec_encoder, codec_decoder = self._get_codec_models(codec_cfg)
        
    # process acoustic prompt
    acoustic_prompt = self._preprocess_acoustic_prompt(acoustic_prompt, sr)
    enc_out = codec_encoder(acoustic_prompt)
    _, prompt, _, _, timbre = codec_decoder(enc_out, eval_vq=False, vq=True)
    prompt = prompt.permute(1, 0, 2)
    
    # process phoneme
    phonemes, _, _ = self._preprocess_english(text, lexicon_path, cleaners)
    phonemes = phonemes.to(accelerator)
    codes_generator_outputs = self.codes_generator(
        texts=phonemes,
        src_lens=torch.zeros(phonemes.size(0), device=accelerator) + phonemes.size(1),
        max_src_len=phonemes.shape[-1],
    )
    prior = codes_generator_outputs[0]
    
    # flow matching euler solving
    logits = self.flow_matching.sampling(
        prior=prior,
        x_len=torch.zeros(prior.size(0), device=accelerator) + prior.size(2),
        x_max_len=prior.size(2),
        prompts=prompt,
        temperature=temperature,
    )['logits']
    
    codes = logits.softmax(1).argmax(1)
    codes = codes.permute(1, 0, 2)
    embs = codec_decoder.vq2emb(codes)
    wav = codec_decoder.inference(embs, timbre)
    wav = wav[0][0].detach().cpu().numpy()
    
    return wav

In [106]:
from IPython.display import Audio

prompt_audio = 'path/to/prompt/audio/file.wav'
target_text = 'Something you want model to say'

In [107]:
Audio(filename=prompt_audio)

In [109]:
synthesized_speech = synthesize(
    model,
    text=target_text,
    acoustic_prompt=prompt_audio,
    codec_cfg=codec_cfg,
    codec_encoder=fa_encoder,
    codec_decoder=fa_decoder,
)

In [None]:
sf.write('path/to/output/filename.wav', synthesized_speech, 16000)
Audio(data=synthesized_speech, rate=16000)