# Textless Speech Resynthesis using Conditional Flow Matching and HuBERT units

In [None]:
!pip install -r requirements/requirements.txt

In [None]:
!git clone https://github.com/facebookresearch/textlesslib.git src/textlesslib
!git clone https://huggingface.co/spaces/sarulab-speech/UTMOS-demo src/utmos

!patch src/utmos/lightning_module.py src/patch/utmos_lightning_module.patch

In [None]:
%cd src/textlesslib
!pip install -e .
%cd -

In [None]:
!wget -t 0 -c -P data https://www.openslr.org/resources/141/test_clean.tar.gz
!tar zxf data/test_clean.tar.gz -C data

In [None]:
import torch
import torchaudio
from fairseq.data.dictionary import Dictionary
from IPython.display import Audio
from textless.data.speech_encoder import SpeechEncoder

from src.flow_matching.models import ConditionalFlowMatchingWithHifiGan

In [None]:
torch.serialization.add_safe_globals([Dictionary])

encoder = SpeechEncoder.by_name(
    dense_model_name="mhubert-base-vp_mls_cv_8lang",
    quantizer_model_name="kmeans-expresso",
    vocab_size=2000,
    deduplicate=False,
    need_f0=False,
).cuda()

# download a pretrained model from hugging face hub
decoder = ConditionalFlowMatchingWithHifiGan.from_pretrained("ryota-komatsu/flow_matching_with_hifigan").cuda()

In [None]:
wav_path = "data/LibriTTS_R/test-clean/121/121726/121_121726_000004_000003.wav"

load a waveform

In [None]:
waveform, sr = torchaudio.load(wav_path)
waveform = torchaudio.functional.resample(waveform, sr, 16000)

encode a waveform into pseudo-phonetic units

In [None]:
units = encoder(waveform.cuda())["units"]
units = units.unsqueeze(0) + 1  # 0: pad

resynthesis

In [None]:
audio_values = decoder(units)[0]
audio_values = audio_values.cpu()

original speech

In [None]:
Audio(waveform, rate=16000)

sampled speech

In [None]:
Audio(audio_values, rate=16000)