In [1]:
import sys
from pathlib import Path

repo_dir = Path("OpenVoice")

if not repo_dir.exists():
    !git clone https://github.com/myshell-ai/OpenVoice
    orig_english_path = Path("OpenVoice/openvoice/text/_orig_english.py")
    english_path = Path("OpenVoice/openvoice/text/english.py")

    english_path.rename(orig_english_path)

    with orig_english_path.open("r") as f:
        data = f.read()
        data = data.replace("unidecode", "anyascii")
        with english_path.open("w") as out_f:
            out_f.write(data)
# append to sys.path so that modules from the repo could be imported
sys.path.append(str(repo_dir))

%pip install -q "librosa>=0.8.1" "wavmark>=0.0.3" "faster-whisper>=0.9.0" "pydub>=0.25.1" "whisper-timestamped>=1.14.2" "tqdm" "inflect>=7.0.0" "eng_to_ipa>=0.0.2" "pypinyin>=0.50.0" \
"cn2an>=0.5.22" "jieba>=0.42.1" "langid>=1.1.6" "gradio>=4.15" "ipywebrtc" "anyascii" "openvino>=2023.3"

Cloning into 'OpenVoice'...
remote: Enumerating objects: 435, done.[K
remote: Counting objects: 100% (235/235), done.[K
remote: Compressing objects: 100% (110/110), done.[K
remote: Total 435 (delta 176), reused 126 (delta 125), pack-reused 200 (from 1)[K
Receiving objects: 100% (435/435), 3.82 MiB | 13.27 MiB/s, done.
Resolving deltas: 100% (219/219), done.
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.8/2.8 MB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.1/48.1 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m798.6/798.6 kB[0m [31m35.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies

In [2]:
import os
import torch
import openvino as ov
import ipywidgets as widgets
from IPython.display import Audio

core = ov.Core()

from openvoice.api import BaseSpeakerTTS, ToneColorConverter, OpenVoiceBaseClass
import openvoice.se_extractor as se_extractor

Importing the dtw module. When using in academic works please cite:
  T. Giorgino. Computing and Visualizing Dynamic Time Warping Alignments in R: The dtw Package.
  J. Stat. Soft., doi:10.18637/jss.v031.i07.



In [3]:
CKPT_BASE_PATH = 'checkpoints'

en_suffix = f'{CKPT_BASE_PATH}/base_speakers/EN'
zh_suffix = f'{CKPT_BASE_PATH}/base_speakers/ZH'
converter_suffix = f'{CKPT_BASE_PATH}/converter'

In [4]:
enable_chinese_lang = False

In [5]:
def download_from_hf_hub(filename, local_dir='./'):
    from huggingface_hub import hf_hub_download
    os.makedirs(local_dir, exist_ok=True)
    hf_hub_download(repo_id="myshell-ai/OpenVoice", filename=filename, local_dir=local_dir)

download_from_hf_hub(f'{converter_suffix}/checkpoint.pth')
download_from_hf_hub(f'{converter_suffix}/config.json')
download_from_hf_hub(f'{en_suffix}/checkpoint.pth')
download_from_hf_hub(f'{en_suffix}/config.json')

download_from_hf_hub(f'{en_suffix}/en_default_se.pth')
download_from_hf_hub(f'{en_suffix}/en_style_se.pth')

if enable_chinese_lang:
    download_from_hf_hub(f'{zh_suffix}/checkpoint.pth')
    download_from_hf_hub(f'{zh_suffix}/config.json')
    download_from_hf_hub(f'{zh_suffix}/zh_default_se.pth')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


checkpoint.pth:   0%|          | 0.00/131M [00:00<?, ?B/s]

checkpoints/converter/config.json:   0%|          | 0.00/850 [00:00<?, ?B/s]

checkpoint.pth:   0%|          | 0.00/160M [00:00<?, ?B/s]

checkpoints/base_speakers/EN/config.json:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

en_default_se.pth:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

en_style_se.pth:   0%|          | 0.00/1.78k [00:00<?, ?B/s]

In [6]:
pt_device = "cpu"

en_base_speaker_tts = BaseSpeakerTTS(f'{en_suffix}/config.json', device=pt_device)
en_base_speaker_tts.load_ckpt(f'{en_suffix}/checkpoint.pth')

tone_color_converter = ToneColorConverter(f'{converter_suffix}/config.json', device=pt_device)
tone_color_converter.load_ckpt(f'{converter_suffix}/checkpoint.pth')

if enable_chinese_lang:
    zh_base_speaker_tts = BaseSpeakerTTS(f'{zh_suffix}/config.json', device=pt_device)
    zh_base_speaker_tts.load_ckpt(f'{zh_suffix}/checkpoint.pth')
else:
    zh_base_speaker_tts = None



Loaded checkpoint 'checkpoints/base_speakers/EN/checkpoint.pth'
missing/unexpected keys: [] []


(…)BERP_none0.30_mean1.81_std1.81.model.pkl:   0%|          | 0.00/10.0M [00:00<?, ?B/s]

Loaded checkpoint 'checkpoints/converter/checkpoint.pth'
missing/unexpected keys: [] []


In [7]:
source_se = torch.load(f'{en_suffix }/en_default_se.pth').to(pt_device)

In [17]:
reference_speaker = '/content/OpenVoice/resources/demo_speaker1.mp3' # This is the voice you want to clone

In [18]:
target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter, target_dir='processed', vad=True)

OpenVoice version: v1


Downloading: "https://github.com/snakers4/silero-vad/zipball/master" to /root/.cache/torch/hub/master.zip


[(0.174, 2.706), (3.502, 7.634), (9.71, 12.978), (14.83, 16.85), (17.518, 20.082), (24.654, 26.162), (33.614, 40.498), (45.486, 54.45), (55.534, 57.426), (59.758, 61.042), (74.446, 78.162)]
after vad: dur = 38.76401360544218


Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at ../aten/src/ATen/native/SpectralOps.cpp:873.)
  return _VF.stft(input, n_fft, hop_length, win_length, window,  # type: ignore[attr-defined]


In [21]:
OUTPUT_DIR = 'outputs/'
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [22]:
save_path = f'{OUTPUT_DIR}/output_en_default.wav'

# Run the base speaker tts
text = "This audio is generated by OpenVoice."
src_path = f'{OUTPUT_DIR}/tmp.wav'
en_base_speaker_tts.tts(text, src_path, speaker='default', language='English', speed=1.0)

# Run the tone color converter
encode_message = "@MyShell"
tone_color_converter.convert(
    audio_src_path=src_path,
    src_se=source_se,
    tgt_se=target_se,
    output_path=save_path,
    message=encode_message)

 > Text splitted to sentences.
This audio is generated by OpenVoice.
ðɪs ˈɑdiˌoʊ ɪz ˈdʒɛnəɹˌeɪtɪd baɪ ˈoʊpən vɔɪs.
 length:45
 length:45


In [23]:
source_se = torch.load(f'{en_suffix }/en_style_se.pth').to(pt_device)
save_path = f'{OUTPUT_DIR}/output_whispering.wav'

# Run the base speaker tts
text = "This audio is generated by OpenVoice."
src_path = f'{OUTPUT_DIR}/tmp.wav'
en_base_speaker_tts.tts(text, src_path, speaker='whispering', language='English', speed=0.9)

# Run the tone color converter
encode_message = "@MyShell"
tone_color_converter.convert(
    audio_src_path=src_path,
    src_se=source_se,
    tgt_se=target_se,
    output_path=save_path,
    message=encode_message)

 > Text splitted to sentences.
This audio is generated by OpenVoice.
ðɪs ˈɑdiˌoʊ ɪz ˈdʒɛnəɹˌeɪtɪd baɪ ˈoʊpən vɔɪs.
 length:45
 length:45


In [24]:
enable_chinese_lang = True

In [28]:
CKPT_BASE_PATH= '/content/checkpoints/base_speakers/ZH'
en_base_speaker_tts = BaseSpeakerTTS(f'{CKPT_BASE_PATH}/config.json', device=pt_device)
en_base_speaker_tts.load_ckpt(f'{en_suffix}/checkpoint.pth')

source_se = torch.load(f'{CKPT_BASE_PATH}/zh_default_se.pth').to(pt_device)
save_path = f'{OUTPUT_DIR}/output_chinese.wav'

# Run the base speaker tts
text = "今天天气真好，我们一起出去吃饭吧。"
src_path = f'{OUTPUT_DIR}/tmp.wav'
en_base_speaker_tts.tts(text, src_path, speaker='default', language='Chinese', speed=1.0)

# Run the tone color converter
encode_message = "@MyShell"
tone_color_converter.convert(
    audio_src_path=src_path,
    src_se=source_se,
    tgt_se=target_se,
    output_path=save_path,
    message=encode_message)

Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...


Loaded checkpoint 'checkpoints/base_speakers/EN/checkpoint.pth'
missing/unexpected keys: [] []
 > Text splitted to sentences.
今天天气真好, 我们一起出去吃饭吧.


Dumping model to file cache /tmp/jieba.cache
DEBUG:jieba:Dumping model to file cache /tmp/jieba.cache
Loading model cost 1.323 seconds.
DEBUG:jieba:Loading model cost 1.323 seconds.
Prefix dict has been built successfully.
DEBUG:jieba:Prefix dict has been built successfully.


tʃ⁼in→tʰjɛn→tʰjɛn→tʃʰi↓ ts`⁼ən→ xɑʊ↓↑,  wo↓↑mən i↓tʃʰi↓↑ ts`ʰu→tʃʰɥ↓ ts`ʰɹ`→fan↓ p⁼a.
 length:85
 length:85
