In [None]:
import torch
import os
import sys
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), 'OpenVoice')))
from OpenVoice.openvoice import se_extractor
from OpenVoice.openvoice.api import BaseSpeakerTTS, ToneColorConverter
import pygame as pg
import io

In [5]:
ckpt_base='OpenVoice/checkpoints/base_speakers/EN'
ckpt_converter = 'OpenVoice/checkpoints/converter'
output_dir = 'OpenVoice/outputs'
reference_speaker = 'OpenVoice/resources/demo_speaker2.mp3'
text = """Energy Secretary Chris Wright, who made millions in the fracking industry, commissioned the report. In a preface, he did not deny that climate change exists.
    “Climate change is real, and it deserves attention,” he wrote. “But it is not the greatest threat facing humanity. That distinction belongs to global energy poverty.”
    In other words, Wright sees more damage to humans from cutting back on carbon emissions.
    That is a minority view in the scientific community, which has a much, much larger body of peer reviewed studies that raise the alarm about climate change. Most notably, the Intergovernmental Panel on Climate Change issues peer-reviewed reports with hundreds of authors from around world. The Trump administration has barred US government scientists from taking part in the next installment, due out in 2029."""
device = "cuda" if torch.cuda.is_available else "cpu"


In [None]:
base_speaker_tts = BaseSpeakerTTS(f'{ckpt_base}/config.json', device=device)
base_speaker_tts.load_ckpt(f'{ckpt_base}/checkpoint.pth')
tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')
source_se = torch.load(f'{ckpt_base}/en_default_se.pth').to(device)

In [None]:
target_se, audio_name = se_extractor.get_se(reference_speaker,
                                            tone_color_converter,
                                            target_dir='processed', 
                                            vad=True)

In [None]:
save_path = f'{output_dir}/output_en_default.wav'
src_path = f'{output_dir}/tmp.wav'
buffer_ouput = base_speaker_tts.tts(text, src_path, speaker="excited", language='English', speed=0.9)
encode_message = "@Myshell"
tone_color_converter.convert(
    audio_src_path=src_path, 
    src_se=source_se, 
    tgt_se=target_se, 
    output_path=save_path,
    message=encode_message)

In [None]:
pg.init()
pg.mixer.init()
pg.mixer.music.load(buffer_ouput, "wav")
pg.mixer.music.play()
while pg.mixer.music.get_busy():  # Wait for playback to finish
    pass

***So the TTS model running oke, now we test the LLM model***

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as f

***Testing with model MiniLM-L6 by Sentence-tranformers, and it's used just for sentence encoding only***

In [24]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

model_name = 'sentence-transformers/all-MiniLM-L6-v2'
cache_dir = 'MiniLM-L6'
# Sentences we want sentence embeddings for
sentences = ['Each sentence is converted']

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
model = AutoModel.from_pretrained(model_name, cache_dir=cache_dir)

# Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

# Perform pooling
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

# Normalize embeddings
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

# the sentence embeddings return a 2D dimentional array
print("Sentence embeddings:")
print(sentence_embeddings)

decoded_text  = tokenizer.decode(encoded_input['input_ids'][0], skip_special_tokens=True)
print(f"Original sentence: {decoded_text}")


  return torch.load(checkpoint_file, map_location="cpu")


Sentence embeddings:
tensor([[ 8.6439e-02,  1.0276e-01,  5.3946e-03,  2.0444e-03, -9.9633e-03,
          2.5386e-02,  4.9288e-02, -3.0627e-02,  6.8725e-02,  1.0137e-02,
          7.7540e-02, -9.0081e-02,  6.1062e-03, -5.6990e-02,  1.4171e-02,
          2.8049e-02, -8.6846e-02,  7.6440e-02, -1.0349e-01, -6.7744e-02,
          6.9995e-02,  8.4425e-02, -7.2491e-03,  1.0477e-02,  1.3402e-02,
          6.7758e-02, -9.4209e-02, -3.7169e-02,  5.2262e-02, -3.1085e-02,
         -9.6341e-02,  1.5772e-02,  2.5787e-02,  7.8525e-02,  7.8995e-02,
          1.9152e-02,  1.6436e-02,  3.1009e-03,  3.8131e-02,  2.3709e-02,
          1.0539e-02, -4.4064e-02,  4.4174e-02, -2.5873e-02,  6.1538e-02,
         -4.0543e-02, -8.6414e-02,  3.1972e-02, -8.9067e-04, -2.4444e-02,
         -9.1972e-02,  2.3394e-02, -8.3029e-02,  4.4151e-02, -2.4969e-02,
          6.2302e-02, -1.3036e-03,  7.5140e-02,  2.4638e-02, -6.4724e-02,
         -1.1773e-01,  3.8339e-02, -9.1177e-02,  6.3545e-02,  7.6274e-02,
         -8.8024e