In [1]:
import sys

sys.path.append("/root/autodl-tmp/I-AM/CosyVoice")
sys.path.append("/root/autodl-tmp/I-AM/CosyVoice/third_party/Matcha-TTS")

import time
import yaml
from tqdm import tqdm
import torchaudio, torch
from contextlib import nullcontext
from concurrent.futures import ThreadPoolExecutor, as_completed
from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2
from cosyvoice.utils.file_utils import load_wav

2025-01-10 17:33:54,085 - modelscope - INFO - PyTorch version 2.5.1 Found.
2025-01-10 17:33:54,090 - modelscope - INFO - Loading ast index from /root/.cache/modelscope/ast_indexer
2025-01-10 17:33:54,126 - modelscope - INFO - Loading done! Current index file version is 1.15.0, with md5 2f5eeacd95c207bb2bc0f708cda0b4fd and a total number of 980 components indexed
  from .autonotebook import tqdm as notebook_tqdm


failed to import ttsfrd, use WeTextProcessing instead


In [2]:
class CosyVoice2TTS:
    def __init__(self, model_path, prompts_config_path):
        self.cosyvoice = CosyVoice2(
            model_path,
            load_jit=True,
            load_onnx=False,
            load_trt=False
        )
        with open(prompts_config_path, 'r', encoding='utf-8') as file:
            self.prompts_config = yaml.safe_load(file)

    def generate_audio(
        self,
        texts_path,
        voice_type,
        background_music_type,
        output_path,
        speed=1.0,
        stream=False,
        music_volume=0.2,
        music_extension_duration=30,
        fade_in_duration=3,
        fade_out_duration=10,
        max_retries=3
    ):

        with open(texts_path, 'r', encoding='utf-8') as file:
            text_segments = yaml.safe_load(file)['sequences']

        prompt_speech_16k = load_wav(self.prompts_config['prompts']['speech'][voice_type], 16000)
        prompt_text = self.prompts_config['prompts']['text'][voice_type]

        total_len = len(text_segments)

        success = False
        attempt = 0

        while not success and attempt < max_retries:
            attempt += 1
            audio_segments = [None] * total_len
            failed = False

            print(f"Attempt {attempt} of {max_retries}")
            # 创建进度条
            pbar = tqdm(total=total_len, desc="Generating audio segments")

            for idx in range(total_len):
                audio = self._generate_single(
                    text_segments[idx]['text'],
                    prompt_text,
                    prompt_speech_16k
                )
                if audio is None:
                    print(f"\nFailed to generate audio for segment {idx}, retrying entire sequence...")
                    failed = True
                    pbar.close()
                    break

                audio_segments[idx] = audio
                pbar.update(1)
            if not failed:
                success = True
                pbar.close()

            if not success and attempt == max_retries:
                raise Exception("Failed to generate all audio segments after maximum retries")

        print("Combining audio segments with background music...")

        combined_audio = self._combined_audios(
            text_segments,
            audio_segments,
            background_music_type,
            music_extension_duration,
            fade_in_duration,
            fade_out_duration
        )

        print(f"Saving final audio to {output_path}")

        torchaudio.save(
            output_path,
            combined_audio,
            self.cosyvoice.sample_rate
        )

        print("Audio generation completed!")

    def _generate_single(self, text, prompt_text, prompt_speech_16k, speed=1.0, stream=False):
        try:
            with torch.inference_mode(), torch.amp.autocast('cuda'):
                for i, output in enumerate(self.cosyvoice.inference_zero_shot(
                    text,
                    prompt_text,
                    prompt_speech_16k,
                    speed=speed,
                    stream=stream
                )):
                    audio = output['tts_speech']
                return audio
        except Exception as e:
            print(f"Error generating audio for text: {text}")
            print(str(e))
            return None

    def _combined_audios(
        self,
        text_segments,
        audio_segments,
        background_music_type,
        music_extension_duration=30,
        fade_in_duration=3,
        fade_out_duration=10
    ):

        combined_audio = self._generate_silence(fade_in_duration)
        for i, audio in enumerate(audio_segments):
            silence = self._generate_silence(text_segments[i]['duration'])
            combined_audio = torch.cat([combined_audio, audio, silence], dim=1)
        
        music_extension_samples = self._generate_silence(music_extension_duration)
        combined_audio =  torch.cat([combined_audio, music_extension_samples], dim=1)

        background_music, bg_sample_rate = torchaudio.load(self.prompts_config['background_music'][background_music_type])

        if background_music.shape[0] > 1:
            background_music = torch.mean(background_music, dim=0, keepdim=True)

        if bg_sample_rate != self.cosyvoice.sample_rate:
            resampler = torchaudio.transforms.Resample(bg_sample_rate, self.cosyvoice.sample_rate)
            background_music = resampler(background_music)

        # 调整背景音乐的长度以匹配语音长度
        target_length = combined_audio.shape[1]
        if background_music.shape[1] > target_length:
            # 如果背景音乐更长，截取需要的部分
            background_music = background_music[:, :target_length]
        elif background_music.shape[1] < target_length:
            # 如果背景音乐更短，循环播放直到达到所需长度
            num_repeats = (target_length + background_music.shape[1] - 1) // background_music.shape[1]
            background_music = background_music.repeat(1, num_repeats)
            background_music = background_music[:, :target_length]

        fade_in_samples = fade_in_duration * self.cosyvoice.sample_rate
        fade_out_samples = fade_out_duration * self.cosyvoice.sample_rate

        fade_in_curve = self._create_fade_curve(fade_in_samples, fade_in_samples, fade_in=True)
        fade_out_curve = self._create_fade_curve(fade_out_samples, fade_out_samples, fade_in=False)

        background_music[0, :fade_in_samples] *= fade_in_curve
        background_music[0, -fade_out_samples:] *= fade_out_curve

        # 调整背景音乐的音量（这里设置为语音的20%音量）
        background_volume = 0.2
        background_music = background_music * background_volume

        # 混合语音和背景音乐
        final_audio = combined_audio + background_music

        # 防止音频溢出（可选）
        if torch.max(torch.abs(final_audio)) > 1:
            final_audio = final_audio / torch.max(torch.abs(final_audio))

        return final_audio
        

    def _generate_silence(self, silence_duration):
        return torch.zeros(1, silence_duration * self.cosyvoice.sample_rate)


    def _create_fade_curve(self, length, fade_length, fade_in=True):
        if fade_in:
            return torch.linspace(0, 1, fade_length)
        else:
            return torch.linspace(1, 0, fade_length)



In [3]:
tts = CosyVoice2TTS(
    model_path='/root/autodl-fs/cosyvoice/pretrained_models/CosyVoice2-0.5B',
    prompts_config_path='/root/autodl-tmp/I-AM/project/backend/agents/prompts/prompts_zero_shot.yaml'
)

  deprecate("LoRACompatibleLinear", "1.0.0", deprecation_message)
2025-01-10 17:34:04,388 INFO input frame rate=25
  WeightNorm.apply(module, name, dim)
  torchaudio.set_audio_backend('soundfile')
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
[1;31m2025-01-10 17:34:05.949812702 [E:onnxruntime:Default, provider_bridge_ort.cc:1744 TryGetProviderInfo_CUDA] /onnxruntime_src/onnxruntime/core/session/provider_bridge_ort.cc:1426 onnxruntime::Provider& onnxruntime::ProviderLibrary::Get() [ONNXRuntimeError] : 1 : FAIL : Failed to load library libonnxruntime_providers_cuda.so with error: libcudnn.so.8: cannot open shared object file: No such file or directory
[m
[0;93m2025-01-10 17:34:05.949831408 [W:onnxruntime:Default, onnxruntime_pybind_state.cc:870 CreateExecutionProviderInstance] Failed to create CUDA

In [4]:

tts.generate_audio(
    texts_path="/root/autodl-tmp/I-AM/project/backend/agents/prompts/texts/sleep_texts.yaml",
    voice_type="male1",
    background_music_type="bmusic_01",
    output_path="/root/autodl-tmp/I-AM/project/backend/agents/jupyter/output/temp/sleep.wav"
)

Attempt 1 of 3


Generating audio segments:   0%|          | 0/12 [00:00<?, ?it/s]

2025-01-10 17:34:14,047 INFO synthesis text 亲爱的朋友,请找一个舒适的地方躺下或坐下,闭上眼睛[breath],开始我们的宁静之旅。
2025-01-10 17:34:22,677 INFO yield speech len 11.6, rtf 0.7439236599823524
100%|██████████| 1/1 [00:09<00:00,  9.52s/it]
Generating audio segments:   8%|▊         | 1/12 [00:09<01:45,  9.55s/it]2025-01-10 17:34:23,590 INFO synthesis text 首先,深深地,吸一口气[breath],感受空气缓缓进入你的鼻腔,充满你的肺部。
2025-01-10 17:34:33,141 INFO yield speech len 10.68, rtf 0.894291771485118
100%|██████████| 1/1 [00:10<00:00, 10.43s/it]
Generating audio segments:  17%|█▋        | 2/12 [00:20<01:40, 10.09s/it]2025-01-10 17:34:34,059 INFO synthesis text 然后,慢慢地,呼出[breath],感受所有的紧张和压力[quick_breath]随着呼气离开你的身体。
2025-01-10 17:34:43,120 INFO yield speech len 9.12, rtf 0.9935234983762106
100%|██████████| 1/1 [00:09<00:00,  9.95s/it]
Generating audio segments:  25%|██▌       | 3/12 [00:29<01:30, 10.04s/it]2025-01-10 17:34:44,100 INFO synthesis text 再来一次,深深地,吸气[breath],慢慢地,呼气[breath]。让每一次呼吸[quick_breath]都带你进入更深层的放松状态。
2025-01-10 17:34:57,242 INFO yie

Combining audio segments with background music...
Saving final audio to /root/autodl-tmp/I-AM/project/backend/agents/jupyter/output/temp/sleep.wav
Audio generation completed!
