In [1]:
import sys

sys.path.append("/root/autodl-tmp/I-AM/CosyVoice")
sys.path.append("/root/autodl-tmp/I-AM/CosyVoice/third_party/Matcha-TTS")

import time
import yaml
from tqdm import tqdm
import torchaudio, torch
from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2
from cosyvoice.utils.file_utils import load_wav

2025-04-20 18:41:52,511 - modelscope - INFO - PyTorch version 2.6.0 Found.
2025-04-20 18:41:52,513 - modelscope - INFO - Loading ast index from /root/.cache/modelscope/ast_indexer
2025-04-20 18:41:52,550 - modelscope - INFO - Loading done! Current index file version is 1.15.0, with md5 2f5eeacd95c207bb2bc0f708cda0b4fd and a total number of 980 components indexed
  from .autonotebook import tqdm as notebook_tqdm


failed to import ttsfrd, use WeTextProcessing instead


In [2]:
class CosyVoice2TTS:
    def __init__(self, model_path, prompts_config_path):
        self.cosyvoice = CosyVoice2(
            model_path,
            load_jit=True,
            load_onnx=False,
            load_trt=False
        )
        with open(prompts_config_path, 'r', encoding='utf-8') as file:
            self.prompts_config = yaml.safe_load(file)

    def generate_audio(
        self,
        texts_path,
        voice_type,
        background_music_type,
        output_path,
        speed=1.0,
        stream=False,
        music_volume=0.2,
        music_extension_duration=30,
        fade_in_duration=3,
        fade_out_duration=10,
        max_retries=3
    ):

        with open(texts_path, 'r', encoding='utf-8') as file:
            text_segments = yaml.safe_load(file)['sequences']

        prompt_speech_16k = load_wav(self.prompts_config['prompts']['speech'][voice_type], 16000)
        prompt_text = self.prompts_config['prompts']['text'][voice_type]

        total_len = len(text_segments)

        success = False
        attempt = 0

        while not success and attempt < max_retries:
            attempt += 1
            audio_segments = [None] * total_len
            failed = False

            print(f"Attempt {attempt} of {max_retries}")
            # 创建进度条
            pbar = tqdm(total=total_len, desc="Generating audio segments")

            for sentence in text_segments:
                audio = self._generate_single(
                    sentence['text'],
                    prompt_text,
                    prompt_speech_16k
                )
                if audio is None:
                    print(f"\nFailed to generate audio for segment {idx}, retrying entire sequence...")
                    failed = True
                    pbar.close()
                    break

                audio_segments[sentence['id']] = audio
                pbar.update(1)
            if not failed:
                success = True
                pbar.close()

            if not success and attempt == max_retries:
                raise Exception("Failed to generate all audio segments after maximum retries")

        print("Combining audio segments with background music...")

        combined_audio = self._combined_audios(
            text_segments,
            audio_segments,
            background_music_type,
            music_extension_duration,
            fade_in_duration,
            fade_out_duration
        )

        print(f"Saving final audio to {output_path}")

        torchaudio.save(
            output_path,
            combined_audio,
            self.cosyvoice.sample_rate
        )

        print("Audio generation completed!")

    def _generate_single(self, text, prompt_text, prompt_speech_16k, speed=1.0, stream=False):
        try:
            with torch.inference_mode(), torch.amp.autocast('cuda'):
                for i, output in enumerate(self.cosyvoice.inference_zero_shot(
                    text,
                    prompt_text,
                    prompt_speech_16k,
                    speed=speed,
                    stream=stream
                )):
                    audio = output['tts_speech']
                return audio
        except Exception as e:
            print(f"Error generating audio for text: {text}")
            print(str(e))
            return None

    def _combined_audios(
        self,
        text_segments,
        audio_segments,
        background_music_type,
        music_extension_duration=30,
        fade_in_duration=3,
        fade_out_duration=10
    ):

        combined_audio = self._generate_silence(fade_in_duration)
        for i, audio in enumerate(audio_segments):
            silence = self._generate_silence(text_segments[i]['duration'])
            combined_audio = torch.cat([combined_audio, audio, silence], dim=1)
        
        music_extension_samples = self._generate_silence(music_extension_duration)
        combined_audio =  torch.cat([combined_audio, music_extension_samples], dim=1)

        background_music, bg_sample_rate = torchaudio.load(self.prompts_config['background_music'][background_music_type])

        if background_music.shape[0] > 1:
            background_music = torch.mean(background_music, dim=0, keepdim=True)

        if bg_sample_rate != self.cosyvoice.sample_rate:
            resampler = torchaudio.transforms.Resample(bg_sample_rate, self.cosyvoice.sample_rate)
            background_music = resampler(background_music)

        # 调整背景音乐的长度以匹配语音长度
        target_length = combined_audio.shape[1]
        if background_music.shape[1] > target_length:
            # 如果背景音乐更长，截取需要的部分
            background_music = background_music[:, :target_length]
        elif background_music.shape[1] < target_length:
            # 如果背景音乐更短，循环播放直到达到所需长度
            num_repeats = (target_length + background_music.shape[1] - 1) // background_music.shape[1]
            background_music = background_music.repeat(1, num_repeats)
            background_music = background_music[:, :target_length]

        fade_in_samples = fade_in_duration * self.cosyvoice.sample_rate
        fade_out_samples = fade_out_duration * self.cosyvoice.sample_rate

        fade_in_curve = self._create_fade_curve(fade_in_samples, fade_in_samples, fade_in=True)
        fade_out_curve = self._create_fade_curve(fade_out_samples, fade_out_samples, fade_in=False)

        background_music[0, :fade_in_samples] *= fade_in_curve
        background_music[0, -fade_out_samples:] *= fade_out_curve

        # 调整背景音乐的音量（这里设置为语音的20%音量）
        background_volume = 0.2
        background_music = background_music * background_volume

        # 混合语音和背景音乐
        final_audio = combined_audio + background_music

        # 防止音频溢出（可选）
        if torch.max(torch.abs(final_audio)) > 1:
            final_audio = final_audio / torch.max(torch.abs(final_audio))

        return final_audio
        

    def _generate_silence(self, silence_duration):
        return torch.zeros(1, silence_duration * self.cosyvoice.sample_rate)


    def _create_fade_curve(self, length, fade_length, fade_in=True):
        if fade_in:
            return torch.linspace(0, 1, fade_length)
        else:
            return torch.linspace(1, 0, fade_length)



In [4]:
tts = CosyVoice2TTS(
    model_path='/root/autodl-fs/cosyvoice/pretrained_models/CosyVoice2-0.5B',
    prompts_config_path='/root/autodl-tmp/I-AM/project/backend/config/meditation.yaml'
)

2025-04-20 18:43:20,220 INFO input frame rate=25
[1;31m2025-04-20 18:43:21.356582103 [E:onnxruntime:Default, provider_bridge_ort.cc:1744 TryGetProviderInfo_CUDA] /onnxruntime_src/onnxruntime/core/session/provider_bridge_ort.cc:1426 onnxruntime::Provider& onnxruntime::ProviderLibrary::Get() [ONNXRuntimeError] : 1 : FAIL : Failed to load library libonnxruntime_providers_cuda.so with error: libcudnn.so.8: cannot open shared object file: No such file or directory
[m
[0;93m2025-04-20 18:43:21.356604054 [W:onnxruntime:Default, onnxruntime_pybind_state.cc:870 CreateExecutionProviderInstance] Failed to create CUDAExecutionProvider. Please reference https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html#requirementsto ensure all dependencies are met.[m
2025-04-20 18:43:21,920 WETEXT INFO found existing fst: /root/miniconda3/envs/manifest_app/lib/python3.11/site-packages/tn/zh_tn_tagger.fst
2025-04-20 18:43:21,920 WETEXT INFO found existing fst: /root/miniconda3/envs/man

In [None]:

tts.generate_audio(
    texts_path="/root/autodl-tmp/I-AM/project/backend/agents/jupyter/output/script/exam.yaml",
    voice_type="male1",
    background_music_type="bmusic_01",
    output_path="/root/autodl-tmp/I-AM/project/backend/agents/jupyter/output/temp/exam.wav"
)

In [None]:
# 拼接音频文件

import glob
import os
import re
from pydub import AudioSegment


def natural_sort_key(s):
    """用于实现自然排序的键函数"""
    result = []
    # 使用正则表达式分割字符串，保留数字
    parts = re.split('([0-9]+)', s)
    
    for text in parts:
        if text.isdigit():
            # 如果是数字，转换为整数
            result.append(int(text))
        else:
            # 如果是文本，转换为小写
            result.append(text.lower())
    
    return result

def combine_wav_files_with_pause_and_music(base_path, output_path, pause_durations, background_music_path, music_extension_duration=15000, crossfade_duration=100, fade_out_duration=5000):
    """
    连接多个 WAV 文件，并在之间添加停顿，同时添加背景音乐
    
    参数:
    base_path: 输入文件的基础路径
    output_path: 输出文件路径
    pause_durations: 每个段落之间的停顿时长列表(毫秒)
    background_music_path: 背景音乐文件路径
    music_extension_duration: 背景音乐在主音频结束后继续播放的时长(毫秒)
    crossfade_duration: 交叉淡入淡出的持续时间(毫秒)
    fade_out_duration: 背景音乐的渐出时长(毫秒)
    """
    wav_files = glob.glob(os.path.join(base_path, "*.wav"))
    wav_files.sort(key=natural_sort_key)

    for wav_file in wav_files:
        if not os.path.exists(wav_file):
            print(f"文件 {wav_file} 不存在")
            return
    
    print(f"找到以下文件: ")
    for wav_file in wav_files:
        print(wav_file)
    
    # 加载5秒静音
    combined = AudioSegment.silent(duration=5000)
    
    # 依次添加其他音频文件
    for i, wav_file in enumerate(wav_files):
        # 创建静音片段
        pause = AudioSegment.silent(duration=pause_durations[i])
        
        # 加载并添加下一个音频文件
        next_segment = AudioSegment.from_wav(wav_file)
        combined = combined.append(next_segment, crossfade=crossfade_duration)

        # 添加停顿
        combined = combined + pause
    
    # 添加额外的静音以延长背景音乐
    combined = combined + AudioSegment.silent(duration=music_extension_duration)
    
    # 加载背景音乐
    background_music = AudioSegment.from_file(background_music_path)
    
    # 调整背景音乐音量
    background_music = background_music - 15  # 减少音量，单位为dB
    
    # 截断或循环背景音乐以匹配合并音频的长度
    if len(background_music) < len(combined):
        background_music = background_music * ((len(combined) // len(background_music)) + 1)
    background_music = background_music[:len(combined)]
    
    # 添加渐入和渐出效果
    background_music = background_music.fade_in(fade_in_duration).fade_out(fade_out_duration)
    
    # 叠加背景音乐
    combined_with_music = combined.overlay(background_music)
    
    os.makedirs(os.path.dirname(output_path), exist_ok=True)

    # 导出合并后的文件
    combined_with_music.export(output_path, format="wav")
    print(f"已合并所有音频并添加背景音乐到：{output_path}")

# 使用示例
base_path = "output/tts/sleep/female2"
output_path = "output/tts/sleep/female2_combined_with_bmusic_03.wav"
pause_durations = love_texts_silence_durations
background_music_path = "/root/autodl-tmp/I-AM/project/backend/agents/prompts/background_music/bmusic_03.wav"
music_extension_duration = 30000  # 背景音乐延长30秒
fade_in_duration = 3000
fade_out_duration = 10000  # 渐出时长10秒

combine_wav_files_with_pause_and_music(
    base_path,
    output_path,
    pause_durations,
    background_music_path,
    music_extension_duration,
    fade_out_duration=fade_out_duration
)