In [1]:
import torch
from pathlib import Path
import librosa
import numpy as np
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
import soundfile as sf

class CustomVoiceNarrator:
    def __init__(self, voice_samples_dir):
        """Initialize the custom voice narrator with a directory of voice samples"""
        self.voice_samples_dir = Path(voice_samples_dir)
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        
        # Initialize models
        self.processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
        self.model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(self.device)
        self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(self.device)
        
        # Process voice samples to create speaker embedding
        self.speaker_embedding = self._create_speaker_embedding()
    
    def _create_speaker_embedding(self):
        """Create a speaker embedding from voice samples"""
        embeddings = []
        
        for audio_file in self.voice_samples_dir.glob("*.wav"):
            # Load and preprocess audio
            speech, sr = librosa.load(str(audio_file), sr=16000)
            speech = librosa.effects.trim(speech)[0]
            
            # Convert to tensor
            inputs = self.processor(
                audio=speech,
                sampling_rate=sr,
                return_tensors="pt"
            )
            
            # Generate speaker embedding
            with torch.no_grad():
                embedding = self.model.get_encoder_hidden_states(
                    inputs["input_values"].to(self.device)
                ).last_hidden_state.mean(dim=1)
            embeddings.append(embedding)
        
        # Average all embeddings
        return torch.mean(torch.stack(embeddings), dim=0)
    
    def narrate(self, text, output_path):
        """Generate speech from text using the custom voice"""
        # Preprocess text
        inputs = self.processor(text=text, return_tensors="pt")
        
        # Generate speech
        speech = self.model.generate_speech(
            inputs["input_ids"].to(self.device),
            self.speaker_embedding.to(self.device),
            vocoder=self.vocoder
        )
        
        # Save the generated speech
        sf.write(output_path, speech.cpu().numpy(), samplerate=16000)
        
        return output_path

# Example usage
if __name__ == "__main__":
    # Initialize narrator with directory containing voice samples
    narrator = CustomVoiceNarrator("voice_samples")
    
    # Generate speech with custom voice
    text = "Hello, this is a test of the custom voice narrator."
    output_path = "generated_speech.wav"
    narrator.narrate(text, output_path)


  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


AttributeError: 'SpeechT5ForTextToSpeech' object has no attribute 'get_speaker_embeddings'