# Please run on Colab with T4 gpu

In [None]:
%%capture
!pip install TTS
!pip install torch
!pip install audiocraft

## Music generation Class

In [None]:
import torch
from transformers import pipeline
from TTS.api import TTS
from IPython.display import Audio
from audiocraft.models import MusicGen
from audiocraft.data.audio import audio_write
import wave

class GenerateAudio():
    """
    A class to generate audio files based on a provided text input. The class generates
    speech, classifies the sentiment of the text, and creates background music to accompany
    the speech based on the classified sentiment.

    Attributes:
    device (str): The device to run the models on ('cuda' if GPU is available, 'cpu' otherwise).
    text (str): The input text to generate speech and classify sentiment.
    """

    def __init__(self, text):
        """
        Initializes the GenerateAudio class with the provided text and sets up the device.

        Parameters:
        text (str): The input text for generating speech and sentiment analysis.
        """
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.text = text

    def _generate_sentiment(self, candidate_labels=["happy", "sad", "scary"]):
        """
        Classifies the sentiment of the input text using a zero-shot classification model.

        Parameters:
        candidate_labels (list): A list of sentiment labels to classify the text. Default is
                                  ["happy", "sad", "scary"].

        Returns:
        str: The label with the highest classification score indicating the sentiment of the text.
        """
        classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=self.device)  # Run on GPU if available
        result = classifier(self.text, candidate_labels)

        labels_and_scores = list(zip(result['labels'], result['scores']))
        best_label, best_score = max(labels_and_scores, key=lambda x: x[1])

        return best_label

    def _generate_speech(self, outfile='tts_output.wav'):
        """
        Generates speech from the input text using a Tacotron 2 TTS model and saves it to a file.

        Parameters:
        outfile (str): The path to save the generated speech audio file. Default is 'tts_output.wav'.

        Returns:
        str: The path to the generated speech audio file.
        """
        tts_model = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC").to(self.device)
        tts_model.tts_to_file(text=self.text, file_path=outfile)
        return outfile

    def _generate_background_music(self, label, duration, outfile="bg_audio"):
        """
        Generates background music based on a given label and duration using the MusicGen model.

        Parameters:
        label (str): The sentiment label used to guide the generation of background music.
        duration (float): The duration (in seconds) for which the background music should play.
        outfile (str): The path to save the generated background music file. Default is 'bg_audio'.

        Returns:
        str: The path to the generated background music audio file.
        """
        model = MusicGen.get_pretrained('small', device=self.device)
        model.set_generation_params(duration=duration)  # Duration of the generated waveform in seconds
        output = model.generate(
            descriptions=[f'{label} + Orchestral Background Music']
        )

        if outfile.endswith('.wav'):
            outfile = outfile[:-4]

        audio_write(stem_name=outfile, wav=output[0], sample_rate=model.sample_rate)
        return outfile

    def _find_duration(self, filePath):
        """
        Calculates the duration of an audio file based on its sample rate and number of frames.

        Parameters:
        filePath (str): The path to the audio file.

        Returns:
        float: The duration of the audio file in seconds.
        """
        with wave.open(filePath, 'rb') as audio_file:
            sample_rate = audio_file.getframerate()  # Sample rate (Hz)
            num_frames = audio_file.getnframes()    # Total number of frames
            duration = num_frames / float(sample_rate)  # Duration in seconds
        return duration

    def create_audio_files(self, ttsPath, bgMusicPath):
        """
        Creates speech and background music audio files based on the sentiment of the input text.

        The function generates speech from the input text, calculates its duration,
        and generates background music based on the classified sentiment.

        Parameters:
        ttsPath (str): The path to save the generated speech audio file.
        bgMusicPath (str): The path to save the generated background music audio file.

        Returns:
        tuple: A tuple containing the paths to the generated speech and background music files.
        """
        sentiment = self._generate_sentiment()
        ttsPath = self._generate_speech(ttsPath)
        duration = self._find_duration(ttsPath)
        bgMusicPath = self._generate_background_music(sentiment, duration,outfile=bgMusicPath)

        return ttsPath, bgMusicPath



In [None]:
obj = GenerateAudio("The dark forest gave me chills.")
obj.create_audio_files("tts_test.wav","bg_audio_test.wav")

## Replace story text

In [None]:
from transformers import pipeline
from audiocraft.models import MusicGen
from pydub import AudioSegment
import soundfile as sf
from TTS.api import TTS
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

def generate_sentiment(text, candidate_labels=["happy", "sad", "scary", "adventurous", "calm", "cheerful"]):
    classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=0)  # Use GPU if available
    result = classifier(text, candidate_labels)
    return result['labels'][0]

def generate_audio(text, outfile="tts_output.wav"):
    tts_model = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC")
    tts_model.tts_to_file(text=text, file_path=outfile)
    return outfile

def generate_background_music(label, duration, file_name="background_music.wav"):
    model = MusicGen.get_pretrained('facebook/musicgen-small') 
    model.device = device
    model.set_generation_params(use_sampling=True, top_k=250, duration=duration)
    outputs = model.generate(descriptions=[f"{label} + Background Music"], progress=True)
    audio = outputs[0].cpu().numpy().squeeze()
    sf.write(file_name, audio.T, samplerate=32000)
    return file_name

def crossfade_music_segments(sentiment_labels, segment_durations, crossfade_duration=2000):
    combined_music = None
    for i, (label, duration) in enumerate(zip(sentiment_labels, segment_durations)):
        music_file = generate_background_music(label, duration)
        music_segment = AudioSegment.from_wav(music_file)
        
        if combined_music is None:
            combined_music = music_segment
        else:
            combined_music = combined_music.append(music_segment, crossfade=crossfade_duration)
    
    combined_music.export("combined_music.wav", format="wav")
    return "combined_music.wav"

def overlay_tts_on_music(tts_file, music_file, output_file):
    tts_audio = AudioSegment.from_wav(tts_file)
    music_audio = AudioSegment.from_wav(music_file)

    music_audio = music_audio - 10
    tts_audio = tts_audio + 5

    tts_duration = len(tts_audio)
    music_duration = len(music_audio)

    if music_duration < tts_duration:
        loops = tts_duration // music_duration + 1
        music_audio = (music_audio * loops)[:tts_duration]
    elif music_duration > tts_duration:
        music_audio = music_audio[:tts_duration]

    combined_audio = music_audio.overlay(tts_audio, position=0)
    combined_audio.export(output_file, format="wav")
    print(f"Overlayed audio saved to {output_file}")

def generate_story_audio(story_text, segment_length=500):
    # Split the story into segments
    segments = [story_text[i:i+segment_length] for i in range(0, len(story_text), segment_length)]
    
    # Analyze sentiment for each segment
    sentiment_labels = [generate_sentiment(segment) for segment in segments]

    # Generate music for each segment and crossfade
    segment_durations = [10] * len(segments)
    combined_music_file = crossfade_music_segments(sentiment_labels, segment_durations)

    # Generate TTS for the entire story
    tts_file = generate_audio(story_text)

    # Overlay TTS on the combined music
    overlay_tts_on_music(tts_file, combined_music_file, "final_story_audio.wav")
    print("Final audio saved to final_story_audio.wav")
import re

def clean_text(text):
    text = text.replace("“", '"').replace("”", '"').replace("‘", "'").replace("’", "'")
    text = re.sub(r"\s+", " ", text).strip()
    text = text.replace("—", " - ").replace("–", "-")
    text = re.sub(r"\s+([?.!,])", r"\1", text)
    text = re.sub(r"([?.!,])\s*", r"\1 ", text)
    text = text.strip()

    return text

story_text = """
HIGH above the city, on a tall column, stood the statue of the Happy Prince.  He was gilded all over with thin leaves of fine gold, for eyes he had two bright sapphires, and a large red ruby glowed on his sword-hilt.
He was very much admired indeed.  “He is as beautiful as a weathercock,” remarked one of the Town Councillors who wished to gain a reputation for having artistic tastes; “only not quite so useful,” he added, fearing lest people should think him unpractical, which he really was not.
“Why can’t you be like the Happy Prince?” asked a sensible mother of her little boy who was crying for the moon.  “The Happy Prince never dreams of crying for anything.”
“I am glad there is some one in the world who is quite happy,” muttered a disappointed man as he gazed at the wonderful statue.
“He looks just like an angel,” said the Charity Children as they came out of the cathedral in their bright scarlet cloaks and their clean white pinafores.
“How do you know?” said the Mathematical Master, “you have never seen one.”
“Ah! but we have, in our dreams,” answered the children; and the Mathematical Master frowned and looked very severe, for he did not approve of children dreaming.
One night there flew over the city a little Swallow.  His friends had gone away to Egypt six weeks before, but he had stayed behind, for he was in love with the most beautiful Reed.  He had met her early in the spring as he was flying down the river after a big yellow moth, and had been so attracted by her slender waist that he had stopped to talk to her.
“Shall I love you?” said the Swallow, who liked to come to the point at once, and the Reed made him a low bow.  So he flew round and round her, touching the water with his wings, and making silver ripples.  This was his courtship, and it lasted all through the summer.
“It is a ridiculous attachment,” twittered the other Swallows; “she has no money, and far too many relations”; and indeed the river was quite full of Reeds.  Then, when the autumn came they all flew away.
After they had gone he felt lonely, and began to tire of his lady-love.
"""
cleaned_text = clean_text(story_text)

generate_story_audio(cleaned_text)
