In [2]:
##########    AFTER ENTERING PROMT, WAIT FOR 7-10 MINUTES FOR FINAL OUTPUT    ##########


#####################################
##### INSTALLING LIBRARIES USED #####
#####################################

!pip install transformers accelerate
!pip install moviepy diffusers torch pillow
!pip install sentencepiece datasets[audio] soundfile





#####################################
##### IMPORTING LIBRARIES USED ######
#####################################

import torch
import numpy as np
import cv2
import re
import soundfile as sf
from transformers import GPT2LMHeadModel, GPT2Tokenizer, SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan, AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from diffusers import StableDiffusionPipeline
from moviepy.editor import ImageClip, concatenate_videoclips, VideoFileClip, AudioFileClip, ImageSequenceClip





########################################
##### GENERATING STORY FROM PROMPT #####
########################################

model_name="maheshkrishnam/promt_to_story"

# Load the model and tokenizer from Hugging Face
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Load model directly
tokenizer = AutoTokenizer.from_pretrained("maheshkrishnam/promt_to_story")
model = AutoModelForCausalLM.from_pretrained("maheshkrishnam/promt_to_story")


def generate_story(prompt, model, tokenizer, max_length=200, temperature=0.7):
    # Encode the prompt text
    input_ids = tokenizer.encode(prompt, return_tensors='pt')

    # Generate text
    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_length=max_length,
            temperature=temperature,
            num_return_sequences=1,
            no_repeat_ngram_size=2,
            top_k=50,
            top_p=0.95,
            do_sample=True,
            eos_token_id=tokenizer.eos_token_id
        )

    # Decode the generated text
    story = tokenizer.decode(output[0], skip_special_tokens=True)
    return story

# Generate a story
prompt = input("Enter promt here : ")
story = generate_story(prompt, model, tokenizer)
print(story)





#######################################
##### TRIMING EXTRA PART OF STORY #####
#######################################

def remove_incomplete_last_line(text):
    # Remove all newlines and make it one continuous paragraph
    text = text.replace('\n', ' ')

    # Making the story shorter than the length of 500 characters
    for i in range(500, -1, -1):
        if text[i] == '.':
            return text[:i + 1]

    return text

story = remove_incomplete_last_line(story)
story





#######################################
##### GENERATING AUDIO FROM STORY #####
#######################################

processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

# restriction of 600 characters for this tts model
inputs = processor(text=story, return_tensors="pt")

# load xvector containing speaker's voice characteristics from a dataset
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)

speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)

sf.write("story.wav", speech.numpy(), samplerate=16000)





##########################################################
##### GENERATING VIDEO FROM TEXT WITH AUDIO INGRATED #####
##########################################################

# Initialize Stable Diffusion model
model_id = "runwayml/stable-diffusion-v1-5"
device = "cuda" if torch.cuda.is_available() else "cpu"
pipe = StableDiffusionPipeline.from_pretrained(model_id)
pipe = pipe.to(device)

# Define the story
story = story.replace(',', '.')
delimiters = ['.']

# Create a regular expression pattern from the delimiters
pattern = '|'.join(map(re.escape, delimiters))

# Split the text using the pattern
scenes = re.split(pattern, story)

# Scene Generation
def generate_image_from_text(text, pipe, device):
    with torch.autocast(device):
        image = pipe(text).images[0]

    return np.array(image)

# Generate images for each scene
images = []
for scene in scenes:
    if scene.strip():  # Skip empty scenes
        image = generate_image_from_text(scene.strip(), pipe, device)
        images.append(image)

# Animating Images with transitions
def create_video(images, scene_durations, output_video_path, fps=30):
    clips = []
    for img, duration in zip(images, scene_durations):
        img_clip = ImageClip(img).set_duration(duration)

        # Apply zoom and pan effect smoothly
        zoom_factor = 1.1
        img_clip = img_clip.resize(lambda t: 1 + (zoom_factor - 1) * t / img_clip.duration)
        img_clip = img_clip.set_position(('center', 'center'))

        clips.append(img_clip)

    video = concatenate_videoclips(clips, method="compose")
    video.write_videofile(output_video_path, fps=fps, codec='libx264')

    return video.duration

# Calculate scene durations based on the length of the story
scene_durations = [6] * len(images)  # Adjust durations according to story flow

# Adjusting durations to reach desired video length of 25-30 seconds
total_duration = sum(scene_durations)
target_duration = 25  # target video duration in seconds
factor = target_duration / total_duration
scene_durations = [duration * factor for duration in scene_durations]

# Ensure the total duration matches or slightly exceeds the target duration
current_duration = sum(scene_durations)
if current_duration < target_duration:
    scene_durations[-1] += target_duration - current_duration

# Generate the video without subtitles
output_video_path = 'story_video.mp4'
video_duration = create_video(images, scene_durations, output_video_path)

# Adding background music
def add_background_music(video_path, output_path, audio_path):
    video_clip = VideoFileClip(video_path)
    audio_clip = AudioFileClip(audio_path).subclip(0, video_duration)
    video_clip = video_clip.set_audio(audio_clip)
    video_clip.write_videofile(output_path, codec='libx264', audio_codec='aac')

# Add background music
background_music_path = 'story.wav'  # Path to background music file
output_video_with_music_path = 'final_video.mp4'
add_background_music(output_video_path, output_video_with_music_path, background_music_path)
print(f"Video with audio saved at {output_video_with_music_path}")

# Removing extra files
!rm story.wav story_video.mp4


Enter promt here : A group of children discover a dead body.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


A group of children discover a dead body.

The king of the country, who was watching the procession, suddenly
heard the sound of footsteps, and at first he did not know what it was. He
saw the bodies lying on the ground, but he could not see them, for
the bodies were lying where they had been, like dead people. The king
was terrified, as it seemed to him that some of them had crept into the
grave. But he made no objections; the children would not listen to
him. And so he waited patiently until at length the next morning the king was
caught by a terrible noise, which sounded like thunder. This terrible sound
soon became clearer and clearer, till at last the dead child was found,
and the frightened king called to his servants to come to the spot. There were
no guards at all, only the servants who came to pick him up. When the poor
king saw that he was safe


Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  return F.conv2d(input, weight, bias, self.stride,



  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

Potential NSFW content was detected in one or more images. A black image will be returned instead. Try again with a different prompt and/or seed.


  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

Potential NSFW content was detected in one or more images. A black image will be returned instead. Try again with a different prompt and/or seed.


Moviepy - Building video story_video.mp4.
Moviepy - Writing video story_video.mp4





Moviepy - Done !
Moviepy - video ready story_video.mp4
Moviepy - Building video final_video.mp4.
MoviePy - Writing audio in final_videoTEMP_MPY_wvf_snd.mp4




MoviePy - Done.
Moviepy - Writing video final_video.mp4





Moviepy - Done !
Moviepy - video ready final_video.mp4
Video with audio saved at final_video.mp4
