With this I want to be able to provide a script of text to get TTS for. Then, get a transcript of that audio using Whisper API.

Can't Whisper return an srt? Maybe stop there?

Ideally, I'd like to create a video file that has the TTS audio in it as well as the transcript displaying as subtitles, with a transparent background.


The TTS text could support instructions to add delay or maybe effects
Could also have a syntax for specifying a block of text to turn into 1 output
	Maybe can have options to customize everything in the TTS text (extract elements with llm?)


Transparent video can work as a webm right?

Search: "python transparent webm"
https://stackoverflow.com/questions/72409283/opencv-how-to-create-webm-with-a-transparent-background
	no answers
https://github.com/Zulko/moviepy/issues/2082
	https://codepen.io/mortenjust/pen/BaLrjzm
		shows that webm does work to show transparency


My intent is for this to make it easier to add AI TTS to a video synced to subtitles. This could be useful for making educational videos, or for making videos that are more accessible to people with hearing impairments.

Need:
- add lemonfox client with whisper endpoint

Now that I think about it, this would also serve my idea for making videos instead of actual comics like in the main notebook.

We'd still be using narration and would want subtitles.

There are cases where you might want subtitles baked in which the transparent webm would work for. Then sometimes you just want to have them separately like in an SRT file. These could be options which we could automate further up in the pipeline.

In [4]:
import json
import subprocess
from PIL import Image, ImageDraw, ImageFont
import os
from src.ai import elevenlabs, lemonfox
from src.utils import saveB64Audio
from datetime import datetime

def create_project_folder():
		timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
		folder_name = f"project_{timestamp}"
		os.makedirs(folder_name, exist_ok=True)
		return folder_name

def create_text_frame(text, w, h, frame_number, font_size, font_color, output_folder):
		img = Image.new('RGBA', (w, h), (0, 0, 0, 0))
		draw = ImageDraw.Draw(img)
		font = ImageFont.truetype("Arial.ttf", font_size)
		left, top, right, bottom = draw.textbbox((0, 0), text, font=font)
		text_w = right - left
		text_h = bottom - top
		position = ((w-text_w)/2, h-text_h-40)  # Moved text up a bit
		draw.text(position, text, font=font, fill=font_color)
		img.save(os.path.join(output_folder, f'frame_{frame_number:06d}.png'))

def create_video(script, width=1280, height=720, fps=30, font_size=40, font_color=(255, 255, 255, 255)):
		project_folder = create_project_folder()

		# Save audio
		audio = elevenlabs.getSpeechB64(script)
		audio_path = os.path.join(project_folder, 'audio.mp3')
		saveB64Audio(audio, audio_path)

		transcript_srt = lemonfox.getTranscript(audio_path, outformat='srt', prompt="ElevenLabs, LemonFox, Kdenlive")

		if transcript_srt.startswith('"'):
				transcript_srt = transcript_srt[1:]
		if transcript_srt.endswith('"'):
				transcript_srt = transcript_srt[:-1]
		transcript_srt = transcript_srt.replace('\\n', '\n')
		transcript_srt = transcript_srt.strip()

		transcript_srt_path = os.path.join(project_folder, 'transcript.srt')
		with open(transcript_srt_path, 'w') as f:
				f.write(transcript_srt)

		# using srt
		return

		# Get transcript
		transcript = lemonfox.getTranscript(audio_path)
		transcript_json = transcript.model_dump_json()
		transcript_path = os.path.join(project_folder, 'transcript.json')
		with open(transcript_path, 'w') as f:
				f.write(transcript_json)

		# Load the transcript JSON
		with open(transcript_path, 'r') as f:
				transcript = json.load(f)

		# Create frames
		current_time = 0
		frame_number = 0
		step = 1 / fps

		for segment in transcript['segments']:
				start, end = segment['start'], segment['end']
				text = segment['text']

				# Empty frames before the segment
				while current_time < start:
						Image.new('RGBA', (width, height), (0, 0, 0, 0)).save(os.path.join(project_folder, f'frame_{frame_number:06d}.png'))
						frame_number += 1
						current_time += step

				# Frames with text
				while current_time < end:
						create_text_frame(text, width, height, frame_number, font_size, font_color, project_folder)
						frame_number += 1
						current_time += step

		# Get total duration from the last segment
		total_duration = transcript['segments'][-1]['end']

		# Empty frames for remaining duration
		while current_time < total_duration:
				Image.new('RGBA', (width, height), (0, 0, 0, 0)).save(os.path.join(project_folder, f'frame_{frame_number:06d}.png'))
				frame_number += 1
				current_time += step

		# Use FFmpeg to create video from frames and add audio
		output_path = os.path.join(project_folder, 'output.webm')
		ffmpeg_command = [
				'ffmpeg',
				'-framerate', str(fps),
				'-i', os.path.join(project_folder, 'frame_%06d.png'),
				'-i', audio_path,
				'-c:v', 'libvpx-vp9',
				'-crf', '30',
				'-b:v', '0',
				'-b:a', '128k',
				'-c:a', 'libopus',
				'-shortest',
				output_path
		]

		subprocess.run(ffmpeg_command)

		# Clean up temporary frame files
		for i in range(frame_number):
				os.remove(os.path.join(project_folder, f'frame_{i:06d}.png'))

		print(f"Video created successfully in {project_folder}")
		return project_folder, output_path

# Example usage
script = """
This should sound better -- using the nicer voice model here. It costs more, so I use the other model for the speed and lower price.

We're using ElevenLabs' cheaper model for the TTS, and LemonFox's Whisper API to get the transcript.

Kdenlive was used to bring the two together to create a video. Now there just needs to be some visuals for a full video.
"""

create_video(
		script,
		width=1280,
		height=720,
		fps=30,
		font_size=60,  # Increased font size
		font_color=(255, 255, 255, 255)  # White color
)