This notebook contains two methods to generate a video by starting with a text input to read using TTS.

The first method adds subtitles to the video that display 1+ line at a time. The second method adds word-synced captions to the video using a modified `autocaption` module.

In [None]:
# notes on where im trying to go with this
# can provide:
# url which will be extracted to get input text (optionally formatted using LLM)
# file or text which will be used as input text
# video url which will skip the text/TTS and adds subtitles to the video

# so the process involves:

# with TTS audio, generate

In [None]:
import os, sys

sys.path.append(os.path.abspath('..'))

from src.utils import ensuredir, displayVideo, get_text_from_url
from src.enums import ElevenLabsTTSModel, ElevenLabsTTSVoice
from src.tools.tts import make_tts
from src.tools.stt import create_transcript
from src.video.moviepy import create_video_with_subtitles, create_video_from_audio

# change the model and/or voice if desired
# TODO support openai tts
tts_model = ElevenLabsTTSModel.Multilingual_v2.value
tts_voice = ElevenLabsTTSVoice.Brian.value  # TODO add more voices to enum

# choose 1 of the below inputs
input_text = """"""
input_file = ''
input_url = 'http://example.com'
reformat_url_text = False
input_video = '' # TODO

output_file = 'example.mp3'

method = 'subtitle' # 'subtitle' or 'autocaption'

In [None]:
from typing import Literal

if not input_video:
	input_type: Literal['text', 'file', 'url'] = 'text'
	input_src = input_text
	if input_file:
		input_type = 'file'
		input_src = input_file
	elif input_url:
		input_type = 'url'
		input_src = input_url

	if input_url:
		res = get_text_from_url(input_url)
		if reformat_url_text:
			from src.tools.format_tts import format_tts_text
			res = format_tts_text(res)
		print('url text:\n' + res)
else:
	import ffmpeg
	import yt_dlp
	p = 'output/tmp-video/'
	ensuredir(p)
	ydl = yt_dlp.YoutubeDL({
		'outtmpl': f'{p}/%(id)s.%(ext)s',
	})
	ydl.download([input_video])
	input_video = ydl.prepare_filename(ydl.extract_info(input_video, download=False))
	assert isinstance(input_video, str)
	# convert to mp4
	p = input_video.split('.')[0]
	ffmpeg.input(input_video).output(f'{p}.mp4').run(overwrite_output=True)
	input_video = f'{p}.mp4'

In [None]:
if method == 'subtitle':
	if input_video:
		media_path = input_video
	else:
		media_path = make_tts(input_src, input_type, output_file, model=tts_model, voice=tts_voice)
	transcript_path = create_transcript(media_path)

	video_output_file = f"{os.path.splitext(media_path)[0]}.mp4"
	ensuredir(video_output_file)
	create_video_with_subtitles(media_path, transcript_path, video_output_file, False, 1280, 720)
	print(f'Video saved to {video_output_file}')

In [None]:
if method == 'autocaption':
	from autocaption.predict import VideoCaptioner
	import shutil

	if input_video:
		media_path = input_video
		video_output_file = f"{os.path.splitext(media_path)[0]}-subtitled.mp4"
	else:
		# generate tts + video with audio
		media_path = make_tts(input_src, input_type, output_file, model=tts_model, voice=tts_voice)
		video_output_file = f"{os.path.splitext(media_path)[0]}.mp4"
		filename = os.path.basename(video_output_file)
		create_video_from_audio(media_path, video_output_file)

	vc = VideoCaptioner()
	output_paths = vc.add_captions(media_path, kerning=-2)
	o = output_paths[0]
	shutil.move(o, video_output_file)

# for this, still generate the tts
# create video with audio and no subtitles
# pass tts video to vc
# add audio back to output video

In [None]:
if video_output_file:
	displayVideo(path=video_output_file)

In [None]:
# TODO
# youtube video as input
# add captions to video

# video_input = 'test2.mp4'
# vc = VideoCaptioner()
# outputs = vc.add_captions(video_input, kerning=-2)
# o = outputs[0]
# shutil.move(o, 'test3.mp4')

In [None]:
# TODO
# - save transcript as word-timestamp json instead of srt, construct and write to srt to control split points
# - display paragraph(s) of text at a time on screen,
#   highlight segments as they're spoken
# - generate background image for the video

# idea for an autonomous version
# a script format that defines the video
# - list info at the beginning (audio, background color, etc)
# - each line after contains a timestamp and and action of some sort
#   - display text
#   - display image
# an llm could output this script and the app generates the video from it