In [None]:
from dotenv import load_dotenv
from openai import OpenAI
from elevenlabs.client import AsyncElevenLabs
import asyncio
import json
import os
import argparse
import concurrent.futures

from manga_extraction import extract_all_pages_as_images, save_important_pages, split_volume_into_parts, save_all_pages, extract_panels, scale_base64_image
from vision_analysis import analyze_images_with_gpt4_vision, detect_important_pages, get_important_panels, VISION_PRICE_PER_TOKEN 
from prompts import DRAMATIC_PROMPT, BASIC_PROMPT, BASIC_PROMPT_WITH_CONTEXT,  BASIC_INSTRUCTIONS, KEY_PAGE_IDENTIFICATION_INSTRUCTIONS, KEY_PANEL_IDENTIFICATION_PROMPT, KEY_PANEL_IDENTIFICATION_INSTRUCTIONS
from citation_processing import extract_text_and_citations, extract_script
from movie_director import make_movie
load_dotenv()  # Load environment variables from .env file

In [None]:
volume_number = 10
manga = "naruto"

The goal of this block is to extract a small-scale (fit within a 256px x 256px bounding box) array of pngs corresponding to the pages in the volume. 
Additionally, an unscaled array of images is extracted as well for full-res images. 

Later on, the small-scaled images will be sent to GPT-Vision, and the full scaled images will be used for panel extraction to get beautiful high-res panels for the video. 

In [None]:
# Initialize OpenAI client with API key
client = OpenAI()
# get elevenlabs api key from dotenv
narration_client = AsyncElevenLabs(api_key=os.getenv("ELEVENLABS_API_KEY"))

print("Extracting all pages from the volume...")
volume_scaled_and_unscaled = extract_all_pages_as_images(f"{manga}/v{volume_number}/v{volume_number}.pdf")
volume = volume_scaled_and_unscaled["scaled"]
volume_unscaled = volume_scaled_and_unscaled["full"]
print("Total pages in volume:", len(volume))

The goal of this block is to use the `profile-reference.pdf` and `chapter-reference.pdf` provided by the user as examples in order to identify so-called `important pages`:
1) Profile pages: These are pages that contain information about the characters within the volume. Usually most mangas have this towards the beginning of the volume. They show a number of characters, what they look like, their names, and occasionally a small description of the character. This is super useful for GPT-Vision to identify and discern the characters in following steps.
2) Chapter pages: Pretty self explanitory, these are the pages that contain the chapter number and title. Mangas typically get creative with how chapter pages look like, some have a full page spread, some have a small box in the corner, some have a full page spread with a small box in the corner. So it's helpful to have a chapter-reference.pdf to use as an example to know what to look for for this specific manga.

In a nutshell, this block will process every single page of the volume through GPT-Vision (with parallelized calls to maximize speed) and ask it to tell us if it thinks a page is a profile page or a chapter page. The final results are gathered at the end and extracted as a list of ints in `profile_pages` and `chapter_pages` where each int represents the page index in the volume.

In [None]:
profile_reference = extract_all_pages_as_images(f"{manga}/profile-reference.pdf")["scaled"]
chapter_reference = extract_all_pages_as_images(f"{manga}/chapter-reference.pdf")["scaled"]

profile_pages = []
chapter_pages = [] 

important_page_tokens = 0

batch_size = 20

print("Identifying important pages in the volume...")
# Function to wrap the detect_important_pages call
def process_batch(start_idx, pages):
    response = detect_important_pages(profile_reference, chapter_reference, pages, client,
        KEY_PAGE_IDENTIFICATION_INSTRUCTIONS, KEY_PAGE_IDENTIFICATION_INSTRUCTIONS)
    return start_idx, response

# Using ThreadPoolExecutor to parallelize API calls
with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = []
    for i in range(0, len(volume), batch_size):
        pages = volume[i:i+batch_size]
        futures.append(executor.submit(process_batch, i, pages))

    for future in concurrent.futures.as_completed(futures):
        start_idx, response = future.result()
        end_index = start_idx + batch_size - 1
        print(f"Processing pages {start_idx} to {min(end_index, len(volume)-1)}")
        
        ip = response["parsed_response"]
        print(json.dumps(ip, indent=2))
        for page in ip:
            if page["type"] == "profile":
                profile_pages.append(page["image_index"] + start_idx)
            elif page["type"] == "chapter":
                chapter_pages.append(page["image_index"] + start_idx)

        important_page_tokens += response["total_tokens"]

profile_pages.sort()
chapter_pages.sort()

print("Total tokens to extract profiles and chapters:", important_page_tokens)
print("\n__________\n")
print("Profile pages:", profile_pages)
print("Chapter pages:", chapter_pages)

This step is totally optional, but helps developers see what the code did while extracting the `important pages` and saves the GPT-vision identified chapter pages and profile pages into corresponding folders for QA.

In [None]:
print(f"{len(volume)}")
print("\n__________\n")
print("Saving important pages to disk for QA...")
save_important_pages(volume, profile_pages, chapter_pages, manga, volume_number)

Now that we have the chapter start pages and the profile pages, the goal of this block is to begin the summarization or "script-writing" process. The goal is to have a comprehensive summary of everything that happened in the volume, and importantly, to have **PAGE CITATIONS** sprinkled throughout the summary. This is important because we want to be able to reference the relevant pages and panels later on, and it's important to know where the information in the summary came from.

But we can't summarize a whole volume in one GPT-vision prompt (unless the manga volume is tiny), so we split the volume into reasonable-sized chunks for GPT-Vision to process. We also don't want to split the volume in the middle of a chapter, so we use the chapter start pages to split in an intelligent way. 

The `NUMBER_OF_JOBS` variable determines how many segments the volume will be split into; and the `split_volume_into_parts()` function will do its best to split the volume into `NUMBER_OF_JOBS` parts while respecting chapter boundaries. The result is `jobs` which is a messy object that contains an array of `scaled_images` arrays and an array of `unscaled_images` arrays, where each array is one segment of the volume. 

The first job (or segment) is sent to GPT-Vision along with the profile page(s) to summarize and get citations. The prompt for the first job is basically the same as the rest of the jobs, except the subsequent jobs will use previous job summaries as context, so the prompt will be slightly different. We basically end up snowballing into a full volume summary this way, in a synchronized way (no parallel calls, as we depend on previous summaries to build the next summary).

GPT Vision will do a great job summarizing all the segments with proper character identification, and we gather all the citations as we go and parse them out of the text and map the citations to the actual unscaled page images from the volume. These unscaled page images will later be ran through the `panel_extractor` to get beautiful high-res panels for the video.

The final result of this block is a `movie_script` object which contains all the information needed to create the video. It has all of the split up text linked to the unscaled page images that the text refers to. For convenience we also create a `narration_script` string which is a concatenation of all the text segments (so a pure summary), and we log it for the user to see.

In [None]:
character_profiles = [volume[i] for i in profile_pages]    
NUMBER_OF_JOBS = 7
jobs = split_volume_into_parts(volume, volume_unscaled, chapter_pages, NUMBER_OF_JOBS)
parts = jobs["parts"]
jobs_unscaled = jobs["unscaled_images"]
jobs = jobs["scaled_images"]

# Summarize the images in the first job
response = analyze_images_with_gpt4_vision(character_profiles, jobs[0], client, BASIC_PROMPT, BASIC_INSTRUCTIONS)
recap = response.choices[0].message.content
tokens = response.usage.total_tokens
movie_script = extract_text_and_citations(response.choices[0].message.content, jobs[0], jobs_unscaled[0])

print("\n\n\n_____________\n\n\n")
print(response.choices[0].message.content)

# iterate thrugh the rest of the jobs while adding context from previous ones
for i, job in enumerate(jobs):
    if i == 0:
        continue
    response = analyze_images_with_gpt4_vision(character_profiles, job, client, recap + "\n-----\n" + BASIC_PROMPT_WITH_CONTEXT, BASIC_INSTRUCTIONS)
    recap = recap + "\n\n" + response.choices[0].message.content
    tokens += response.usage.total_tokens
    print("\n\n\n_____________\n\n\n")
    print(response.choices[0].message.content)
    movie_script = movie_script + extract_text_and_citations(response.choices[0].message.content, job, jobs_unscaled[i])

print("\n\n\n_____________\n\n\n")
print("\n\n\n_____________\n\n\n")
print("\n\n\n_____________\n\n\n")

narration_script = extract_script(movie_script)
print(narration_script)
print("\n___________\n")

This block is responsible for getting all the cited page images from `movie_script` and creating an array of panel images of extracted panels from each cited page image, and incorporate them into the `movie_script` object. This is done by running `panel_extractor` on each cited page image, and then adding the extracted panel images to the `movie_script` object.

In [None]:
extract_panels(movie_script)

This block's goal is to identify the `important_panels` for each segment of `movie_script`. In the previous block, we extracted all of the panels from the cited page images, and now we want to ask GPT Vision, given the text that corresponds to the cited page image(s), which panels are the most important to show in the video while that text is being narrated. GPT-Vision does a surprisingly good job at this with limited context, and we can do these GPT Vision requests in parallel to maximize speed. There's all sorts of sketchy error handling, and if GPT Vision screws up (there is no JSON mode for GPT-Vision unfortunately, so we have to run its response through GPT-3.5 with JSON mode enabled as a followup step, and even then there are sometimes problems), we just skip that segment (and use the full page image in the video later on).

Once we're done doing this, we do some napkin math to calculate the total costs of GPT tokens and Elevenlabs tokens used in the process, and we log it for the user to see.

In [None]:
print("number of segments:", len(movie_script))
for i, segment in enumerate(movie_script):
    print("segment", i, ": ", segment["text"])
    all_panels_base64 = [panel for sublist in segment["panels"].values() for panel in sublist]
    print(len(all_panels_base64))
    print("number of panels:", len(all_panels_base64))
    print("number of images:", len(segment["images"]))

def process_segment(segment_tuple):
    i, segment = segment_tuple  # Unpack the tuple
    panels = []
    for j, page in enumerate(segment["images"]):
        if "panels" in segment:
            if j not in segment["panels"]:
                panels.append(page)
            else:
                for panel in segment["panels"][j]:
                    panels.append(panel)
        else:
            panels.append(page)
    
    scaled_panels = [scale_base64_image(p) for p in panels]


    response = get_important_panels(profile_reference, scaled_panels, client, 
        segment["text"] + "\n________\n" + KEY_PANEL_IDENTIFICATION_PROMPT, KEY_PANEL_IDENTIFICATION_INSTRUCTIONS)

    important_panels = response["parsed_response"]
    # check if important panels is an array
    if not isinstance(important_panels, list):
        important_panels = []

    ip = []
    for p in important_panels:
        number = p
        if isinstance(number, str):
            if number.isdigit():
                number = int(number)
        if not isinstance(number, int):
            continue

        if number < len(panels):
            ip.append(panels[number])
        
    
    return i, ip, response["total_tokens"]

# Initialize variables
panel_tokens = 0
important_panels_info = {}

# Use ThreadPoolExecutor to parallelize the processing
with concurrent.futures.ThreadPoolExecutor() as executor:
    # Create a list of futures
    futures = [executor.submit(process_segment, (i, segment)) for i, segment in enumerate(movie_script)]
    
    # Collect the results as they complete
    for future in concurrent.futures.as_completed(futures):
        i, ip, tokens = future.result()
        if ip:
            print("Important panels for segment", i, "exist.")
        else: 
            print("No important panels for segment", i)
        movie_script[i]["important_panels"] = ip  # Assign the important panels back to the segment
        panel_tokens += tokens


ELEVENLABS_PRICE_PER_CHARACTER = 0.0003
print("Tokens for extracting profiles and chapters:", important_page_tokens, " | ", "${:,.4f}".format(VISION_PRICE_PER_TOKEN * important_page_tokens))
print("Tokens for summarization:", tokens,  " | ", "${:,.4f}".format(VISION_PRICE_PER_TOKEN * tokens))
print("Tokens for extracting important panels:", panel_tokens, " | ", "${:,.4f}".format(VISION_PRICE_PER_TOKEN * panel_tokens))
total_gpt_tokens = important_page_tokens + tokens + panel_tokens
print("Total GPT tokens:", total_gpt_tokens,  " | ", "${:,.4f}".format(VISION_PRICE_PER_TOKEN * (total_gpt_tokens)))
print("Total elevenlabs characters:", len(narration_script), " | ", "${:,.4f}".format(ELEVENLABS_PRICE_PER_CHARACTER * (len(narration_script))))
print("GRAND TOTAL COST"," | ", "${:,.4f}".format(VISION_PRICE_PER_TOKEN * (total_gpt_tokens) + ELEVENLABS_PRICE_PER_CHARACTER * (len(narration_script))))

This block is responsible for making the final movie and saving it to disk. This `make_movie` function does the Elevenlabs narration by narrating all of the movie_script segments (and we parallelize as much as possible to save time), and we end up getting an array of audio clips of the narrated segments. Each segment will now have a corresponding audio clip, the text that was narrated, and the important panels that were identified for that segment, as well as the full page image(s) that were cited. Naturally, we have everything we need to make a movie, and we use moviepy to make it! So we place the audio clips one by one, and get the time length of each audio clip, and split all the images that correspond to the audio clip evenly among that time. So if there is a 10s audio clip, and 5 images, each image will be displayed for 2 seconds in order. Finally, the video is saved to disk and all the temporary files are cleaned up.

In [None]:
await make_movie(movie_script, manga, volume_number, narration_client)