## Process
1. Upload photos, get metadata if possible (location, date, etc.)
2. Send descriptions through vision model to get descriptions, append to images in an object (i.e. {file: str, description: str})
3. Prompt model with photo objects, output should be an object with file name, transcript text chunk and duration
4. Run through opencv to create slideshow
5. Create audio with TTS model
6. combine audio and video with tts

In [1]:
# get metadata for images
from PIL import Image
from PIL.ExifTags import TAGS
import os

def get_basic_info(image_path):
    """Extract filename, datetime, and size in MB from image."""
    image = Image.open(image_path)
    exif = image._getexif() if hasattr(image, '_getexif') else None
    
    # Get file size in MB
    size_mb = os.path.getsize(image_path) / (1024 * 1024)
    
    # Get datetime if available
    datetime = None
    if exif:
        for tag_id in exif:
            tag = TAGS.get(tag_id, tag_id)
            if tag == 'DateTime':
                datetime = exif[tag_id]
    
    return {
        'filename': os.path.basename(image_path),
        'datetime': datetime,
        'size_mb': round(size_mb, 2)
    }

In [2]:
# set openai api key
os.environ['OPENAI_API_KEY'] = "OPENAI_API_KEY"

In [3]:
# Send through vision model to get descriptions
from openai import OpenAI
import base64

def get_photo_description(photo_path):
    """
    Get a description of a photo using GPT-4o-mini
    
    Args:
        photo_path (str): Path to the photo file
        
    Returns:
        str: Description of the photo
    """
    client = OpenAI()
    
    # Encode the image to base64
    def encode_image(image_path):
        with open(image_path, "rb") as image_file:
            return base64.b64encode(image_file.read()).decode('utf-8')
    
    base64_image = encode_image(photo_path)
    
    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            # Tried having it output longer content and told it it was for a sentimenta/romantic movie but it just tried way too hard lol
            messages=[
                {
                    "role": "user",
                    "content": [
                        # new prompt, haven't tried it yet but want to
                        {"type": "text", "text": "Please describe this photo in a concise way, add cute little details about what is happening but still keep it brief"},
                        # old prompt {"type": "text", "text": "Please describe this photo in a concise way. The woman in the photo is my wife, Payton, the dog is Summit, and the man is me, Michael."},
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpeg;base64,{base64_image}"
                            }
                        }
                    ]
                }
            ],
            max_tokens=500
        )
        return response.choices[0].message.content
    except Exception as e:
        print(f"Error getting description for {photo_path}: {str(e)}")
        return None

# Now let's modify the format_photos function to include descriptions
def format_photos(photo_paths):
    formatted_photos = []
    
    for path in photo_paths:
        try:
            basic_info = get_basic_info(path)
            description = get_photo_description(path)
            
            photo_object = {
                'image_path': path,
                'date_time': basic_info['datetime'],
                'size': basic_info['size_mb'],
                'description': description
            }
            formatted_photos.append(photo_object)
        except Exception as e:
            print(f"Error processing {path}: {str(e)}")
            
    return formatted_photos

In [None]:
import os

# Assumes you have a /photos directory with photos to read
photos_dir = 'photos'
photos = [os.path.join(photos_dir, f) for f in os.listdir(photos_dir) 
          if f.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp'))]

formatted_photos = format_photos(photos)
formatted_photos

In [91]:
# Create transcript by using structured output of images and descriptions as segments
#got-4o is much better at this than gpt-4o-mini at this, it includes more photos and the poem is better overall
from pydantic import BaseModel
from typing import List
from openai import OpenAI
import json

class SlideSegment(BaseModel):
    image_number: int
    duration_seconds: float
    text: str
    image_path: str

class SlideshowTranscript(BaseModel):
    segments: List[SlideSegment]

def generate_slideshow_transcript(formatted_photos):
    """
    Generate a slideshow transcript from formatted photos using GPT-4
    
    Args:
        formatted_photos (list): List of dictionaries containing photo information
        
    Returns:
        List[SlideSegment]: List of structured segments for the slideshow
    """
    # Sort photos by date for chronological ordering
    sorted_photos = sorted(formatted_photos, key=lambda x: x['date_time'])
    
    # Create a focused context with paths and descriptions
    photo_context = [
        {
            'path': photo['image_path'],
            'date': photo['date_time'],
            'description': photo['description']
        } for photo in sorted_photos
    ]
    
    # Extract just paths for validation
    valid_paths = [p['path'] for p in photo_context]
    client = OpenAI()
    
    
    response = client.beta.chat.completions.parse(
        model="gpt-4o",
        messages=[
            {
                "role": "system",
                "content": f"""You are a professional narrator creating a sentimental slideshow for my wife, Payton.
                AVAILABLE IMAGES:
                Below is a chronological list of photos with their descriptions. Use these descriptions to craft accurate, meaningful segments:

                {json.dumps(photo_context, indent=2)}

                CRITICAL RULES:
                1. ONLY use image paths from this list - no exceptions: {valid_paths}
                2. Base your narrative on the actual content described in each image
                3. Follow the chronological order of the photos
                4. Keep each segment's duration between 2-5 seconds
                5. Reference specific details from the image descriptions in your narrative
                6. Don't make assumptions about events not shown in the photos

                Your task is to create a rhyming poem that:
                - Starts with "The story of us goes like this..."
                - Uses modern, relatable language (think Taylor Swift style)
                - Makes natural transitions between images
                - Incorporates specific details from the image descriptions
                - Tells the story through what's actually visible in each photo per the descriptions

                For each segment:
                - Choose an exact image path from the list
                - Set an appropriate duration (2-5 seconds)
                - Write a rhyming line that matches the photo's description
                - Focus on the actual elements described in the photo

                Remember: If you need a specific type of image for your narrative but can't find one that matches in the descriptions, adjust your narrative to fit what's actually available. Never invent scenes or modify image paths."""
            },
            {
                "role": "user",
                "content": "Create a slideshow narrative that follows these photos chronologically, using their descriptions to tell an accurate and meaningful story. Every image_path must match exactly from the provided list."
            }
                ],
        response_format=SlideshowTranscript
    )
    
    return response.choices[0].message.parsed



In [92]:
transcript_segments = generate_slideshow_transcript(formatted_photos)


In [93]:
# sometimes, the transcript will reference a photo that doesn't exist, this function finds those photos so we can replace them or run the function again
def find_missing_paths(formatted_photos, transcript_segments):
    """
    Find paths in transcript that don't exist in formatted_photos
    
    Args:
        formatted_photos (list): List of dictionaries containing photo information
        transcript_segments (SlideshowTranscript): Transcript with timing and image information
        
    Returns:
        set: Set of paths that exist in transcript but not in formatted_photos
    """
    formatted_paths = set(photo['image_path'] for photo in formatted_photos)
    transcript_paths = set(segment.image_path for segment in transcript_segments.segments)
    return transcript_paths - formatted_paths

In [None]:
missing_paths = find_missing_paths(formatted_photos, transcript_segments)
missing_paths

In [None]:
# Display results
for segment in transcript_segments.segments:
    print(f"Segment {segment.image_number}")
    print("-" * 50)
    print(f"Image Path: {segment.image_path}")
    print(f"Duration: {segment.duration_seconds} seconds")
    print(f"Text: {segment.text}")
    print("\n")

In [97]:
# Create slideshow with opencv
import cv2
import numpy as np
import json
from pathlib import Path
from tqdm import tqdm
import time

def resize_with_padding(image, target_width, target_height):
    # Get current and target aspect ratios
    target_aspect = target_width / target_height
    image_aspect = image.shape[1] / image.shape[0]
    
    if image_aspect > target_aspect:
        # Image is wider than target: fit to width
        new_width = target_width
        new_height = int(target_width / image_aspect)
        resized = cv2.resize(image, (new_width, new_height))
        
        # Add black bars on top and bottom (letterbox)
        top_padding = (target_height - new_height) // 2
        bottom_padding = target_height - new_height - top_padding
        padded = cv2.copyMakeBorder(
            resized,
            top_padding,
            bottom_padding,
            0,
            0,
            cv2.BORDER_CONSTANT,
            value=[0, 0, 0]
        )
    else:
        # Image is taller than target: fit to height
        new_height = target_height
        new_width = int(target_height * image_aspect)
        resized = cv2.resize(image, (new_width, new_height))
        
        # Add black bars on left and right (pillarbox)
        left_padding = (target_width - new_width) // 2
        right_padding = target_width - new_width - left_padding
        padded = cv2.copyMakeBorder(
            resized,
            0,
            0,
            left_padding,
            right_padding,
            cv2.BORDER_CONSTANT,
            value=[0, 0, 0]
        )
    
    return padded

def create_slideshow(json_data, output_path='slideshow.mp4', fps=30, target_width=1920, target_height=1080):
    start_time = time.time()
    segments = json_data['segments']
    
    # Calculate total frames
    total_frames = sum(int(segment['duration_seconds'] * fps) for segment in segments)
    
    # Initialize video writer
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_path, fourcc, fps, (target_width, target_height))
    
    # Process each segment with progress bar
    with tqdm(total=total_frames, desc="Creating slideshow") as pbar:
        for segment in segments:
            img = cv2.imread(segment['image_path'])
            if img is None:
                print(f"Warning: Could not read image {segment['image_path']}")
                continue
            
            # Resize image and add black bars while maintaining aspect ratio
            img = resize_with_padding(img, target_width, target_height)
            
            # Write the image for duration * fps frames
            n_frames = int(segment['duration_seconds'] * fps)
            for _ in range(n_frames):
                out.write(img)
                pbar.update(1)
    
    out.release()
    
    elapsed_time = time.time() - start_time
    print(f"\nSlideshow created in {elapsed_time:.2f} seconds")
    print(f"Output saved to: {output_path}")


In [None]:

# Convert Pydantic model to compatible dictionary format
slideshow_dict = {
    'segments': [
        {
            'image_path': segment.image_path,
            'duration_seconds': segment.duration_seconds,
            'text': segment.text
        }
        for segment in transcript_segments.segments
    ]
}

# Use existing create_slideshow function
create_slideshow(slideshow_dict)

In [104]:
# Create audio with TTS model
from pathlib import Path
from openai import OpenAI
from pydub import AudioSegment
import os

client = OpenAI()

def create_tts_for_segments(transcript_segments, output_dir="tts_segments"):
    """
    Create TTS audio for each segment in the transcript, matching segment durations
    
    Args:
        transcript_segments (SlideshowTranscript): Transcript with timing and text information
        output_dir (str): Directory to save individual audio files
    
    Returns:
        str: Path to the combined audio file
    """
    # Ensure output directory exists
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    
    combined_audio = AudioSegment.silent(duration=0)
    
    for i, segment in enumerate(transcript_segments.segments):
        speech_file_path = f"{output_dir}/segment_{i}.mp3"
        
        # Generate TTS audio
        response = client.audio.speech.create(
            model="tts-1",
            voice="alloy",
            input=segment.text
        )
        
        # Save the audio file
        response.stream_to_file(speech_file_path)
        
        # Load the audio segment
        segment_audio = AudioSegment.from_mp3(speech_file_path)
        
        # Convert segment duration from seconds to milliseconds
        target_duration_ms = int(segment.duration_seconds * 1000)
        current_duration_ms = len(segment_audio)
        
        if current_duration_ms < target_duration_ms:
            # If audio is shorter than segment, add silence at the end
            silence_duration = target_duration_ms - current_duration_ms
            segment_audio = segment_audio + AudioSegment.silent(duration=silence_duration)
        elif current_duration_ms > target_duration_ms:
            # If audio is longer than segment, log a warning
            print(f"Warning: Audio for segment {i} is longer than segment duration")
            print(f"Audio: {current_duration_ms/1000:.2f}s, Segment: {segment.duration_seconds:.2f}s")
        
        # Add to combined audio
        combined_audio += segment_audio
    
    # Save the combined audio
    combined_file_path = f"{output_dir}/combined_narration.mp3"
    combined_audio.export(combined_file_path, format="mp3")
    
    # Print total duration
    print(f"Total audio duration: {len(combined_audio)/1000:.2f} seconds")
    
    return combined_file_path

In [None]:
# Usage
narration_path = create_tts_for_segments(transcript_segments)
print(f"Combined narration saved to: {narration_path}")

In [121]:
# Combine video and audio with moviepy
from moviepy import VideoFileClip, AudioFileClip, CompositeAudioClip

def combine_video_and_narration(video_path, narration_path, output_path):
    """
    Combine video with narration audio using CompositeAudioClip
    
    Args:
        video_path (str): Path to the video file
        narration_path (str): Path to the narration audio file
        output_path (str): Path for the output video
    """
    try:
        # Load video and audio clips
        videoclip = VideoFileClip(video_path)
        audioclip = AudioFileClip(narration_path)

        # Ensure narration duration matches video duration
        if audioclip.duration > videoclip.duration:
            print(f"Warning: Narration ({audioclip.duration:.2f}s) is longer than video ({videoclip.duration:.2f}s)")
        elif audioclip.duration < videoclip.duration:
            print(f"Warning: Narration ({audioclip.duration:.2f}s) is shorter than video ({videoclip.duration:.2f}s)")

        # Create composite audio and set it to video
        new_audioclip = CompositeAudioClip([audioclip])
        videoclip.audio = new_audioclip

         # Write the final video with higher quality settings
        videoclip.write_videofile(output_path, 
                                   codec='libx264', 
                                   audio_codec='aac',
                                   bitrate='8000k',  # Increased video bitrate
                                   audio_bitrate='320k',  # Increased audio bitrate
                                   preset='slow',  # Better compression, but slower encoding
                                   threads=4)  # Use 4 threads for faster encoding
        
    finally:
        # Clean up
        videoclip.close()
        audioclip.close()

In [None]:
video_path = "slideshow.mp4"
narration_path = "tts_segments/combined_narration.mp3"
output_path = "final_slideshow_with_narration.mp4"

combine_video_and_narration(video_path, narration_path, output_path)
print(f"Final video saved to: {output_path}")

# Archived/Helpers

In [None]:
# reads the transcript.json file and recreates the data models if u don't want to run the function again
import json
from pydantic import BaseModel
from typing import List

# Recreate the data models
class SlideSegment(BaseModel):
    image_number: int
    duration_seconds: float
    text: str
    image_path: str

class SlideshowTranscript(BaseModel):
    segments: List[SlideSegment]
with open('transcript.json', 'r') as f:
    transcript_dict = json.load(f)

# Convert back to Pydantic model
transcript_segments = SlideshowTranscript(**transcript_dict)

In [5]:
# save formatted photos if u don't want to send to openai again
import json
from datetime import datetime

# Custom JSON encoder to handle datetime objects
class DateTimeEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, datetime):
            return obj.isoformat()
        return super().default(obj)

# Save formatted photos to JSON
def save_photos_to_json(photos, output_file='formatted_photos_11_26_24_1.json'):
    try:
        with open(output_file, 'w') as f:
            json.dump(photos, f, cls=DateTimeEncoder, indent=2)
        print(f"Successfully saved photos to {output_file}")
    except Exception as e:
        print(f"Error saving photos to JSON: {str(e)}")

# Use it
save_photos_to_json(formatted_photos)


Successfully saved photos to formatted_photos_11_26_24_1.json


In [78]:
# method to replace missing photos in the transcript worked to avoid errors but story line made no sense
def clean_slideshow_transcript(formatted_photos, transcript_segments):
    """
    Makes sure that the image exists, if not, it replaces the image with the most appropriate photo
    
    Args:
        formatted_photos (list): List of dictionaries containing photo information
        transcript_segments (SlideshowTranscript): Transcript with timing and image information
        
    Returns:
        List[SlideSegment]: List of structured segments for the slideshow
    """
    # Find missing paths
    missing_paths = find_missing_paths(formatted_photos, transcript_segments)
    
    if not missing_paths:
        return transcript_segments
    
    client = OpenAI()
    
    response = client.beta.chat.completions.parse(
        model="gpt-4o-mini",
        messages=[
            {
                "role": "system",
                "content": """You are doing quality control for a slideshow creation system. 
                Your job is to fix segments that reference non-existent images by replacing them with the most appropriate 
                existing image based on the context and descriptions."""
            },
            {
                "role": "user",
                "content": f"""These images don't exist in our source data: {missing_paths}

                Please update the transcript segments to use the most appropriate existing image for each missing image.
                Consider the context of the segment's text and the descriptions of available images.

                Available photos and their descriptions:
                {json.dumps(formatted_photos, indent=2)}

                Current transcript segments:
                {json.dumps(transcript_segments.dict(), indent=2)}"""
            }
        ],
        response_format=SlideshowTranscript
    )
    
    return response.choices[0].message.parsed

In [7]:
# uses moviepy which took way to long and was kinda overkill lol
from moviepy import ImageClip, concatenate_videoclips, AudioClip
import os

def create_slideshow(transcript_segments):
    """
    Create a video slideshow from transcript segments using moviepy
    
    Args:
        transcript_segments (SlideshowTranscript): Transcript with timing and image information
    """
    # Create video clips for each segment
    clips = []
    
    for segment in transcript_segments.segments:
        # Create image clip
        image_clip = ImageClip(segment.image_path, duration=segment.duration_seconds)

        # Center the image
        # image_clip = image_clip.set_position('center')
        
        clips.append(image_clip)
    
    # Concatenate all clips
    final_video = concatenate_videoclips(clips, method="compose")
    return final_video

    
    # Write the video file
    # final_video.write_videofile(
    #     "slideshow.mov",  # Changed extension to .mov
    #     fps=10,          # Slideshow so can be slow
    #     codec='libx264', 
    #     preset='medium',
    #     audio=False,
    # )

# Use it
result =create_slideshow(transcript_segments)