In [None]:
#Require Python 3.12.3
#%pip freeze > requirements.txt

# Setup

In [None]:
import cv2
import numpy as np
from PIL import Image
from moviepy.editor import VideoFileClip, AudioFileClip, CompositeAudioClip
import tempfile
import os

from transformers import CLIPFeatureExtractor
from diffusers import StableDiffusionPipeline
from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
import torch
from pathlib import Path

import numpy as np
import datetime
import random
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM


In [None]:
torch.cuda.is_available()

In [None]:
assts_dir= Path(os.getcwd())/"assets"
propmt_dir = Path(os.getcwd())/"Prompts"
GDrive_dir = Path("run/user/1000/gvfs/google-drive:host=gmail.com,user=aiartstudio.ai/0AOT4cSJ5oKlpUk9PVA/1cBJcIkDKKJziO4CPcNyoIBUOQ6n_MshJ")


# classes

In [None]:

class ConceptPromptGenerator:
    def __init__(self, model_name="microsoft/Phi-4-reasoning-plus", max_new_tokens=300, device=None):
        self.device = device if device is not None else ("cuda" if torch.cuda.is_available() else "cpu")
        self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True, trust_remote_code=True)
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
            device_map=self.device,
            trust_remote_code=True
        )
        
        self.model.to(self.device)
        self.generator = pipeline(
            "text-generation",
            model=self.model,
            tokenizer=self.tokenizer,

        )
        self.max_new_tokens = max_new_tokens

    def generate(self, concept: str, category: str) -> dict:
        few_shot = (
            "You are an expert in AI art prompting and factual summaries.\n"
            "Given a concept and its category, generate:\n"
            "1. A vivid prompt for Stable Diffusion 1.5\n"
            "2. A negative prompt to avoid rendering issues\n"
            "3. A short (around 150 characters) catchy, interesting and educational fact about the concept\n\n"
            "Example:\n"
            "Concept: dog\n"
            "Category: animal\n"
            "Prompt: a happy golden retriever playing in a field, photorealistic, warm sunlight, detailed fur, 4k, realistic anatomy\n"
            "Negative Prompt: blurry, extra limbs, distorted, low quality, overexposed, unrealistic eyes\n"
            "Fact: Dogs bark to communicate\n\n"
            f"Concept: {concept}\n"
            f"Category: {category}\n"
            "Prompt:"
        )

        output = self.generator(
            few_shot,
            max_new_tokens=self.max_new_tokens,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
        )[0]["generated_text"]

        # Parse output from where the concept starts
        output = output.split(f"Concept: {concept}")[1]

        result = {"Prompt": "", "Negative Prompt": "", "Fact": ""}
        for line in output.splitlines():
            if line.startswith("Prompt:"):
                result["Prompt"] = line[len("Prompt:"):].strip()
            elif line.startswith("Negative Prompt:"):
                result["Negative Prompt"] = line[len("Negative Prompt:"):].strip()
            elif line.startswith("Fact:"):
                result["Fact"] = line[len("Fact:"):].strip()
            if all(result.values()):
                break
        return result
    

In [None]:

class VideoEditor:
    def __init__(self, fps=30, fourcc='mp4v', frame_size=(640, 480)):
        self.fps = fps
        self.frame_size = frame_size
        self.fourcc = cv2.VideoWriter_fourcc(*fourcc)
        self.temp_video_path = tempfile.mktemp(suffix='.mp4')
        self.video_writer = cv2.VideoWriter(self.temp_video_path, self.fourcc, self.fps, self.frame_size)
        self.current_time = 0  # in seconds, tracks the actual current duration of the video content
        self.audio_clips = []

        # Default settings for time text overlay
        self.default_show_time_text = False
        self.default_time_text_position = (50, 50)  # Default top-left corner
        self.default_time_text_color = (0, 255, 0)  # Default Green (B, G, R)
        self.default_time_text_font_scale = 1.0
        self.default_time_text_thickness = 2
        self.default_time_text_font = cv2.FONT_HERSHEY_SIMPLEX
        self.default_time_display_format = "HH:MM:SS.MS"

        # New class-level parameter for global display time offset
        # If None, the displayed time will default to the actual video current_time.
        # If set, it overrides the default behavior for all subsequent additions
        # where display_time_start_offset is not explicitly provided.
        self.global_display_time_offset_start = None
        # New: Counter for the displayed time when global_display_time_offset_start is active
        self.display_time_counter = 0.0

    def set_global_display_time_offset_start(self, offset_time):
        """
        Sets a global offset for the time displayed on the video.
        Any subsequent images added with show_time_text enabled and without
        an explicit display_time_start_offset will use this value as their base.

        Args:
            offset_time (float): The starting time (in seconds) to display on the video.
                                 Set to None to revert to using the actual video's current_time.
        """
        if not isinstance(offset_time, (int, float)) and offset_time is not None:
            raise TypeError("offset_time must be a number (int or float) or None.")
        if offset_time is not None and offset_time < 0:
            raise ValueError("offset_time cannot be negative.")
        self.global_display_time_offset_start = offset_time
        # Initialize the display_time_counter when the global offset is set
        self.display_time_counter = offset_time if offset_time is not None else 0.0


    def set_default_time_display_format(self, format_string):
        """
        Sets the default format for displaying time on frames.

        Args:
            format_string (str): The desired format string using placeholders:
                                 HH (hours), MM (minutes), SS (seconds), MS (milliseconds).
                                 Placeholders are case-insensitive (e.g., "hh", "mm", "ss", "ms" also work).
                                 Example: "HH:MM:SS.MS", "ss.ms", "MM:SS".
        """
        self.default_time_display_format = format_string

    def _resize_frame(self, frame):
        """
        Resizes an image frame to the video's frame_size.
        """
        return cv2.resize(frame, self.frame_size)

    def _convert_to_cv2(self, image):
        """
        Converts various image types (path, Pillow, NumPy array) to an OpenCV image (NumPy array)
        and resizes it.
        """
        if isinstance(image, str):
            img = cv2.imread(image)
            if img is None:
                raise FileNotFoundError(f"Image file not found or could not be read: {image}")
        elif isinstance(image, Image.Image):
            img = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
        elif isinstance(image, np.ndarray):
            img = image
        else:
            raise ValueError("Unsupported image type. Must be a file path (string), Pillow Image, or NumPy array.")
        return self._resize_frame(img)

    def _draw_time_on_frame(self, frame, time_in_sec, position, color, font_scale, thickness, font, time_format):
        """
        Draws the current time onto a video frame with a specified format.

        Args:
            frame (np.ndarray): The OpenCV image frame.
            time_in_sec (float): The time in seconds to display.
            position (tuple): (x, y) coordinates for the text.
            color (tuple): (B, G, R) color for the text.
            font_scale (float): Font scale factor.
            thickness (int): Line thickness for the text.
            font (int): OpenCV font type (e.g., cv2.FONT_HERSHEY_SIMPLEX).
            time_format (str): The format string using placeholders (HH, MM, SS, MS).
                               Placeholders are case-insensitive.
        """
        total_seconds_int = int(time_in_sec)
        milliseconds = int((time_in_sec - total_seconds_int) * 10)
        seconds = total_seconds_int % 60
        minutes = (total_seconds_int // 60) % 60
        hours = total_seconds_int // 3600

        replacements = {
            "HH": f"{hours:02}", "hh": f"{hours:02}",
            "MM": f"{minutes:02}", "mm": f"{minutes:02}",
            "SS": f"{seconds:02}", "ss": f"{seconds:02}",
            "MS": f"{milliseconds:01}", "ms": f"{milliseconds:01}"
        }

        time_str = time_format
        for placeholder, value in replacements.items():
            time_str = time_str.replace(placeholder, value)

        cv2.putText(frame, time_str, position, font, font_scale, color, thickness, cv2.LINE_AA)
        return frame

    def add_image(self, image, duration_sec,
                  show_time_text=None, time_text_position=None, time_text_color=None,
                  time_text_font_scale=None, time_text_thickness=None, time_text_font=None,
                  display_time_start_offset=None, time_display_format=None):
        """
        Adds a single image to the video for a specified duration, with optional time display.

        Args:
            image: The image to add. Can be a file path (string), a Pillow Image object,
                   or an OpenCV image (NumPy array).
            duration_sec (float): The duration (in seconds) for which the image should be displayed.
            show_time_text (bool, optional): Whether to show the current time on the image.
                                              Defaults to self.default_show_time_text.
            time_text_position (tuple, optional): (x, y) coordinates for the text.
                                                  Defaults to self.default_time_text_position.
            time_text_color (tuple, optional): (B, G, R) color for the text.
                                               Defaults to self.default_time_text_color.
            time_text_font_scale (float, optional): Font scale factor.
                                                    Defaults to self.default_time_text_font_scale.
            time_text_thickness (int, optional): Line thickness for the text.
                                                 Defaults to self.default_time_text_thickness.
            time_text_font (int, optional): OpenCV font type.
                                            Defaults to self.default_time_text_font.
            display_time_start_offset (float, optional): The starting time (in seconds) to display
                                                         on the video for this segment.
                                                         Precedence: local param > global param > actual video current time.
            time_display_format (str, optional): The format for the time string (e.g., "HH:MM:SS.MS").
                                                 Placeholders are case-insensitive.
                                                 Precedence: local param > default class param.
        """
        frame = self._convert_to_cv2(image)
        frame_count = int(self.fps * duration_sec)

        # Resolve text display parameters
        _show_text = self.default_show_time_text if show_time_text is None else show_time_text
        _position = self.default_time_text_position if time_text_position is None else time_text_position
        _color = self.default_time_text_color if time_text_color is None else time_text_color
        _font_scale = self.default_time_text_font_scale if time_text_font_scale is None else time_text_font_scale
        _thickness = self.default_time_text_thickness if time_text_thickness is None else time_text_thickness
        _font = self.default_time_text_font if time_text_font is None else time_text_font
        _time_format = self.default_time_display_format if time_display_format is None else time_display_format

        # Determine the base time for display based on precedence
        _base_display_time_for_this_segment = self.current_time # Default fallback: actual video time

        if display_time_start_offset is not None:
            # Local override takes highest precedence
            _base_display_time_for_this_segment = display_time_start_offset
        elif self.global_display_time_offset_start is not None:
            # Global override applies if no local override. Use the continuous display_time_counter.
            _base_display_time_for_this_segment = self.display_time_counter

        for i in range(frame_count):
            # Calculate the time to display for the current frame
            current_display_time = _base_display_time_for_this_segment + (i / self.fps)
            frame_to_write = frame.copy() # Use a copy to avoid drawing on the original frame for next iteration
            if _show_text:
                frame_to_write = self._draw_time_on_frame(
                    frame_to_write, current_display_time, _position, _color, _font_scale, _thickness, _font, _time_format
                )
            self.video_writer.write(frame_to_write)

        # Always update the actual video time
        self.current_time += duration_sec

        # Only update display_time_counter if global offset is active AND no local offset was used
        if display_time_start_offset is None and self.global_display_time_offset_start is not None:
            self.display_time_counter += duration_sec


    def add_images_from_list(self, images, total_duration_sec,
                             show_time_text=None, time_text_position=None, time_text_color=None,
                             time_text_font_scale=None, time_text_thickness=None, time_text_font=None,
                             display_time_start_offset=None, time_display_format=None):
        """
        Adds several images to the video, distributing them evenly over a total duration,
        with optional time display on each image.

        Args:
            images: Can be a string (directory path), a list of strings (image file paths),
                    a list of OpenCV images (numpy arrays), or a list of Pillow images.
            total_duration_sec (float): The total duration (in seconds) that these images
                                        should occupy in the video.
            show_time_text (bool, optional): Whether to show the current time on the image.
                                              Defaults to self.default_show_time_text.
            time_text_position (tuple, optional): (x, y) coordinates for the text.
                                                  Defaults to self.default_time_text_position.
            time_text_color (tuple, optional): (B, G, R) color for the text.
                                               Defaults to self.default_time_text_color.
            time_text_font_scale (float, optional): Font scale factor.
                                                    Defaults to self.default_time_text_font_scale.
            time_text_thickness (int, optional): Line thickness for the text.
                                                 Defaults to self.default_time_text_thickness.
            time_text_font (int, optional): OpenCV font type.
                                            Defaults to self.default_time_text_font.
            display_time_start_offset (float, optional): The starting time (in seconds) to display
                                                         on the video for this segment.
                                                         Precedence: local param > global param > actual video current time.
            time_display_format (str, optional): The format for the time string (e.g., "HH:MM:SS.MS").
                                                 Placeholders are case-insensitive.
                                                 Precedence: local param > default class param.
        """
        image_list = []

        if isinstance(images, str) and os.path.isdir(images):
            for filename in sorted(os.listdir(images)):
                if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp')):
                    image_list.append(os.path.join(images, filename))
        elif isinstance(images, list):
            if all(isinstance(img, str) for img in images):
                image_list = images
            elif all(isinstance(img, (np.ndarray, Image.Image)) for img in images):
                image_list = images
            else:
                raise ValueError("List must contain only strings (paths), OpenCV images, or Pillow images.")
        else:
            raise ValueError("Unsupported 'images' type. Must be a directory path (string), a list of paths, a list of OpenCV images, or a list of Pillow images.")

        if not image_list:
            print("No images found to add.")
            return

        single_image_duration = total_duration_sec / len(image_list)

        for i, img in enumerate(image_list):
            # The display_time_start_offset passed to add_image needs to be carefully managed.
            # If the user provided a display_time_start_offset to add_images_from_list,
            # we calculate the offset for each individual image within that batch.
            # If not, we pass None, allowing add_image to use the global_display_time_offset_start
            # and its internal display_time_counter, which is the desired behavior for continuous time.
            effective_display_offset_for_image_batch = None
            if display_time_start_offset is not None:
                effective_display_offset_for_image_batch = display_time_start_offset + (i * single_image_duration)

            self.add_image(img, single_image_duration,
                           show_time_text=show_time_text,
                           time_text_position=time_text_position,
                           time_text_color=time_text_color,
                           time_text_font_scale=time_text_font_scale,
                           time_text_thickness=time_text_thickness,
                           time_text_font=time_text_font,
                           display_time_start_offset=effective_display_offset_for_image_batch, # Pass the calculated offset or None
                           time_display_format=time_display_format)

    def add_text_below_image(self, image, text, duration_sec,
                             text_box_height_ratio=0.2,
                             background_color=(0, 0, 0),
                             text_color=(255, 255, 255),
                             text_horizontal_alignment="center",
                             text_vertical_alignment="center",
                             font_scale=0.7, thickness=2, font=cv2.FONT_HERSHEY_SIMPLEX,
                             show_time_text=None, time_text_position=None, time_text_color=None,
                             time_text_font_scale=None, time_text_thickness=None, time_text_font=None,
                             display_time_start_offset=None, time_display_format=None):
        """
        Adds an image with text displayed in a box below it for a given duration.
        The combined content fits within the video's frame_size.

        Args:
            image: The input image (path, Pillow, or NumPy array).
            text (str): The text string to display.
            duration_sec (float): The duration (in seconds) for which this combined frame should be displayed.
            text_box_height_ratio (float): Ratio (0.0 to 1.0) of the frame height allocated to the text box.
            background_color (tuple): (B, G, R) color for the text box background.
            text_color (tuple): (B, G, R) color for the text.
            text_horizontal_alignment (str): Horizontal alignment of text ("left", "center", "right").
            text_vertical_alignment (str): Vertical alignment of text ("top", "center", "bottom").
            font_scale (float): Font scale factor for the text.
            thickness (int): Line thickness for the text.
            font (int): OpenCV font type (e.g., cv2.FONT_HERSHEY_SIMPLEX).
            # Parameters for optional time display (same as add_image)
            show_time_text (bool, optional): Whether to show the current time on the image.
            time_text_position (tuple, optional): (x, y) coordinates for the time text.
            time_text_color (tuple, optional): (B, G, R) color for the time text.
            time_text_font_scale (float, optional): Font scale factor for time text.
            time_text_thickness (int, optional): Line thickness for time text.
            time_text_font (int, optional): OpenCV font type for time text.
            display_time_start_offset (float, optional): The starting time to display.
            time_display_format (str, optional): The format for the time string.
        """
        if not (0.0 <= text_box_height_ratio <= 1.0):
            raise ValueError("text_box_height_ratio must be between 0.0 and 1.0.")

        original_img_frame = self._convert_to_cv2(image)
        
        video_width, video_height = self.frame_size
        text_box_height = int(video_height * text_box_height_ratio)
        image_display_height = video_height - text_box_height

        if image_display_height <= 0:
            raise ValueError("Image display height is zero or negative. Reduce text_box_height_ratio or increase frame_size.")

        resized_img_for_top = cv2.resize(original_img_frame, (video_width, image_display_height))

        text_section = np.full((text_box_height, video_width, 3), background_color, dtype=np.uint8)

        text_size = cv2.getTextSize(text, font, font_scale, thickness)[0]
        text_width, text_height_actual = text_size

        text_x, text_y = 0, 0

        if text_horizontal_alignment.lower() == "center":
            text_x = (video_width - text_width) // 2
        elif text_horizontal_alignment.lower() == "right":
            text_x = video_width - text_width - 10
        else: # Default to "left"
            text_x = 10

        if text_vertical_alignment.lower() == "center":
            text_y = (text_box_height + text_height_actual) // 2
        elif text_vertical_alignment.lower() == "bottom":
            text_y = text_box_height - 10
        else: # Default to "top"
            text_y = text_height_actual + 10

        cv2.putText(text_section, text, (text_x, text_y), font, font_scale, text_color, thickness, cv2.LINE_AA)

        final_frame = np.vstack((resized_img_for_top, text_section))

        frame_count = int(self.fps * duration_sec)

        _show_text = self.default_show_time_text if show_time_text is None else show_time_text
        _time_position = self.default_time_text_position if time_text_position is None else time_text_position
        _time_color = self.default_time_text_color if time_text_color is None else time_text_color
        _time_font_scale = self.default_time_text_font_scale if time_text_font_scale is None else time_text_font_scale
        _time_thickness = self.default_time_text_thickness if time_text_thickness is None else time_text_thickness
        _time_font = self.default_time_text_font if time_text_font is None else time_text_font
        _time_format = self.default_time_display_format if time_display_format is None else time_display_format

        _base_display_time_for_this_segment = self.current_time
        if display_time_start_offset is not None:
            _base_display_time_for_this_segment = display_time_start_offset
        elif self.global_display_time_offset_start is not None:
            _base_display_time_for_this_segment = self.display_time_counter


        for i in range(frame_count):
            current_display_time = _base_display_time_for_this_segment + (i / self.fps)
            frame_to_write = final_frame.copy()
            if _show_text:
                frame_to_write = self._draw_time_on_frame(
                    frame_to_write, current_display_time, _time_position, _time_color, _time_font_scale, _time_thickness, _time_font, _time_format
                )
            self.video_writer.write(frame_to_write)
        
        self.current_time += duration_sec

        if display_time_start_offset is None and self.global_display_time_offset_start is not None:
            self.display_time_counter += duration_sec

    def add_video(self, video_path):
        """
        Adds another video clip to the current video. The added video retains its original duration.

        Args:
            video_path (str): Path to the video file to be added.
        """
        cap = cv2.VideoCapture(video_path)
        if not cap.isOpened():
            raise FileNotFoundError(f"Video file not found or could not be opened: {video_path}")

        video_fps = cap.get(cv2.CAP_PROP_FPS)
        if video_fps == 0:
            video_fps = self.fps

        while True:
            ret, frame = cap.read()
            if not ret:
                break
            frame = self._resize_frame(frame)
            self.video_writer.write(frame)

        duration = cap.get(cv2.CAP_PROP_FRAME_COUNT) / video_fps
        cap.release()
        self.current_time += duration
        # If global display offset is active, and we are adding a video (which doesn't have its own
        # display_time_start_offset parameter), we should also advance the display_time_counter.
        if self.global_display_time_offset_start is not None:
            self.display_time_counter += duration


    def add_audio(self, audio_path, audio_clip_start=None, audio_clip_end=None, video_start_offset=None):
        """
        Adds an audio clip to the video timeline.

        Args:
            audio_path (str): Path to the audio file.
            audio_clip_start (float, optional): The start time (in seconds) within the audio file itself.
                                                Defaults to 0 (beginning of the audio file).
            audio_clip_end (float, optional): The end time (in seconds) within the audio file itself.
                                              Defaults to the end of the audio clip.
            video_start_offset (float, optional): The time (in seconds) on the video timeline where this
                                                  audio should start. If None, it starts at the current
                                                  end time of the video (`self.current_time`).
        """
        try:
            audio_clip = AudioFileClip(audio_path)
        except Exception as e:
            raise ValueError(f"Could not load audio file {audio_path}: {e}")

        if audio_clip_start is not None or audio_clip_end is not None:
            if audio_clip_start is not None and audio_clip_end is not None and audio_clip_start > audio_clip_end:
                raise ValueError("audio_clip_start cannot be greater than audio_clip_end.")
            
            start_subclip = audio_clip_start if audio_clip_start is not None else 0
            end_subclip = audio_clip_end if audio_clip_end is not None else audio_clip.duration
            
            audio_clip = audio_clip.subclip(start_subclip, end_subclip)

        offset_on_video = video_start_offset if video_start_offset is not None else self.current_time

        self.audio_clips.append((audio_clip, offset_on_video))

    def get_video_duration(self):
        """
        Returns the current duration of the video content in seconds.
        """
        return self.current_time

    def save(self, output_path):
        """
        Finalizes the video and merges audio if present.
        """
        self.video_writer.release()

        final_clip = VideoFileClip(self.temp_video_path)

        if self.audio_clips:
            all_audios = []
            for audio, offset in self.audio_clips:
                all_audios.append(audio.set_start(offset))
            
            composite_audio = CompositeAudioClip(all_audios)
            
            final_clip = final_clip.set_audio(composite_audio)

        print(f"Saving video to {output_path}...")
        final_clip.write_videofile(output_path, codec='libx264', audio_codec='aac')
        final_clip.close()
        os.remove(self.temp_video_path)
        print("Video saved successfully and temporary file removed.")



In [None]:
class SD15ImageGenerator:
    def __init__(self, model_id="runwayml/stable-diffusion-v1-5", use_cuda=True, num_inference_steps=25):
        """
        Initialize the Stable Diffusion 1.5 pipeline and inference settings.
        """
        self.device = "cuda" if use_cuda and torch.cuda.is_available() else "cpu"
        self.num_inference_steps = num_inference_steps
        self.intermediate_images = []

        # Load the safety checker and feature extractor
        # You might need to specify the subfolder if they are not at the top level of the model_id
        safety_checker = StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker")
        feature_extractor = CLIPFeatureExtractor.from_pretrained("CompVis/stable-diffusion-safety-checker")
        self.negative_prompt = """deformed, distorted, disfigured, bad anatomy, ugly, tiling, poorly drawn hands, poorly drawn face, 
                                  out of frame, low quality, jpeg artifacts, duplicate, morbid, mutilated, extra fingers, mutated hands,  mutation, blurry, dehydrated, 
                                  bad proportions, extra limbs, cloned face, gross proportions, malformed limbs, missing arms, missing legs, extra hands, fused fingers, wrong hand, 
                                  long neck, worst quality, watermark, signature, text, error, cropped, username, logo, lowres, oversaturated, washed out, 
                                  cloned, bad composition, crosseyed , squint, lazy eye , bad eyes, wrong eyes, missing teeth, bad teeth, ugly teeth, open mouth, too many teeth,
                                  extra tongue, wrong mouth, ugly mouth, bad mouth, bad nose, ugly nose, wrong nose, missing nose, bad ear, ugly ear, wrong ear, missing ear,
                                  extra ear, double ear, three ears, mutated ear, long ear, short ear, big ear, small ear,
                                  bad hair, ugly hair, wrong hair, missing hair, bad skin, ugly skin, wrong skin, 
                                  missing skin, extra skin, mutated skin, bad clothing, ugly clothing, wrong clothing,missing clothing, mutated clothing, 
                                  big clothing, small clothing, bad background, ugly background, wrong background, bad lighting, ugly lighting, wrong lighting"""
        self.pipe = StableDiffusionPipeline.from_pretrained(
            model_id,
            torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
             safety_checker=safety_checker,
            feature_extractor=feature_extractor # Don't forget the feature_extractor
        ).to(self.device)

    def _capture_step(self, step, timestep, latents):
        """
        Internal callback to capture the image at each step.
        """
        # Decode latent to image at this step
        with torch.no_grad():
            image = self.pipe.vae.decode(latents / self.pipe.vae.config.scaling_factor).sample
            image = (image / 2 + 0.5).clamp(0, 1)
            image = image.cpu().permute(0, 2, 3, 1).numpy()[0]
            image_pil = Image.fromarray((image * 255).astype("uint8"))
            self.intermediate_images.append(image_pil)

    def generate_image(self, prompt, negative_prompt=None, guidance_scale=7.5):
        """
        Generate image and collect intermediate steps.
        Returns a list of PIL images (one per step).
        """
        self.intermediate_images = []
        negative_prompt = negative_prompt or self.negative_prompt

        with torch.autocast(self.device) if self.device == "cuda" else torch.no_grad():
            _ = self.pipe(
                prompt="high resolution image of: "+prompt +" ,8K, best quality, masterpiece, photorealistic, ultra-detailed, sharp focus",
                negative_prompt=negative_prompt,
                guidance_scale=guidance_scale,
                num_inference_steps=self.num_inference_steps,
                callback=self._capture_step,
                callback_steps=1  # capture every step
            )

        return self.intermediate_images

    def save_image(self, image: Image.Image, output_path: str):
        """
        Save a single PIL image to the specified path.
        """
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        image.save(output_path)
        print(f"Image saved to {output_path}")
    def save_images(self, images, directory="generated"):
        """
        Save a list of images to the given directory.
        """
        os.makedirs(directory, exist_ok=True)
        for i, img in enumerate(images):
            path = os.path.join(directory, f"step_{i:02d}.png")
            img.save(path)
        print(f"Saved {len(images)} images to '{directory}/'")


# Functions


In [None]:


def generate_evenly_distributed_values(data):
    """
    Generates evenly distributed values for each tuple (number, start, end) in a list.

    Args:
        data (list): A list of tuples, where each tuple is (number, start, end).
                     'number' is the count of values to generate, 'start' is the
                     beginning of the range, and 'end' is the end of the range.

    Returns:
        list: A single list containing all the generated evenly distributed values.
    """
    all_values = []
    for num, start, end in data:
        # Generate 'num' evenly distributed values between 'start' and 'end'
        # np.linspace includes both start and end points
        if num > 0:
            generated_values = np.linspace(start, end, num).tolist()
            all_values.extend(generated_values)
    return all_values

# Example Usage:
# data1 = [(5, 0, 10), (3, 100, 102)]
# result1 = generate_evenly_distributed_values(data1)
# print(f"Result for data1: {result1}")
# # Expected output for data1: [0.0, 2.5, 5.0, 7.5, 10.0, 100.0, 101.0, 102.0]

# data2 = [(1, 5, 5), (4, -2, 2)]
# result2 = generate_evenly_distributed_values(data2)
# print(f"Result for data2: {result2}")
# # Expected output for data2: [5.0, -2.0, -0.6666666666666666, 0.6666666666666666, 2.0]

# data3 = []
# result3 = generate_evenly_distributed_values(data3)
# print(f"Result for data3: {result3}")
# # Expected output for data3: []

# data4 = [(0, 1, 10)]
# result4 = generate_evenly_distributed_values(data4)
# print(f"Result for data4: {result4}")
# # Expected output for data4: []


In [None]:

def generate_filename_by_datetime(postfix:str, extension: str) -> str:
    """
    Generates a filename based on the current date and time with a specified extension.

    The format of the filename will be 'YYYY-MM-DD-HH-MM-SS.extension'.

    Args:
        extension (str): The file extension (e.g., 'mp4', 'txt', 'jpg').
                         It should not include the leading dot.

    Returns:
        str: The generated filename.
    """
    # Get the current date and time
    now = datetime.datetime.now()

    # Format the datetime object into a string
    # YYYY: Year with century
    # MM: Month as a zero-padded decimal number
    # DD: Day of the month as a zero-padded decimal number
    # HH: Hour (24-hour clock) as a zero-padded decimal number
    # MM: Minute as a zero-padded decimal number
    # SS: Second as a zero-padded decimal number
    timestamp_str = now.strftime("%Y-%m-%d-%H-%M-%S")

    # Construct the full filename
    filename = f"{timestamp_str}_{postfix}.{extension}"

    return filename


In [None]:

def add_text_to_image(
    image,
    text: str,
    org: tuple[int | None, int | None] = (10, 30),  # Bottom-left corner of the text string
    font_face: int = cv2.FONT_HERSHEY_SIMPLEX,
    font_scale: float = 1.0,
    color: tuple[int, int, int] = (0, 0, 0),  # BGR color (Black by default)
    thickness: int = 2,
    background_color: tuple[int, int, int] = (255, 255, 255), # White background for new canvas
    text_background_color: tuple[int, int, int] = (255, 255, 255), # White background for text by default
    text_background_transparency: float = 0.8, # 80% transparency by default
    padding_x: int = 20, # Horizontal padding for text background
    padding_y: int = 20, # Vertical padding for text background
    wordwrap: bool = False # New parameter: if True, wraps text to fit image width
) -> np.ndarray:
    """
    Adds text to an image, extending the image size if the text falls outside
    the original boundaries. Supports word wrapping.

    Args:
        image: The input image. Can be an OpenCV (numpy.ndarray) or Pillow (PIL.Image.Image) image.
        text (str): The text string to add.
        org (tuple[int | None, int | None]): The bottom-left corner of the text string in (x, y) coordinates.
                               Defaults to (10, 30). If x or y is None, it will be centered in that direction.
        font_face (int): Font type. See cv2.FONT_HERSHEY_* for options.
                         Defaults to cv2.FONT_HERSHEY_SIMPLEX.
        font_scale (float): Font scale factor multiplied by the font-specific base size.
                            Defaults to 1.0.
        color (tuple[int, int, int]): Text color in BGR format. Defaults to (0, 0, 0) (Black).
        thickness (int): Thickness of the text lines. Defaults to 2.
        background_color (tuple[int, int, int]): Color to fill the extended canvas if the image
                                                  needs to be resized. Defaults to (255, 255, 255) (White).
        text_background_color (tuple[int, int, int]): Color of the text's background in BGR format.
                                                       Defaults to (255, 255, 255) (White).
        text_background_transparency (float): Transparency of the text background.
                                              Value between 0.0 (fully transparent) and 1.0 (fully opaque).
                                              Defaults to 0.8 (80% transparent).
        padding_x (int): Horizontal padding to add around the text background. Defaults to 5 pixels.
        padding_y (int): Vertical padding to add around the text background. Defaults to 5 pixels.
        wordwrap (bool): If True, wraps text to fit within the image width, breaking at spaces.
                         Defaults to False.

    Returns:
        numpy.ndarray: The image with the added text, in OpenCV (BGR) format.
    """

    # 1. Handle Image Input: Convert Pillow image to OpenCV format if necessary
    if isinstance(image, Image.Image):
        img_np = np.array(image)
        if img_np.ndim == 2: # Grayscale image
            img_cv = cv2.cvtColor(img_np, cv2.COLOR_GRAY2BGR)
        elif img_np.shape[2] == 4: # RGBA image
            img_cv = cv2.cvtColor(img_np, cv2.COLOR_RGBA2BGR)
        else: # RGB image
            img_cv = cv2.cvtColor(img_np, cv2.COLOR_RGB2BGR)
    elif isinstance(image, np.ndarray):
        img_cv = image
        # Ensure the image is BGR (3 channels) if it's grayscale
        if img_cv.ndim == 2:
            img_cv = cv2.cvtColor(img_cv, cv2.COLOR_GRAY2BGR)
        elif img_cv.shape[2] == 4: # Handle RGBA if passed as numpy array
            img_cv = cv2.cvtColor(img_cv, cv2.COLOR_RGBA2BGR)
    else:
        raise TypeError("Input image must be a PIL Image or a NumPy array (OpenCV format).")

    # Get original image dimensions
    h_orig, w_orig = img_cv.shape[:2]

    # Calculate representative text height and baseline for a single line.
    # This is used for consistent line spacing and overall text block height calculations.
    (text_w_dummy, text_h_single_line), baseline_single_line = cv2.getTextSize(
        "Tg", font_face, font_scale, thickness
    )
    # A reasonable spacing between lines, often a percentage of the font height.
    line_spacing = int(text_h_single_line * 0.5)

    # --- Word Wrapping Logic ---
    wrapped_lines_info = [] # This list will store tuples of (line_text, line_width, line_height, line_baseline) for each line.
    max_overall_text_width = 0 # Stores the width of the widest line
    
    if wordwrap:
        # Determine the maximum available width for wrapping the text within the image.
        # This considers the initial x-position and padding.
        if org[0] is None: # If horizontally centered, available width is image width minus double padding.
            available_width_for_wrapping = w_orig - (2 * padding_x)
        else: # If a specific x-coordinate is provided, available width is from that point to the right edge.
            available_width_for_wrapping = w_orig - org[0] - padding_x
        
        # Ensure the available width is not negative or too small to avoid issues.
        available_width_for_wrapping = max(10, available_width_for_wrapping)

        words = text.split(' ')
        current_line_words = []
        current_line_text = ""

        for word in words:
            # Construct a test line by adding the current word (with a space if not the first word).
            test_line_text = (current_line_text + " " + word).strip()
            # Get the size of this potential line.
            (test_w, _), _ = cv2.getTextSize(
                test_line_text, font_face, font_scale, thickness
            )

            # If adding the word makes the line too long AND there are already words in the current line,
            # then the current line is complete and the new word starts a new line.
            if test_w > available_width_for_wrapping and len(current_line_words) > 0:
                # Calculate the actual size of the completed line.
                (line_w, line_h), line_baseline = cv2.getTextSize(
                    current_line_text, font_face, font_scale, thickness
                )
                wrapped_lines_info.append((current_line_text, line_w, line_h, line_baseline))
                max_overall_text_width = max(max_overall_text_width, line_w)

                # Start a new line with the current word.
                current_line_words = [word]
                current_line_text = word
            else:
                # The word fits, so add it to the current line.
                current_line_words.append(word)
                current_line_text = " ".join(current_line_words)
        
        # After the loop, add any remaining text in the current_line_text as the last line.
        if current_line_text:
            (line_w, line_h), line_baseline = cv2.getTextSize(
                current_line_text, font_face, font_scale, thickness
            )
            wrapped_lines_info.append((current_line_text, line_w, line_h, line_baseline))
            max_overall_text_width = max(max_overall_text_width, line_w)

    else: # If word wrapping is not enabled, treat the entire text as a single line.
        (text_w, text_h), baseline = cv2.getTextSize(text, font_face, font_scale, thickness)
        wrapped_lines_info.append((text, text_w, text_h, baseline))
        max_overall_text_width = text_w
    
    # Calculate the total height required by the entire block of wrapped text.
    # This is the height from the top of the first line's ascenders to the bottom of the last line's descenders.
    total_text_block_content_height = 0
    if wrapped_lines_info:
        # Top of the first line relative to its baseline (negative value)
        first_line_top_offset_from_baseline = -(wrapped_lines_info[0][2] - wrapped_lines_info[0][3])
        
        # Baseline of the last line relative to the first line's baseline
        last_line_baseline_offset_from_first_baseline = 0
        if len(wrapped_lines_info) > 1:
            last_line_baseline_offset_from_first_baseline = (len(wrapped_lines_info) - 1) * \
                                                              (text_h_single_line + line_spacing)
        
        # Bottom of the last line relative to its own baseline
        last_line_bottom_offset_from_its_baseline = wrapped_lines_info[-1][3]

        # Total content height = (last line's baseline + its bottom offset) - (first line's baseline + its top offset)
        # We assume the first line's baseline is at y=0 for this calculation of the *span*.
        total_text_block_content_height = (last_line_baseline_offset_from_first_baseline + last_line_bottom_offset_from_its_baseline) - \
                                          (first_line_top_offset_from_baseline)

    # --- Determine Canvas Extension (Pre-calculation of text block position for extension check) ---
    new_w, new_h = w_orig, h_orig
    offset_x_canvas, offset_y_canvas = 0, 0

    # Calculate the *desired* top-left corner of the text block's content area (without padding)
    # relative to the original image's (0,0) if no canvas extension happens.
    temp_text_block_x_content_start = org[0] if org[0] is not None else int((w_orig - max_overall_text_width) / 2)
    
    # temp_text_block_y_content_top represents the Y-coordinate of the *visual top* of the entire text block.
    if org[1] is not None: # org[1] is the baseline of the first line
        temp_text_block_y_content_top = org[1] + first_line_top_offset_from_baseline
    else: # Vertical centering
        temp_text_block_y_content_top = int((h_orig - total_text_block_content_height) / 2)

    # Calculate bounding box for text block with padding for extension check
    padded_x1 = temp_text_block_x_content_start - padding_x
    padded_y1 = temp_text_block_y_content_top - padding_y 
    padded_x2 = temp_text_block_x_content_start + max_overall_text_width + padding_x
    padded_y2 = temp_text_block_y_content_top + total_text_block_content_height + padding_y

    # Check for left extension
    if padded_x1 < 0:
        offset_x_canvas = -padded_x1
        new_w += offset_x_canvas

    # Check for top extension
    if padded_y1 < 0:
        offset_y_canvas = -padded_y1
        new_h += offset_y_canvas

    # Recalculate padded coordinates based on potentially adjusted new_w/new_h
    # This is needed to check for right/bottom extension against the *potential* new size.
    # The new_w/new_h might have increased due to left/top extensions.
    # The text block's position on this *potential* new canvas:
    current_text_block_x_on_potential_canvas = temp_text_block_x_content_start + offset_x_canvas
    current_text_block_y_top_on_potential_canvas = temp_text_block_y_content_top + offset_y_canvas

    padded_x2_after_offset = current_text_block_x_on_potential_canvas + max_overall_text_width + padding_x
    padded_y2_after_offset = current_text_block_y_top_on_potential_canvas + total_text_block_content_height + padding_y

    # Check for right extension
    if padded_x2_after_offset > new_w:
        new_w = padded_x2_after_offset

    # Check for bottom extension
    if padded_y2_after_offset > new_h:
        new_h = padded_y2_after_offset

    # 4. Create new canvas if needed and paste original image.
    if new_w > w_orig or new_h > h_orig:
        # Create a new blank canvas with the specified background color.
        new_image_canvas = np.full((new_h, new_w, 3), background_color, dtype=np.uint8)
        # Paste the original image onto the new canvas at the calculated offset.
        new_image_canvas[offset_y_canvas : offset_y_canvas + h_orig,
                         offset_x_canvas : offset_x_canvas + w_orig] = img_cv
        img_cv = new_image_canvas
    
    # Calculate the FINAL position of the text block's *content area* top-left corner on the (potentially new) canvas.
    final_text_block_x_content_on_canvas = 0
    final_text_block_y_content_top_on_canvas = 0

    if org[0] is None: # Horizontal centering
        # Center the entire text block (based on its widest line) horizontally on the new canvas.
        final_text_block_x_content_on_canvas = int((img_cv.shape[1] - max_overall_text_width) / 2)
    else: # Specific x-coordinate provided
        final_text_block_x_content_on_canvas = temp_text_block_x_content_start + offset_x_canvas

    if org[1] is None: # Vertical centering
        # Center the entire text block vertically on the new canvas.
        final_text_block_y_content_top_on_canvas = int((img_cv.shape[0] - total_text_block_content_height) / 2)
    else: # Specific y-coordinate provided (baseline)
        final_text_block_y_content_top_on_canvas = temp_text_block_y_content_top + offset_y_canvas

    # 5. Place Text Background (before text) for the entire block.
    # This is drawn only if transparency is greater than 0 and there is text to draw.
    if text_background_transparency > 0 and wrapped_lines_info:
        # Calculate the top-left and bottom-right corners of the entire text block's background.
        # These are relative to the final position on the canvas.
        x1_bg = final_text_block_x_content_on_canvas - padding_x
        y1_bg = final_text_block_y_content_top_on_canvas - padding_y
        x2_bg = final_text_block_x_content_on_canvas + max_overall_text_width + padding_x
        y2_bg = final_text_block_y_content_top_on_canvas + total_text_block_content_height + padding_y

        # Ensure background coordinates are within the image bounds to prevent drawing outside.
        x1_bg = max(0, x1_bg)
        y1_bg = max(0, y1_bg)
        x2_bg = min(img_cv.shape[1], x2_bg)
        y2_bg = min(img_cv.shape[0], y2_bg)

        # Only draw the rectangle if the bounding box is valid (positive width and height).
        if x2_bg > x1_bg and y2_bg > y1_bg:
            overlay = img_cv.copy() # Create a copy to draw the background on.
            # Draw a filled rectangle for the background.
            cv2.rectangle(overlay, (x1_bg, y1_bg), (x2_bg, y2_bg), text_background_color, -1)
            # Blend the overlay with the original image using the specified transparency.
            alpha = text_background_transparency
            cv2.addWeighted(overlay, alpha, img_cv, 1 - alpha, 0, img_cv)

    # 6. Place Text (loop through each wrapped line).
    # Calculate the baseline of the first line from the top of the text content block.
    # The baseline of the first line is the top of the text content block + the distance from its top to its baseline.
    current_line_y_baseline = final_text_block_y_content_top_on_canvas - first_line_top_offset_from_baseline

    for i, (line_text, line_w, line_h, line_baseline) in enumerate(wrapped_lines_info):
        line_org_x = final_text_block_x_content_on_canvas # Default to block's left edge
        
        # If the original request was for horizontal centering, recalculate x-origin for each line
        # to ensure each line is individually centered within the *new* canvas.
        if org[0] is None:
            line_org_x = int((img_cv.shape[1] - line_w) / 2)

        # Put the text on the image.
        cv2.putText(img_cv, line_text, (line_org_x, current_line_y_baseline),
                    font_face, font_scale, color, thickness, cv2.LINE_AA)
        
        # Move the y-coordinate down to the baseline of the next line.
        # The height of the line itself is line_h, but we use text_h_single_line for consistent spacing.
        if i < len(wrapped_lines_info) - 1: # Don't add spacing after the last line
            current_line_y_baseline += (text_h_single_line + line_spacing)

    # 7. Return the modified image.
    return img_cv


In [None]:

def add_text_to_image_old(
    image,
    text: str,
    org: tuple[int | None, int | None] = (10, 30),  # Bottom-left corner of the text string
    font_face: int = cv2.FONT_HERSHEY_SIMPLEX,
    font_scale: float = 1.0,
    color: tuple[int, int, int] = (0, 0, 0),  # BGR color (Black by default)
    thickness: int = 2,
    background_color: tuple[int, int, int] = (255, 255, 255), # White background for new canvas
    text_background_color: tuple[int, int, int] = (255, 255, 255), # White background for text by default
    text_background_transparency: float = 0.8, # 80% transparency by default
    padding_x: int = 5, # Horizontal padding for text background
    padding_y: int = 5 # Vertical padding for text background
) -> np.ndarray:
    """
    Adds text to an image, extending the image size if the text falls outside
    the original boundaries.

    Args:
        image: The input image. Can be an OpenCV (numpy.ndarray) or Pillow (PIL.Image.Image) image.
        text (str): The text string to add.
        org (tuple[int | None, int | None]): The bottom-left corner of the text string in (x, y) coordinates.
                               Defaults to (10, 30). If x or y is None, it will be centered in that direction.
        font_face (int): Font type. See cv2.FONT_HERSHEY_* for options.
                         Defaults to cv2.FONT_HERSHEY_SIMPLEX.
        font_scale (float): Font scale factor multiplied by the font-specific base size.
                            Defaults to 1.0.
        color (tuple[int, int, int]): Text color in BGR format. Defaults to (0, 0, 0) (Black).
        thickness (int): Thickness of the text lines. Defaults to 2.
        background_color (tuple[int, int, int]): Color to fill the extended canvas if the image
                                                  needs to be resized. Defaults to (255, 255, 255) (White).
        text_background_color (tuple[int, int, int]): Color of the text's background in BGR format.
                                                       Defaults to (255, 255, 255) (White).
        text_background_transparency (float): Transparency of the text background.
                                              Value between 0.0 (fully transparent) and 1.0 (fully opaque).
                                              Defaults to 0.5 (50% transparent).
        padding_x (int): Horizontal padding to add around the text background. Defaults to 5 pixels.
        padding_y (int): Vertical padding to add around the text background. Defaults to 5 pixels.

    Returns:
        numpy.ndarray: The image with the added text, in OpenCV (BGR) format.
    """

    # 1. Handle Image Input: Convert Pillow image to OpenCV format if necessary
    if isinstance(image, Image.Image):
        # Convert PIL image to NumPy array (OpenCV format - BGR)
        # PIL uses RGB, OpenCV uses BGR, so convert color channels
        img_np = np.array(image)
        if img_np.ndim == 2: # Grayscale image
            img_cv = cv2.cvtColor(img_np, cv2.COLOR_GRAY2BGR)
        elif img_np.shape[2] == 4: # RGBA image
            img_cv = cv2.cvtColor(img_np, cv2.COLOR_RGBA2BGR)
        else: # RGB image
            img_cv = cv2.cvtColor(img_np, cv2.COLOR_RGB2BGR)
    elif isinstance(image, np.ndarray):
        img_cv = image
        # Ensure the image is BGR (3 channels) if it's grayscale
        if img_cv.ndim == 2:
            img_cv = cv2.cvtColor(img_cv, cv2.COLOR_GRAY2BGR)
        elif img_cv.shape[2] == 4: # Handle RGBA if passed as numpy array
            img_cv = cv2.cvtColor(img_cv, cv2.COLOR_RGBA2BGR)
    else:
        raise TypeError("Input image must be a PIL Image or a NumPy array (OpenCV format).")

    # Get original image dimensions
    h_orig, w_orig = img_cv.shape[:2]

    # 2. Calculate Text Size
    # getTextSize returns ((width, height), baseline)
    # text_w is the width of the text string
    # text_h is the height of the text string (from top of ascenders to bottom of descenders)
    # baseline is the distance from the bottom-left point (org) to the baseline of the text.
    (text_w, text_h), baseline = cv2.getTextSize(text, font_face, font_scale, thickness)

    # Initialize new dimensions and offsets for the original image
    new_w, new_h = w_orig, h_orig
    offset_x, offset_y = 0, 0

    # Determine initial current_org_x and current_org_y based on user input or centering
    # Use float for calculations to avoid integer division issues, then cast to int for coordinates
    initial_org_x = org[0] if org[0] is not None else int((w_orig - text_w) / 2)
    # For vertical centering, org[1] is the baseline. We want the text's vertical center
    # (org[1] - text_h / 2) to align with the image's vertical center (h_orig / 2).
    # So, org[1] = h_orig / 2 + text_h / 2.
    initial_org_y = org[1] if org[1] is not None else int((h_orig + text_h) / 2)

    current_org_x, current_org_y = initial_org_x, initial_org_y


    # 3. Determine New Image Dimensions (if text is outside)

    # Calculate bounding box for text with padding for extension check
    # These are potential coordinates if the image were to be extended
    padded_x1 = current_org_x - padding_x
    padded_y1 = current_org_y - text_h - padding_y # Top of text + padding
    padded_x2 = current_org_x + text_w + padding_x
    padded_y2 = current_org_y + baseline + padding_y # Bottom of text + padding

    # Check for left extension (text background starts before x=0)
    if padded_x1 < 0:
        offset_x = -padded_x1
        new_w += offset_x
        # Adjust current_org_x for the new canvas, so the text starts at the padded edge
        current_org_x += offset_x

    # Check for top extension (text background top edge is above y=0)
    if padded_y1 < 0:
        offset_y = -padded_y1
        new_h += offset_y
        # Adjust current_org_y for the new canvas, so the text baseline is at the padded edge
        current_org_y += offset_y

    # Recalculate padded coordinates based on potentially adjusted current_org_x, current_org_y
    # This is crucial because if offsets were applied, the text's position relative to the new
    # canvas's (0,0) has changed.
    padded_x1 = current_org_x - padding_x
    padded_y1 = current_org_y - text_h - padding_y
    padded_x2 = current_org_x + text_w + padding_x
    padded_y2 = current_org_y + baseline + padding_y

    # Check for right extension (text background extends beyond original width)
    if padded_x2 > new_w:
        new_w = padded_x2

    # Check for bottom extension (text background extends beyond original height)
    if padded_y2 > new_h:
        new_h = padded_y2

    # 4. Create new canvas if needed and paste original image
    if new_w > w_orig or new_h > h_orig:
        # Create a new blank canvas with the background color
        new_image_canvas = np.full((new_h, new_w, 3), background_color, dtype=np.uint8)

        # Paste the original image onto the new canvas at the calculated offset
        new_image_canvas[offset_y : offset_y + h_orig, offset_x : offset_x + w_orig] = img_cv
        img_cv = new_image_canvas
    
    # Update the org coordinates to reflect the new canvas if it was extended
    # This is important because putText will draw relative to the current image's top-left
    adjusted_org = (current_org_x, current_org_y)

    # 5. Place Text Background (before text)
    if text_background_transparency > 0:
        # Calculate the top-left and bottom-right corners of the text bounding box with padding
        x1_bg = adjusted_org[0] - padding_x
        y1_bg = adjusted_org[1] - text_h - padding_y
        x2_bg = adjusted_org[0] + text_w + padding_x
        y2_bg = adjusted_org[1] + baseline + padding_y

        # Ensure coordinates are within image bounds (important if text is at edges)
        x1_bg = max(0, x1_bg)
        y1_bg = max(0, y1_bg)
        x2_bg = min(img_cv.shape[1], x2_bg)
        y2_bg = min(img_cv.shape[0], y2_bg)

        if x2_bg > x1_bg and y2_bg > y1_bg: # Only draw if the bounding box is valid
            # Create a rectangle for the background
            overlay = img_cv.copy()
            cv2.rectangle(overlay, (x1_bg, y1_bg), (x2_bg, y2_bg), text_background_color, -1) # -1 fills the rectangle

            # Blend the overlay with the original image
            alpha = text_background_transparency
            cv2.addWeighted(overlay, alpha, img_cv, 1 - alpha, 0, img_cv)

    # 6. Place Text
    cv2.putText(img_cv, text, adjusted_org, font_face, font_scale, color, thickness, cv2.LINE_AA)

    # 7. Return Image
    return img_cv


In [None]:

def get_random_prompt(folder_path: str) -> tuple[str, str] | None:
    """
    Finds all .txt files in a specified folder, reads all lines from them,
    and returns a single random line along with its originating filename (without extension).

    Args:
        folder_path (str): The path to the folder to search for .txt files.

    Returns:
        tuple[str, str] | None: A tuple containing the filename (without extension)
                                and the randomly selected line, or None if no .txt
                                files are found or if they are empty.
    """
    # all_lines will now store tuples of (filename_without_extension, line_content)
    all_lines = []
    
    # Check if the provided path is a valid directory
    if not os.path.isdir(folder_path):
        print(f"Error: Folder '{folder_path}' not found or is not a directory.")
        return None

    # Iterate over all files in the specified folder
    for filename in os.listdir(folder_path):
        # Construct the full file path
        file_path = os.path.join(folder_path, filename)

        # Check if it's a file and has a .txt extension
        if os.path.isfile(file_path) and filename.endswith('.txt'):
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    # Read all lines from the current .txt file
                    lines_from_file = f.readlines()
                    
                    # Get the filename without extension
                    filename_without_ext = os.path.splitext(filename)[0]

                    # Extend the main list with tuples of (filename_without_ext, cleaned_line)
                    # Only add non-empty lines after stripping whitespace
                    for line in lines_from_file:
                        if line[0] == '#':
                            continue
                        cleaned_line = line.strip()
                        if cleaned_line: # Only add if the line is not empty after stripping
                            all_lines.append((filename_without_ext, cleaned_line))
            except Exception as e:
                print(f"Error reading file {file_path}: {e}")
                continue # Continue to the next file even if one fails

    # Check if any lines were collected
    if not all_lines:
        print(f"No .txt files found or all found files are empty in '{folder_path}'.")
        return None
    #print(*all_lines, sep='\n' )
    # Select and return a random (filename, line) tuple from the collected lines
    return random.choice(all_lines)


# run

In [None]:
generator = SD15ImageGenerator(num_inference_steps=100)

conceptPromptGenerator=ConceptPromptGenerator(device="cpu" )



In [None]:

# Example Usage:
speed_stop=0.5
#speed_distribution = [(40, 1/30, speed_stop), (40, speed_stop, speed_stop), (19, speed_stop, 1/30)]
speed_distribution = [(30, 1/30, 1/30), (30,1/30, speed_stop),(30, speed_stop, speed_stop), (9, speed_stop, 1/30)]
result1 = generate_evenly_distributed_values(speed_distribution)
result1.append(5.202)
print(f"Result for data1: {result1}")
print(f"len {len(result1)}, sum {sum(result1)}" )
# # Expected output for data1: [0.0, 2.5, 5.0, 7.5, 10.0, 100.0, 101.0, 102.0]

In [None]:
def gen_one_video():
    subscribe_msg= "Think you're a fast problem-solver? Subscribe for fresh puzzles and fascinating facts! Cracked the code? Prove it! Share your answer AND your solve time in the comments below!"
    Subscribe_position = (None, 580)  # Position for the subscribe message
    category,subject = get_random_prompt(propmt_dir)
    #category="testing"
    #subject="Louvre Abu Dhabi a longsubject name that is very long and should be wrapped properly in the image"
    print(category, subject)
    #prompt_dic={'Prompt': 'a detailed view of Louvre Abu Dhabi, architectural marvel, modern design, glass dome, sun rays, reflection, desert backdrop', 'Negative Prompt': 'no details, wrong architecture, generic buildings, low resolution, no dome', 'Fact': 'Louvre Abu Dhabi fuses French art with Middle Eastern influences.'}
    prompt_dic=conceptPromptGenerator.generate(subject,category)
    print(prompt_dic)    
    images = generator.generate_image(prompt_dic['Prompt'],prompt_dic['Negative Prompt'] , guidance_scale=7.5)
    editor = VideoEditor(fps=30, frame_size=(512, 512))
    #generator.save_images(images, directory="generated")
    #editor.add_images_from_list(images, duration_sec=30)
    if(len(result1) != len(images   )):
        print("time_duration and number of images are not matched.")
    logo=Image.open(str(assts_dir/"aiartstudio_logo.png"))
    editor.add_image(logo, 3)  # Add logo for 3 seconds

    editor.set_global_display_time_offset_start(0)  # Set global offset to 0 seconds
    total_duration = 0
    for index, duration in enumerate(result1):
        img = images[index] 
        img1=add_text_to_image(img, f"Guess what AI painting now?!",org=(25,540),font_scale=0.8)
        total_duration += duration
        if int(total_duration)%2 == 0:
            img1 = add_text_to_image(img1, subscribe_msg, org=Subscribe_position, font_scale=0.8,color=(0, 0, 255),wordwrap=True)
        else:
            img1 = add_text_to_image(img1, subscribe_msg, org=Subscribe_position, font_scale=0.8,color=(255, 0, 255),wordwrap=True)
        editor.add_image(img1, duration,show_time_text=True,time_display_format="SS.MS",time_text_position=(400, 373),time_text_font_scale=0.8)
    img_final = add_text_to_image(images[-1], subscribe_msg, org=Subscribe_position, font_scale=0.8,color=(0, 0, 255),wordwrap=True)
    img=add_text_to_image(img_final.copy(), "Time Up, AI draw:",org=(None,250))
    img=add_text_to_image(img, subject,org=(None,350),color=(0, 0, 255), font_scale=1.2, thickness=3,wordwrap=True)
    editor.add_image(img, 5,show_time_text=False)

    img=add_text_to_image(img_final, "Fact:",org=(None,50))
    img=add_text_to_image(img,prompt_dic['Fact'] ,org=(None,150),color=(0, 0, 255), font_scale=1.2, thickness=3,wordwrap=True)
    editor.add_image(img, 10,show_time_text=False)
    editor.add_audio(str(assts_dir/"Long Distance.mp3"),audio_clip_end=editor.get_video_duration(),video_start_offset=0)  # Add audio starting at the beginning of the video
    #editor.add_image(images[-1],3)  # Add last image for 3 seconds
    fileName=generate_filename_by_datetime(category, "mp4")
    #full_path_filename = str(GDrive_dir/fileName)
    #print(full_path_filename)
    editor.save(f"videos/{fileName}")

#gen_one_video()

In [None]:
for i in range(10):
    gen_one_video()
    pass
