In [24]:
from diffusers import (
    StableDiffusionControlNetPipeline,
    ControlNetModel,
    StableDiffusionImg2ImgPipeline
)
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch
import os
import imageio.v2 as imageio

In [25]:
# --- Setup device ---
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [26]:
# --- Load ControlNet for scribble ---
controlnet = ControlNetModel.from_pretrained(
    "lllyasviel/sd-controlnet-scribble", torch_dtype=torch.float16
)
pipe = StableDiffusionControlNetPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5",
    controlnet=controlnet,
    torch_dtype=torch.float16,
).to(device)

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

In [27]:
from google.colab import files
# Upload image
uploaded_files = files.upload()
uploaded_filename = list(uploaded_files.keys())[0]
print(f"Uploaded file: {uploaded_filename}")
sketch = Image.open(uploaded_filename).convert("RGB")

Saving boy_sketch.jpg to boy_sketch (2).jpg
Uploaded file: boy_sketch (2).jpg


In [33]:
# Input prompt base
prompt_base = input("Enter the prompt base (e.g., 'anime style boy'): ")

Enter the prompt base (e.g., 'anime style boy'): anime style boy


In [30]:
# --- Prompt and image setup ---
# sketch_path = "/content/boy_sketch.jpg"
# sketch = Image.open(sketch_path).convert("RGB")
# prompt_base = "anime style boy"

env_start = "in a sunny forest"
env_end = "in a rainy forest"

In [31]:
# --- Output folder
os.makedirs("frames", exist_ok=True)

In [35]:
def is_mostly_black(img, threshold=0.95):
    # Convert to grayscale and check percentage of near-black pixels
    gray = img.convert("L")
    pixels = gray.getdata()
    black_pixels = sum(1 for p in pixels if p < 10)
    return (black_pixels / len(pixels)) > threshold

In [36]:
def generate_first_frame(sketch_img, prompt, pipe, max_retries=5):
    print(f"Generating first frame with prompt: {prompt}")
    for attempt in range(max_retries):
        seed = 42 + attempt  # change seed on retries
        generator = torch.manual_seed(seed)
        result = pipe(prompt=prompt, image=sketch_img, num_inference_steps=30, generator=generator)
        img = result.images[0]
        if not is_mostly_black(img):
            print(f"Generated frame successfully on attempt {attempt + 1}")
            return img
        else:
            print(f"Attempt {attempt + 1} generated a black image, retrying...")
    print("Warning: Max retries reached. Returning last generated image (may be black).")
    return img

In [37]:
first_frame = generate_first_frame(sketch, f"{prompt_base}, {env_start}", pipe)
first_frame.save("frames/frame_000.png")

Generating first frame with prompt: anime style boy, in a sunny forest


  0%|          | 0/30 [00:00<?, ?it/s]

Potential NSFW content was detected in one or more images. A black image will be returned instead. Try again with a different prompt and/or seed.


Attempt 1 generated a black image, retrying...


  0%|          | 0/30 [00:00<?, ?it/s]

Generated frame successfully on attempt 2


In [9]:
# Clear pipeline and free memory
del pipe, controlnet
torch.cuda.empty_cache()

In [10]:
# --- Load Img2Img pipeline with LoRA ---
img2img_pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16
).to(device)

img2img_pipe.load_lora_weights("aionthegrind/anime-lora")

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

anime-lora-model.safetensors:   0%|          | 0.00/89.7M [00:00<?, ?B/s]



In [11]:
# --- Setup CLIP for similarity check ---
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

def clip_similarity(img1, img2):
    inputs = clip_processor(images=[img1, img2], return_tensors="pt").to(device)
    with torch.no_grad():
        features = clip_model.get_image_features(**inputs)
        similarity = torch.cosine_similarity(features[0:1], features[1:2]).item()
    return similarity


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

In [12]:
# --- Smooth prompt blending ---
def blend_prompt(alpha):
    return (
        f"{prompt_base}, anime screencap, "
        f"{env_start}:{1 - alpha:.2f}, {env_end}:{alpha:.2f}"
    )

In [13]:
# --- Retry logic function ---
def generate_frame_with_retries(
    img2img_pipe, prev_frame, prompt, base_seed, frame_idx, max_retries, threshold
):
    retries = 0
    while retries <= max_retries:
        seed = base_seed + frame_idx + retries * 1000  # change seed on retry
        generator = torch.manual_seed(seed)

        new_frame = img2img_pipe(
            prompt=prompt,
            image=prev_frame,
            strength=0.65,
            guidance_scale=7.5,
            num_inference_steps=25,
            generator=generator,
        ).images[0]

        sim = clip_similarity(prev_frame, new_frame)
        if sim >= threshold:
            return new_frame, sim
        else:
            print(f"Retry {retries+1}/{max_retries}: similarity={sim:.3f} < {threshold} for frame {frame_idx}")
            retries += 1

    print(f"Warning: Low similarity persisted after {max_retries} retries at frame {frame_idx}")
    return new_frame, sim

In [14]:
# --- Simple temporal smoothing ---
def temporal_smoothing(img_prev, img_new, alpha=0.7):
    img_prev = img_prev.convert("RGBA")
    img_new = img_new.convert("RGBA")
    blended = Image.blend(img_new, img_prev, alpha=alpha)
    return blended.convert("RGB")

In [15]:
# --- Frame generation loop ---
frames = [first_frame]
num_frames = 40
base_seed = 42
similarity_threshold = 0.85
max_retries = 3

print("Generating frames with retry logic and temporal smoothing...")

for i in range(1, num_frames):
    alpha = i / num_frames
    blended_prompt = blend_prompt(alpha)

    new_frame, similarity = generate_frame_with_retries(
        img2img_pipe, frames[-1], blended_prompt, base_seed, i, max_retries, similarity_threshold
    )

    smoothed_frame = temporal_smoothing(frames[-1], new_frame, alpha=0.7)

    smoothed_frame.save(f"frames/frame_{i:03d}.png")
    frames.append(smoothed_frame)

    print(f"Frame {i} saved with similarity {similarity:.4f}")

Generating frames with retry logic and temporal smoothing...


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 1 saved with similarity 0.9398


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 2 saved with similarity 0.9537


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 3 saved with similarity 0.9453


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 4 saved with similarity 0.9632


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 5 saved with similarity 0.9618


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 6 saved with similarity 0.9602


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 7 saved with similarity 0.9535


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 8 saved with similarity 0.9475


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 9 saved with similarity 0.9563


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 10 saved with similarity 0.9588


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 11 saved with similarity 0.9627


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 12 saved with similarity 0.9591


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 13 saved with similarity 0.9563


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 14 saved with similarity 0.9599


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 15 saved with similarity 0.9494


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 16 saved with similarity 0.9558


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 17 saved with similarity 0.9623


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 18 saved with similarity 0.9613


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 19 saved with similarity 0.9737


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 20 saved with similarity 0.9615


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 21 saved with similarity 0.9396


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 22 saved with similarity 0.9529


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 23 saved with similarity 0.9553


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 24 saved with similarity 0.9581


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 25 saved with similarity 0.9472


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 26 saved with similarity 0.9797


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 27 saved with similarity 0.9478


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 28 saved with similarity 0.9616


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 29 saved with similarity 0.9604


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 30 saved with similarity 0.9579


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 31 saved with similarity 0.9560


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 32 saved with similarity 0.9545


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 33 saved with similarity 0.9415


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 34 saved with similarity 0.9436


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 35 saved with similarity 0.9567


  0%|          | 0/16 [00:00<?, ?it/s]

Potential NSFW content was detected in one or more images. A black image will be returned instead. Try again with a different prompt and/or seed.


Retry 1/3: similarity=0.418 < 0.85 for frame 36


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 36 saved with similarity 0.9556


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 37 saved with similarity 0.9676


  0%|          | 0/16 [00:00<?, ?it/s]

Potential NSFW content was detected in one or more images. A black image will be returned instead. Try again with a different prompt and/or seed.


Retry 1/3: similarity=0.414 < 0.85 for frame 38


  0%|          | 0/16 [00:00<?, ?it/s]

Potential NSFW content was detected in one or more images. A black image will be returned instead. Try again with a different prompt and/or seed.


Retry 2/3: similarity=0.414 < 0.85 for frame 38


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 38 saved with similarity 0.9681


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 39 saved with similarity 0.9667


In [16]:
# --- Final CLIP similarity between first and last frame ---
final_similarity = clip_similarity(frames[0], frames[-1])
print(f"CLIP similarity between first and last frame: {final_similarity:.4f}")

CLIP similarity between first and last frame: 0.8156


In [17]:
# --- Create video from frames ---
video_path = "output_animation.mp4"
fps = 8
print(f"Creating video at {fps} fps...")

with imageio.get_writer(video_path, fps=fps) as writer:
    for i in range(len(frames)):
        img = imageio.imread(f"frames/frame_{i:03d}.png")
        writer.append_data(img)

Creating video at 8 fps...


In [18]:
# --- Display video in notebook ---
from IPython.display import HTML
from base64 import b64encode
mp4 = open(video_path, 'rb').read()
data_url = "data:video/mp4;base64," + b64encode(mp4).decode()

HTML(f"""
<video width=600 controls>
  <source src="{data_url}" type="video/mp4">
</video>
""")