In [19]:
from diffusers import (
    StableDiffusionControlNetPipeline,
    ControlNetModel,
    StableDiffusionImg2ImgPipeline
)
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch
import os
import imageio.v2 as imageio

In [None]:
# --- Setup device ---
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
# --- Load ControlNet for scribble ---
controlnet = ControlNetModel.from_pretrained(
    "lllyasviel/sd-controlnet-scribble", torch_dtype=torch.float16
)
pipe = StableDiffusionControlNetPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5",
    controlnet=controlnet,
    torch_dtype=torch.float16,
).to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/920 [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/1.45G [00:00<?, ?B/s]

model_index.json:   0%|          | 0.00/541 [00:00<?, ?B/s]

Fetching 15 files:   0%|          | 0/15 [00:00<?, ?it/s]

config.json: 0.00B [00:00, ?B/s]

preprocessor_config.json:   0%|          | 0.00/342 [00:00<?, ?B/s]

scheduler_config.json:   0%|          | 0.00/308 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/472 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/617 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/492M [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/547 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/3.44G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/806 [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/335M [00:00<?, ?B/s]

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

In [None]:
# --- Prompt and image setup ---
sketch_path = "/content/boy_sketch.jpg"
sketch = Image.open(sketch_path).convert("RGB")

prompt_base = "anime style boy, vibrant colors, highly detailed"
env_start = "in a sunny forest"
env_end = "in a rainy forest"

In [None]:
# --- Output folder
os.makedirs("frames", exist_ok=True)

In [None]:
# --- Generate first frame ---
def generate_first_frame(sketch_img, prompt):
    print(f"Generating first frame with prompt: {prompt}")
    result = pipe(prompt=prompt, image=sketch_img, num_inference_steps=30)
    return result.images[0]

first_frame = generate_first_frame(sketch, f"{prompt_base}, {env_start}")
first_frame.save("frames/frame_000.png")

Generating first frame with prompt: anime style boy, vibrant colors, highly detailed, in a sunny forest


  0%|          | 0/30 [00:00<?, ?it/s]

In [None]:
# Clear pipeline and free memory
del pipe, controlnet
torch.cuda.empty_cache()

In [25]:
# --- Load Img2Img pipeline with LoRA ---
img2img_pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16
).to(device)
img2img_pipe.load_lora_weights("aionthegrind/anime-lora")

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]



In [None]:
# --- Setup CLIP for similarity check ---
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

def clip_similarity(img1, img2):
    inputs = clip_processor(images=[img1, img2], return_tensors="pt").to(device)
    with torch.no_grad():
        features = clip_model.get_image_features(**inputs)
        similarity = torch.cosine_similarity(features[0:1], features[1:2]).item()
    return similarity


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

In [None]:
# --- Smooth prompt blending ---
def blend_prompt(alpha):
    return (
        f"{prompt_base}, anime screencap, "
        f"{env_start}:{1 - alpha:.2f}, {env_end}:{alpha:.2f}"
    )

In [None]:
# --- Frame generation loop with CLIP consistency check ---
frames = [first_frame]
num_frames = 40
base_seed = 42

print("Generating frames with identity consistency...")

for i in range(1, num_frames):
    alpha = i / num_frames
    blended_prompt = blend_prompt(alpha)
    generator = torch.manual_seed(base_seed + i)

    # Generate new frame based on last frame
    new_frame = img2img_pipe(
        prompt=blended_prompt,
        image=frames[-1],
        strength=0.65,
        guidance_scale=7.5,
        num_inference_steps=25,
        generator=generator
    ).images[0]

    # Optional: Enforce minimum similarity to previous frame
    similarity = clip_similarity(frames[-1], new_frame)
    if similarity < 0.85:
        print(f"Low similarity ({similarity:.3f}) detected at frame {i}, regenerating...")
        # You can choose to retry with different seed or prompt tweaks here
        # For now, just keep the new frame and log the similarity

    new_frame.save(f"frames/frame_{i:03d}.png")
    frames.append(new_frame)

Generating frames with identity consistency...


  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

In [20]:
# --- Final CLIP similarity between first and last frame ---
final_similarity = clip_similarity(frames[0], frames[-1])
print(f"CLIP similarity between first and last frame: {final_similarity:.4f}")

CLIP similarity between first and last frame: 0.8316


In [21]:
# --- Create video from frames ---
video_path = "output_animation.mp4"
fps = 8
print(f"Creating video at {fps} fps...")

with imageio.get_writer(video_path, fps=fps) as writer:
    for i in range(len(frames)):
        img = imageio.imread(f"frames/frame_{i:03d}.png")
        writer.append_data(img)

Creating video at 8 fps...


In [23]:
# --- Display video in notebook ---
from IPython.display import HTML
from base64 import b64encode
mp4 = open(video_path, 'rb').read()
data_url = "data:video/mp4;base64," + b64encode(mp4).decode()

HTML(f"""
<video width=600 controls>
  <source src="{data_url}" type="video/mp4">
</video>
""")