In [38]:
from diffusers import (
    StableDiffusionControlNetPipeline,
    ControlNetModel,
    StableDiffusionImg2ImgPipeline
)
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch
import os
import imageio.v2 as imageio

In [39]:
# --- Setup device ---
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

Using device: cuda


In [40]:
# --- Load ControlNet for sketch-to-image ---
controlnet = ControlNetModel.from_pretrained(
    "lllyasviel/sd-controlnet-scribble", torch_dtype=torch.float16
)
pipe = StableDiffusionControlNetPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5",
    controlnet=controlnet,
    torch_dtype=torch.float16,
).to(device)

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

In [41]:
from google.colab import files
uploaded_files = files.upload()
uploaded_filename = list(uploaded_files.keys())[0]
print(f"Uploaded file: {uploaded_filename}")
sketch = Image.open(uploaded_filename).convert("RGB")

Saving boy_sketch.jpg to boy_sketch (3).jpg
Uploaded file: boy_sketch (3).jpg


In [42]:
prompt_base = input("Enter the prompt base (e.g., 'anime style boy'): ")
env_start = "in a sunny forest"
env_end = "in a rainy forest"

Enter the prompt base (e.g., 'anime style boy'): anime style boy


In [43]:
# --- Output folder
os.makedirs("frames", exist_ok=True)

In [44]:
def is_mostly_black(img, threshold=0.95):
    gray = img.convert("L")
    pixels = gray.getdata()
    black = sum(1 for p in pixels if p < 10)
    return (black / len(pixels)) > threshold

In [45]:
def generate_first_frame(sketch_img, prompt, pipe, max_retries=5):
    print(f"Generating first frame with prompt: {prompt}")
    for attempt in range(max_retries):
        seed = 42 + attempt
        generator = torch.manual_seed(seed)
        result = pipe(prompt=prompt, image=sketch_img, num_inference_steps=30, generator=generator)
        img = result.images[0]
        if not is_mostly_black(img):
            print(f"Generated successfully on attempt {attempt + 1}")
            return img
        print(f"Attempt {attempt + 1} failed (mostly black)")
    print("Max retries reached; returning last frame.")
    return img

In [46]:
first_frame = generate_first_frame(sketch, f"{prompt_base}, {env_start}", pipe)
first_frame.save("frames/frame_000.png")

Generating first frame with prompt: anime style boy, in a sunny forest


  0%|          | 0/30 [00:00<?, ?it/s]

Potential NSFW content was detected in one or more images. A black image will be returned instead. Try again with a different prompt and/or seed.


Attempt 1 failed (mostly black)


  0%|          | 0/30 [00:00<?, ?it/s]

Generated successfully on attempt 2


In [47]:
del pipe, controlnet
torch.cuda.empty_cache()

In [48]:
img2img_pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16
).to(device)
img2img_pipe.load_lora_weights("aionthegrind/anime-lora")

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]



In [49]:
# --- CLIP for identity embeddings ---
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

def get_clip_embedding(image):
    inputs = clip_processor(images=image, return_tensors="pt").to(device)
    with torch.no_grad():
        emb = clip_model.get_image_features(**inputs)
        emb = emb / emb.norm(dim=-1, keepdim=True)
    return emb

In [50]:

# Reference embedding from first frame
identity_embedding = get_clip_embedding(first_frame)

def clip_identity_similarity(img, ref_emb):
    new_emb = get_clip_embedding(img)
    sim = torch.cosine_similarity(new_emb, ref_emb).item()
    return sim

In [51]:
# --- Prompt blending & frame generation ---
def blend_prompt(alpha):
    return (
        f"{prompt_base}, anime screencap, "
        f"{env_start}:{1-alpha:.2f}, {env_end}:{alpha:.2f}"
    )

In [52]:
def generate_frame_with_identity(
    img2img_pipe, prev_frame, prompt, base_seed, frame_idx, max_retries, threshold
):
    retries = 0
    while retries <= max_retries:
        seed = base_seed + frame_idx + retries * 1000
        generator = torch.manual_seed(seed)
        new_frame = img2img_pipe(
            prompt=prompt,
            image=prev_frame,
            strength=0.65,
            guidance_scale=7.5,
            num_inference_steps=25,
            generator=generator,
        ).images[0]

        sim_id = clip_identity_similarity(new_frame, identity_embedding)
        if sim_id >= threshold:
            return new_frame, sim_id
        else:
            print(f"Retry {retries+1}/{max_retries}: identity_sim={sim_id:.3f} < {threshold}")
            retries += 1

    print(f"Max retries used; final identity_sim={sim_id:.3f}")
    return new_frame, sim_id

In [53]:
def temporal_smoothing(img_prev, img_new, alpha=0.7):
    img_prev = img_prev.convert("RGBA")
    img_new = img_new.convert("RGBA")
    blended = Image.blend(img_new, img_prev, alpha=alpha)
    return blended.convert("RGB")

In [54]:
# --- Simple temporal smoothing ---
def temporal_smoothing(img_prev, img_new, alpha=0.7):
    img_prev = img_prev.convert("RGBA")
    img_new = img_new.convert("RGBA")
    blended = Image.blend(img_new, img_prev, alpha=alpha)
    return blended.convert("RGB")

In [55]:
frames = [first_frame]
num_frames = 40
base_seed = 42
identity_threshold = 0.84
max_retries = 3

print("Generating frames with identity consistency...")

for i in range(1, num_frames):
    alpha = i / num_frames
    blended = blend_prompt(alpha)
    new_frame, sim_id = generate_frame_with_identity(
        img2img_pipe, frames[-1], blended, base_seed, i, max_retries, identity_threshold
    )

    smoothed = temporal_smoothing(frames[-1], new_frame, alpha=0.7)
    smoothed.save(f"frames/frame_{i:03d}.png")
    frames.append(smoothed)
    print(f"Frame {i} saved — identity_sim={sim_id:.4f}")

Generating frames with identity consistency...


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 1 saved — identity_sim=0.9200


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 2 saved — identity_sim=0.9313


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 3 saved — identity_sim=0.9239


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 4 saved — identity_sim=0.9082


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 5 saved — identity_sim=0.9251


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 6 saved — identity_sim=0.9213


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 7 saved — identity_sim=0.8904


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 8 saved — identity_sim=0.9171


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 9 saved — identity_sim=0.8760


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 10 saved — identity_sim=0.8947


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 11 saved — identity_sim=0.8831


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 12 saved — identity_sim=0.8699


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 13 saved — identity_sim=0.8864


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 14 saved — identity_sim=0.8874


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 15 saved — identity_sim=0.8719


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 16 saved — identity_sim=0.8771


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 17 saved — identity_sim=0.8661


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 18 saved — identity_sim=0.8936


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 19 saved — identity_sim=0.8943


  0%|          | 0/16 [00:00<?, ?it/s]

Retry 1/3: identity_sim=0.825 < 0.84


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 20 saved — identity_sim=0.8746


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 21 saved — identity_sim=0.8745


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 22 saved — identity_sim=0.8759


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 23 saved — identity_sim=0.8749


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 24 saved — identity_sim=0.8716


  0%|          | 0/16 [00:00<?, ?it/s]

Retry 1/3: identity_sim=0.840 < 0.84


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 25 saved — identity_sim=0.8727


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 26 saved — identity_sim=0.8902


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 27 saved — identity_sim=0.8654


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 28 saved — identity_sim=0.8737


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 29 saved — identity_sim=0.8676


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 30 saved — identity_sim=0.8434


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 31 saved — identity_sim=0.8640


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 32 saved — identity_sim=0.8679


  0%|          | 0/16 [00:00<?, ?it/s]

Retry 1/3: identity_sim=0.826 < 0.84


  0%|          | 0/16 [00:00<?, ?it/s]

Retry 2/3: identity_sim=0.832 < 0.84


  0%|          | 0/16 [00:00<?, ?it/s]

Retry 3/3: identity_sim=0.833 < 0.84


  0%|          | 0/16 [00:00<?, ?it/s]

Retry 4/3: identity_sim=0.812 < 0.84
Max retries used; final identity_sim=0.812
Frame 33 saved — identity_sim=0.8116


  0%|          | 0/16 [00:00<?, ?it/s]

Retry 1/3: identity_sim=0.810 < 0.84


  0%|          | 0/16 [00:00<?, ?it/s]

Retry 2/3: identity_sim=0.839 < 0.84


  0%|          | 0/16 [00:00<?, ?it/s]

Retry 3/3: identity_sim=0.816 < 0.84


  0%|          | 0/16 [00:00<?, ?it/s]

Retry 4/3: identity_sim=0.814 < 0.84
Max retries used; final identity_sim=0.814
Frame 34 saved — identity_sim=0.8144


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 35 saved — identity_sim=0.8603


  0%|          | 0/16 [00:00<?, ?it/s]

Retry 1/3: identity_sim=0.837 < 0.84


  0%|          | 0/16 [00:00<?, ?it/s]

Retry 2/3: identity_sim=0.828 < 0.84


  0%|          | 0/16 [00:00<?, ?it/s]

Retry 3/3: identity_sim=0.817 < 0.84


  0%|          | 0/16 [00:00<?, ?it/s]

Retry 4/3: identity_sim=0.827 < 0.84
Max retries used; final identity_sim=0.827
Frame 36 saved — identity_sim=0.8268


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 37 saved — identity_sim=0.8583


  0%|          | 0/16 [00:00<?, ?it/s]

Retry 1/3: identity_sim=0.837 < 0.84


  0%|          | 0/16 [00:00<?, ?it/s]

Retry 2/3: identity_sim=0.795 < 0.84


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 38 saved — identity_sim=0.8572


  0%|          | 0/16 [00:00<?, ?it/s]

Frame 39 saved — identity_sim=0.8595


In [56]:
# --- Final similarity check ---
final_sim = clip_identity_similarity(frames[-1], identity_embedding)
print(f"Final identity similarity: {final_sim:.4f}")

Final identity similarity: 0.8516


In [57]:
# --- Video creation ---
video_path = "output_identity_video.mp4"
fps = 8
print(f"Saving video at {fps} fps...")
with imageio.get_writer(video_path, fps=fps) as writer:
    for i in range(len(frames)):
        img = imageio.imread(f"frames/frame_{i:03d}.png")
        writer.append_data(img)


Saving video at 8 fps...


In [58]:
# --- Display video inline ---
from IPython.display import HTML
from base64 import b64encode
mp4 = open(video_path,'rb').read()
data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
HTML(f"""<video width=600 controls><source src="{data_url}" type="video/mp4"></video>""")