In [None]:
prompt = "<s1> drinking a beer"

In [None]:
from diffusers import StableDiffusionPipeline, EulerAncestralDiscreteScheduler
import torch

model_id = "stabilityai/stable-diffusion-2-1-base"

pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to(
    "cuda"
)
pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config)

torch.manual_seed(0)
image = pipe(prompt, num_inference_steps=25, guidance_scale=9.5).images[0]

image  # nice. diffusers are cool.

Now there is two way to LORA this model. You can 1. monkey-patch it, or 2. update the weight inplaced.

Monkey-patching is essentially replacing the linear layer with a lora-linear layer, which is the following

$$
x_2 = Wx_1 + A B^T x_1
$$

On the other hand, weight updating is literally replacing the original weight with the LORA weight. This is the following

$$
W' = W + A B^T
$$

You might find this weird. Just having the weight updated is the logical option. Why even monkey-patch when you can add the weights? Well, by keeping the LORA weights we can perform _weight mixing_ dynamically. We can't do this if we just update the weight, because the weight is fixed. This is the reason why we have two options. You can adjust the weight with `tune_lora_scale` function.


In [None]:
from lora_diffusion import monkeypatch_lora, tune_lora_scale, patch_pipe

patch_pipe(
    pipe,
    "/Projects/Personal/leap-sd/training/lora_dataset_creator/lora_dataset/dragon/models/step_1000.safetensors",
    patch_text=True,
    patch_ti=True,
    patch_unet=True,
)

tune_lora_scale(pipe.unet, 1.00)

torch.manual_seed(0)
image = pipe(prompt, num_inference_steps=50, guidance_scale=9).images[0]
display(image)

patch_pipe(
    pipe,
    "/Projects/Personal/leap-sd/training/lora_dataset_creator/lora_dataset/dragon/models/step_inv_1000.safetensors",
    patch_text=True,
    patch_ti=True,
    patch_unet=True,
)

tune_lora_scale(pipe.unet, 1.00)

torch.manual_seed(0)
image = pipe(prompt, num_inference_steps=50, guidance_scale=9).images[0]
display(image)

In [None]:
tune_lora_scale(pipe.unet, 0.3)
tune_lora_scale(pipe.text_encoder, 0.3)
image = pipe(prompt, num_inference_steps=50, guidance_scale=9).images[0]
image

# Nice. Let's try another example:

In [None]:
pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to(
    "cuda"
)

prompt = "superman, style of <s1><s2>"
torch.manual_seed(1)
image = pipe(prompt, num_inference_steps=50, guidance_scale=4).images[0]

image


In [None]:
patch_pipe(
    pipe,
    "../example_loras/lora_popart.safetensors",
    patch_text=True,
    patch_ti=True,
    patch_unet=True,
)
torch.manual_seed(1)
tune_lora_scale(pipe.unet, 1.0)
tune_lora_scale(pipe.text_encoder, 1.0)
image = pipe(prompt, num_inference_steps=50, guidance_scale=4).images[0]
image


That is good pop-art style, but we might get a better result with lower $\alpha$ for both text encoder and unet.


In [None]:
torch.manual_seed(1)
tune_lora_scale(pipe.unet, 0.5)
tune_lora_scale(pipe.text_encoder, 0.5)

image = pipe(prompt, num_inference_steps=50, guidance_scale=4.5).images[0]
image.save("../contents/pop_art.jpg")
image


# Appendix : To make stuff on the readme


In [None]:
prompt = "baby lion in style of <s1><s2>"

patch_pipe(
    pipe,
    "../example_loras/lora_disney.safetensors",
    patch_text=True,
    patch_ti=True,
    patch_unet=True,
)
torch.manual_seed(6)
tune_lora_scale(pipe.unet, 0.5)
tune_lora_scale(pipe.text_encoder, 0.5)
image = pipe(prompt, num_inference_steps=50, guidance_scale=5).images[0]
image.save("../contents/disney_lora.jpg")
image


#


In [None]:

patch_pipe(
    pipe,
    "../example_loras/lora_krk.safetensors",
    patch_text=True,
    patch_ti=True,
    patch_unet=True,
)

example_prompts = [
    "painting of <TOK>, a starry night, style of vincent van gogh",
    "portrait of <TOK> by mario testino 1950, 1950s style, hair tied in a bun, taken in 1950, detailed face of <TOK>, sony a7r",
    "photof of <TOK>, 50mm, sharp, muscular, detailed realistic face, hyper realistic, perfect face, intricate, natural light, <TOK> underwater photoshoot,collarbones, skin indentation, Alphonse Mucha, Greg Rutkowski",
    "a photo of <TOK> in advanced organic armor, biological filigree, detailed symmetric face, flowing hair, neon details, intricate, elegant, highly detailed, digital painting, artstation, concept art, smooth, sharp focus, octane, art by Krenz Cushart , Artem Demura, Alphonse Mucha, digital cgi art 8K HDR by Yuanyuan Wang photorealistic",
    "a photo of <TOK> on the beach, small waves, detailed symmetric face, beautiful composition",
    "a photo of <TOK> rainbow background, wlop, dan mumford, artgerm, liam brazier, peter mohrbacher, jia zhangke, 8 k, raw, featured in artstation, octane render, cinematic, elegant, intricate, 8 k",
    "photo of Summoner <TOK> with a cute water elemental, fantasy illustration, detailed face, intricate, elegant, highly detailed, digital painting, artstation, concept art, wallpaper, smooth, sharp focus, illustration, art by artgerm and greg rutkowski",
    "<TOK>, cyberpunk 2077, 4K, 3d render in unreal engine",
    "a pencil sketch of <TOK>",
    "a minecraft render of <TOK>",
    "young woman <TOK>, eden, intense eyes, tears running down, crying, vaporwave aesthetic, synthwave, colorful, psychedelic, crown, long gown, flowers, bees, butterflies, ribbons, ornate, intricate, digital painting, artstation, concept art, smooth, sharp focus, illustration of <wday>, art by artgerm and greg rutkowski and alphonse mucha",
    "<TOK> in a construction outfit",
]

outs = []
tune_lora_scale(pipe.unet, 0.7)
tune_lora_scale(pipe.text_encoder, 0.7)
for idx, prompt in enumerate(example_prompts):
    prompt = prompt.replace("<TOK>", "<s1><s2>")
    torch.manual_seed(idx)
    image = pipe(prompt, num_inference_steps=50, guidance_scale=5).images[0]
    outs.append(image)


In [None]:
from lora_diffusion import image_grid

imgs = image_grid(outs, 3, 4)
imgs.save("../contents/lora_pti_example.jpg")
imgs