In [None]:
!pip install transformers diffusers accelerate peft

In [1]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )


In [2]:
from huggingface_hub import hf_hub_download

realistic_vision_path = hf_hub_download(repo_id="SG161222/Realistic_Vision_V5.1_noVAE", filename="Realistic_Vision_V5.1-inpainting.safetensors")
vae_path = hf_hub_download(repo_id="stabilityai/sd-vae-ft-mse-original", filename="vae-ft-mse-840000-ema-pruned.safetensors")

In [3]:
import sys
sys.path.append('../src')

In [4]:
from config import DatasetConfig, Config, ModelConfig, WandbConfig, EvaluationConfig, TrainConfig

dataset_config = DatasetConfig(
    roboflow_api_key='HNXIsW3WwnidNDQZHexX',
    roboflow_workspace='arked',
    project_name='facades-flzke',
    dataset_version=11,
    data_root='facades_data',
    image_size=512,
)

model_config = ModelConfig(
    model_path=realistic_vision_path,
    vae_path=vae_path,
)

wandb_config = WandbConfig(
    project_name='facades',
)

eval_config=EvaluationConfig(
    prompts=['white facade', 'brick facade'],
)

train_config=TrainConfig(
    checkpoint_folder = wandb_config.project_name + "_checkpoints",
    train_batch_size = 4,
    unet_lr=1e-4,
    text_encoder_lr=1e-4,
    scheduler_num_cycles=4,
    total_steps=1000,
)

config = Config(
    dataset=dataset_config,
    model=model_config,
    wandb=wandb_config,
    eval=eval_config,
    train=train_config,
)

In [5]:
from model import get_models

text_encoder, vae, unet, tokenizer, noise_scheduler, placeholder_token_ids = get_models(
    config.model.model_path,
    config.model.vae_path,
    device=config.device,
    load_from_safetensor=True,
)

loading VAE...
loading model...


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


In [7]:
from peft import LoraConfig, LoraModel

UNET_DEFAULT_TARGET_REPLACE = {"CrossAttention", "Attention", "GEGLU"}
UNET_TARGET_MODULES = ["to_q", "to_v", "to_k", "to_out.0", "ff.net.0.proj"] #, "proj_in", "conv1", "conv2"]

unet_peft = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=UNET_TARGET_MODULES,
    lora_dropout=0.1,
    bias='none',
)

text_encoder_peft = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj", "k_proj", "out_proj", "mlp.fc1", "mlp.fc2"],
    lora_dropout=0.1,
    bias='none',
)

In [14]:
text_encoder_peft

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type=None, inference_mode=False, r=8, target_modules={'q_proj', 'mlp.fc1', 'out_proj', 'v_proj', 'k_proj', 'mlp.fc2'}, lora_alpha=32, lora_dropout=0.1, fan_in_fan_out=False, bias='none', modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={})

In [8]:
text_encoder_lora = LoraModel(text_encoder, text_encoder_peft, "text_encoder_lora")

In [9]:
print_trainable_parameters(text_encoder_lora)

trainable params: 1327104 || all params: 124387584 || trainable%: 1.0669103437204794


In [28]:
import itertools

params_to_optimize = [
    {
        "params": itertools.chain(*[p for p in text_encoder_lora.parameters() if p.requires_grad]),
        "lr": 1e-4
    },
]

In [29]:
params_to_optimize

[{'params': <itertools.chain at 0x222a061ee60>, 'lr': 0.0001}]

In [21]:
text_encoder_lora

LoraModel(
  (model): CLIPTextModel(
    (text_model): CLIPTextTransformer(
      (embeddings): CLIPTextEmbeddings(
        (token_embedding): Embedding(49408, 768)
        (position_embedding): Embedding(77, 768)
      )
      (encoder): CLIPEncoder(
        (layers): ModuleList(
          (0-11): 12 x CLIPEncoderLayer(
            (self_attn): CLIPAttention(
              (k_proj): Linear(
                in_features=768, out_features=768, bias=True
                (lora_dropout): ModuleDict(
                  (text_encoder_lora): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (text_encoder_lora): Linear(in_features=768, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (text_encoder_lora): Linear(in_features=8, out_features=768, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )

In [49]:
from peft import get_peft_model_state_dict
import torch 

def get_module_kohya_state_dict(
    state_dict, 
    prefix: str, 
    lora_alpha: int,
    dtype: torch.dtype = torch.float32, 
):
    kohya_ss_state_dict = {}
    for peft_key, weight in state_dict.items():
        kohya_key = peft_key.replace("model", prefix, 1)
        kohya_key = kohya_key.replace("lora_A", "lora_down")
        kohya_key = kohya_key.replace("lora_B", "lora_up")
        kohya_key = kohya_key.replace(".", "_", kohya_key.count(".") - 2)
        kohya_ss_state_dict[kohya_key] = weight.to(dtype)

        # Set alpha parameter
        if "lora_down" in kohya_key:
            alpha_key = f'{kohya_key.split(".")[0]}.alpha'
            kohya_ss_state_dict[alpha_key] = torch.tensor(lora_alpha).to(dtype)

    return kohya_ss_state_dict

In [47]:
text_encoder_lora.peft_config["text_encoder_lora"].lora_alpha

32

In [51]:
state_dict = get_peft_model_state_dict(text_encoder_lora, adapter_name="text_encoder_lora")
state_dict

{'model.text_model.encoder.layers.0.self_attn.k_proj.lora_A.weight': tensor([[ 0.0096,  0.0165, -0.0187,  ...,  0.0266, -0.0265,  0.0094],
         [-0.0286,  0.0005, -0.0334,  ..., -0.0253, -0.0106,  0.0294],
         [-0.0170, -0.0327,  0.0122,  ...,  0.0115, -0.0011, -0.0304],
         ...,
         [ 0.0172,  0.0234,  0.0123,  ...,  0.0230, -0.0283, -0.0328],
         [-0.0065,  0.0249, -0.0122,  ..., -0.0246,  0.0032, -0.0215],
         [ 0.0341, -0.0222, -0.0149,  ...,  0.0223,  0.0315, -0.0264]],
        device='cuda:0'),
 'model.text_model.encoder.layers.0.self_attn.k_proj.lora_B.weight': tensor([[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0'),
 'model.text_model.encoder.layers.0.self_attn.v_proj.lora_A.weight': tensor([[-0.0147, -0.0285,  0.0164,  ...,  0.0071

In [50]:
from utils import get_peft_model_state_dict

lora_alpha = text_encoder_lora.peft_config["text_encoder_lora"].lora_alpha
state_dict = get_peft_model_state_dict(text_encoder_lora, adapter_name="text_encoder_lora")
get_module_kohya_state_dict(
    state_dict, 
    prefix="lora_te", 
    lora_alpha=lora_alpha,
    )

{'lora_te_text_lora_te_encoder_layers_0_self_attn_k_proj.lora_down.weight': tensor([[ 0.0096,  0.0165, -0.0187,  ...,  0.0266, -0.0265,  0.0094],
         [-0.0286,  0.0005, -0.0334,  ..., -0.0253, -0.0106,  0.0294],
         [-0.0170, -0.0327,  0.0122,  ...,  0.0115, -0.0011, -0.0304],
         ...,
         [ 0.0172,  0.0234,  0.0123,  ...,  0.0230, -0.0283, -0.0328],
         [-0.0065,  0.0249, -0.0122,  ..., -0.0246,  0.0032, -0.0215],
         [ 0.0341, -0.0222, -0.0149,  ...,  0.0223,  0.0315, -0.0264]],
        device='cuda:0'),
 'lora_te_text_lora_te_encoder_layers_0_self_attn_k_proj.alpha': tensor(32.),
 'lora_te_text_lora_te_encoder_layers_0_self_attn_k_proj.lora_up.weight': tensor([[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0'),
 'lora_te_text_lora_te_encod

In [24]:
from safetensors import safe_open

file = safe_open('test_lora.safetensors', framework="pt", device=0)

In [28]:
file.keys()

['lora_te_text_model_encoder_layers_0_self_attn_k_proj.alpha',
 'lora_te_text_model_encoder_layers_0_self_attn_k_proj.lora_down.weight',
 'lora_te_text_model_encoder_layers_0_self_attn_k_proj.lora_up.weight',
 'lora_te_text_model_encoder_layers_0_self_attn_out_proj.alpha',
 'lora_te_text_model_encoder_layers_0_self_attn_out_proj.lora_down.weight',
 'lora_te_text_model_encoder_layers_0_self_attn_out_proj.lora_up.weight',
 'lora_te_text_model_encoder_layers_0_self_attn_q_proj.alpha',
 'lora_te_text_model_encoder_layers_0_self_attn_q_proj.lora_down.weight',
 'lora_te_text_model_encoder_layers_0_self_attn_q_proj.lora_up.weight',
 'lora_te_text_model_encoder_layers_0_self_attn_v_proj.alpha',
 'lora_te_text_model_encoder_layers_0_self_attn_v_proj.lora_down.weight',
 'lora_te_text_model_encoder_layers_0_self_attn_v_proj.lora_up.weight',
 'lora_te_text_model_encoder_layers_10_self_attn_k_proj.alpha',
 'lora_te_text_model_encoder_layers_10_self_attn_k_proj.lora_down.weight',
 'lora_te_text_mode

In [33]:
text_encoder_lora.peft_config['text_encoder_lora'].lora_alpha

32

In [30]:
file.get_tensor('lora_te_text_model_encoder_layers_0_self_attn_k_proj.alpha')

tensor(8., device='cuda:0', dtype=torch.float16)