In [1]:
!pip install -q diffusers transformers accelerate safetensors peft bitsandbytes

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25h

#### This code fine-tunes the Stable Diffusion model using LoRA with a custom Pinterest dataset. The dataset, consisting of images in PNG format, is used to adapt the model to a specific style or theme related to Pinterest. 

#### The process includes:
##### Loading Pretrained Model: Stable Diffusion model is loaded from runwayml/stable-diffusion-v1-5.
##### Dataset Preparation: Images are resized to 512x512 and converted to tensors.
##### Fine-Tuning with LoRA: The UNet model is fine-tuned using the LoRA technique, optimizing model weights while keeping resource usage low.
##### Training Loop: The model is trained on the dataset by generating noisy images, applying the diffusion process, and minimizing the loss between predicted and actual noise.
##### Saving the LoRA Adapter: After training, the LoRA adapter is saved for future use.

In [None]:
import torch
from diffusers import StableDiffusionPipeline, UNet2DConditionModel
from peft import LoraConfig, get_peft_model
from diffusers import AutoencoderKL
from PIL import Image
import os
from torchvision import transforms
from tqdm import tqdm

model_id = "runwayml/stable-diffusion-v1-5"
dataset_path = "/kaggle/input/hpdataset/HPdataset"
output_dir = "lora_finetuned_sd15"

# Load base model
pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16)  # Load the pretrained model
unet = pipe.unet.to("cuda").half()  # UNet model for denoising, moved to GPU and set to float16
vae = pipe.vae.to("cuda").half()  # VAE for encoding images, moved to GPU and set to float16
text_encoder = pipe.text_encoder.to("cuda").half()  # Text encoder, moved to GPU and set to float16

# LoRA configuration setup
lora_config = LoraConfig(
    r=8,  # Rank for the LoRA layers
    lora_alpha=16,  # Scaling factor for LoRA
    target_modules=["to_q", "to_k", "to_v", "to_out.0"],  # LoRA applied to these linear layers
    lora_dropout=0.1,  # Dropout rate for LoRA
    bias="none"  # No bias in LoRA layers
)

# Apply LoRA to the UNet model
unet = get_peft_model(unet, lora_config)  # Wrap UNet with LoRA layers
unet.print_trainable_parameters()  # Print parameters to verify trainable layers

# Load custom dataset
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, image_folder):
        # Initialize dataset by loading all PNG images
        self.image_paths = [os.path.join(image_folder, img) for img in os.listdir(image_folder) if img.endswith(".png")]
        self.transform = transforms.Compose([
            transforms.ToTensor(),  # Convert images to Tensor (range [0, 1])
            transforms.Resize((512, 512)),  # Resize images to 512x512
        ])
    
    def __len__(self):
        return len(self.image_paths)  # Return total number of images
    
    def __getitem__(self, idx):
        # Load and transform image for a specific index
        image = Image.open(self.image_paths[idx]).convert("RGB")  # Convert to RGB
        image = self.transform(image)  # Apply transformations
        return image

# Initialize dataset and dataloader
dataset = CustomDataset(dataset_path)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=True)

# Training loop setup
optimizer = torch.optim.AdamW(unet.parameters(), lr=1e-4)  # AdamW optimizer for training
epochs = 10  # Number of epochs for training
max_timesteps = 1000  # Maximum timesteps for the diffusion process

# Training loop
for epoch in range(epochs):
    for batch in tqdm(dataloader):
        batch = batch.to("cuda").half()  # Move data to GPU and set to float16

        # Encode image using VAE to obtain latent representation
        with torch.no_grad():
            batch_latents = vae.encode(batch).latent_dist.sample().detach() * 0.18215  # Scale latents appropriately

        # Generate random timesteps for diffusion process
        timesteps = torch.randint(0, max_timesteps, (batch.size(0),), device=batch.device, dtype=torch.int64)
        
        # Generate random noise for the image
        noise = torch.randn_like(batch_latents, dtype=torch.float16)
        noisy_latents = batch_latents + 0.1 * noise  # Add noise to latents

        # Dummy text input to simulate text encoder processing
        dummy_input_ids = torch.randint(0, 1000, (batch.size(0), 77), device=batch.device)  # Random text tokens
        encoder_hidden_states = text_encoder(dummy_input_ids).last_hidden_state  # Encoder output

        # Forward pass through UNet model (denoising)
        noise_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample  # Generate denoised latents

        # Compute loss (MSE loss between predicted and actual noise)
        loss = torch.nn.functional.mse_loss(noise_pred, noise)

        # Backpropagation and optimization
        optimizer.zero_grad()  # Reset gradients
        loss.backward()  # Compute gradients
        optimizer.step()  # Update weights
        
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss.item()}")

# Save the LoRA-adapted UNet model
unet.save_pretrained(output_dir)
print("LoRA fine-tuning complete. Adapter saved to", output_dir)


The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

model_index.json:   0%|          | 0.00/541 [00:00<?, ?B/s]

Fetching 15 files:   0%|          | 0/15 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

config.json:   0%|          | 0.00/617 [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/492M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.72k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/472 [00:00<?, ?B/s]

scheduler_config.json:   0%|          | 0.00/308 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/342 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/547 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/806 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/3.44G [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/335M [00:00<?, ?B/s]

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

trainable params: 1,594,368 || all params: 861,115,332 || trainable%: 0.1852


100%|██████████| 614/614 [13:50<00:00,  1.35s/it]


Epoch 1/10, Loss: 0.7578125


100%|██████████| 614/614 [13:49<00:00,  1.35s/it]


Epoch 2/10, Loss: 0.59033203125


100%|██████████| 614/614 [13:49<00:00,  1.35s/it]


Epoch 3/10, Loss: 0.79638671875


100%|██████████| 614/614 [13:50<00:00,  1.35s/it]


Epoch 4/10, Loss: 0.82763671875


100%|██████████| 614/614 [13:50<00:00,  1.35s/it]


Epoch 5/10, Loss: 0.7568359375


100%|██████████| 614/614 [13:50<00:00,  1.35s/it]


Epoch 6/10, Loss: 0.80126953125


100%|██████████| 614/614 [13:50<00:00,  1.35s/it]


Epoch 7/10, Loss: 0.76220703125


100%|██████████| 614/614 [13:50<00:00,  1.35s/it]


Epoch 8/10, Loss: 0.78466796875


100%|██████████| 614/614 [13:50<00:00,  1.35s/it]


Epoch 9/10, Loss: 0.73583984375


100%|██████████| 614/614 [13:50<00:00,  1.35s/it]


Epoch 10/10, Loss: 0.865234375
LoRA fine-tuning complete! Adapter saved to lora_finetuned_sd15
