In [1]:
import os
import torch
import gc
import glob
from PIL import Image
from tqdm.auto import tqdm
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from transformers import CLIPTextModel, CLIPTokenizer
from diffusers import AutoencoderKL, UNet2DConditionModel, DDPMScheduler, StableDiffusionPipeline
from diffusers.optimization import get_scheduler
from huggingface_hub import notebook_login

In [2]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
# Configuration
UNIQUE_TOKEN    = "meAmr"
CLASS_TOKEN     = "person"
INSTANCE_DIR    = "/content/amr_photos"
OUTPUT_DIR      = "/content/fine_tuned_sd"
HF_MODEL_REPO   = "amr3303/fine-tuned-sd-amr"
RESOLUTION      = 512
TRAIN_BATCH_SIZE= 1
NUM_TRAIN_EPOCHS= 50
LEARNING_RATE   = 2e-6
LR_WARMUP_STEPS = 0
SAVE_PROMPT     = f"A photo of {UNIQUE_TOKEN} {CLASS_TOKEN} in a snowing mountain"
SAMPLE_DIR      = "/content/samples"

In [4]:
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(SAMPLE_DIR, exist_ok=True)

In [5]:
class DreamBoothDataset(Dataset):
    def __init__(self, image_paths, tokenizer, size=512):
        self.paths    = image_paths
        self.tokenizer= tokenizer
        self.size     = size
        self.transform= transforms.Compose([
            transforms.Resize(size),
            transforms.CenterCrop(size),
            transforms.ToTensor(),
            transforms.Normalize([0.5], [0.5]),
        ])
    def __len__(self): return len(self.paths)
    def __getitem__(self, idx):
        img = Image.open(self.paths[idx]).convert("RGB")
        pv  = self.transform(img)
        prompt = f"A photo of {UNIQUE_TOKEN} {CLASS_TOKEN}"
        ids    = self.tokenizer(
            prompt,
            padding="max_length",
            truncation=True,
            max_length=self.tokenizer.model_max_length,
            return_tensors="pt"
        ).input_ids[0]
        return {"pixel_values": pv, "input_ids": ids}

In [6]:
def train():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    # Load components
    base = "runwayml/stable-diffusion-v1-5"
    vae  = AutoencoderKL.from_pretrained(base, subfolder="vae").to(device)
    text_encoder = CLIPTextModel.from_pretrained(base, subfolder="text_encoder").to(device)
    tokenizer    = CLIPTokenizer.from_pretrained(base, subfolder="tokenizer")
    unet   = UNet2DConditionModel.from_pretrained(base, subfolder="unet").to(device)
    scheduler = DDPMScheduler.from_pretrained(base, subfolder="scheduler")
    vae.requires_grad_(False)
    text_encoder.requires_grad_(False)

    # Prepare data
    files = glob.glob(os.path.join(INSTANCE_DIR, "*.jpg")) + \
            glob.glob(os.path.join(INSTANCE_DIR, "*.png"))
    ds    = DreamBoothDataset(files, tokenizer, RESOLUTION)
    dl    = DataLoader(ds, batch_size=TRAIN_BATCH_SIZE, shuffle=True)

    # Optimizer & scheduler
    opt   = torch.optim.AdamW(unet.parameters(), lr=LEARNING_RATE)
    total_steps = len(dl) * NUM_TRAIN_EPOCHS
    lr_sch = get_scheduler("constant_with_warmup", opt, LR_WARMUP_STEPS, total_steps)

    # Train
    step=0
    for epoch in range(NUM_TRAIN_EPOCHS):
        unet.train()
        for batch in dl:
            latents = vae.encode(batch["pixel_values"].to(device)).latent_dist.sample() * 0.18215
            enc = text_encoder(batch["input_ids"].to(device))[0]
            noise = torch.randn_like(latents)
            timesteps = torch.randint(
                0, scheduler.config.num_train_timesteps,
                latents.shape[:1], device=device
            )
            noisy = scheduler.add_noise(latents, noise, timesteps)
            pred = unet(noisy, timesteps, enc).sample
            loss = torch.nn.functional.mse_loss(pred, noise)
            loss.backward()
            opt.step(); lr_sch.step(); opt.zero_grad()
            step +=1
            if step % 50 == 0:
                print(f"Step {step} loss {loss.item():.4f}")
    return unet, tokenizer, scheduler, vae, text_encoder

In [7]:
# Run training
unet, tokenizer, scheduler, vae, text_encoder = train()

Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/547 [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/335M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/617 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/492M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/806 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/472 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/3.44G [00:00<?, ?B/s]

scheduler_config.json:   0%|          | 0.00/308 [00:00<?, ?B/s]

Step 50 loss 0.2225
Step 100 loss 0.0018
Step 150 loss 0.0030
Step 200 loss 0.1095
Step 250 loss 0.0691
Step 300 loss 0.2654
Step 350 loss 0.0355
Step 400 loss 0.1117
Step 450 loss 0.0616
Step 500 loss 0.1500
Step 550 loss 0.0125


In [8]:
# %% Cell 5: Create Diffusers Pipeline & Push to Hugging Face
from diffusers import StableDiffusionPipeline

# Build pipeline from fine-tuned components
pipe = StableDiffusionPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5",
    vae=vae,
    text_encoder=text_encoder,
    tokenizer=tokenizer,
    unet=unet,
    scheduler=scheduler,
    safety_checker=None,  # Optional: Only include if you want to disable safety checks
    feature_extractor=None,  # Optional: Only include if you want to disable feature extraction
).to("cuda")

pipe.save_pretrained(OUTPUT_DIR)

# Push in Diffusers format
pipe.push_to_hub(
    repo_id=HF_MODEL_REPO,
    private=True,
)
print(f"Diffusers pipeline pushed to https://huggingface.co/{HF_MODEL_REPO}")

model_index.json:   0%|          | 0.00/541 [00:00<?, ?B/s]

Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

You have disabled the safety checker for <class 'diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline'> by passing `safety_checker=None`. Ensure that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered results in services or applications open to the public. Both the diffusers team and Hugging Face strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling it only for use-cases that involve analyzing network behavior or auditing its results. For more information, please have a look at https://github.com/huggingface/diffusers/pull/254 .


Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/335M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/492M [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/3.44G [00:00<?, ?B/s]

Diffusers pipeline pushed to https://huggingface.co/amr3303/fine-tuned-sd-amr


In [9]:
from diffusers import StableDiffusionPipeline
import torch

# Load the pipeline from local path (if you're still in Colab and haven't restarted)
pipe = StableDiffusionPipeline.from_pretrained("/content/fine_tuned_sd").to("cuda")

# OR load from Hugging Face Hub (if you've restarted the runtime or using it elsewhere)
# pipe = StableDiffusionPipeline.from_pretrained("amr3303/fine-tuned-sd-amr", torch_dtype=torch.float16).to("cuda")


Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

You have disabled the safety checker for <class 'diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline'> by passing `safety_checker=None`. Ensure that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered results in services or applications open to the public. Both the diffusers team and Hugging Face strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling it only for use-cases that involve analyzing network behavior or auditing its results. For more information, please have a look at https://github.com/huggingface/diffusers/pull/254 .


In [15]:
prompt = "A photo of meAmr with a cat"  # Use the token you trained with
image = pipe(prompt).images[0]

# Display in notebook
image.show()

# Optional: Save to disk
image.save(f"/content/{prompt}.png")

  0%|          | 0/50 [00:00<?, ?it/s]