# Setup

Remove once kubeflow-training SDK is upgraded to 1.9.0

In [None]:
%pip install -qqU kubeflow-training

In [None]:
%pip install -qqU datasets s3fs diffusers peft transformers

In [None]:

# Install the YAML magic
%pip install -qqU yamlmagic
%load_ext yamlmagic

## Training configuration
Edit the following training parameters:

In [None]:
%%yaml parameters

# Dataset specification and description
name_of_your_concept: 'ccorgi'
type_of_thing: 'dog'
dataset_id: 'diffusers/dog-example'
dataset_train_split: 'train'
# Model definitions
model_id: 'CompVis/stable-diffusion-v1-4'
model_text_encoder_subfolder: 'text_encoder'
model_vae_subfolder: 'vae'
model_unet_subfolder: 'unet'
model_tokenizer_subfolder: 'tokenizer'
feature_extractor: 'openai/clip-vit-base-patch32'
safety_checker: 'CompVis/stable-diffusion-safety-checker'
# Storage configuration
local_trained_model_folder: './dreambooth'    # Notebook location where will the trained model be stored once downloaded from s3
# Model training configuration
learning_rate: 2.0e-06
max_train_steps: 300
train_batch_size: 1
gradient_accumulation_steps: 1                # Increase this if you want to lower memory usage
max_grad_norm: 1.0
gradient_checkpointing: True                  # Set this to True to lower the memory usage
use_8bit_adam: True                           # Use 8bit optimizer from bitsandbytes
seed: 3434554
output_dir: "dreambooth"                      # Where to save the trained pipeline in PyTorchJob


Load and show dataset images used for training dreambooth.

In [None]:
from datasets import load_dataset
from PIL import Image


def image_grid(imgs, rows, cols):
    assert len(imgs) == rows * cols
    w, h = imgs[0].size
    grid = Image.new("RGB", size=(cols * w, rows * h))
    grid_w, grid_h = grid.size
    for i, img in enumerate(imgs):
        grid.paste(img, box=(i % cols * w, i // cols * h))
    return grid


num_samples = 4
dataset = load_dataset(parameters['dataset_id'], split=parameters['dataset_train_split'])
image_grid(dataset["image"][:num_samples], rows=1, cols=num_samples)

# Distributed training
Training function to be ran on all distributed training Pods.

In [None]:
def training_function(parameters):
    import math
    import torch
    import os

    import torch.nn.functional as F
    from accelerate import Accelerator
    from accelerate.utils import set_seed
    from diffusers import DDPMScheduler, PNDMScheduler, StableDiffusionPipeline, AutoencoderKL, UNet2DConditionModel
    from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
    from torch.utils.data import DataLoader, Dataset
    from tqdm.auto import tqdm
    from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
    from torchvision import transforms
    from datasets import load_dataset
    import s3fs

    name_of_your_concept = parameters['name_of_your_concept']
    type_of_thing = parameters['type_of_thing']
    instance_prompt = f"a photo of {name_of_your_concept} {type_of_thing}"

    # PyTorch Dataset object that implements the __len__ and __getitem__ methods
    class DreamBoothDataset(Dataset):
        def __init__(self, dataset, instance_prompt, tokenizer, size=512):
            self.dataset = dataset
            self.instance_prompt = instance_prompt
            self.tokenizer = tokenizer
            self.size = size
            self.transforms = transforms.Compose(
                [
                    transforms.Resize(size),
                    transforms.CenterCrop(size),
                    transforms.ToTensor(),
                    transforms.Normalize([0.5], [0.5]),
                ]
            )

        def __len__(self):
            return len(self.dataset)

        def __getitem__(self, index):
            example = {}
            image = self.dataset[index]["image"]
            example["instance_images"] = self.transforms(image)
            example["instance_prompt_ids"] = self.tokenizer(
                self.instance_prompt,
                padding="do_not_pad",
                truncation=True,
                max_length=self.tokenizer.model_max_length,
            ).input_ids
            return example

    # The Stable Diffusion checkpoint we'll fine-tune
    model_id = parameters['model_id']
    tokenizer = CLIPTokenizer.from_pretrained(
        model_id,
        subfolder=parameters['model_tokenizer_subfolder'],
    )

    dataset = load_dataset(parameters['dataset_id'], split=parameters['dataset_train_split'])
    train_dataset = DreamBoothDataset(dataset, instance_prompt, tokenizer)

    # Data collator function to couple prompt with image
    def collate_fn(examples):
        input_ids = [example["instance_prompt_ids"] for example in examples]
        pixel_values = [example["instance_images"] for example in examples]
        pixel_values = torch.stack(pixel_values)
        pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()

        input_ids = tokenizer.pad({"input_ids": input_ids}, padding=True, return_tensors="pt").input_ids

        batch = {
            "input_ids": input_ids,
            "pixel_values": pixel_values,
        }
        return batch

    text_encoder = CLIPTextModel.from_pretrained(model_id, subfolder=parameters['model_text_encoder_subfolder'])
    vae = AutoencoderKL.from_pretrained(model_id, subfolder=parameters['model_vae_subfolder'])
    unet = UNet2DConditionModel.from_pretrained(model_id, subfolder=parameters['model_unet_subfolder'])
    feature_extractor = CLIPFeatureExtractor.from_pretrained(parameters['feature_extractor'])

    accelerator = Accelerator(
        gradient_accumulation_steps=parameters['gradient_accumulation_steps'],
    )

    set_seed(parameters['seed'])

    if parameters['gradient_checkpointing']:
        unet.enable_gradient_checkpointing()

    # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs
    if parameters['use_8bit_adam']:
        import bitsandbytes as bnb

        optimizer_class = bnb.optim.AdamW8bit
    else:
        optimizer_class = torch.optim.AdamW

    optimizer = optimizer_class(
        unet.parameters(),  # Only optimize unet
        lr=parameters['learning_rate'],
    )

    noise_scheduler = DDPMScheduler(
        beta_start=0.00085,
        beta_end=0.012,
        beta_schedule="scaled_linear",
        num_train_timesteps=1000,
    )

    train_dataloader = DataLoader(
        train_dataset,
        batch_size=parameters['train_batch_size'],
        shuffle=True,
        collate_fn=collate_fn,
    )

    unet, optimizer, train_dataloader = accelerator.prepare(unet, optimizer, train_dataloader)

    # Move text_encode and vae to gpu
    text_encoder.to(accelerator.device)
    vae.to(accelerator.device)

    # We need to recalculate our total training steps as the size of the training dataloader may have changed
    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / parameters['gradient_accumulation_steps'])
    num_train_epochs = math.ceil(parameters['max_train_steps'] / num_update_steps_per_epoch)

    # Train!
    # Only show the progress bar once on each machine
    progress_bar = tqdm(range(parameters['max_train_steps']), disable=not accelerator.is_local_main_process)
    progress_bar.set_description("Steps")
    global_step = 0

    for epoch in range(num_train_epochs):
        unet.train()
        for step, batch in enumerate(train_dataloader):
            with accelerator.accumulate(unet):
                # Convert images to latent space
                with torch.no_grad():
                    latents = vae.encode(batch["pixel_values"]).latent_dist.sample()
                    latents = latents * 0.18215

                # Sample noise that we'll add to the latents
                noise = torch.randn(latents.shape).to(latents.device)
                bsz = latents.shape[0]
                # Sample a random timestep for each image
                timesteps = torch.randint(
                    0,
                    noise_scheduler.config.num_train_timesteps,
                    (bsz,),
                    device=latents.device,
                ).long()

                # Add noise to the latents according to the noise magnitude at each timestep
                # (this is the forward diffusion process)
                noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)

                # Get the text embedding for conditioning
                with torch.no_grad():
                    encoder_hidden_states = text_encoder(batch["input_ids"])[0]

                # Predict the noise residual
                noise_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
                loss = F.mse_loss(noise_pred, noise, reduction="none").mean([1, 2, 3]).mean()

                accelerator.backward(loss)
                if accelerator.sync_gradients:
                    accelerator.clip_grad_norm_(unet.parameters(), parameters['max_grad_norm'])
                optimizer.step()
                optimizer.zero_grad()

            # Checks if the accelerator has performed an optimization step behind the scenes
            if accelerator.sync_gradients:
                progress_bar.update(1)
                global_step += 1

            logs = {"loss": loss.detach().item()}
            progress_bar.set_postfix(**logs)

            if global_step >= parameters['max_train_steps']:
                break

        accelerator.wait_for_everyone()

    # Create the pipeline using the trained modules and save it
    if accelerator.is_main_process:
        output_dir = parameters['output_dir']
        print(f"Loading pipeline and saving to {output_dir}...")
        scheduler = PNDMScheduler(
            beta_start=0.00085,
            beta_end=0.012,
            beta_schedule="scaled_linear",
            skip_prk_steps=True,
            steps_offset=1,
        )
        pipeline = StableDiffusionPipeline(
            text_encoder=text_encoder,
            vae=vae,
            unet=accelerator.unwrap_model(unet),
            tokenizer=tokenizer,
            scheduler=scheduler,
            safety_checker=StableDiffusionSafetyChecker.from_pretrained(parameters['safety_checker']),
            feature_extractor=feature_extractor,
        )
        pipeline.save_pretrained(output_dir)

        # Store trained model on AWS S3 bucket
        s3 = s3fs.S3FileSystem()
        s3_path = os.environ["AWS_S3_BUCKET"]
        s3.put(output_dir, s3_path, recursive=True)

Invoke training function on PyTorchJob Pods, provide packages to be installed and AWS S3 configuration.

In [None]:
from kubeflow.training import TrainingClient
from kubernetes import client
from kubernetes.client import (
    V1EnvVar,
    V1EnvVarSource,
    V1SecretKeySelector
)

job_name = "dreambooth"

# aws_connection_name value should be the same as connection name in Data science project where the Workbench is running
aws_connection_name = "workbench-aws"

# Provide URL and token with all required permissions
# On OpenShift, you can retrieve the token by running `oc whoami -t`,
# and the server with `oc cluster-info`.

token = ""
openshift_api_url = ""

api_key = {"authorization": "Bearer " + token}
config = client.Configuration(host=openshift_api_url, api_key=api_key)
# Un-comment if your cluster API server uses a self-signed certificate or an un-trusted CA
# config.verify_ssl = False
tc = TrainingClient(client_configuration=config)


# Alternatively add edit role for user running this Notebook using oc CLI:
# oc adm policy add-role-to-user edit system:serviceaccount:<namespace>:<workbench name> -n <namespace>
# tc = TrainingClient()

tc.create_job(
    job_kind="PyTorchJob",
    name=job_name,
    train_func=training_function,
    num_workers=1,
    num_procs_per_worker="auto",
    resources_per_worker={"gpu": 2},
    base_image="quay.io/modh/training:py311-cuda121-torch241",
    parameters=parameters,
    env_vars=[
        V1EnvVar(name="AWS_ACCESS_KEY_ID", value_from=V1EnvVarSource(secret_key_ref=V1SecretKeySelector(key="AWS_ACCESS_KEY_ID", name=aws_connection_name))),
        V1EnvVar(name="AWS_S3_BUCKET", value_from=V1EnvVarSource(secret_key_ref=V1SecretKeySelector(key="AWS_S3_BUCKET", name=aws_connection_name))),
        V1EnvVar(name="AWS_S3_ENDPOINT", value_from=V1EnvVarSource(secret_key_ref=V1SecretKeySelector(key="AWS_S3_ENDPOINT", name=aws_connection_name))),
        V1EnvVar(name="AWS_SECRET_ACCESS_KEY", value_from=V1EnvVarSource(secret_key_ref=V1SecretKeySelector(key="AWS_SECRET_ACCESS_KEY", name=aws_connection_name))),
    ],
    packages_to_install=[
        "diffusers",
        "torchvision",
        "s3fs",
        "bitsandbytes",
    ],
)

Once training Pods start you can watch training progress using following command.

In [None]:
logs, _ = tc.get_job_logs(job_name, follow=True)

# Local inference in Workbench
Once training finishes download trained model from S3 to Workbench folder.

In [None]:
import s3fs
import os

# Download trained model into local filesystem
s3 = s3fs.S3FileSystem()
s3_path = os.environ["AWS_S3_BUCKET"] + "/" + parameters['output_dir']
_ = s3.get(s3_path, parameters['local_trained_model_folder'], recursive=True)

In [None]:
from diffusers import StableDiffusionPipeline
import torch

# Setup stable diffusion pipeline with downloaded model
pipe = StableDiffusionPipeline.from_pretrained(
    parameters['local_trained_model_folder'],
    torch_dtype=torch.float16,
).to("cuda")

In [None]:
# Define a prompt for image generation using trained tokens
name_of_your_concept = parameters['name_of_your_concept']
type_of_thing = parameters['type_of_thing']
prompt = f"a photo of {name_of_your_concept} {type_of_thing} in the Acropolis"

# Tune the guidance to control how closely the generations follow the prompt
# Values between 7-11 usually work best
guidance_scale = 7

num_cols = 2
all_images = []
for _ in range(num_cols):
    images = pipe(prompt, guidance_scale=guidance_scale).images
    all_images.extend(images)

image_grid(all_images, 1, num_cols)

# Cleaning Up
Delete PyTorchJob to clean up OpenShift environment.

In [None]:
tc.delete_job(name=job_name)