<a href="https://colab.research.google.com/github/mitran27/GenerativeNetworks/blob/main/Dreambooth.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!nvidia-smi

Sat Sep 14 06:33:14 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   58C    P8              11W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [3]:
!pip install bitsandbytes
!pip install accelerate
!pip install diffusers

Collecting bitsandbytes
  Downloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Downloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl (137.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.5/137.5 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.43.3
Collecting diffusers
  Downloading diffusers-0.30.2-py3-none-any.whl.metadata (18 kB)
Downloading diffusers-0.30.2-py3-none-any.whl (2.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: diffusers
Successfully installed diffusers-0.30.2


In [2]:
from PIL import Image

def image_grid(imgs, rows, cols, resize=256):
    assert len(imgs) == rows * cols

    if resize is not None:
        imgs = [img.resize((resize, resize)) for img in imgs]

    w, h = imgs[0].size
    grid_w, grid_h = cols * w, rows * h
    grid = Image.new("RGB", size=(grid_w, grid_h))

    for i, img in enumerate(imgs):
        x = i % cols * w
        y = i // cols * h
        grid.paste(img, box=(x, y))

    return grid

In [None]:
# prompt: clone diffusers repo

!git clone https://github.com/huggingface/diffusers.git


Cloning into 'diffusers'...
remote: Enumerating objects: 70436, done.[K
remote: Counting objects: 100% (9752/9752), done.[K
remote: Compressing objects: 100% (1294/1294), done.[K
remote: Total 70436 (delta 9163), reused 8682 (delta 8363), pack-reused 60684 (from 1)[K
Receiving objects: 100% (70436/70436), 49.16 MiB | 10.55 MiB/s, done.


In [8]:
import os
from PIL import Image
from tqdm import tqdm

In [9]:
MODEL_NAME = "stabilityai/stable-diffusion-2"
PRECISION = "fp16"

In [10]:
from transformers import AutoTokenizer, PretrainedConfig
from transformers import CLIPTextModel
from torch.utils.data import Dataset
from torchvision import transforms
from diffusers.optimization import get_scheduler
import bitsandbytes as bnb
from accelerate.utils import ProjectConfiguration, set_seed
from accelerate import Accelerator
import torch
import diffusers
from diffusers import (
    AutoencoderKL,
    DDPMScheduler,
    DiffusionPipeline,
    StableDiffusionPipeline,
    UNet2DConditionModel,
)




In [11]:
class DreamBoothDataset(Dataset):
  def __init__(self,
               Instance_dir,
               instance_prompt,
               tokenizer,
               class_dir,
               class_prompt,
               class_num,
               img_size=512,
               tokenizer_max_length=None) -> None:

          self.tokenizer = tokenizer
          self.instance_dir = Instance_dir
          self.instance_prompt = instance_prompt
          self.class_dir = class_dir
          self.class_prompt = class_prompt
          self.class_num = class_num
          self.img_size = img_size
          self.tokenizer_max_length = tokenizer_max_length

          self.instance_images = os.listdir(self.instance_dir)
          self.class_images = os.listdir(self.class_dir)

          self._prepare_transforms();

          self._prepare_instance_tokens()
          self._prepare_class_tokens()

  def __len__(self):
        return len(self.instance_images)

  def _prepare_transforms(self, image):
        self._imaage_transforms = transforms.Compose(
            [
                transforms.Resize(self.img_size),
                transforms.CenterCrop(self.img_size),
                transforms.ToTensor(),
                transforms.Normalize([0.5], [0.5]),
            ]
        )
  def _prepare_instance_tokens(self):
      max_length = self.tokenizer.model_max_length if self.tokenizer_max_length is None else self.tokenizer_max_length
      instance_tokens = self.tokenizer(
          self.instance_prompt,
          truncation=True,
          padding="max_length",
          max_length=max_length,
          return_tensors="pt",
      )

      self.instance_data = {
          "instance_prompt_ids": instance_tokens["input_ids"],
          "instance_attention_mask": instance_tokens["attention_mask"],
      }

  def _prepare_class_tokens(self):
        max_length = self.tokenizer.model_max_length if self.tokenizer_max_length is None else self.tokenizer_max_length
        class_tokens = self.tokenizer(
          self.class_prompt,
          truncation=True,
          padding="max_length",
          max_length=max_length,
          return_tensors="pt",
      )

        self.class_data = {
          "class_prompt_ids": class_tokens["input_ids"],
          "aclss_ttention_mask": class_tokens["attention_mask"],
      }

  def pre_compute_embedding(self, text_encoder):



    self.instance_data["instance_embedding"] = text_encoder(
        input_ids=self.instance_data["instance_prompt_ids"].to(text_encoder.device),
        attention_mask=self.instance_data["instance_attention_mask"].to(text_encoder.device),
        return_dict = False
    )[0]

    self.class_data["class_embedding"] = text_encoder(
        input_ids=self.class_data["class_prompt_ids"].to(text_encoder.device),
        attention_mask=self.class_data["aclss_ttention_mask"].to(text_encoder.device),
        return_dict = False
    )[0]



  def __getitem__(self, index) :
        index = index % len(self.instance_images)
        instance_image = Image.open(os.path.join(self.instance_dir, self.instance_images[index]))
        class_image = Image.open(os.path.join(self.class_dir, self.class_images[index]))

        assert class_image.mode == "RGB" and instance_image.mode == "RGB"

        data = {}

        data["instance_image"] = self._imaage_transforms(instance_image)
        data["class_image"] = instance_image.mode == "RGB"

        data.update(self.instance_data)
        data.update(self.class_data)

        return data


In [12]:
from abc import ABC, abstractmethod

class DLPipeline(ABC):
  def __init__(self):
    pass

  @abstractmethod
  def train(self):
    pass

  @abstractmethod
  def generate(self):
    pass

  @abstractmethod
  def save(self):
    pass

  @abstractmethod
  def load(self):
    pass

**Required Models for training Stable diffusion**

1) Tokenizer : to Convert text to input embedding

2) Text Encoder : prcess the text to latents representing the information of the context of the text

3) Scheduler : Noise and denoise the images/latents for input to the diffusion model
* Noise the input for training to predict the noise

* Denoise during inference to remove the predicted noise from input noise to process with next step

4)  VAE :encode the image to latent space

5) Unet : take the noised latent and try to predict the noise



**Training the Stable diffusion Model**

1) use tokenizer and



In [15]:
import itertools
class DreamBoothPipeline(DLPipeline):
  def __init__(self,training_configs,
               train_clip = False,
               generate_class_images=False,
               n_class=0,
               class_images_dir=None,
               class_prompt=None):

    print("*** Downloading and loading models ***")

    self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    self.dtype = torch.float16 if PRECISION=="fp16" else torch.float32

    self.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, subfolder="tokenizer",variant=PRECISION)
    self.text_encoder = CLIPTextModel.from_pretrained(MODEL_NAME, subfolder="text_encoder", variant=PRECISION, torch_dtype=self.dtype).to(self.device)
    self.unet = UNet2DConditionModel.from_pretrained(MODEL_NAME, subfolder="unet", torch_dtype=self.dtype, variant=PRECISION,use_safetensors=True).to(self.device)
    self.vae = AutoencoderKL.from_pretrained(MODEL_NAME, subfolder="vae",  torch_dtype=self.dtype, variant=PRECISION,use_safetensors=True).to(self.device)
    self.scheduler = DDPMScheduler.from_pretrained(MODEL_NAME, subfolder="scheduler")

    self.text_encoder.requires_grad_(False) if train_clip else None
    self.vae.requires_grad_(False)
    self.train_clip = train_clip
    if train_clip:
      params = itertools.chain(self.unet.parameters(), self.text_encoder.parameters())
    else:
      params = self.unet.parameters()

    self.optimizer = bnb.optim.Adam8bit(params,
                                          lr = training_configs["lr"],
                                          betas=training_configs["betas"],
                                          eps=training_configs["adam_epsilon"],
                                          weight_decay=training_configs["adam_weight_decay"])


    accelerator_project_config = ProjectConfiguration(project_dir=os.path.join(training_configs["dir"],"project"), logging_dir=os.path.join(training_configs["dir"],"log"))

    self.accelerator = Accelerator(
        gradient_accumulation_steps=training_configs["gradient_accumulation_steps"],
        mixed_precision=training_configs["mixed_precision"],
        log_with=training_configs["report_to"],
        project_config=accelerator_project_config
    )

    self.lr_scheduler = get_scheduler(
        training_configs["lr_scheduler"],
        optimizer=self.optimizer,
        num_warmup_steps=training_configs["lr_warmup_steps"] * self.accelerator.num_processes,
        num_training_steps=training_configs["max_train_steps"] * self.accelerator.num_processes,
        num_cycles=training_configs["lr_num_cycles"],
        power=training_configs["lr_power"]
    )

    if generate_class_images:
      self.generate_class_images(class_images_dir, n_class, class_prompt)


  @staticmethod
  def generate_class_images(class_images_dir, n_class:int, class_prompt:str, device="cpu"):
    if not os.path.exists(class_images_dir):
      os.mkdir(class_images_dir)

    if n_class > len(os.listdir(class_images_dir)):
      print("Generating class images")

      vae = AutoencoderKL.from_pretrained(
          "madebyollin/sdxl-vae-fp16-fix",
          torch_dtype=torch.float16
      )
      pipe = DiffusionPipeline.from_pretrained(
          "stabilityai/stable-diffusion-xl-base-1.0",
          vae=vae,
          torch_dtype=torch.float16,
          variant="fp16",
          use_safetensors=True,
      )
      pipe.to("cuda");

      base_SD_pipeline = DiffusionPipeline.from_pretrained(MODEL_NAME, torch_dtype=torch.float32)

      images = base_SD_pipeline(prompt=class_prompt, num_inference_steps=25, num_images_per_prompt=n_class-len(os.listdir(class_images_dir)))
      for i in range(len(images)):
        images[i].save(os.path.join(class_images_dir, f"generated_{i}.png"))
      del(base_SD_pipeline)

    else:
      print("Class images already generated")


  def train(self, train_dataloader):
    pass

  def generate():
    pass
  def save():
    pass
  def load():
    pass

In [16]:
configs = {
    "lr" : 1e-6 ,
    "adam_beta1" : 0.9,
    "adam_beta2" : 0.999,
    "betas":(0.9,0.999),
    "adam_weight_decay" : 1e-2,
    "adam_epsilon" : 1e-08,
    "lr_scheduler" : 'constant',
    "lr_warmup_steps": 0,
    "lr_num_cycles": 1,
    "max_train_steps":400,
    "lr_power" : 1.0,
    "gradient_accumulation_steps": 1,
    "mixed_precision": "fp16",
    "report_to": "wandb",
    "dir": "/content/dreambooth"
}




In [17]:
text = DreamBoothPipeline(configs,True,False)

*** Downloading and loading models ***




model.fp16.safetensors:   0%|          | 0.00/681M [00:00<?, ?B/s]

diffusion_pytorch_model.fp16.safetensors:   0%|          | 0.00/1.73G [00:00<?, ?B/s]

diffusion_pytorch_model.fp16.safetensors:   0%|          | 0.00/167M [00:00<?, ?B/s]



In [10]:
DreamBoothPipeline.generate_class_images("/content/classimgs",2,"a photo of a man")

Generating class images


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
from diffusers import  AutoencoderKL
from transformers import CLIPTokenizer, CLIPTextModel
import torch
from PIL import Image

class StableDiffusionModel:
    def __init__(self, model_name, revision="fp16", variant="fp16"):
        # Set device and dtype to fp16 if GPU is available
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.dtype = torch.float16 if torch.cuda.is_available() else torch.float32

        # Load model components
        self.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, subfolder="tokenizer",variant=PRECISION)
        self.text_encoder = CLIPTextModel.from_pretrained(MODEL_NAME, subfolder="text_encoder", variant=PRECISION, torch_dtype=self.dtype).to(self.device)
        self.unet = UNet2DConditionModel.from_pretrained(MODEL_NAME, subfolder="unet", torch_dtype=self.dtype, variant=variant).to(self.device)
        self.vae = AutoencoderKL.from_pretrained(MODEL_NAME, subfolder="vae",  torch_dtype=self.dtype, variant=variant).to(self.device)
        self.scheduler = DDPMScheduler.from_pretrained(model_name, subfolder="scheduler")

    def inference(self, prompt, num_steps=50, output_image_path="generated_image.png"):
        # Tokenize the input prompt
        inputs = self.tokenizer(prompt, return_tensors="pt").input_ids.to(self.device)

        # Encode the text input to get text embeddings
        text_embeddings = self.text_encoder(inputs)[0]
        print(text_embeddings.shape)
        # Generate latent noise for the image
        latents = torch.randn((1, self.unet.config.in_channels, 64, 64), device=self.device, dtype=self.dtype)

        # Prepare for the denoising loop
        self.scheduler.set_timesteps(num_steps)
        latents = latents * self.scheduler.init_noise_sigma

        # Denoising loop using the UNet model
        for t in self.scheduler.timesteps:
            latent_model_input = self.scheduler.scale_model_input(latents, t)
            noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
            latents = self.scheduler.step(noise_pred, t, latents).prev_sample

        # Decode the latents to pixel space using the VAE
        latents = 1 / 0.18215 * latents
        image = self.vae.decode(latents).sample

        # Convert the image to PIL format
        image = (image / 2 + 0.5).clamp(0, 1)
        image = image.cpu().permute(0, 2, 3, 1).numpy()
        image = Image.fromarray((image[0] * 255).astype("uint8"))

        # Save or display the generated image
        image.save(output_image_path)
        image.show()

# Example usage
if __name__ == "__main__":
    # Create an instance of the StableDiffusionModel
    sd_model = StableDiffusionModel(model_name="stabilityai/stable-diffusion-xl-base-1.0", revision="fp16", variant="fp16")

    # Run inference and generate an image


In [None]:
    sd_model.inference(prompt="A scenic landscape with mountains during sunset.")


In [4]:
from diffusers import StableDiffusionPipeline
import torch

model_id = "stabilityai/stable-diffusion-2"
pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16)



The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model_index.json:   0%|          | 0.00/537 [00:00<?, ?B/s]

Fetching 13 files:   0%|          | 0/13 [00:00<?, ?it/s]

text_encoder/config.json:   0%|          | 0.00/633 [00:00<?, ?B/s]

tokenizer/special_tokens_map.json:   0%|          | 0.00/460 [00:00<?, ?B/s]

tokenizer/merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

(…)ature_extractor/preprocessor_config.json:   0%|          | 0.00/342 [00:00<?, ?B/s]

scheduler/scheduler_config.json:   0%|          | 0.00/345 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.36G [00:00<?, ?B/s]

tokenizer/vocab.json:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

tokenizer/tokenizer_config.json:   0%|          | 0.00/824 [00:00<?, ?B/s]

unet/config.json:   0%|          | 0.00/909 [00:00<?, ?B/s]

vae/config.json:   0%|          | 0.00/611 [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/335M [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/3.46G [00:00<?, ?B/s]

Loading pipeline components...:   0%|          | 0/6 [00:00<?, ?it/s]



In [6]:
import os
prompts = ["A young beautiful women with medium hair sitting in a park",
          "A brown indian lady with long hair dressed in saree ",
          "A young  south indian women dressed modernly with bob hair ",
          "A beautiful women dressed formally wearing a handbang",
           "A beautiful women with blue eyes and blonde hair dressed in jeans wearing a bag",
         "A beautiful women with brown eyes and straigh hair dressed in traditional wearing a jewels",

]

index = 0
for prompt in prompts:
  images = pipe(prompt=prompt, num_inference_steps=50, num_images_per_prompt = 4)[0]
  for i in range(len(images)):
          images[i].save(os.path.join("/content/classimgs", f"generated_{index}.png"))
          index +=1
del(pipe)


KeyboardInterrupt: 