## Text to image using stable diffusion

In [None]:
# !pip install diffusers==0.3.0 --q
# !pip install transformers scipy ftfy --q
# !pip install "ipywidgets>=7,<8" --q
# import IPython.display

## Imports

In [2]:
import gc
import torch
from PIL import Image
import IPython.display 
from torch import autocast
from tqdm.auto import tqdm
# from kaggle_secrets import UserSecretsClient
from transformers import CLIPTextModel, CLIPTokenizer
from diffusers import StableDiffusionPipeline
from diffusers import AutoencoderKL, UNet2DConditionModel
from diffusers import LMSDiscreteScheduler , PNDMScheduler
import warnings

warnings.filterwarnings('ignore')
# user_secrets = UserSecretsClient()
# Hugging_face  = user_secrets.get_secret("Hugging_id")
Hugging_face = "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"

## Hyperparameters

In [3]:
class config : 
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
    HEIGHT = 512                        
    WIDTH = 512                         
    NUM_INFERENCE_STEPS = 50            
    GUIDANCE_SCALE = 20                
    GENERATOR = torch.manual_seed(50)   
    BATCH_SIZE = 1

## Helper functions

In [4]:
def image_grid(imgs, rows, cols):
    assert len(imgs) == rows*cols
    w, h = imgs[0].size
    grid = Image.new('RGB', size=(cols*w, rows*h))
    grid_w, grid_h = grid.size
    for i, img in enumerate(imgs):
        grid.paste(img, box=(i%cols*w, i//cols*h))
    return grid

## Loading the pretrained models

* <font size = 3><span style="color:#3A3E59"> The model we are going to use is `CompVis/stable-diffusion-v1-4` the model card can be found <a href =https://huggingface.co/CompVis/stable-diffusion-v1-4>here</a> </span></font>
* <font size = 3><span style="color:#3A3E59"> We are going to load 
     `Variable auto encoder`,
     `Tokenizer`,
     `Text encoder`  and
     `Unet`</span></font>
     
* <font size = 3><span style="color:#3A3E59">Stable Diffusion during inference</span></font>

In [5]:
vae = AutoencoderKL.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="vae")
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14")

In [6]:
unet = UNet2DConditionModel.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="unet")
vae = vae.to(config.DEVICE)
text_encoder = text_encoder.to(config.DEVICE)
unet = unet.to(config.DEVICE)

print(f'\033[94mTokenizer, Text Encoder, VAE, Unet are loaded !!!')

[94mTokenizer, Text Encoder, VAE, Unet are loaded !!!


## Scheduler

<font size = 5><span style="color:#F60195"> </span></font>
* <font size = 3><span style="color:#3A3E59"> Using K - LMS Scheduler</span></font>
* <font size = 3><span style="color:#3A3E59"> The default scheduler is PNDM scheduler </span></font>
* <font size = 3><span style="color:#3A3E59"> Some other schedulers are DDIM  ,DDPM and <a href = https://github.com/huggingface/diffusers/tree/main/src/diffusers/schedulers> some more </a></span></font>

In [7]:
scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000)
print(f'\033[94mThe scheduler loaded is K-LMS Sceheduler')

[94mThe scheduler loaded is K-LMS Sceheduler


In [9]:
prompt = ["black basketball nike shoes with blue laces"]

In [10]:
text_input = tokenizer(prompt, padding="max_length", max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt")
max_length = text_input.input_ids.shape[-1]
with torch.no_grad():
      text_embeddings = text_encoder(text_input.input_ids.to(config.DEVICE))[0]
uncond_input = tokenizer(
    [""] * config.BATCH_SIZE, padding="max_length", max_length=max_length, return_tensors="pt"
)
with torch.no_grad():
      uncond_embeddings = text_encoder(uncond_input.input_ids.to(config.DEVICE))[0]   
text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
print(f'\033[94mText Embeddings shape: {text_embeddings.shape}')

[94mText Embeddings shape: torch.Size([2, 77, 768])


In [11]:
latents = torch.randn(
  (config.BATCH_SIZE, unet.in_channels, config.HEIGHT // 8, config.WIDTH // 8),
  generator=config.GENERATOR,
)
latents = latents.to(config.DEVICE)

print(f'\033[94mLatent shape: {latents.shape}')

[94mLatent shape: torch.Size([1, 4, 64, 64])


## Encoding the image

In [12]:
scheduler.set_timesteps(config.NUM_INFERENCE_STEPS)
latents = latents * scheduler.sigmas[0]

In [13]:

with autocast(config.DEVICE):
      for i, t in tqdm(enumerate(scheduler.timesteps)):
            
            latent_model_input = torch.cat([latents] * 2)
            sigma = scheduler.sigmas[i]
            latent_model_input = latent_model_input / ((sigma**2 + 1) ** 0.5)

            with torch.no_grad():
                  noise_pred = unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample

            noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
            noise_pred = noise_pred_uncond + config.GUIDANCE_SCALE * (noise_pred_text - noise_pred_uncond)

            latents = scheduler.step(noise_pred, t, latents).prev_sample

0it [00:00, ?it/s]

KeyboardInterrupt: 

## Decoding the image

In [None]:
latents = 1 / 0.18215 * latents

with torch.no_grad():
  image = vae.decode(latents).sample
print(f'\033[94mImage shape: {image.shape}')

## Visualizing the image

In [None]:
image = (image / 2 + 0.5).clamp(0, 1)
image = image.detach().cpu().permute(0, 2, 3, 1).numpy()
images = (image * 255).round().astype("uint8")
pil_images = [Image.fromarray(image) for image in images]
pil_images[0]

In [None]:
prompt = ["extra large size blue kurta with red buttons"]

In [None]:
text_input = tokenizer(prompt, padding="max_length", max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt")
max_length = text_input.input_ids.shape[-1]
with torch.no_grad():
      text_embeddings = text_encoder(text_input.input_ids.to(config.DEVICE))[0]
uncond_input = tokenizer(
    [""] * config.BATCH_SIZE, padding="max_length", max_length=max_length, return_tensors="pt"
)
with torch.no_grad():
      uncond_embeddings = text_encoder(uncond_input.input_ids.to(config.DEVICE))[0]   
text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
print(f'\033[94mText Embeddings shape: {text_embeddings.shape}')

In [None]:
latents = torch.randn(
  (config.BATCH_SIZE, unet.in_channels, config.HEIGHT // 8, config.WIDTH // 8),
  generator=config.GENERATOR,
)
latents = latents.to(config.DEVICE)

print(f'\033[94mLatent shape: {latents.shape}')

## Encoding another image

In [None]:
scheduler.set_timesteps(config.NUM_INFERENCE_STEPS)
latents = latents * scheduler.sigmas[0]

In [1]:

with autocast(config.DEVICE):
      for i, t in tqdm(enumerate(scheduler.timesteps)):
        
        latent_model_input = torch.cat([latents] * 2)
        sigma = scheduler.sigmas[i]
        latent_model_input = latent_model_input / ((sigma**2 + 1) ** 0.5)

        with torch.no_grad():
              noise_pred = unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample

        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
        noise_pred = noise_pred_uncond + config.GUIDANCE_SCALE * (noise_pred_text - noise_pred_uncond)

        latents = scheduler.step(noise_pred, i, latents).prev_sample

NameError: name 'autocast' is not defined

## Decoding it

In [None]:
latents = 1 / 0.18215 * latents

with torch.no_grad():
  image = vae.decode(latents).sample
print(f'\033[94mImage shape: {image.shape}')

## Visualizing the image

In [None]:
image = (image / 2 + 0.5).clamp(0, 1)
image = image.detach().cpu().permute(0, 2, 3, 1).numpy()
images = (image * 255).round().astype("uint8")
pil_images = [Image.fromarray(image) for image in images]
pil_images[0].save("img2.jpg")
pil_images[0]

In [None]:
del latents
del vae
del text_encoder
del unet
gc.collect()

## PRETRAINED PIPELINE FOR STABLE DIFFUSION

* <font size = 3><span style="color:#3A3E59">StableDiffusionPipeline is an end-to-end inference pipeline that we can use to generate images from text with just a few lines of code.
</span></font>

In [None]:
pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", revision="fp16", torch_dtype=torch.float16, use_auth_token=Hugging_face)  
pipe = pipe.to(config.DEVICE)
print(f'\033[94mStable Diffusion Pipeline created !!!')

## Visualizing the images

In [None]:
num_images = 2
prompt = ["met gala dress, modern, traditional indian, chickenkari, negative: salwar kamiz, fusion saree and gown"] * num_images
with autocast("cuda"):
  images = pipe(prompt , num_inference_steps=100).images

grid = image_grid(images, rows=1, cols=2)
grid

In [None]:
num_images = 2
prompt =["winter attire: black punk-style outfit, layered with a blue t-shirt. full size view. vibrant yellow shoes."] * num_images
with autocast("cuda"):
  images = pipe(prompt , num_inference_steps=200).images

grid = image_grid(images, rows=1, cols=2)
grid

In [None]:
# num_images = 4
# prompt =["Cybernetic cloaked anime character concept design, dynamic pose, fantasy anime, dark, bejewelled and encrusted technological royal cloak, powerful aggressive sword stance, biological human face, iridescent, dark and intricate, Greg Rutkowski, Makoto Shinkai, anime CGI, animated, animation, artgerm, artstation, digital illustration, 8k"] * num_images
# with autocast("cuda"):
#   images = pipe(prompt , num_inference_steps=200).images

# grid = image_grid(images, rows=2, cols=2)
# grid