In [1]:
! pip install diffusers accelerate



In [3]:
import torch
from transformers import pipeline
from transformers import SamModel, SamProcessor
from transformers import AutoImageProcessor, AutoModelForDepthEstimation, AutoProcessor, AutoModelForZeroShotObjectDetection
from diffusers import StableDiffusionImg2ImgPipeline, ControlNetModel, UniPCMultistepScheduler, AutoPipelineForText2Image
from diffusers.pipelines.controlnet import StableDiffusionControlNetInpaintPipeline

import os
import numpy as np
from PIL import Image
import random
import gc
import cv2
import matplotlib.pyplot as plt
from io import BytesIO

In [4]:
device = "cuda"
dtype = torch.float16

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Image-to-Image Pipeline

In [28]:
model_id_or_path = "runwayml/stable-diffusion-v1-5"
pipe = StableDiffusionImg2ImgPipeline.from_pretrained (model_id_or_path, dtype=torch.float16)
pipe = pipe.to (device)

Keyword arguments {'dtype': torch.float16} are not expected by StableDiffusionImg2ImgPipeline and will be ignored.


Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

In [7]:
im_h, im_w = 512, 768

In [15]:
init_image = Image.open("/content/drive/MyDrive/images/empty-room-1.jpg")
init_image = init_image.resize((im_w, im_h))

# prompt = """A small, inviting coffee shop is a warm refuge from the rain lashing against its large picture windows.
# The interior is filled with the rich aroma of freshly ground coffee and baked pastries.
# The lighting is soft and warm, coming from low-hanging Edison bulbs and the glow of the espresso machine.
#  A collection of mismatched, comfortable armchairs and small wooden tables are arranged throughout the space.
# The walls are lined with bookshelves filled with used paperbacks for patrons to read.
#  Steam fogs up the inside of the windows, and you can see streaks of rain running down the glass.
#  The gentle hum of conversation mixes with the sound of the coffee grinder and the soft jazz music playing in the background."""

prompt = """\
A small, inviting coffee shop is a warm refuge from the rain lashing against its large window.
The interior is filled with people enjoying the rich aroma of freshly ground coffee and baked pastries.
The lighting is soft and warm, coming from low-hanging lights and the glow of the espresso machine.
There are enclosed glass display shelves containing pastries and other tasty food items.
A collection of chairs and small tables are arranged throughout."""


### **Stable Diffusion's Guidance Scale (CFG Scale)**
Controls how closely the AI-generated image follows your text prompt, acting as a "prompt strength" knob: higher numbers (e.g., 12-16) make the AI stick strictly to the prompt but can limit creativity and add artifacts, while lower numbers (e.g., 6-9) give the AI more creative freedom, often resulting in better overall quality, with a default often around 7-7.5. Finding the right balance is key, with mid-range values typically offering the best blend of prompt adherence and artistic flair, although extreme values (very low or very high) can produce unique but often undesirable results like overly abstract or oversaturated images.

### **Stable Diffusion Strength (or Denoising Strength) parameter**
Controls how much the AI alters an input image, especially in Image-to-Image or Inpainting tasks, balancing creativity with adherence to the original; a lower value keeps the output closer to the input, while a higher value allows for more significant changes and artistic freedom, with values typically around 0.75 being a default for a blend.

In [13]:
# test runs
result = pipe (
    prompt = prompt,
    image = init_image,
    strength = 0.9,
    guidance_scale = 8.0,
    steps = 75 # number of sampling steps
)
images = result.images
images[0].save("coffee-shop-4.png")


  0%|          | 0/45 [00:00<?, ?it/s]

In [17]:
prompt = """\
A small, inviting coffee shop is a warm refuge from the rain lashing against its large window.
The interior has people enjoying the aroma of freshly ground coffee and pastries.
The lighting is soft and warm, coming from overhead lights and the glow of the espresso machine.
There are enclosed glass display shelves containing pastries.
A collection of chairs and small tables are arranged throughout."""

In [18]:
! rm -f output*.png

In [19]:
strengths = [0.2, 0.4, 0.6, 0.8]      # 0.2 = low strength, 0.7 = high strength
guidance_scales = [2, 4, 6, 8, 10]    # Specify the effect of the prompt
steps = 50
# total output images = 4 x 5 = 20

for st in strengths:
  for gs in guidance_scales:
    images = pipe (
        prompt = prompt,
        image = init_image,
        strength = st,
        guidance_scale = gs,
        steps=steps).images

    output_file = f"output - strength_{st} - gs_{gs}.png"
    print(output_file)
    images[0].save(output_file)


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

In [20]:
! mv /content/output*.png /content/drive/MyDrive/outputs

# Inpainting Pipeline

In [24]:
del pipe

In [22]:
base_model_name = "SG161222/Realistic_Vision_V5.1_noVAE"
# base_model_name = "runwayml/stable-diffusion-inpainting"
# seg_model_name = "BertChristiaens/controlnet-seg-room"
seg_model_name = "lllyasviel/control_v11p_sd15_seg" # for SD 1.5
depth_model_name = "lllyasviel/sd-controlnet-depth"

In [23]:
controlnet_seg = ControlNetModel.from_pretrained (seg_model_name, torch_dtype=dtype)
controlnet_depth = ControlNetModel.from_pretrained (depth_model_name, torch_dtype=dtype)

config.json:   0%|          | 0.00/994 [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/1.45G [00:00<?, ?B/s]

config.json:   0%|          | 0.00/920 [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/1.45G [00:00<?, ?B/s]

In [29]:
sd_pipe = StableDiffusionControlNetInpaintPipeline.from_pretrained (
    base_model_name,
    controlnet = [controlnet_seg, controlnet_depth],
    safety_checker = None,
    torch_dtype = dtype
)

Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

In [30]:
sd_pipe.schedular = UniPCMultistepScheduler.from_config (pipe.scheduler.config)
sd_pipe = pipe.to(device)

In [31]:
# Positive Prompt (things we want)

pos_prompt = """A small, inviting coffee shop is a warm refuge from the rain lashing against its large picture windows.
The interior is filled with the rich aroma of freshly ground coffee and baked pastries.
The lighting is soft and warm, coming from low-hanging Edison bulbs and the glow of the espresso machine.
 A collection of mismatched, comfortable armchairs and small wooden tables are arranged throughout the space.
The walls are lined with bookshelves filled with used paperbacks for patrons to read.
 Steam fogs up the inside of the windows, and you can see streaks of rain running down the glass.
 The gentle hum of conversation mixes with the sound of the coffee grinder and the soft jazz music playing in the background."""

# pos_prompt = """Photorealistic interior design, modern minimalist living room, clean aesthetic, high-end furniture, bright and airy.
#  Dominant color scheme of soft pastel yellow and light grey, with accents of dark wood and subtle green.
#   Neutral off-white walls and light beige rug. A long, contemporary, three-seater sofa upholstered in a muted pastel yellow fabric,
#    with thin, dark wooden legs. On the sofa, two matching yellow throw pillows and one light grey throw pillow.
#     Centered in front of the sofa, a rectangular coffee table with a dark wooden top and thin dark legs.
#      On the coffee table, a small stack of books and a round, light-colored decorative object.
#      Behind the sofa, three framed abstract art prints in a row, featuring geometric shapes in shades of yellow, grey, and off-white.
#       To the right of the sofa, a matching armchair in the same pastel yellow fabric and dark wooden legs.
#        To the right of the armchair, a low-slung, mid-century modern-style lounge chair with a light yellow upholstered seat and back,
#         set within a dark wooden frame. In front of the sofa and armchair, a soft, round ottoman in a matching pastel yellow fabric.
#          Recessed ceiling spotlights illuminating the room. Two modern pendant lights with dark, minimalist fixtures hanging from
#           the ceiling: one large, dark dome-shaped pendant light above the armchair area, and a smaller, dark, conical pendant light
#            above the coffee table area. A single, dark, wall-mounted sconce on the wall to the left of the art prints.
#             A single potted green plant with broad leaves (e.g., a Fiddle Leaf Fig or similar) placed on a small, dark wooden
#              side table between the sofa and the right armchair. The flooring is a light, smooth, possibly concrete or light wood finish,
#               mostly covered by a large, rectangular, light beige/off-white area rug. Soft, diffused interior lighting.
#                Professional interior photography. High resolution, sharp focus, exquisite detail, luxurious texture rendering."""

# Prompt Suffix (things that apply to all positive prompts)
prompt_suffix = """interior design, 8K, high resolution, ultra realistic, high quality, photorealistic, shot on a Canon EOS R5 with an 85mm f/1.2 lens."""
pos_prompt += prompt_suffix

# Negative Prompt (things we do NOT want)
neg_prompt = "windows, doors, low resolution, banner, logo, watermark, text, deformed, blurry, out of focus, surreal, ugly, beginner"


In [32]:
gs = 10                # guidance scale
num_steps = 75         # iterative sampling steps
strength = 0.99        # strength of modification
im_h, im_w = 512, 768  # image height, width

In [33]:
im_path = "/content/drive/MyDrive/images/empty-room-2.jpg"
orig_image = Image.open(im_path)
image = orig_image.resize((im_w, im_h))

In [34]:
orig_w, orig_h = orig_image.size
print(orig_w, orig_h)

736 736


# Depth Map

In [35]:
depth_model_name = "LiheYoung/depth-anything-base-hf" # Zero shot depth map model

In [36]:
pipe = pipeline (
    task = "depth-estimation",
    model = depth_model_name,
    dtype = dtype,
    device = device
)

config.json:   0%|          | 0.00/957 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/390M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/437 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Device set to use cuda


In [37]:
def get_depth_map (image):
  depth_map = pipe(orig_image)["depth"]
  return depth_map

In [38]:
depth_map = get_depth_map (orig_image)
depth_map.save("depth_map.png")

# Segmentation Map

In [39]:
# Download the bbox model
grounding_dino_model_id = "IDEA-Research/grounding-dino-tiny"
grounding_dino_processor = AutoProcessor.from_pretrained(grounding_dino_model_id)
grounding_dino_model = AutoModelForZeroShotObjectDetection.from_pretrained(grounding_dino_model_id).to(device)

preprocessor_config.json:   0%|          | 0.00/457 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/689M [00:00<?, ?B/s]

In [40]:
def get_bboxes (image, text_labels):

  inputs = grounding_dino_processor (
    images = image, text = text_labels, return_tensors = "pt").to(device)
  with torch.no_grad():
    outputs = grounding_dino_model (**inputs)

  results = grounding_dino_processor.post_process_grounded_object_detection (
      outputs,
      inputs.input_ids,
      target_sizes = [image.size[::-1]]
  )

  # Extract the bounding boxes
  boxes  = results[0]["boxes"]
  scores = results[0]["scores"]
  labels = results[0]["labels"]

  return [boxes.tolist()], labels

In [41]:
# test the above cell
get_bboxes (image, "door. window.")



([[[2.1608219146728516,
    55.71337890625,
    122.01409149169922,
    368.69512939453125],
   [172.18069458007812,
    145.08230590820312,
    219.08575439453125,
    326.15069580078125],
   [172.06141662597656,
    145.0760498046875,
    218.90533447265625,
    326.3276672363281]]],
 ['window', 'window', 'door'])

In [42]:
# Download the segmentation model
sam_model_id = "facebook/sam-vit-base"
sam_processor = SamProcessor.from_pretrained(sam_model_id)
sam_model = SamModel.from_pretrained(sam_model_id).to(device)

preprocessor_config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/375M [00:00<?, ?B/s]

In [43]:
def get_seg_map (image, text_labels):

  boxes, labels = get_bboxes (image, text_labels)
  inputs = sam_processor(image.convert("RGB"), return_tensors="pt").to(device)
  image_embeddings = sam_model.get_image_embeddings(inputs["pixel_values"])
  inputs = sam_processor (image, input_boxes=boxes, return_tensors="pt").to(device)

  inputs["input_boxes"].shape  # ???
  inputs.pop("pixel_values", None)
  inputs.update({"image_embeddings": image_embeddings})

  with torch.no_grad():
      outputs = sam_model (**inputs)
  masks = sam_processor.image_processor.post_process_masks (
      outputs.pred_masks.cpu(),
      inputs["original_sizes"].cpu(),
      inputs["reshaped_input_sizes"].cpu()
  )
  scores = outputs.iou_scores

  all_masks = masks[0]
  all_scores = scores[0]
  # Select the masks with the highest probability score
  print(len(all_masks), all_masks.shape, all_scores)

  image_np = np.array(image)
  combined_mask_overlay = np.zeros_like(image_np)    # black image
  select_mask_overlay = 255 * np.ones_like(image_np) # white image

  for i in range(all_masks.shape[0]):

    # identify the mask with the highest score for each mask region
    best_mask_idx = torch.argmax(all_scores[i]).item()
    best_mask = all_masks[i, best_mask_idx]

    # create a random color
    color = [random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)]

    # convert the mask to numpy (from PT tensors)
    mask_np = best_mask.cpu().numpy().astype(np.uint8)

    # highlight the mask pixels with this mask's color
    combined_mask_overlay[mask_np > 0] = color

    if labels[i] == 'window' or labels[i] == 'door':
      select_mask_overlay[mask_np > 0] = [0, 0, 0]  # change mask to black

  return Image.fromarray(combined_mask_overlay), Image.fromarray(select_mask_overlay)


In [44]:
text_labels = "door. window. floor. ceiling. walls."
seg_map, seg_mask = get_seg_map (orig_image, text_labels)


7 torch.Size([7, 3, 736, 736]) tensor([[0.9797, 0.9947, 0.9739],
        [0.9873, 0.9797, 0.9166],
        [1.0156, 0.9891, 0.7867],
        [0.9747, 0.9777, 0.9069],
        [0.9876, 0.9925, 0.9779],
        [1.0044, 1.0051, 0.9615],
        [0.9741, 0.9776, 0.9112]], device='cuda:0')


In [45]:
seg_map.save("seg_map.png")

In [46]:
seg_mask.save("seg_mask.png")

In [47]:
depth_map_resized = depth_map.resize((im_w, im_h))
seg_map_resized = seg_map.resize((im_w, im_h))

In [48]:
seg_mask_np = np.array(seg_mask).astype(np.uint8)
seg_mask_resized = Image.fromarray(seg_mask_np).resize((im_w, im_h))

# Image Generation

In [49]:
results_folder = "results_final_1/"
os.makedirs(results_folder, exist_ok=True)

In [52]:
! mv results_final_1/ /content/drive/MyDrive/

In [50]:
cond_seg = [0.5] # [0.4, 0.5, 0.7, 0.75, 0.8, 0.9, 1.0]
cond_depth = [0.12, 0.15] # [0.1, 0.2, 0.3, 0.5]
strength = [0.9, 0.99] # [0.7, 0.8, 0.9]

In [51]:
# Try each combination of parameters and generate the output images
for cs in cond_seg:
  for cd in cond_depth:
    for st in strength:
      random_seed = random.randint(0, 2**32 - 1)
      generator = torch.Generator(device=device).manual_seed(random_seed)
      generated_image = sd_pipe (
          prompt = pos_prompt,
          negative_prompt = neg_prompt,
          num_inference_steps = num_steps,
          strength = st,
          guidance_scale = gs,
          generator = [generator],
          image = image,
          mask_image = seg_mask_resized,
          control_image = [seg_map_resized, depth_map_resized],
          controlnet_conditioning_scale = [cs, cd]
      ).images[0]

      design_image = generated_image.resize((orig_w, orig_h), Image.Resampling.LANCZOS)
      design_image.save(f"{results_folder}/design_image - cs_{cs} - cd_{cd} - st_{st}.png")


Token indices sequence length is longer than the specified maximum sequence length for this model (179 > 77). Running this sequence through the model will result in indexing errors
The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['the space . the walls are lined with bookshelves filled with used paperbacks for patrons to read . steam fogs up the inside of the windows , and you can see streaks of rain running down the glass . the gentle hum of conversation mixes with the sound of the coffee grinder and the soft jazz music playing in the background . interior design , 8 k , high resolution , ultra realistic , high quality , photorealistic , shot on a canon eos r 5 with an 8 5 mm f / 1 . 2 lens .']


  0%|          | 0/67 [00:00<?, ?it/s]

The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['the space . the walls are lined with bookshelves filled with used paperbacks for patrons to read . steam fogs up the inside of the windows , and you can see streaks of rain running down the glass . the gentle hum of conversation mixes with the sound of the coffee grinder and the soft jazz music playing in the background . interior design , 8 k , high resolution , ultra realistic , high quality , photorealistic , shot on a canon eos r 5 with an 8 5 mm f / 1 . 2 lens .']


  0%|          | 0/74 [00:00<?, ?it/s]

The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['the space . the walls are lined with bookshelves filled with used paperbacks for patrons to read . steam fogs up the inside of the windows , and you can see streaks of rain running down the glass . the gentle hum of conversation mixes with the sound of the coffee grinder and the soft jazz music playing in the background . interior design , 8 k , high resolution , ultra realistic , high quality , photorealistic , shot on a canon eos r 5 with an 8 5 mm f / 1 . 2 lens .']


  0%|          | 0/67 [00:00<?, ?it/s]

The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['the space . the walls are lined with bookshelves filled with used paperbacks for patrons to read . steam fogs up the inside of the windows , and you can see streaks of rain running down the glass . the gentle hum of conversation mixes with the sound of the coffee grinder and the soft jazz music playing in the background . interior design , 8 k , high resolution , ultra realistic , high quality , photorealistic , shot on a canon eos r 5 with an 8 5 mm f / 1 . 2 lens .']


  0%|          | 0/74 [00:00<?, ?it/s]

# Try it with a different interior image

In [54]:
im_path = "/content/drive/MyDrive/images/house-interior-1.jpg"
orig_image = Image.open(im_path)
image = orig_image.resize((im_w, im_h))

In [55]:
# Create a new prompt

pos_prompt = """Photorealistic interior design, modern minimalist living room, clean aesthetic, high-end furniture, bright and airy.
 Dominant color scheme of soft pastel yellow and light grey, with accents of dark wood and subtle green.
  Neutral off-white walls and light beige rug. A long, contemporary, three-seater sofa upholstered in a muted pastel yellow fabric,
   with thin, dark wooden legs. On the sofa, two matching yellow throw pillows and one light grey throw pillow.
    Centered in front of the sofa, a rectangular coffee table with a dark wooden top and thin dark legs.
     On the coffee table, a small stack of books and a round, light-colored decorative object.
     Behind the sofa, three framed abstract art prints in a row, featuring geometric shapes in shades of yellow, grey, and off-white.
      To the right of the sofa, a matching armchair in the same pastel yellow fabric and dark wooden legs.
       To the right of the armchair, a low-slung, mid-century modern-style lounge chair with a light yellow upholstered seat and back,
        set within a dark wooden frame. In front of the sofa and armchair, a soft, round ottoman in a matching pastel yellow fabric.
         Recessed ceiling spotlights illuminating the room. Two modern pendant lights with dark, minimalist fixtures hanging from
          the ceiling: one large, dark dome-shaped pendant light above the armchair area, and a smaller, dark, conical pendant light
           above the coffee table area. A single, dark, wall-mounted sconce on the wall to the left of the art prints.
            A single potted green plant with broad leaves (e.g., a Fiddle Leaf Fig or similar) placed on a small, dark wooden
             side table between the sofa and the right armchair. The flooring is a light, smooth, possibly concrete or light wood finish,
              mostly covered by a large, rectangular, light beige/off-white area rug. Soft, diffused interior lighting.
               Professional interior photography. High resolution, sharp focus, exquisite detail, luxurious texture rendering."""

# Prompt Suffix (things that apply to all positive prompts)
prompt_suffix = """interior design, 8K, high resolution, ultra realistic, high quality, photorealistic """
pos_prompt += prompt_suffix

# Negative Prompt (things we do NOT want)
neg_prompt = "windows, doors, low resolution, banner, logo, watermark, text, deformed, blurry, out of focus, surreal, ugly, beginner"


## Depth Map

In [56]:
depth_model_name = "LiheYoung/depth-anything-base-hf"

In [57]:
pipe = pipeline(
    task="depth-estimation",
    model=depth_model_name,
    dtype=dtype,
    device=device
)

Device set to use cuda


In [58]:
def get_depth_map(image):
    depth_map = pipe(orig_image)["depth"]
    return depth_map

In [59]:
get_depth_map(image).save("depth_map.png")

## Segmentation Mask

In [60]:
grounding_dino_model_id = "IDEA-Research/grounding-dino-tiny"
grounding_dino_processor = AutoProcessor.from_pretrained(grounding_dino_model_id)
grounding_dino_model = AutoModelForZeroShotObjectDetection.from_pretrained(grounding_dino_model_id).to(device)

In [None]:
sam_model_id = "facebook/sam-vit-huge"
sam_processor = SamProcessor.from_pretrained(sam_model_id)
sam_model = SamModel.from_pretrained(sam_model_id).to(device)

preprocessor_config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/2.56G [00:00<?, ?B/s]

In [None]:
def get_bboxes(image, text_labels):

  inputs = grounding_dino_processor(
    images=image, text=text_labels, return_tensors="pt").to(device)

  with torch.no_grad():
    outputs = grounding_dino_model(**inputs)

  results = grounding_dino_processor.post_process_grounded_object_detection(
      outputs,
      inputs.input_ids,
      target_sizes=[image.size[::-1]]
  )

  # Extract the bounding boxes
  boxes = results[0]["boxes"]
  scores = results[0]["scores"]
  labels = results[0]["labels"]

  return [boxes.tolist()], labels

In [None]:
def get_seg_map(image, text_labels):

  boxes, labels = get_bboxes(image, text_labels)

  inputs = sam_processor(image.convert("RGB"), return_tensors="pt").to(device)
  image_embeddings = sam_model.get_image_embeddings(inputs["pixel_values"])

  inputs = sam_processor(image, input_boxes=boxes, return_tensors="pt").to(device)
  inputs["input_boxes"].shape
  inputs.pop("pixel_values", None)
  inputs.update({"image_embeddings": image_embeddings})

  with torch.no_grad():
      outputs = sam_model(**inputs)

  masks = sam_processor.image_processor.post_process_masks(
      outputs.pred_masks.cpu(), inputs["original_sizes"].cpu(), inputs["reshaped_input_sizes"].cpu())
  scores = outputs.iou_scores
  print(masks[0])
  print(scores)

  image_np = np.array(image)

  combined_mask_overlay = np.zeros_like(image_np)
  all_masks = masks[0]

  all_scores = scores[0]

  for i in range(all_masks.shape[0]):
      best_mask_idx = torch.argmax(all_scores[i]).item()
      best_mask = all_masks[i, best_mask_idx]

      color = [random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)]

      mask_np = best_mask.cpu().numpy().astype(np.uint8)

      combined_mask_overlay[mask_np > 0] = color

  alpha = 1.0
  final_image = cv2.addWeighted(image_np, 0, combined_mask_overlay, alpha, 0)

  final_image_rgb = cv2.cvtColor(final_image, cv2.COLOR_BGR2RGB)
  final_image_pil = Image.fromarray(final_image_rgb)
  return final_image_pil, masks, labels

In [None]:
text_labels = "door. window. floor. ceiling. walls"

seg_map, masks,labels = get_seg_map(orig_image, text_labels)

tensor([[[[False, False, False,  ..., False, False, False],
          [False, False, False,  ..., False, False, False],
          [False, False, False,  ..., False, False, False],
          ...,
          [False, False, False,  ..., False, False, False],
          [False, False, False,  ..., False, False, False],
          [False, False, False,  ..., False, False, False]],

         [[False, False, False,  ..., False, False, False],
          [False, False, False,  ..., False, False, False],
          [False, False, False,  ..., False, False, False],
          ...,
          [False, False, False,  ..., False, False, False],
          [False, False, False,  ..., False, False, False],
          [False, False, False,  ..., False, False, False]],

         [[False, False, False,  ..., False, False, False],
          [False, False, False,  ..., False, False, False],
          [False, False, False,  ..., False, False, False],
          ...,
          [False, False, False,  ..., False, False,

In [None]:
seg_map.save("seg_map.png")

In [None]:
def get_seg(image, text_labels):
  seg_map, masks,labels = get_seg_map(image, text_labels)

  indices_to_select = []
  for i,label in enumerate(labels):
    if label == 'window' or label == 'door':
      indices_to_select.append(i)
  # color_win = (0,255,3)
  # color_door = (1,255,8)

  conv_masks = masks[0].numpy().astype(np.uint8) * 255
  final_masks = conv_masks[indices_to_select,0,...]

  total_masks = np.zeros_like(final_masks[0])
  for i in range(len(final_masks)):
    total_masks+=final_masks[i,...]
  print(np.unique(total_masks))

  seg_mask = 255-(np.clip(total_masks,0,1)*255)#255-total_masks

  return seg_map, seg_mask

In [None]:
from PIL import Image
import numpy as np

text_labels = "door. window. floor. ceiling. walls"
seg_map, seg_mask_np = get_seg(orig_image, text_labels)
seg_map.save("seg_map.png")

# Convert the NumPy array to a PIL Image before saving
seg_mask_pil = Image.fromarray(seg_mask_np.astype(np.uint8))
seg_mask_pil.save("seg_mask.png")

[  0 254 255]


In [None]:

depth_map = get_depth_map(orig_image)
depth_map = depth_map.resize((im_w, im_h))

In [None]:
seg_mask = seg_mask_pil.resize((im_w, im_h))
seg_map = seg_map.resize((im_w, im_h))


In [None]:
a=5

In [None]:
#seg_model_name = "BertChristiaens/controlnet-seg-room"
seg_model_name = "lllyasviel/control_v11p_sd15_seg"
depth_model_name = "lllyasviel/sd-controlnet-depth"
base_model_name = "SG161222/Realistic_Vision_V5.1_noVAE"
#base_model_name = "runwayml/stable-diffusion-inpainting",

controlnet_depth = ControlNetModel.from_pretrained(depth_model_name, torch_dtype=torch.float16)
#base_model_name = "runwayml/stable-diffusion-v1-5"
controlnet_seg = ControlNetModel.from_pretrained(seg_model_name, torch_dtype=torch.float16)

In [None]:
pipe = StableDiffusionControlNetInpaintPipeline.from_pretrained(
    base_model_name,
    controlnet=[controlnet_seg, controlnet_depth],
    safety_checker=None,
    torch_dtype=dtype
)

pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
pipe = pipe.to(device)

Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

In [None]:
neg_prompt = "windows, doors, low resolution, banner, logo, watermark, text, deformed, blurry, out of focus, surreal, ugly, beginner"
guidance_scale = 10
num_steps = 100
strength = 0.999
im_h,im_w = 512,768
orig_h, orig_w = 1993, 3000

In [None]:
cond_seg = [0.5,0.7,0.75, 0.8,0.9, 1.0]
cond_depth = [0.1,0.2,0.3, 0.5]
strength = [0.7,0.8,0.9]

In [None]:
for cs in cond_seg:
  for cd in cond_depth:
    for st in strength:
      random_seed = random....
      generator = torch.Generator(device="cpu").manual_seed(random_seed)

      generated_image = pipe(
          prompt=pos_prompt,
          negative_prompt=neg_prompt,
          num_inference_steps=num_steps,
          strength=st,
          guidance_scale=guidance_scale,
          generator=[generator],
          image=image,
          mask_image=seg_mask,
          control_image=[seg_map, depth_map],
          controlnet_conditioning_scale=[cs,cd]
      ).images[0]
      design_image = generated_image.resize(
          (orig_w, orig_h), Image.Resampling.LANCZOS
      )
      design_image.save(f"design_image - cs_{cs} - cd_{cd} - st_{st}.png")

The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['the space. the walls are lined with bookshelves filled with used paperbacks for patrons to read. steam fogs up the inside of the windows, and you can see streaks of rain running down the glass. the gentle hum of conversation mixes with the sound of the coffee grinder and the soft jazz music playing in the background. interior design, 8 k, high resolution, ultra realistic, high quality, photorealistic, shot on a canon eos r 5 with an 8 5 mm f / 1. 2 lens.']


  0%|          | 0/90 [00:00<?, ?it/s]