<h1>
<hr style=" border:none; height:3px;">
<center>Main Workflow - Seamless Stable Diffusion CS</center>
<hr style=" border:none; height:3px;">
</h1>

<center><img src='https://netacad.centralesupelec.fr/img/cs.jpg' width=200></center>

<h4><center>Louis LHOTTE | Clément VERON | Edouard SEGUIER</center></h4>

# I. Imports

In [1]:
import cv2
import numpy as np

# II. Observation and Blending

<div class="alert alert-block alert-info"> After conducting tests, we realized that some additions were somewhat weird and unrealistic compared to the original image. For instance, adding headphones to a regular photo consistently (across three attempts) resulted in an outlier (very strange style) or half-headphones with poorly matched colors. The idea, therefore, is to add a <b>blending / filtering</b> step to the pipeline so that the headphones integrate better into the photo (aiming for realism and seamless integration).</div>

In [None]:
original_img = cv2.imread('sylvie.jpg')
modified_img = cv2.imread('headphones.png')

if original_img.shape[:2] != modified_img.shape[:2]:
    modified_img = cv2.resize(modified_img, (original_img.shape[1], original_img.shape[0]))

diff_mask = cv2.absdiff(original_img, modified_img)
gray_diff_mask = cv2.cvtColor(diff_mask, cv2.COLOR_BGR2GRAY)
_, binary_diff_mask = cv2.threshold(gray_diff_mask, 30, 255, cv2.THRESH_BINARY)

contours, _ = cv2.findContours(binary_diff_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

mask = np.zeros_like(binary_diff_mask)
cv2.drawContours(mask, contours, -1, (255), thickness=cv2.FILLED)

original_region = cv2.bitwise_and(original_img, original_img, mask=mask)
modified_region = cv2.bitwise_and(modified_img, modified_img, mask=mask)

modified_region_lab = cv2.cvtColor(modified_region, cv2.COLOR_BGR2LAB)
l, a, b = cv2.split(modified_region_lab)
a = cv2.add(a, 10)
b = cv2.add(b, 20)
modified_region_lab = cv2.merge((l, a, b))
modified_region_colored = cv2.cvtColor(modified_region_lab, cv2.COLOR_LAB2BGR)

blended_region = cv2.addWeighted(original_region, 0.4, modified_region_colored, 0.6, 0)

final_result = cv2.addWeighted(original_img, 1, blended_region, 1, 0)

cv2.imwrite('final_blended_output_v2.jpg', final_result)
cv2.imshow('Final Blended Image', final_result)
cv2.waitKey(0)
cv2.destroyAllWindows()

In [None]:
original_img = cv2.imread('sylvie.jpg')
modified_img = cv2.imread('headphones.png')

if original_img.shape[:2] != modified_img.shape[:2]:
    modified_img = cv2.resize(modified_img, (original_img.shape[1], original_img.shape[0]))

diff_mask = cv2.absdiff(original_img, modified_img)
gray_diff_mask = cv2.cvtColor(diff_mask, cv2.COLOR_BGR2GRAY)
_, binary_diff_mask = cv2.threshold(gray_diff_mask, 30, 255, cv2.THRESH_BINARY)

# Morphological operations to refine the mask
kernel = np.ones((5, 5), np.uint8)
binary_diff_mask = cv2.erode(binary_diff_mask, kernel, iterations=1)
binary_diff_mask = cv2.dilate(binary_diff_mask, kernel, iterations=1)

x, y, w, h = cv2.boundingRect(binary_diff_mask)
center = (x + w // 2, y + h // 2)

original_img_lab = cv2.cvtColor(original_img, cv2.COLOR_BGR2LAB)
modified_img_lab = cv2.cvtColor(modified_img, cv2.COLOR_BGR2LAB)

l_orig, a_orig, b_orig = cv2.split(original_img_lab)
l_mod, a_mod, b_mod = cv2.split(modified_img_lab)

a_mod = cv2.addWeighted(a_mod, 0.9, a_orig, 0.1, 0)
b_mod = cv2.addWeighted(b_mod, 0.9, b_orig, 0.1, 0)

modified_img_lab = cv2.merge((l_mod, a_mod, b_mod))
modified_img = cv2.cvtColor(modified_img_lab, cv2.COLOR_LAB2BGR)

mask = cv2.absdiff(original_img, modified_img)
gray_mask = cv2.cvtColor(mask, cv2.COLOR_BGR2GRAY)
_, binary_mask = cv2.threshold(gray_mask, 30, 255, cv2.THRESH_BINARY)

binary_mask_cropped = np.zeros_like(binary_mask)
binary_mask_cropped[y:y+h, x:x+w] = binary_mask[y:y+h, x:x+w]

# Gaussian blur (feathering)
feathered_mask = cv2.GaussianBlur(binary_mask_cropped, (9, 9), 0)

blended_img = cv2.seamlessClone(modified_img, original_img, feathered_mask, center, cv2.NORMAL_CLONE)

cv2.imwrite('blended_output_v2.jpg', blended_img)
cv2.imshow('Blended Image', blended_img)
cv2.waitKey(0)
cv2.destroyAllWindows()


<div class="alert alert-block alert-danger">Makes the photo more natural so it looks effective to some extent. However, I have some difficulty zoning in the modified region and some unwanted elements gets modified (like the beard unfortunately). Overall colours are also changed slightly but it is linked with the same problem : I am modifying more than I should</div>

## III. Space color filter

In [18]:
original_img = cv2.imread('sylvie.jpg')
modified_img = cv2.imread('headphones.png')

if original_img.shape[:2] != modified_img.shape[:2]:
    modified_img = cv2.resize(modified_img, (original_img.shape[1], original_img.shape[0]))

diff_mask = cv2.absdiff(original_img, modified_img)
gray_diff_mask = cv2.cvtColor(diff_mask, cv2.COLOR_BGR2GRAY)
_, binary_diff_mask = cv2.threshold(gray_diff_mask, 30, 255, cv2.THRESH_BINARY)

contours, _ = cv2.findContours(binary_diff_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

mask = np.zeros_like(binary_diff_mask)
cv2.drawContours(mask, contours, -1, (255), thickness=cv2.FILLED)

original_region = cv2.bitwise_and(original_img, original_img, mask=mask)
modified_region = cv2.bitwise_and(modified_img, modified_img, mask=mask)

# Adjust brightness and color with High-Pass Filtering (frequency-based blending)
blurred_original = cv2.GaussianBlur(original_region, (21, 21), 0)
high_pass_original = cv2.subtract(original_region, blurred_original)

modified_blended = cv2.add(modified_region, high_pass_original)

# Subtle LAB adjustment to ensure color harmony
modified_region_lab = cv2.cvtColor(modified_blended, cv2.COLOR_BGR2LAB)
l, a, b = cv2.split(modified_region_lab)
a = cv2.add(a, 3)  # Small adjustment in color channels
b = cv2.add(b, 5)
modified_region_lab_adjusted = cv2.merge((l, a, b))
modified_region_final = cv2.cvtColor(modified_region_lab_adjusted, cv2.COLOR_LAB2BGR)

blended_region = cv2.addWeighted(original_region, 0.5, modified_region_final, 0.5, 0)
final_result = cv2.addWeighted(original_img, 1, blended_region, 1, 0)

output_path = 'final_blended_output_v4.jpg'
cv2.imwrite(output_path, final_result)
cv2.imshow('Final Blended Image', final_result)
cv2.waitKey(0)
cv2.destroyAllWindows()

<div class="alert alert-block alert-danger">Did not converge yet</div>

# IV. Evaluation Metric

In [24]:
import torch
from transformers import AutoFeatureExtractor, AutoModel
from PIL import Image
from torchvision import transforms
from torch.nn.functional import cosine_similarity
import os
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"

device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "google/vit-base-patch16-224"
feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)

def preprocess_image(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = feature_extractor(images=image, return_tensors="pt")
    return inputs["pixel_values"].to(device)

image_path1 = "sylvie.jpg"
image_path2 = "blended_output_v2.jpg"
image1 = preprocess_image(image_path1)
image2 = preprocess_image(image_path2)

with torch.no_grad():
    features1 = model(image1).last_hidden_state.mean(dim=1)
    features2 = model(image2).last_hidden_state.mean(dim=1)

similarity = cosine_similarity(features1, features2).item()
print(f"Similarité entre les deux images: {similarity:.4f}")

Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Similarité entre les deux images: 0.8044
