<a href="https://colab.research.google.com/github/liverishav13/EE798R_Code/blob/main/Updated_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
from transformers import CLIPModel, CLIPTokenizer
from diffusers import StableDiffusionPipeline
from PIL import Image
from torchvision import transforms
import matplotlib.pyplot as plt

In [None]:
# Define the Context Diffusion model
class ContextDiffusion(nn.Module):
    def __init__(self, text_encoder, image_encoder, diffusion_model, tokenizer):
        super(ContextDiffusion, self).__init__()
        self.text_encoder = text_encoder
        self.image_encoder = image_encoder
        self.diffusion_model = diffusion_model
        self.tokenizer = tokenizer

        # Projection layer to align image embedding dimensions with text embeddings
        self.image_projection = nn.Linear(768, 512)  # Adjust input dim based on encoder output

    def forward(self, query_image, context_image, prompt=None):
        # PART 1: Text Encoding
        if prompt:
            # Tokenize and encode the text prompt
            inputs = self.tokenizer(prompt, return_tensors="pt")
            text_embeddings = self.text_encoder(**inputs).last_hidden_state
        else:
            text_embeddings = torch.zeros([1, 512])  # Zero vector if no prompt provided

        # PART 2: Visual Context Encoding
        # Encode the single context image
        context_embedding = self.image_encoder(pixel_values=context_image).last_hidden_state
        context_embedding = context_embedding.mean(dim=0, keepdim=True)  # Aggregate by averaging

        # Project context embedding to match text embedding dimension
        context_embedding = self.image_projection(context_embedding)

        # PART 3: Cross-Attention to Combine Text and Context Embeddings
        combined_embeddings = torch.cat([text_embeddings, context_embedding], dim=1)

        # Generate an image conditioned on the combined embeddings
        generated_image = self.diffusion_model(prompt, num_inference_steps=10).images[0]

        return generated_image

In [None]:
# Utility function to load and normalize images dynamically
def load_image_as_tensor(image_path):
    transform = transforms.Compose([
        transforms.Resize((224, 224)),  # Resize to 224x224 for CLIP
        transforms.ToTensor()
    ])
    image = Image.open(image_path).convert("RGB")
    image = transform(image)

    # Calculate mean and std for normalization
    mean, std = torch.mean(image, dim=[1, 2]), torch.std(image, dim=[1, 2])
    normalize = transforms.Normalize(mean.tolist(), std.tolist())
    normalized_image = normalize(image)

    return normalized_image.unsqueeze(0)  # Add batch dimension

In [None]:
# Initialize encoders and tokenizer
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
text_encoder = clip_model.text_model
image_encoder = clip_model.vision_model
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")

# Load the Stable Diffusion model
diffusion_model = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to("cpu")

# Instantiate Context Diffusion Model
model = ContextDiffusion(text_encoder, image_encoder, diffusion_model, tokenizer)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

model_index.json:   0%|          | 0.00/541 [00:00<?, ?B/s]

Fetching 16 files:   0%|          | 0/16 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

safety_checker/config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

(…)ature_extractor/preprocessor_config.json:   0%|          | 0.00/342 [00:00<?, ?B/s]

tokenizer/merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

text_encoder/config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

(…)kpoints/scheduler_config-checkpoint.json:   0%|          | 0.00/209 [00:00<?, ?B/s]

scheduler/scheduler_config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/492M [00:00<?, ?B/s]

tokenizer/special_tokens_map.json:   0%|          | 0.00/472 [00:00<?, ?B/s]

tokenizer/tokenizer_config.json:   0%|          | 0.00/806 [00:00<?, ?B/s]

unet/config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

vae/config.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

tokenizer/vocab.json:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/3.44G [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/335M [00:00<?, ?B/s]

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

In [None]:
# Load example images (only one context image, as per the paper)
query_image = load_image_as_tensor("/content/img1.jpg").to("cpu")
context_image = load_image_as_tensor("/content/img5.jpg").to("cpu")  # Single context image

# Text prompt
# prompt = ""

# Run forward pass
generated_image = model(query_image, context_image, "")

# Display the generated image
plt.imshow(generated_image)
plt.axis('off')
plt.show()

RuntimeError: Tensors must have same number of dimensions: got 2 and 3