### Installing and Importing the Packages and Libraries

In [None]:
# Install necessary libraries
!pip install diffusers transformers accelerate datasets safetensors xformers
!pip install datasets
!pip install torch torchvision transformers diffusers datasets accelerate
!pip install invisible_watermark transformers accelerate safetensors

# Import required libraries
import os
from PIL import Image
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from diffusers import StableDiffusionPipeline, DDPMScheduler, UNet2DConditionModel
from transformers import AutoTokenizer
import torch
from torch.optim import AdamW
from tqdm import tqdm
from datasets import load_dataset
import requests
from io import BytesIO

# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"
dataset = load_dataset("RIW/small-coco", split="train")



### Using the MSCoco Dataset

In [None]:
# Prepare directories for saving images and captions
os.makedirs("./coco_images", exist_ok=True)
os.makedirs("./coco_captions", exist_ok=True)

# Save images and captions
for i, data in enumerate(dataset):
    # Extract URL and caption
    url = data["url"]
    caption = data["caption"]

    # Download and save the image
    response = requests.get(url)
    image = Image.open(BytesIO(response.content)).convert("RGB")
    image.save(f"./coco_images/image_{i}.jpg")

    # Save the caption
    with open(f"./coco_captions/caption_{i}.txt", "w") as f:
        f.write(caption)

    # To limit the number of images processed, you can add a condition to stop after a certain number
    if i >= 5000:  # For example, only process the first 5000 images
        break

print("Images and captions saved!")

In [None]:
from diffusers import DiffusionPipeline
import torch

In [None]:
from diffusers import DiffusionPipeline
import torch

pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16, use_safetensors=True, variant="fp16")
device = "cuda"
pipe.to(device)

# if using torch < 2.0
# pipe.enable_xformers_memory_efficient_attention()

prompt = "A young man with an umbrella standing with cattle"

images = pipe(prompt=prompt).images[0]


In [None]:
from IPython.display import display
display(images)


In [None]:
print(pipe)

In [None]:
import os
import json

# Paths to your folders
images_folder = 'coco_images/'
captions_folder = 'coco_captions/'
# Get the list of image files (assuming .jpg or .png)
image_files = [f for f in os.listdir(images_folder) if f.endswith(('.jpg', '.png'))]
# Get the list of image files and caption files
image_files = sorted([f for f in os.listdir(images_folder) if f.endswith(('.jpg', '.png'))])
caption_files = sorted([f for f in os.listdir(captions_folder) if f.startswith('caption_') and f.endswith('.txt')])

# Ensure that the number of images and captions are the same
if len(image_files) != len(caption_files):
    raise ValueError(f"The number of images ({len(image_files)}) does not match the number of captions ({len(caption_files)}). Please check your dataset.")

# Prepare the dataset list
dataset = []

# Pair images with captions based on the numbers in the filenames
for i in range(len(image_files)):
    # Extract the number from the filename (e.g., image_1.jpg -> 1)
    image_number = image_files[i].split('_')[1].split('.')[0]  # Extracting the number part

    # Find the corresponding caption file based on the extracted number
    caption_filename = f"caption_{image_number}.txt"

    if caption_filename in caption_files:
        image_file = image_files[i]
        caption_file = caption_filename

        # Construct the paths
        image_path = os.path.join(images_folder, image_file)
        caption_path = os.path.join(captions_folder, caption_file)

        # Read the caption
        with open(caption_path, 'r') as file:
            caption = file.read().strip()

        # Create an entry for the dataset
        dataset.append({
            "image": image_path,
            "caption": caption
        })
    else:
        print(f"Warning: No matching caption for image {image_number}")

# Save the dataset to a JSON file
dataset_path = 'coco_dataset.json'
with open(dataset_path, 'w') as f:
    json.dump(dataset, f, indent=4)

print(f"Dataset saved to {dataset_path}")

Dataset saved to coco_dataset.json


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import CLIPTextModel, CLIPTokenizer
from diffusers import StableDiffusionXLPipeline
from PIL import Image
import os
from torchvision import transforms
from tqdm import tqdm

device = "cuda"


In [None]:
class CocoDataset(Dataset):
    def __init__(self, coco_data, tokenizer, transform=None):
        self.data = coco_data  # List of image-caption pairs
        self.tokenizer = tokenizer
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        image_path = item['image']
        caption = item['caption']

        # Load the image
        image = Image.open(image_path).convert("RGB")

        # Preprocess the image
        if self.transform:
            image = self.transform(image)

        # Tokenize the caption
        inputs = self.tokenizer(caption, padding="max_length", truncation=True, max_length=77, return_tensors="pt")

        return {
            'pixel_values': image,
            'input_ids': inputs['input_ids'].squeeze(0),
            'attention_mask': inputs['attention_mask'].squeeze(0)
        }


In [None]:
transform = transforms.Compose([
    transforms.Resize((512, 512)),  # Resize images to the required size
    transforms.ToTensor(),  # Convert image to tensor
    transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])  # Normalize the images
])


In [None]:
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14")
text_encoder.to(device)


In [None]:
# Load the dataset from the saved JSON file
import json
with open('coco_dataset.json', 'r') as f:
    coco_data = json.load(f)

# Create dataset and dataloaders
dataset = CocoDataset(coco_data, tokenizer, transform)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)


In [None]:
import torch

torch.cuda.empty_cache()  # Clear unused memory

### Start Finetuning

In [None]:
import torch
from torch.optim import AdamW
from tqdm import tqdm
from torch.cuda.amp import GradScaler, autocast

# Set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move the pipeline to the device
pipe.to(device)

# Define the optimizer for individual components
optimizer = AdamW(
    list(pipe.unet.parameters()) +
    list(pipe.vae.parameters()) +
    list(pipe.text_encoder.parameters()),
    lr=5e-6
)

# Training setup
epochs = 5  # Number of epochs
gradient_accumulation_steps = 4  # For accumulating gradients
scaler = GradScaler()

# Training loop
for epoch in range(epochs):
    pipe.unet.train()  # Set UNet to training mode

    for batch in tqdm(dataloader):
        optimizer.zero_grad()

        # Validate batch inputs
        pixel_values = batch.get('pixel_values', None)
        input_ids = batch.get('input_ids', None)
        attention_mask = batch.get('attention_mask', None)

        if pixel_values is None or input_ids is None or attention_mask is None:
            raise ValueError("Input batch is missing required keys (pixel_values, input_ids, attention_mask).")

        # Move inputs to the device
        pixel_values = pixel_values.to(device).half()
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        torch.cuda.empty_cache()  # Clear unused memory
        # Initialize loss
        loss = None

        try:
            # Encode the pixel values using the VAE
            with autocast():
                latents = pipe.vae.encode(pixel_values).latent_dist.sample()
                latents = latents.to(device)

                # Generate text embeddings using the text encoder
                text_embeddings = pipe.text_encoder(input_ids)[0]

                # Sample a random timestep
                timestep = torch.randint(0, 1000, (latents.shape[0],), device=device)

                # Forward pass through the UNet
                outputs = pipe.unet(
                    latents,
                    timestep=timestep,
                    encoder_hidden_states=text_embeddings,
                    attention_mask=attention_mask
                )

                # Check for loss in outputs
                if hasattr(outputs, 'loss') and outputs.loss is not None:
                    loss = outputs.loss
                else:
                    # Define your own loss function here if needed
                    raise ValueError("The UNet output does not include 'loss'. Verify the model output.")

            # Perform backward pass with gradient scaling
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

        except Exception as e:
            print(f"Error during forward/backward pass: {e}")
            continue

        # Print loss for this batch if no exception occurred
        if loss is not None:
            print(f"Batch Loss: {loss.item()}")
    del pixel_values, input_ids, attention_mask
    torch.cuda.empty_cache()
    # Print epoch loss summary (if at least one batch succeeded)
    if loss is not None:
        print(f"Epoch [{epoch + 1}/{epochs}], Last Batch Loss: {loss.item()}")

# Save the fine-tuned model
pipe.save_pretrained("fine_tuned_stable_diffusion")


In [None]:
torch.cuda.empty_cache()
import gc
gc.collect()
torch.cuda.empty_cache()
