<a href="https://colab.research.google.com/github/mkwatra08/TextIt/blob/main/Text_It.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install diffusers transformers accelerate
!pip install torch --upgrade
!wget http://images.cocodataset.org/zips/train2014.zip
!wget http://images.cocodataset.org/annotations/annotations_trainval2014.zip
!unzip train2014.zip
!unzip annotations_trainval2014.zip

import json
import os
import torch
from transformers import CLIPTextModel, CLIPTokenizer
from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler
from torch import autocast
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from torch import optim
from PIL import Image

# Load captions
with open('annotations/captions_train2014.json', 'r') as f:
    captions = json.load(f)

# Extract image paths and captions
image_folder = 'train2014'
text_image_pairs = []

for annot in captions['annotations']:
    image_id = annot['image_id']
    caption = annot['caption']
    image_path = os.path.join(image_folder, f'COCO_train2014_{image_id:012d}.jpg')
    text_image_pairs.append((caption, image_path))

print(f"Total pairs: {len(text_image_pairs)}")
print(f"Example pair: {text_image_pairs[0]}")

class TextImageDataset(Dataset):
    def __init__(self, text_image_pairs, tokenizer, transform=None):
        self.text_image_pairs = text_image_pairs
        self.tokenizer = tokenizer
        self.transform = transform

    def __len__(self):
        return len(self.text_image_pairs)

    def __getitem__(self, idx):
        text, image_path = self.text_image_pairs[idx]
        image = Image.open(image_path).convert("RGB")
        if self.transform:
            image = self.transform(image)
        tokens = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=77)
        return tokens.input_ids[0], tokens.attention_mask[0], image

transform = transforms.Compose([
    transforms.Resize((512, 512)),
    transforms.ToTensor(),
])

tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
dataset = TextImageDataset(text_image_pairs, tokenizer, transform)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

model_id = "CompVis/stable-diffusion-v1-4"
device = "cuda" if torch.cuda.is_available() else "cpu"

pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16 if device == "cuda" else torch.float32)
pipe.to(device)
pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)

num_epochs = 5
optimizer = optim.AdamW(pipe.unet.parameters(), lr=1e-5)

for epoch in range(num_epochs):
    for input_ids, attention_mask, images in dataloader:
        optimizer.zero_grad()
        images = images.to(device)
        with autocast(device):
            loss = pipe(input_ids=input_ids.to(device), attention_mask=attention_mask.to(device), pixel_values=images).loss
        loss.backward()
        optimizer.step()
        print(f"Epoch {epoch + 1}, Loss: {loss.item()}")

pipe.save_pretrained("path_to_save_model")

pipe = StableDiffusionPipeline.from_pretrained("path_to_save_model", torch_dtype=torch.float16 if device == "cuda" else torch.float32)
pipe.to(device)

prompt = "A fantasy landscape with mountains and rivers at sunset"
with autocast(device):
    image = pipe(prompt).images[0]

image.show()


In [None]:
!pip install diffusers transformers accelerate
!pip install torch --upgrade
!pip install huggingface_hub

from huggingface_hub import login
from huggingface_hub import HfApi, HfFolder, Repository

# save_directory = "path_to_save_model"
# pipe.save_pretrained(save_directory)

repo_name = "pallavi13/Text-It"
api = HfApi()
username = HfFolder.get_token().split(":")[0]

repo_url = api.create_repo(name=repo_name, exist_ok=True)
repo = Repository(local_dir=save_directory, clone_from=repo_url)
repo.push_to_hub(commit_message="Initial commit")
