In [None]:
!pip install datasets


Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[

In [None]:
!pip install pycocotools
!pip install requests

# Download the MSCOCO dataset
import requests
import zipfile
import os

coco_url = "http://images.cocodataset.org/zips/train2017.zip"
coco_zip = "train2017.zip"

# Download the dataset
r = requests.get(coco_url)
with open(coco_zip, "wb") as f:
    f.write(r.content)

# Extract the zip file
with zipfile.ZipFile(coco_zip, 'r') as zip_ref:
    zip_ref.extractall("coco")

# Cleanup
os.remove(coco_zip)

print("MSCOCO dataset downloaded and extracted.")




KeyboardInterrupt: 

In [None]:
import torch
from torch.utils.data import DataLoader, Subset
from transformers import CLIPProcessor, CLIPModel
from datasets import load_dataset
from tqdm import tqdm

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the CLIP model and processor
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Load the Flickr30k dataset and select a small subset (100 images)
dataset = load_dataset("flickr30k", split="train[:100]")

# Preprocessing function
def preprocess(example):
    return processor(text=example['captions'], images=example['image'], return_tensors="pt", padding=True)

# Preprocess the dataset
dataset = dataset.map(preprocess, batched=True)

# Create a DataLoader
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

# Define optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# Training loop
model.train()
for epoch in range(3):  # Train for 3 epochs
    total_loss = 0
    for batch in tqdm(dataloader):
        optimizer.zero_grad()

        # Move data to the device
        input_ids = batch['input_ids'].squeeze(1).to(device)
        pixel_values = batch['pixel_values'].squeeze(1).to(device)
        attention_mask = batch['attention_mask'].squeeze(1).to(device)

        outputs = model(input_ids=input_ids, pixel_values=pixel_values, attention_mask=attention_mask)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss / len(dataloader)}")

# Evaluation
model.eval()
with torch.no_grad():
    for batch in dataloader:
        input_ids = batch['input_ids'].squeeze(1).to(device)
        pixel_values = batch['pixel_values'].squeeze(1).to(device)
        attention_mask = batch['attention_mask'].squeeze(1).to(device)

        outputs = model(input_ids=input_ids, pixel_values=pixel_values, attention_mask=attention_mask)
        logits_per_image = outputs.logits_per_image  # Image-to-text similarity score
        logits_per_text = outputs.logits_per_text    # Text-to-image similarity score

        # You can print or save these logits for further analysis
        print(logits_per_image, logits_per_text)


DatasetNotFoundError: Dataset 'flickr30k' doesn't exist on the Hub or cannot be accessed.

In [None]:
import os
import torch
from pycocotools.coco import COCO
from PIL import Image
from torchvision import transforms
from torch.utils.data import DataLoader, Dataset
from transformers import CLIPProcessor, CLIPModel
from tqdm import tqdm

# Path to images and annotations
image_dir = "coco/train2017/"
ann_file = "coco/annotations/captions_train2017.json"

# Initialize COCO API
coco = COCO(ann_file)

# Get the first 100 image ids
img_ids = list(coco.imgs.keys())[:100]

# Define a custom dataset class
class CocoDataset(Dataset):
    def __init__(self, img_ids, image_dir, coco, transform=None):
        self.img_ids = img_ids
        self.image_dir = image_dir
        self.coco = coco
        self.transform = transform

    def __len__(self):
        return len(self.img_ids)

    def __getitem__(self, idx):
        img_id = self.img_ids[idx]
        ann_ids = self.coco.getAnnIds(imgIds=img_id)
        anns = self.coco.loadAnns(ann_ids)
        caption = anns[0]['caption']

        img_info = self.coco.loadImgs(img_id)[0]
        img_path = os.path.join(self.image_dir, img_info['file_name'])
        image = Image.open(img_path).convert("RGB")

        if self.transform:
            image = self.transform(image)

        return image, caption

# Define transformations
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

# Create the dataset and dataloader
dataset = CocoDataset(img_ids=img_ids, image_dir=image_dir, coco=coco, transform=transform)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

# Load the CLIP model and processor
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Training loop
model.train()
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

for epoch in range(3):  # Train for 3 epochs
    total_loss = 0
    for images, captions in tqdm(dataloader):
        # Process images and captions
        inputs = processor(text=captions, images=images, return_tensors="pt", padding=True)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss / len(dataloader)}")

# Evaluation
model.eval()
with torch.no_grad():
    for images, captions in dataloader:
        inputs = processor(text=captions, images=images, return_tensors="pt", padding=True)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        outputs = model(**inputs)
        logits_per_image = outputs.logits_per_image  # Image-to-text similarity score
        logits_per_text = outputs.logits_per_text    # Text-to-image similarity score

        # Print the logits for the first example in the batch
        print(logits_per_image[0], logits_per_text[0])



loading annotations into memory...


FileNotFoundError: [Errno 2] No such file or directory: 'coco/annotations/captions_train2017.json'