In [1]:
import torch
# Check if GPU is available
print("Is CUDA available:", torch.cuda.is_available())

# If GPU is available, print the name of the GPU
if torch.cuda.is_available():
    print("GPU Name:", torch.cuda.get_device_name(0))

# Print the number of GPUs available
print("Number of GPUs available:", torch.cuda.device_count())

Is CUDA available: True
GPU Name: NVIDIA GeForce RTX 4070 Laptop GPU
Number of GPUs available: 1


In [2]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import ViTModel, BertTokenizer, AutoTokenizer
from torchvision import transforms
from datasets import load_dataset
import os
from PIL import Image
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Configuration
BATCH_SIZE = 16
EMBED_DIM = 768  # Dimension of embeddings (consistent with ViT output)
NUM_HEADS = 8
NUM_LAYERS = 6
VOCAB_SIZE = 30522  # Vocabulary size of the tokenizer (e.g., BERT tokenizer)
MAX_LEN = 128  # Max caption length
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
# Pretrained Vision Transformer (ViT)
class ViTImageEncoder(nn.Module):
    def __init__(self, embed_dim):
        super(ViTImageEncoder, self).__init__()
        # Load pre-trained ViT
        self.vit = ViTModel.from_pretrained("google/vit-base-patch16-224-in21k")
        # Linear layer to match embed_dim
        self.fc = nn.Linear(self.vit.config.hidden_size, embed_dim)

    def forward(self, images):
        # Pass the images through the ViT model
        outputs = self.vit(images)
        img_features = self.fc(outputs.last_hidden_state)
        return img_features

In [5]:
# Transformer Decoder for caption generation
class CaptionDecoder(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, num_layers, max_len):
        super(CaptionDecoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.positional_encoding = nn.Parameter(torch.randn(max_len, embed_dim))
        decoder_layer = nn.TransformerDecoderLayer(embed_dim, num_heads)
        self.transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers)
        self.fc_out = nn.Linear(embed_dim, vocab_size)

    def forward(self, tgt, memory):
        # Embed the captions and add positional encoding
        tgt_embed = self.embedding(tgt) + self.positional_encoding[:tgt.size(1), :]
        # Pass the embedded tokens through the transformer decoder
        output = self.transformer_decoder(tgt_embed, memory)
        # Predict the next word in the sequence
        return self.fc_out(output)


In [6]:
# Complete Image Captioning Model (ViT Encoder + Transformer Decoder)
class ImageCaptioningModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, num_layers, max_len):
        super(ImageCaptioningModel, self).__init__()
        self.encoder = ViTImageEncoder(embed_dim)
        self.decoder = CaptionDecoder(vocab_size, embed_dim, num_heads, num_layers, max_len)

    def forward(self, images, captions):
        img_features = self.encoder(images)
        output = self.decoder(captions, img_features)
        return output


In [7]:
# Prepare MS COCO Dataset
def preprocess_coco(coco_dir):
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    
    # Load the COCO dataset
    dataset = load_dataset('coco', split='train')  # You can change split to 'validation' for validation

    # Convert the dataset into DataLoader format
    class COCODataset(torch.utils.data.Dataset):
        def __init__(self, dataset):
            self.dataset = dataset
            self.transform = transform

        def __len__(self):
            return len(self.dataset)

        def __getitem__(self, idx):
            item = self.dataset[idx]
            image = Image.open(item['file_name']).convert('RGB')
            image = self.transform(image)
            caption = tokenizer(item['caption'], return_tensors="pt", padding='max_length', max_length=MAX_LEN, truncation=True).input_ids.squeeze(0)
            return image, caption

    return DataLoader(COCODataset(dataset), batch_size=BATCH_SIZE, shuffle=True)


In [8]:
# Tokenizer for captions
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [9]:
# Training Loop
def train(model, dataloader, optimizer, criterion, num_epochs=5):
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        for images, captions in tqdm(dataloader):
            images, captions = images.to(DEVICE), captions.to(DEVICE)
            optimizer.zero_grad()
            outputs = model(images, captions)
            loss = criterion(outputs.view(-1, VOCAB_SIZE), captions.view(-1))
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {running_loss / len(dataloader)}")

In [11]:
# Main function to run the training
if __name__ == '__main__':
    # Initialize model
    model = ImageCaptioningModel(vocab_size=VOCAB_SIZE, embed_dim=EMBED_DIM, num_heads=NUM_HEADS, num_layers=NUM_LAYERS, max_len=MAX_LEN).to(DEVICE)

    # Define optimizer and loss function
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    criterion = nn.CrossEntropyLoss()

    # Load the dataset
    coco_dir = './cocoapi/'  # Path to the MS COCO 2014 dataset directory
    train_loader = preprocess_coco(coco_dir)

    # Start training
    train(model, train_loader, optimizer, criterion, num_epochs=1)

DatasetNotFoundError: Dataset 'coco' doesn't exist on the Hub or cannot be accessed.

In [4]:
from transformers import pipeline

captioner = pipeline(model="ydshieh/vit-gpt2-coco-en")
captioner()

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


ConnectionError: HTTPConnectionPool(host='c', port=80): Max retries exceeded with url: /%5CAbhi-MTech%5CSem-1%5CAI-Lab%5CAI_Project_final%5Cimage_captioning%5Ccocoapi%5Cimages%09est2014%5CCOCO_test2014_000000000182.jpg (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x0000022659613CD0>: Failed to resolve 'c' ([Errno 11001] getaddrinfo failed)"))

## ********************************************************************************

In [5]:
import torch

# Check if a GPU is available and set the device accordingly
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')


Using device: cuda


In [6]:
from transformers import ViTModel, ViTFeatureExtractor

# Load Vision Transformer and move it to GPU
feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224')
vit_model = ViTModel.from_pretrained('google/vit-base-patch16-224').to(device)

def extract_image_features(image):
    # Move image to GPU
    inputs = feature_extractor(images=image, return_tensors="pt").to(device)
    outputs = vit_model(**inputs)
    return outputs.last_hidden_state  # Shape: (batch_size, seq_length, hidden_size)


Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
import torch.nn as nn

class TransformerDecoder(nn.Module):
    def __init__(self, vocab_size, d_model, num_layers, num_heads, ff_dim, max_seq_length):
        super(TransformerDecoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model).to(device)
        self.positional_encoding = nn.Parameter(torch.randn(1, max_seq_length, d_model)).to(device)
        decoder_layer = nn.TransformerDecoderLayer(d_model=d_model, nhead=num_heads, dim_feedforward=ff_dim)
        self.transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_layers).to(device)
        self.fc = nn.Linear(d_model, vocab_size).to(device)

    def forward(self, image_features, captions):
        captions_emb = self.embedding(captions.to(device)) + self.positional_encoding[:, :captions.size(1), :].to(device)
        output = self.transformer_decoder(captions_emb.permute(1, 0, 2), image_features.permute(1, 0, 2))
        return self.fc(output.permute(1, 0, 2)).to(device)


In [20]:
import torch.optim as optim

def train_model(decoder_model, vit_model, dataloader, criterion, optimizer, num_epochs):
    decoder_model.train()
    vit_model.eval()  # Set the Vision Transformer model to evaluation mode (no gradients required)
    
    for epoch in range(num_epochs):
        running_loss = 0.0  # Track the loss for the epoch
        for images, captions in dataloader:
            optimizer.zero_grad()
            
            # Move images and captions to the GPU
            images = images.to(device)
            captions = captions.to(device)
            
            # Step 1: Extract image features using ViT
            with torch.no_grad():  # No gradients needed for feature extraction
                image_features = vit_model(images).last_hidden_state  # Extract features from ViT

            # Step 2: Pass image features and captions to the Transformer Decoder
            outputs = decoder_model(image_features, captions[:, :-1])  # Exclude the last token from the captions

            # Step 3: Compute loss
            loss = criterion(outputs.view(-1, outputs.size(-1)), captions[:, 1:].reshape(-1))  # Shift captions for training
            loss.backward()  # Backpropagation
            
            optimizer.step()  # Optimization step

            running_loss += loss.item()
        
        avg_loss = running_loss / len(dataloader)
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}')



In [9]:
from torch.cuda.amp import autocast, GradScaler

scaler = GradScaler()

def train_model_mixed_precision(model, dataloader, criterion, optimizer, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        for images, captions in dataloader:
            optimizer.zero_grad()
            
            # Move images and captions to the GPU
            images = images.to(device)
            captions = captions.to(device)

            # Automatic mixed precision
            with autocast():
                image_features = extract_image_features(images)
                outputs = model(image_features, captions[:, :-1])
                loss = criterion(outputs.view(-1, outputs.size(-1)), captions[:, 1:].reshape(-1))
            
            # Scaled backpropagation
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')


  scaler = GradScaler()


In [10]:
from PIL import Image
from torchvision import transforms

# Define image preprocessing (resize and normalize)
image_transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Rescale image to 224x224 for ViT
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

def load_image(image_path):
    image = Image.open(image_path).convert('RGB')
    image = image_transform(image)
    return image


In [11]:
from transformers import BertTokenizer

# Load a pre-trained tokenizer (can use BERT or any other transformer tokenizer)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_caption(caption):
    return tokenizer.encode(caption, max_length=20, padding='max_length', return_tensors="pt")


In [13]:
import os
from pycocotools.coco import COCO

# Define the path to the MS COCO 2014 dataset
coco_images_dir = './cocoapi/images/train2014/'  # Update this to the correct path where your images are stored
coco_annotations_path = './cocoapi/annotations/captions_train2014.json'  # Path to the COCO annotations

# Load COCO annotations
coco = COCO(coco_annotations_path)

# Get image IDs and their corresponding captions
image_ids = list(coco.imgs.keys())
captions = []
image_paths = []

for img_id in image_ids:
    img_info = coco.loadImgs(img_id)[0]
    img_file_path = os.path.join(coco_images_dir, img_info['file_name'])
    
    # Get all captions for this image
    ann_ids = coco.getAnnIds(imgIds=img_id)
    anns = coco.loadAnns(ann_ids)
    for ann in anns:
        captions.append(ann['caption'])
        image_paths.append(img_file_path)

print(f"Loaded {len(image_paths)} images with captions.")


loading annotations into memory...
Done (t=0.37s)
creating index...
index created!
Loaded 414113 images with captions.


In [15]:
from transformers import ViTModel

# Initialize Vision Transformer model (pre-trained)
vit_model = ViTModel.from_pretrained('google/vit-base-patch16-224').to(device)


Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
import torch.nn as nn

class TransformerDecoder(nn.Module):
    def __init__(self, vocab_size, d_model, num_layers, num_heads, ff_dim, max_seq_length):
        super(TransformerDecoder, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.positional_encoding = nn.Parameter(torch.zeros(1, max_seq_length, d_model))
        
        decoder_layer = nn.TransformerDecoderLayer(d_model=d_model, nhead=num_heads, dim_feedforward=ff_dim)
        self.transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_layers)
        
        self.fc_out = nn.Linear(d_model, vocab_size)
    
    def forward(self, image_features, captions):
        # Embed the captions
        embedded_captions = self.embedding(captions) + self.positional_encoding[:, :captions.size(1), :]
        
        # Pass through transformer decoder
        output = self.transformer_decoder(embedded_captions, image_features)
        
        # Output layer (predict next word)
        return self.fc_out(output)


In [17]:
# Define the vocabulary size, which should match the tokenizer's vocabulary
vocab_size = len(tokenizer)  # Assume tokenizer is already defined

# Initialize the Transformer Decoder model
decoder_model = TransformerDecoder(
    vocab_size=vocab_size,
    d_model=512,  # Embedding dimension
    num_layers=6,  # Number of transformer decoder layers
    num_heads=8,  # Number of attention heads
    ff_dim=2048,  # Feedforward network dimension
    max_seq_length=20  # Maximum sequence length for captions
).to(device)


In [18]:
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)  # Ignore padding tokens
optimizer = torch.optim.Adam(decoder_model.parameters(), lr=1e-4)


In [21]:
from torch.utils.data import Dataset, DataLoader

# Define a PyTorch Dataset class for COCO
class COCODataset(Dataset):
    def __init__(self, image_paths, captions, transform=None):
        self.image_paths = image_paths
        self.captions = captions
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image = load_image(self.image_paths[idx])
        caption = tokenize_caption(self.captions[idx])
        return image, caption

# Now that image_paths and captions are loaded
dataset = COCODataset(image_paths, captions, transform=image_transform)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Start training
train_model(decoder_model, vit_model, dataloader, criterion, optimizer, num_epochs=10)


RuntimeError: stack expects each tensor to be equal size, but got [1, 20] at entry 0 and [1, 21] at entry 1