# Image Captioning

# Show and Tell
"Show and Tell" is basic model architecture where a Convolutional Neural Network (CNN) is used to extract image features, which are then fed into a Long Short-Term Memory (LSTM) network to generate captions.

**Model Architecture**

```Python
import torch 
from torch import nn
from torchvision import models, transforms

class ShowAndTellModel(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers):
        super(ShowAndTellModel, self).__init__()
        self.embed_size = embed_size
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.num_layers = num_layers

        # Encoder (CNN)
        self.encoder = models.resnet50()
        self.encoder.fc = nn.Linear(self.encoder.fc.in_features, embed_size)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)

        # Decoder (RNN)
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)

    def forward(self, images, captions, lengths):
        # Encode images
        features = self.encoder(images)
        features = self.dropout(self.relu(features))

        # Decode captions
        embeddings = self.embed(captions)
        embeddings = torch.cat((features.unsqueeze(1), embeddings), 1)
        packed = pack_padded_sequence(embeddings, lengths, batch_first=True)
        hiddens, _ = self.lstm(packed)
        outputs = self.linear(hiddens[0])

        return outputs
```

**Architecture**

In [23]:
import torch
from transformers import GPT2Model, GPT2Tokenizer
from torchvision import models, transforms
from torch import nn
from PIL import Image

In [19]:
# Load the pre-trained models
cnn = models.resnet50()
lstm = nn.LSTM(input_size=2048, hidden_size=512, num_layers=1)  # Example LSTM

# Remove the last layer of the CNN
modules = list(cnn.children())[:-1]
cnn = nn.Sequential(*modules)

# Define the image transformation
preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

In [30]:
class ShowAndTellModel(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers):
        super(ShowAndTellModel, self).__init__()
        self.embed_size = embed_size
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.num_layers = num_layers

        # Encoder (CNN)
        self.encoder = models.resnet50()
        self.encoder.fc = nn.Linear(self.encoder.fc.in_features, embed_size)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)

        # Decoder (RNN)
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.decoder = GPT2Model.from_pretrained("gpt2")
        self.tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
        self.linear = nn.Linear(self.decoder.config.n_embd, vocab_size)

    def forward(self, images):
        # Encode images
        features = self.encoder(images)
        features = self.dropout(self.relu(features))

        # Decode captions
        embeddings = self.decoder(features)
        embeddings = torch.cat((features.unsqueeze(1), embeddings), 1)

        # Convert embeddings to tokens
        tokens = [self.tokenizer.encode(sentence, add_special_tokens=True) for sentence in embeddings]
        tokens_tensor = torch.tensor(tokens)

        outputs = self.decoder(input_ids=tokens_tensor)
        outputs = self.linear(outputs.last_hidden_state)

        return outputs

In [31]:
# Load and preprocess the image
image = Image.open("images/heinz.jpg")
input_tensor = preprocess(image)
input_batch = input_tensor.unsqueeze(0)

In [32]:
# Define the parameters
embed_size = 512  # Example size, you can adjust based on your requirements
hidden_size = 512  # Example size, you can adjust based on your requirements
vocab_size = 10000  # Example size, you should set it based on your vocabulary size
num_layers = 1  # Example number of layers, you can adjust based on your requirements

# Initialize the model instance
model = ShowAndTellModel(embed_size, hidden_size, vocab_size, num_layers)


In [33]:
# Preprocess the image
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize the image to match the input size of ResNet-50
    transforms.ToTensor(),           # Convert the image to a PyTorch tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize the image
])

# Apply the preprocessing steps to the image
input_image = preprocess(image).unsqueeze(0)  # Add an extra dimension at the beginning for batch size

# Pass the image through the ShowAndTellModel
# Assuming your model instance is named model
output_captions = model(input_image)

RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.FloatTensor instead (while checking arguments for embedding)

In [21]:
# Extract features from the image using the CNN
with torch.no_grad():
    features = cnn(input_batch)

# Prepare the features for the LSTM
features = features.view(1, -1)

# Generate the caption using the LSTM
hidden = (torch.randn(1, 1, 512), torch.randn(1, 1, 512))  # Example hidden state
output, _ = lstm(features.unsqueeze(0), hidden)


In [22]:
output

tensor([[[ 1.3498e-01, -1.3846e-01, -3.5764e-02,  2.0683e-01,  3.8889e-01,
           2.8958e-01, -6.5076e-01,  1.2353e-02,  4.4398e-01, -1.9044e-01,
          -1.8730e-02, -1.1456e-01, -7.6114e-02, -2.1864e-02,  1.7474e-01,
           6.1594e-01, -5.7552e-03,  3.4448e-01,  2.8342e-01, -2.2675e-01,
          -6.4897e-02, -1.8656e-01, -1.3069e-01,  6.4673e-01,  1.8054e-01,
          -8.3798e-02, -1.7044e-01,  1.2797e-01,  3.7055e-01,  1.1754e-02,
          -4.2841e-01,  1.8044e-01,  1.7171e-01, -4.6599e-02,  5.5438e-01,
          -3.4377e-02, -1.3975e-01, -3.2244e-01, -1.7203e-01, -7.3425e-03,
           1.9933e-01, -3.7366e-01,  1.6867e-01,  1.7807e-01,  2.1114e-01,
          -5.4326e-01,  5.0138e-02, -2.0958e-02, -1.8931e-01,  6.5047e-02,
           6.4941e-02, -1.0451e-01,  3.1256e-01,  2.3992e-01, -4.2102e-02,
          -9.0220e-02,  1.4510e-01,  7.8737e-02,  3.2548e-01, -3.0161e-02,
          -4.6526e-01,  1.1463e-01, -6.8010e-02, -3.1349e-02,  4.6978e-02,
           5.2373e-03, -4

# VisualBERT

Transformer-based model for image captioning tasks. It utilizes the attention mechanism and self-attention layers to capture long-range dependencies in both images and text.