## Dataset

*We need to conver text to numerical value*
* We need a vocabulary mapping for each word(or character) to int
* We need to setup a pytorch dataset
* Make sure that each sentence (input) is same size (padding) and dataloader

In [1]:
import os
import pandas as pd
import spacy
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
from PIL import Image
from torchvision.transforms import transforms

In [None]:
!python -m spacy download en_core_web_sm

In [2]:

class Vocabulary():
    spacy_eng = spacy.load("en_core_web_sm")

    
    def __init__(self, freq_threshold):
        self.itos = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
        self.stoi = {v:k for k,v in self.itos.items()}
        self.freq_threshold = freq_threshold

    
    def __len__(self):
        return len(self.itos)


    @staticmethod
    def tokenizer_eng(text):
        return [tok.text.lower() for tok in Vocabulary.spacy_eng.tokenizer(text)]
    
    def build_vocabulary(self, sentence_list):
        frequency = {}
        idx = 4
        N = 50

        for sentence in sentence_list:
            for token in self.tokenizer_eng(sentence):

                frequency[token] = 1 + frequency.get(token, 0)

                if frequency[token] == self.freq_threshold:
                    self.stoi[token] = idx
                    self.itos[idx] = token
                    idx += 1
    
    def numericalize(self, text):
        token_sent = self.tokenizer_eng(text)

        return [self.stoi[token] if token in self.stoi else self.stoi['<UNK>']
                for token in token_sent
            ]

In [3]:
v = Vocabulary(freq_threshold=1)

v.build_vocabulary(["This is a good place to find a city"])
print(v.stoi)
print(v.numericalize("This is a good place to find a city here!!"))

{'<PAD>': 0, '<SOS>': 1, '<EOS>': 2, '<UNK>': 3, 'this': 4, 'is': 5, 'a': 6, 'good': 7, 'place': 8, 'to': 9, 'find': 10, 'city': 11}
[4, 5, 6, 7, 8, 9, 10, 6, 11, 3, 3, 3]


In [4]:
class FlickrDataset(Dataset):

    def __init__(self, root_dir, caption_file, transform=None, freq_threshold=5):
        self.root_dir = root_dir
        self.df = pd.read_csv(caption_file)
        self.transform = transform

        # get the image and caption
        self.images = self.df['image']
        self.caption = self.df['caption']

        # Create our own vocabulary
        self.vocabulary = Vocabulary(freq_threshold)
        self.vocabulary.build_vocabulary(self.caption.tolist())
    
    def __len__(self):
        return len(self.df)


    def __getitem__(self, index):
        # get image
        image_path = os.path.join(self.root_dir, self.images[index])
        img = Image.open(image_path).convert('RGB')

        if self.transform is not None:
            img = self.transform(img)
        
        # get caption
        caption = self.caption[index]
        num_caption = [self.vocabulary.stoi['<SOS>']]
        num_caption += self.vocabulary.numericalize(caption)
        num_caption.append(self.vocabulary.stoi['<EOS>'])
        

        return img, torch.tensor(num_caption)
    


In [5]:
class MyCollate:
    def __init__(self, pad_idx):
        self.pad_idx = pad_idx
    
    def __call__(self, batch):
        img = [item[0].unsqueeze(0) for item in batch]
        img = torch.cat(img, 0)
        target = [item[1] for item in batch]
        target = pad_sequence(target, batch_first=False, padding_value=self.pad_idx)
        return img, target

In [6]:
def get_loader(
        root_folder,
        annotation_file,
        transform,
        batch_size=32,
        num_worker=0,
        shuffle=True,
        pin_memory=False
):
    dataset = FlickrDataset(root_dir=root_folder,
                            caption_file=annotation_file, transform=transform)
    pad_idx = dataset.vocabulary.stoi["<PAD>"]
    
    loader = DataLoader(
        dataset,
        batch_size=batch_size,
        collate_fn=MyCollate(pad_idx=pad_idx),
        pin_memory=pin_memory,
        num_workers=num_worker,
        shuffle=shuffle
    )

    return loader, dataset

In [7]:
transform = transforms.Compose([
    transforms.Resize((299, 299)),
    transforms.ToTensor()
])

loader, _ = get_loader(
    "data/images", "data/captions.txt", transform=transform
)

for idx, (imgs, captions) in enumerate(loader):
    print(imgs.shape)
    print(captions.shape)
    if idx==5:
        break


torch.Size([32, 3, 299, 299])
torch.Size([29, 32])
torch.Size([32, 3, 299, 299])
torch.Size([19, 32])
torch.Size([32, 3, 299, 299])
torch.Size([25, 32])
torch.Size([32, 3, 299, 299])
torch.Size([20, 32])
torch.Size([32, 3, 299, 299])
torch.Size([23, 32])
torch.Size([32, 3, 299, 299])
torch.Size([26, 32])


## Pad Sequence

In [18]:
import torch
from torch.nn.utils.rnn import pad_sequence
# Example sequences of different lengths
sequences = [torch.tensor([1, 2, 3]),
             torch.tensor([4, 5]),
             torch.tensor([6, 7, 8, 9])]
padded_sequences = pad_sequence(sequences, batch_first=True, padding_value=0)
padded_sequences

tensor([[1, 2, 3, 0],
        [4, 5, 0, 0],
        [6, 7, 8, 9]])

In [17]:
padded_sequences = pad_sequence(sequences, batch_first=False, padding_value=0)
print(padded_sequences)


tensor([[1, 4, 6],
        [2, 5, 7],
        [3, 0, 8],
        [0, 0, 9]])


## Model

In [19]:
import torch
import torch.nn as nn
from torchvision import models

## Loading inceptionnet

In [180]:
inception = models.inception_v3(weights='Inception_V3_Weights.DEFAULT', aux_logits=True)
inception.aux_logits = False
inception.AuxLogits = None
inception.eval()

Inception3(
  (Conv2d_1a_3x3): BasicConv2d(
    (conv): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), bias=False)
    (bn): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  )
  (Conv2d_2a_3x3): BasicConv2d(
    (conv): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), bias=False)
    (bn): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  )
  (Conv2d_2b_3x3): BasicConv2d(
    (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn): BatchNorm2d(64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  )
  (maxpool1): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  (Conv2d_3b_1x1): BasicConv2d(
    (conv): Conv2d(64, 80, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (bn): BatchNorm2d(80, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  )
  (Conv2d_4a_3x3): BasicConv2d(
    (conv): Conv2d(80, 192, kernel_size=(3, 3), stri

In [179]:
inception.AuxLogits

InceptionAux(
  (conv0): BasicConv2d(
    (conv): Conv2d(768, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (bn): BatchNorm2d(128, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  )
  (conv1): BasicConv2d(
    (conv): Conv2d(128, 768, kernel_size=(5, 5), stride=(1, 1), bias=False)
    (bn): BatchNorm2d(768, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  )
  (fc): Linear(in_features=768, out_features=1000, bias=True)
)

In [175]:
from PIL import Image
from torchvision import transforms
image_path = "data/images/667626_18933d713e.jpg"  # Replace with your image path
image = Image.open(image_path)
transformation = transforms.transforms.Compose([transforms.transforms.Resize((299, 299)),
                                                transforms.transforms.ToTensor()])

image_tensor = transformation(image)
#image_tensor = image_tensor.permute(1, 2, 0)
image_tensor = image_tensor.unsqueeze(0)
print(image_tensor.shape)


torch.Size([1, 3, 299, 299])


In [177]:
with torch.no_grad():
    output = inception(image_tensor)
output[0].shape
probabilities = torch.nn.functional.softmax(output[0], dim=0)
print(probabilities)


tensor([1.1042e-04, 1.6099e-04, 1.1182e-04, 1.4414e-04, 3.0983e-04, 5.5821e-04,
        2.2292e-03, 1.2322e-04, 1.0763e-04, 2.4891e-04, 9.1726e-05, 8.4824e-05,
        1.2684e-04, 2.0368e-04, 1.5469e-04, 1.2948e-04, 5.0540e-04, 1.5281e-04,
        1.1814e-04, 2.2996e-04, 9.4163e-05, 2.1933e-04, 1.1465e-04, 2.4604e-04,
        5.3684e-05, 1.4006e-04, 1.3197e-04, 4.4242e-04, 9.4141e-05, 1.4691e-04,
        2.4868e-04, 6.2199e-04, 4.9452e-04, 1.2766e-03, 1.2204e-03, 2.2431e-04,
        2.0432e-04, 7.6992e-04, 7.0516e-04, 2.9397e-04, 3.5319e-04, 2.9965e-04,
        1.6164e-04, 1.3094e-04, 2.1130e-04, 2.1709e-04, 1.3393e-04, 3.8514e-04,
        2.2806e-04, 1.3911e-04, 1.4848e-04, 4.0279e-04, 1.2871e-04, 3.0282e-04,
        2.4473e-04, 8.2770e-05, 5.3783e-04, 2.5323e-04, 1.9517e-04, 6.2746e-05,
        9.0976e-04, 2.9220e-04, 1.4304e-04, 1.9991e-04, 1.4704e-04, 8.4168e-05,
        3.1709e-04, 8.1795e-04, 5.0133e-04, 3.9154e-04, 4.8728e-04, 4.4958e-04,
        7.4184e-05, 2.0774e-04, 1.4131e-

## Encoder

In [20]:
class EncoderCNN(nn.Module):

    def __init__(self, embed_size, train_CNN=False):
        super(EncoderCNN, self).__init__()
        self.train_CNN = train_CNN

        ## loading inception model
        self.inception = models.inception_v3(weights='Inception_V3_Weights.DEFAULT', aux_logits=True)
        self.inception.fc = nn.Linear(in_features=self.inception.fc.in_features, out_features=embed_size)
        self.inception.aux_logits = False
        self.inception.AuxLogits = None

        
        self.relu = nn.ReLU()
        self.times = []
        self.dropout = nn.Dropout(0.5)
    

    def forward(self, images):
        features = self.inception(images)
        return self.dropout(self.relu(features))


        


## test for Decoder

In [38]:
enocder = EncoderCNN(256).to("mps")
embed = nn.Embedding(23556, 256).to("mps")
lstm = nn.LSTM(256, 256).to("mps")
lstm_2 = nn.LSTM(256, 512).to("mps")
for img, caption in loader:
    img, caption = img.to("mps"), caption.to("mps")
    output = enocder(img) # feature

    output = output.unsqueeze(0)
    caption = embed(caption) #embedding
    print(output.shape)
    print(caption.shape)
    print(torch.cat((output, caption), dim=0).shape)
    output, _ = lstm(output)
    out = torch.cat((enocder(img).unsqueeze(0), caption), dim=0)
    out, _ = lstm_2(out)
    print(output.shape)
    print(out.shape)
    break


torch.Size([1, 32, 256])
torch.Size([22, 32, 256])
torch.Size([23, 32, 256])
torch.Size([1, 32, 256])
torch.Size([23, 32, 512])


*Questoin:* Do we need start of sequence in this case? since start of sequence is the feature created from the pretrained CNN block (encoder)

## Decoder

In [112]:
print(torch.cat((torch.Tensor([[1,2,3],[7,8,9]]).unsqueeze(0), torch.Tensor([[4,5,6],[9,10,11]]).unsqueeze(0)), dim=-2))

tensor([[[ 1.,  2.,  3.],
         [ 7.,  8.,  9.],
         [ 4.,  5.,  6.],
         [ 9., 10., 11.]]])


In [33]:
class DecoderRNN(nn.Module):

    def __init__(self, embed_size, hidden_size, vocab_size, num_layers):
        super(DecoderRNN, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers)
        self.linear = nn.Linear(hidden_size, vocab_size)
        self.dropout = nn.Dropout(0.5)
    
    def forward(self, features, captions):
        embeddings = self.dropout(self.embed(captions))
        # teacher forcing
        embeddings = torch.cat((features.unsqueeze(0), embeddings), dim=0)
        hidden, _ = self.lstm(embeddings)

        return self.linear(hidden), hidden




In [34]:
vocabulary = Vocabulary(5)
vocabulary.build_vocabulary(pd.read_csv('data/captions.txt')['caption'].tolist())

In [35]:
enocder = EncoderCNN(256).to("mps")
decoder = DecoderRNN(256, 256, len(vocabulary), num_layers=1).to("mps")
for img, caption in loader:
    img = img.to("mps")
    caption = caption.to("mps")
    features = enocder(img)
    print(features.shape)
    print(caption.shape)
    out, hid = decoder(features, caption)
    print(hid.shape)
    print(out.shape)
    break
    

torch.Size([32, 256])
torch.Size([28, 32])
torch.Size([29, 32, 256])
torch.Size([29, 32, 2994])


## CNNtoRNN

In [28]:
class CNNtoRNN(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers):
        super(CNNtoRNN, self).__init__()
        self.encoder = EncoderCNN(embed_size=embed_size)
        self.decoder = DecoderRNN(embed_size, hidden_size, vocab_size, num_layers)
    
    def forward(self, img, captions):
        features = self.encoder(img)
        outputs = self.decoder(features, captions)
        return outputs

    def caption_img(self, img, vocab, max_length=100):
        result_caption = []

        with torch.no_grad():
            feature = self.encoder(img).unsqueeze(0) # adding dim for batch size 
            states = None

            for _ in range(max_length):
                hidden, states = self.decoder.lstm(feature, states)
                output = self.decoder.linear(hidden.squeeze(0)) # removing the extra dimension needed in lstm
                predicted = output.argmax(1) # highest probablities word
                result_caption.append(predicted.item())
                feature = self.decoder.embed(predicted).unsqueeze(0)
                if vocab.itos[predicted.item()] == "<EOS>":
                    break
            
            return [vocab.itos[idx] for idx in result_caption] #return the final sentence


In [29]:
## Santity Check

model = CNNtoRNN(256, 132, 2993, 2)
vocabulary = Vocabulary(5)
vocabulary.build_vocabulary(pd.read_csv('data/captions.txt')['caption'].tolist())
for img, caption in loader:
    print(model.caption_img(img[6].unsqueeze(0), vocabulary))
    break


['pigtails', 'identical', 'start', 'start', 'start', 'start', 'rollerblader', 'rollerblader', 'luggage', 'luggage', 'pole', 'smaller', 'pigtails', 'pigtails', 'identical', 'loading', 'hood', 'hats', 'wade', 'start', 'tool', 'loading', 'pole', 'loading', 'raise', 'pole', 'bearing', 'boa', 'boa', 'moon', 'lounge', 'loading', 'loading', 'blocked', 'scuba', 'waterway', 'blocked', 'waterway', 'window', 'luggage', 'loading', 'loading', 'pole', 'raise', 'movie', 'movie', 'sideways', 'sideways', 'sideways', 'four', 'moon', 'luggage', 'mug', 'mug', 'mug', 'pitbull', 'loading', 'snowsuit', 'pitbull', 'pole', 'boa', 'boa', 'boa', 'moon', 'bearing', 'loading', 'loading', 'blocked', 'blocked', 'waterway', 'waterway', 'window', 'luggage', 'loading', 'loading', 'pole', 'raise', 'identical', 'movie', 'movie', 'movie', 'sideways', 'sideways', 'sideways', 'sideways', 'four', 'moon', 'luggage', 'hill', 'hill', 'hill', 'hill', 'hill', 'identical', 'loading', 'identical', 'identical', 'loading', 'loading',

## Training

In [60]:
import torch.nn as nn
import torch
from torchvision.transforms import transforms
from tqdm import tqdm
from torch.utils.tensorboard import SummaryWriter


In [62]:
transform = transforms.Compose([
    transforms.Resize((299, 299)),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

In [66]:
loader, dataset = get_loader(
    "data/images/", "data/captions.txt", transform
)

In [67]:
device = "cpu"
if torch.cuda.is_available():
    devcie = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"

print(device)


mps


In [69]:
train_CNN = False
embed_size = 256
hidden_size = 256
vocab_size = len(dataset.vocabulary)
num_layers = 1
lr= 3e-4
num_epochs = 100

In [72]:
writer = SummaryWriter(log_dir="runs/data")
model = CNNtoRNN(embed_size, hidden_size, vocab_size, num_layers).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=dataset.vocabulary.stoi["<PAD>"])
optimizer = torch.optim.Adam(params=model.parameters(),
                             lr=lr)

In [74]:
for name, param in model.encoder.inception.named_parameters():
    if "fc.weight" in name or "fc.bias" in name:
        param.requires_grad = True
    else:
        param.requires_grad = train_CNN

In [120]:
model.train()
for epoch in range(num_epochs):
    for idx, (imgs, captions) in tqdm(enumerate(loader), total=len(loader), leave=False):
        imgs = imgs.to(device)
        captions = captions.to(device)

        optimizer.zero_grad()
        print(captions[-1])
        outputs = model(imgs, captions[:-1])
        print(outputs.shape)
        print(outputs.reshape(-1, outputs.shape[2]).shape)
        print(captions.shape)
        print(captions.reshape(-1).shape)
        break
    break



  0%|          | 0/1265 [00:00<?, ?it/s]

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 2, 0], device='mps:0')


                                        

torch.Size([26, 32, 2994])
torch.Size([832, 2994])
torch.Size([26, 32])
torch.Size([832])


