#Data

In [1]:
!pip install transformers



In [2]:
# Tiny dataset
# !gdown https://drive.google.com/uc?id=1qYPCnXXxjEcHEg3tLGt3fDkd2ialAgS4
import os
# Full dataset with jpeg
!gdown https://drive.google.com/uc?id=1-xJoBvzwQKgJjPzHb3fq1sFicwyIisx7

# Full dataset without jpeg
# https://drive.google.com/file/d/1gFSdm8K9SXNPXG9tQWS4bmO_nappN2AL/view?usp=share_link
# !gdown https://drive.google.com/uc?id=1gFSdm8K9SXNPXG9tQWS4bmO_nappN2AL
!unzip data_v1.zip -d /content/data
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

Downloading...
From (original): https://drive.google.com/uc?id=1-xJoBvzwQKgJjPzHb3fq1sFicwyIisx7
From (redirected): https://drive.google.com/uc?id=1-xJoBvzwQKgJjPzHb3fq1sFicwyIisx7&confirm=t&uuid=7d157710-6cdb-440a-9b50-c4e99c494063
To: /content/data_v1.zip
100% 640M/640M [00:14<00:00, 45.0MB/s]
Archive:  data_v1.zip
   creating: /content/data/train-images/
  inflating: /content/data/train-images/00000000002.jpg  
  inflating: /content/data/train-images/00000000003.jpg  
  inflating: /content/data/train-images/00000000004.jpg  
  inflating: /content/data/train-images/00000000005.jpg  
  inflating: /content/data/train-images/00000000006.jpg  
  inflating: /content/data/train-images/00000000009.jpg  
  inflating: /content/data/train-images/00000000010.jpg  
  inflating: /content/data/train-images/00000000017.jpg  
  inflating: /content/data/train-images/00000000020.jpg  
  inflating: /content/data/train-images/00000000022.jpg  
  inflating: /content/data/train-images/00000000023.jpg  
  

In [3]:
import json
data = json.load(open("/content/data/train_data.json", "r"))

In [4]:
data['annotations'][0]

{'id': 0,
 'image_id': 2,
 'caption': 'ba chiếc thuyền đang di chuyển ở trên con sông',
 'segment_caption': 'ba chiếc thuyền đang di_chuyển ở trên con sông'}

#Vocab

In [5]:
from collections import Counter
import itertools
from itertools import count

#os.environ['CUDA_LAUNCH_BLOCKING'] = 1

class IMCP_Vocab():
  def __init__(self, texts) -> None:
    words = list(itertools.chain(*[text.split(" ") for text in texts]))
    counter = Counter(words)
    self.vocab = {key: i for i, key in zip(count(start = 5), counter.keys())}

    self.special_ids = [0, 1, 2, 3]
    self.max_seq_len = 256
    self.special_tokens = ["<s>", "</s>", "<pad>", "<unk>"]
    for id, token in zip(self.special_ids, self.special_tokens):
      self.vocab[token] = id
    self.vocab = {k: v for k, v in sorted(self.vocab.items(), key=lambda x:x[1])}

    self.bos_token = "<s>"
    self.eos_token = "</s>"
    self.pad_token = "<pad>"
    self.unk_token = "<unk>"

  def get_vocab(self):
    return self.vocab

# Dataset

In [6]:
import torch
from torch.utils.data import DataLoader, Dataset
from PIL import Image
import json
import os

class IMCP_Dataset(Dataset):
  def __init__(self, image_path = "/content/data/train-images", summary_path = "/content/data/train_data.json", train = True):
    super().__init__()
    self.data = json.load(open(summary_path, "r"))
    self.image_path = image_path
    self.vocab = IMCP_Vocab(texts = [ann['segment_caption'] for ann in data['annotations']])
    self.imgid2imgname = {entry['id']: entry['filename'] for entry in data['images']}
    self.train = train

  def __len__(self):
    return len(self.data['annotations'])

  def __getitem__(self, index):
    if self.train:
      annotation = self.data['annotations'][index]
      image_id = annotation['image_id']
      # images = self.data['images'][index]
      # image_id = images['id']
      image_name = self.imgid2imgname[image_id]
      image = Image.open(os.path.join(self.image_path, image_name)).convert('RGB')
      caption = annotation['segment_caption']
      return image, caption
    else:
      pass

In [7]:
class IMCP_Test_Dataset(Dataset):
  def __init__(self, image_path = "/content/data/public-test-images", summary_path = "/content/data/test_data.json", train = True):
    super().__init__()
    self.data = json.load(open(summary_path, "r"))
    self.image_path = image_path
    self.vocab = IMCP_Vocab(texts = [ann['segment_caption'] for ann in data['annotations']])
    self.imgid2imgname = {entry['id']: entry['filename'] for entry in data['images']}
    self.train = train

  def __len__(self):
    return len(self.data['annotations'])

  def __getitem__(self, index):
    if self.train:
      annotation = self.data['annotations'][index]
      image_id = annotation['image_id']
      # images = self.data['images'][index]
      # image_id = images['id']
      image_name = self.imgid2imgname[image_id]
      image = Image.open(os.path.join(self.image_path, image_name)).convert('RGB')
      caption = annotation['segment_caption']
      return image, caption
    else:
      pass

In [8]:
train_dataset = IMCP_Dataset()
test_dataset = IMCP_Test_Dataset()
vocab = train_dataset.vocab

In [9]:
device = "cuda" if torch.cuda.is_available() else "cpu"

# Collator

In [10]:
from torch.nn.utils.rnn import pad_sequence
import torchvision.transforms as transforms

class IMCP_Collator:
  def __init__(self, vocab, train = True, model = "resnet101"):
    self.vocab = vocab
    self.bos_id = self.vocab.get_vocab()['<s>']
    self.eos_id = self.vocab.get_vocab()['</s>']
    self.pad_id = self.vocab.get_vocab()['<pad>']
    self.model = model

  def tokenize_texts(self, captions):
    raw_captions = [caption.split(" ") for caption in captions]
    truncated_captions = [s[:self.vocab.max_seq_len] for s in raw_captions]
    max_len = max([len(c) for c in truncated_captions])

    padded_captions = []
    for c in truncated_captions:
        c = [self.vocab.get_vocab()[word] for word in c]
        seq = [self.bos_id] + c + [self.eos_id] + [self.pad_id] * (max_len - len(c))
        padded_captions.append(seq)

    padded_captions = [torch.Tensor(seq).long() for seq in padded_captions]
    padded_captions = pad_sequence(padded_captions, batch_first=True)
    return padded_captions

  def resize_and_stack(self, images):
    if self.model == "resnet101":
      new_size = (224, 224)
      image_tensors = []
      transform = transforms.Compose([
          transforms.Resize((224, 224)),
          transforms.ToTensor(),
      ])

      for image in images:
        img_tensor = transform(image)
        image.close()
        image_tensors.append(img_tensor)

      stacked = torch.stack(image_tensors)
      return stacked
    else:
      pass

  def __call__(self, batch):
    images = [example[0] for example in batch]
    captions = [example[1] for example in batch]
    return {
        'images': self.resize_and_stack(images),
        'captions': self.tokenize_texts(captions)
    }


In [11]:
collator = IMCP_Collator(vocab, train = True, model = "resnet101")

# DataLoader

In [12]:
train_dataloader = DataLoader(train_dataset, batch_size = 16, collate_fn = collator)
test_dataloader = DataLoader(test_dataset, batch_size = 16, collate_fn = collator)

In [13]:
for x in train_dataloader:
  print(x['images'].shape)
  print(x['captions'].shape)
  break

torch.Size([16, 3, 224, 224])
torch.Size([16, 13])


#Model

In [14]:
import torch
import torch.nn as nn
import torchvision.models as models

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load pre-trained RestNet101 model
encoder = models.resnet101(pretrained=True).to(device)

# Remove the last layer of the model
modules = list(encoder.children())[:-1]
encoder = nn.Sequential(*modules)

# Freeze the parameters of the model
for param in encoder.parameters():
    param.requires_grad = False

# Define LSTM decoder
class Decoder(nn.Module):
    def __init__(self, feature_size, embed_size, hidden_size, vocab_size, num_layers):
        super(Decoder, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size + feature_size, hidden_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)

    def forward(self, features, captions):
        captions = captions[:, :-1]
        embeddings = self.embed(captions)
        features = features.squeeze().unsqueeze(1).repeat(1, embeddings.size(1), 1)
        embeddings = torch.cat((features, embeddings), 2)
        hiddens, _ = self.lstm(embeddings)
        outputs = self.linear(hiddens)
        return outputs

# Define hyperparameters
num_epochs = 10
embed_size = 256
feature_size = 2048
hidden_size = 512
vocab_size = len(collator.vocab.get_vocab()) + 5
num_layers = 1
total_step = num_epochs * len(train_dataloader)

# Initialize decoder
decoder = Decoder(feature_size, embed_size, hidden_size, vocab_size, num_layers).to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
params = list(decoder.parameters())
optimizer = torch.optim.Adam(params, lr=0.001)

# Train the model
for epoch in range(num_epochs):
    print(f"Start Epoch {epoch}")
    for i, data in enumerate(train_dataloader):
        # Move data to GPU
        images = data['images'].to(device)
        captions = data['captions'].to(device)

        # Forward pass
        features = encoder(images)
        outputs = decoder(features, captions)
        loss = criterion(outputs.permute(0, 2, 1), captions[:, 1:])

        # Backward pass and optimization
        decoder.zero_grad()
        encoder.zero_grad()
        loss.backward()
        optimizer.step()

        # Print loss
        if i % 100 == 0:
            print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format(epoch+1, num_epochs, i+1, len(train_dataloader), loss.item()))

Downloading: "https://download.pytorch.org/models/resnet101-63fe2227.pth" to /root/.cache/torch/hub/checkpoints/resnet101-63fe2227.pth
100%|██████████| 171M/171M [00:02<00:00, 67.3MB/s]


Start Epoch 0
Epoch [1/10], Step [1/1178], Loss: 7.7658
Epoch [1/10], Step [101/1178], Loss: 3.0897
Epoch [1/10], Step [201/1178], Loss: 3.3393
Epoch [1/10], Step [301/1178], Loss: 2.4066
Epoch [1/10], Step [401/1178], Loss: 2.3837
Epoch [1/10], Step [501/1178], Loss: 2.4664
Epoch [1/10], Step [601/1178], Loss: 1.3105
Epoch [1/10], Step [701/1178], Loss: 1.8160
Epoch [1/10], Step [801/1178], Loss: 2.1253
Epoch [1/10], Step [901/1178], Loss: 1.9665
Epoch [1/10], Step [1001/1178], Loss: 1.7653
Epoch [1/10], Step [1101/1178], Loss: 1.4580
Start Epoch 1
Epoch [2/10], Step [1/1178], Loss: 2.4770
Epoch [2/10], Step [101/1178], Loss: 2.2512
Epoch [2/10], Step [201/1178], Loss: 2.0772
Epoch [2/10], Step [301/1178], Loss: 1.7697
Epoch [2/10], Step [401/1178], Loss: 1.7343
Epoch [2/10], Step [501/1178], Loss: 1.9512
Epoch [2/10], Step [601/1178], Loss: 1.0713
Epoch [2/10], Step [701/1178], Loss: 1.4759
Epoch [2/10], Step [801/1178], Loss: 1.8595
Epoch [2/10], Step [901/1178], Loss: 1.6857
Epoch 

In [15]:
torch.save(encoder.state_dict(), 'encoder.pth')
torch.save(decoder.state_dict(), 'decoder.pth')

In [16]:
!ls

data  data_v1.zip  decoder.pth	encoder.pth  sample_data


In [18]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [19]:
!cp encoder.pth /content/drive/MyDrive
!cp decoder.pth /content/drive/MyDrive

#Inference

In [None]:
# for step, data in enumerate(train_dataloader):
#   images = data['images'].to(device)
#   out = model.generate(images)
#   print(out)
#   break