#Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!mkdir -p /content/drive/MyDrive/IC/ResNet

In [None]:
!pip install transformers > /dev/null

In [None]:
# Tiny dataset
# !gdown https://drive.google.com/uc?id=1qYPCnXXxjEcHEg3tLGt3fDkd2ialAgS4
import os
# Full dataset with jpeg
!gdown https://drive.google.com/uc?id=1-xJoBvzwQKgJjPzHb3fq1sFicwyIisx7

# Full dataset without jpeg
# https://drive.google.com/file/d/1gFSdm8K9SXNPXG9tQWS4bmO_nappN2AL/view?usp=share_link
# !gdown https://drive.google.com/uc?id=1gFSdm8K9SXNPXG9tQWS4bmO_nappN2AL
!unzip data_v1.zip -d /content/data > /dev/null

Downloading...
From: https://drive.google.com/uc?id=1-xJoBvzwQKgJjPzHb3fq1sFicwyIisx7
To: /content/data_v1.zip
100% 640M/640M [00:02<00:00, 275MB/s]


In [None]:
import json
data = json.load(open("/content/data/train_data.json", "r"))

In [None]:
data['annotations'][0]

{'id': 0,
 'image_id': 2,
 'caption': 'ba chiếc thuyền đang di chuyển ở trên con sông',
 'segment_caption': 'ba chiếc thuyền đang di_chuyển ở trên con sông'}

#Vocab

In [None]:
import random
import numpy as np
import torch
import os
def set_random_seed(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

In [None]:
set_random_seed(10)

In [None]:
from collections import Counter
import itertools
from itertools import count

class IMCP_Vocab():
  def __init__(self, texts) -> None:
    words = list(itertools.chain(*[text.split(" ") for text in texts]))
    counter = Counter(words)
    self.vocab = {key: i for i, key in zip(count(start = 4), counter.keys())}
    self.special_ids = [0, 1, 2, 3]
    self.max_seq_len = 256
    self.counter = counter
    self.special_tokens = ["<unk>", "<pad>", "<bos>", "<eos>"]
    for id, token in zip(self.special_ids, self.special_tokens):
      self.vocab[token] = id
    self.vocab = {k: v for k, v in sorted(self.vocab.items(), key=lambda x:x[1])}
    self.id2word = {v: k for k, v in self.vocab.items()}
    
    self.bos_token = "<bos>"
    self.eos_token = "<eos>"
    self.pad_token = "<pad>"
    self.unk_token = "<unk>"
    
  def get_vocab(self):
    return self.vocab

  def get_vocab_dump(self):
    vocab = dict()
    vocab['itos'] = list(vocab.keys())
    vocab['stoi'] = self.vocab
    vocab['freqs'] = dict(self.counter)
    return vocab
  
  def batch_decode(self, predictions_ids):
    preds = []
    for seq in predictions_ids:
        preds.append(" ".join([self.id2word[id] for id in seq if id not in [0,1,2,3,4,5]]))
    return preds

# Dataset

In [17]:
import torch
from torch.utils.data import DataLoader, Dataset
from PIL import Image
import json
import os

class IMCP_Dataset(Dataset):
  def __init__(self, image_path = "/content/data/train-images", summary_path = "/content/data/train_data.json", train = True):
    super().__init__()
    self.data = json.load(open(summary_path, "r"))
    self.image_path = image_path
    self.vocab = IMCP_Vocab(texts = [ann['segment_caption'] for ann in data['annotations']])
    self.imgid2imgname = {entry['id']: entry['filename'] for entry in data['images']}
    self.train = train

  def __len__(self):
    return len(self.data['annotations'])

  def __getitem__(self, index):
    if self.train:
      annotation = self.data['annotations'][index]
      image_id = annotation['image_id']
      # images = self.data['images'][index]
      # image_id = images['id']
      image_name = self.imgid2imgname[image_id]
      image = Image.open(os.path.join(self.image_path, image_name)).convert('RGB')
      caption = annotation['segment_caption']
      return image, caption
    else:
      pass

In [18]:
# class IMCP_Test_Dataset(Dataset):
#   def __init__(self, image_path = "/content/data/public-test-images", summary_path = "/content/data/test_data.json", train = True):
#     super().__init__()
#     self.data = json.load(open(summary_path, "r"))
#     self.image_path = image_path
#     self.vocab = IMCP_Vocab(texts = [ann['segment_caption'] for ann in data['annotations']])
#     self.imgid2imgname = {entry['id']: entry['filename'] for entry in data['images']}
#     self.train = train

#   def __len__(self):
#     return len(self.data['annotations'])

#   def __getitem__(self, index):
#     if self.train:
#       annotation = self.data['annotations'][index]
#       image_id = annotation['image_id']
#       # images = self.data['images'][index]
#       # image_id = images['id']
#       image_name = self.imgid2imgname[image_id]
#       image = Image.open(os.path.join(self.image_path, image_name)).convert('RGB')
#       caption = annotation['segment_caption']
#       return image, caption
#     else:
#       pass

In [19]:
train_dataset = IMCP_Dataset()
vocab = train_dataset.vocab
train_dataset, valid_dataset = torch.utils.data.random_split(train_dataset, [0.9, 0.1])
# test_dataset = IMCP_Test_Dataset()

In [20]:
# Save vocab to file
with open("/content/drive/MyDrive/IC/ResNet/vocab.json", 'w+') as file:
  json.dump(vocab.get_vocab_dump(), file, ensure_ascii = False)

In [21]:
device = "cuda" if torch.cuda.is_available() else "cpu"

# Collator

In [22]:
from torch.nn.utils.rnn import pad_sequence
import torchvision.transforms as transforms

class IMCP_Collator:
  def __init__(self, vocab, train = True):
    self.vocab = vocab
    self.bos_id = self.vocab.get_vocab()['<bos>']
    self.eos_id = self.vocab.get_vocab()['<eos>']
    self.pad_id = self.vocab.get_vocab()['<pad>']

  def tokenize_texts(self, captions):
    raw_captions = [caption.split(" ") for caption in captions]
    truncated_captions = [s[:self.vocab.max_seq_len] for s in raw_captions]
    max_len = max([len(c) for c in truncated_captions])

    padded_captions = []
    for c in truncated_captions:
        c = [self.vocab.get_vocab()[word] for word in c]
        seq = [self.bos_id] + c + [self.eos_id] + [self.pad_id] * (max_len - len(c))
        padded_captions.append(seq)

    padded_captions = [torch.Tensor(seq).long() for seq in padded_captions]
    padded_captions = pad_sequence(padded_captions, batch_first=True)
    return padded_captions
  
  def resize_and_stack(self, images):
    new_size = (224, 224)
    image_tensors = []
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
    ])
    
    for image in images:
      img_tensor = transform(image)
      image.close()
      image_tensors.append(img_tensor)
      
    stacked = torch.stack(image_tensors)
    return stacked

  def __call__(self, batch):
    images = [example[0] for example in batch]
    captions = [example[1] for example in batch]
    return {
        'images': self.resize_and_stack(images),
        'captions': self.tokenize_texts(captions)
    }
    

In [23]:
collator = IMCP_Collator(vocab, train = True)

# DataLoader

In [24]:
train_dataloader = DataLoader(train_dataset, batch_size = 16, collate_fn = collator, drop_last = True)
valid_dataloader = DataLoader(valid_dataset, batch_size = 16, collate_fn = collator, drop_last = True, shuffle = False)
# test_dataloader = DataLoader(test_dataset, batch_size = 16, collate_fn = collatorTest, shuffle = False)

In [25]:
for x in train_dataloader:
  print(x['images'].shape)
  print(x['captions'].shape)
  break

torch.Size([16, 3, 224, 224])
torch.Size([16, 19])


#Model

In [26]:
import torch
import numpy as np
import torch.nn as nn
import torchvision.models as models
import torch.optim.lr_scheduler as lr_scheduler
from transformers import get_scheduler, get_cosine_schedule_with_warmup

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load pre-trained RestNet101 model
encoder = models.resnet101(pretrained=True).to(device)

# Remove the last layer of the model
modules = list(encoder.children())[:-1]
encoder = nn.Sequential(*modules)

# Freeze the parameters of the model
for param in encoder.parameters():
    param.requires_grad = False

# Define LSTM decoder
class Decoder(nn.Module):
    def __init__(self, feature_size, embed_size, hidden_size, vocab_size, num_layers):
        super(Decoder, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size + feature_size, hidden_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)
    
    def forward(self, features, captions):
        captions = captions[:, :-1]
        embeddings = self.embed(captions)
        features = features.squeeze().unsqueeze(1).repeat(1, embeddings.size(1), 1)
        embeddings = torch.cat((features, embeddings), 2)
        hiddens, _ = self.lstm(embeddings)
        outputs = self.linear(hiddens)
        return outputs

# Define hyperparameters
num_epochs = 10
embed_size = 256
feature_size = 2048
hidden_size = 512
vocab_size = len(collator.vocab.get_vocab()) + 5
num_layers = 1
total_step = num_epochs * len(train_dataloader)


# Initialize decoder
decoder = Decoder(feature_size, embed_size, hidden_size, vocab_size, num_layers).to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
# params = list(decoder.parameters())
# optimizer = torch.optim.Adam(params, lr=0.001)

# Creating optimizer and lr schedulers
param_optimizer = list(decoder.parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
# optimizer_grouped_parameters = [
#     {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
#     {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
# ]
optimizer = torch.optim.AdamW(param_optimizer, lr=0.001, weight_decay=0.001)
# optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=0.001, eps=1e-6)  # To reproduce BertAdam specific behavior set correct_bias=False
lr_scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps = 0.1 * total_step, num_training_steps = total_step)

# Train the model
for epoch in range(num_epochs):
    print(f"Start Epoch {epoch}")
    for i, data in enumerate(train_dataloader):
        # Move data to GPU
        images = data['images'].to(device)
        captions = data['captions'].to(device)
        
        # Forward pass
        features = encoder(images)
        outputs = decoder(features, captions)
        loss = criterion(outputs.permute(0, 2, 1), captions[:, 1:])
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        
        # Print loss
        if i % 100 == 0:
            print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format(epoch+1, num_epochs, i+1, len(train_dataloader), loss.item()))
            
    valid_loss = []
    for i, data in enumerate(valid_dataloader):
        # Move data to GPU
        images = data['images'].to(device)
        captions = data['captions'].to(device)
        # Forward pass
        features = encoder(images)
        outputs = decoder(features, captions)
        loss = criterion(outputs.permute(0, 2, 1), captions[:, 1:])
        valid_loss.append(loss.item())
    
    print('Epoch [{}/{}], Valid Loss: {:.4f}'.format(epoch+1, num_epochs, np.mean(valid_loss)))
    

Downloading: "https://download.pytorch.org/models/resnet101-63fe2227.pth" to /root/.cache/torch/hub/checkpoints/resnet101-63fe2227.pth
100%|██████████| 171M/171M [00:00<00:00, 247MB/s]


Start Epoch 0
Epoch [1/10], Step [1/1060], Loss: 7.7540
Epoch [1/10], Step [101/1060], Loss: 3.9190
Epoch [1/10], Step [201/1060], Loss: 3.1035
Epoch [1/10], Step [301/1060], Loss: 2.7878
Epoch [1/10], Step [401/1060], Loss: 2.2108
Epoch [1/10], Step [501/1060], Loss: 2.7448
Epoch [1/10], Step [601/1060], Loss: 2.0379
Epoch [1/10], Step [701/1060], Loss: 1.8433
Epoch [1/10], Step [801/1060], Loss: 1.4940
Epoch [1/10], Step [901/1060], Loss: 1.7577
Epoch [1/10], Step [1001/1060], Loss: 1.4926
Epoch [1/10], Valid Loss: 1.7853
Start Epoch 1
Epoch [2/10], Step [1/1060], Loss: 1.6716
Epoch [2/10], Step [101/1060], Loss: 1.6506
Epoch [2/10], Step [201/1060], Loss: 1.6229
Epoch [2/10], Step [301/1060], Loss: 1.5936
Epoch [2/10], Step [401/1060], Loss: 1.5196
Epoch [2/10], Step [501/1060], Loss: 2.0169
Epoch [2/10], Step [601/1060], Loss: 1.6426
Epoch [2/10], Step [701/1060], Loss: 1.4760
Epoch [2/10], Step [801/1060], Loss: 1.3026
Epoch [2/10], Step [901/1060], Loss: 1.5661
Epoch [2/10], Step

In [27]:
torch.save(encoder.state_dict(), '/content/drive/MyDrive/IC/ResNet/encoder.pth')
torch.save(decoder.state_dict(), '/content/drive/MyDrive/IC/ResNet/decoder.pth')