<a href="https://colab.research.google.com/github/luyuzhe111/image-caption-match/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import os

In [3]:
os.chdir('/content/drive/MyDrive/kaggle/wikipedia')

In [4]:
import pandas as pd
from tqdm import tqdm

# Import packages

In [5]:
! pip install -q albumentations==0.4.6

In [6]:
! pip install cairosvg



In [7]:
! pip install transformers



In [8]:
!pip install pytorch-metric-learning



In [9]:
import glob
import gc
gc.enable()
import multiprocessing
import cv2
import copy
import time
import random
from PIL import Image
from PIL import ImageFile

import base64
import pickle

# fold
from sklearn.model_selection import StratifiedKFold

# For downloading images
from io import BytesIO

# For data manipulation
import numpy as np
import pandas as pd
from numpy import asarray

# Pytorch Imports
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.cuda import amp
import torchvision


# Utils
import joblib
from tqdm import tqdm
from collections import defaultdict

# Sklearn Imports
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, KFold

# For Image Models
# import timm

# For Transformer Models
from transformers import AutoTokenizer, AutoModel
from transformers import BertTokenizer, BertModel, BertConfig

import torchvision.transforms as transforms

import warnings
warnings.filterwarnings("ignore")


# Load data

In [10]:
# load data from input files
df_train = pd.read_feather('./train-subsample.feather')
df_valid = pd.read_feather('./valid-subsample.feather')
df_test = pd.read_feather('./test-subsample.feather')

In [11]:
df_train.head(3)

Unnamed: 0,language,image_url,caption_title_and_reference_description,page_title,path
0,sk,http://upload.wikimedia.org/wikipedia/commons/...,Edubuntu </s> Edubuntu 7.04 - Gaim a napaľovanie,Edubuntu,./images/images0/sk/13737.jpg
1,en,https://upload.wikimedia.org/wikipedia/commons...,Lensfield Road </s>,Lensfield Road,./images/images4/en/95883.jpg
2,en,http://upload.wikimedia.org/wikipedia/commons/...,Argao </s> Old cannons of Argao,Argao,./images/images3/en/66896.jpg


In [12]:
print(f'size of the train/val/test dataset: {len(df_train)}, {len(df_valid)}, {len(df_test)}')

size of the train/val/test dataset: 79342, 9918, 9918


# Set up


## config

In [13]:
def optimal_num_of_loader_workers():
    num_cpus = multiprocessing.cpu_count()
    num_gpus = torch.cuda.device_count()
    optimal_value = min(num_cpus, num_gpus*4) if num_gpus else num_cpus - 1
    return optimal_value

In [14]:
CONFIG = {
    "seed": 2021,
    "epochs": 5,
    
    "img_size": 224,
    "embedding_size": 512,
    "text_model_name": "xlm-roberta-base",
    "pool": 'gem-2',
    "freeze_image_model": False,
    "freeze_text_model": False,
    "train_batch_size": 32,
    "valid_batch_size": 32,
    "learning_rate": 1e-5,
    "scheduler": 'CosineAnnealingLR',
    "min_lr": 1e-6,
    'num_workers':4,
    
    "T_max": 500,
    "weight_decay": 1e-6,
    "max_length": 32,
    
    "n_accumulate": 1,
}

CONFIG["experiment"] = f"dim-{CONFIG['embedding_size']}_pool-{CONFIG['pool']}"
CONFIG["tokenizer"] = AutoTokenizer.from_pretrained(CONFIG['text_model_name'])

In [15]:
def set_seed(seed=42):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed(CONFIG['seed'])

# Dataset

In [16]:
def load_img(dir):
  image = Image.open(dir)
  return image

In [17]:
class WikipediaDataset(Dataset):
    def __init__(self, data, tokenizer, max_length, transforms=None):
        self.data = data.reset_index(drop=True)
        self.max_len = max_length
        self.tokenizer = tokenizer
        self.transforms = transforms
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        img = load_img(self.data.at[index, "path"])
        if self.transforms:
          img = self.transforms(img)
        
        caption = self.data.at[index, "caption_title_and_reference_description"]
        caption = caption.replace("[SEP]", "</s>") # sep token for xlm-roberta
        inputs = self.tokenizer.encode_plus(
            caption,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length'
        )
        
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
       
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'image': img,
        }

# Data Transforms

In [18]:
data_transforms = {
    "train": transforms.Compose([   
        transforms.RandomHorizontalFlip(),                                      
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
    ]),
    
    "valid": transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
    ])
}

# Dataloader

In [19]:
train_dataset = WikipediaDataset(df_train, CONFIG["tokenizer"], CONFIG["max_length"], transforms=data_transforms["train"])
train_loader = DataLoader(train_dataset, batch_size=CONFIG['train_batch_size'], num_workers=CONFIG['num_workers'], shuffle=True, pin_memory=True)

valid_dataset = WikipediaDataset(df_valid, CONFIG["tokenizer"], CONFIG["max_length"], transforms=data_transforms["valid"])
valid_loader = DataLoader(valid_dataset, batch_size=CONFIG['valid_batch_size'], num_workers=CONFIG['num_workers'], shuffle=False, pin_memory=True)

test_dataset = WikipediaDataset(df_test, CONFIG["tokenizer"], CONFIG["max_length"], transforms=data_transforms["valid"])
test_loader = DataLoader(test_dataset, batch_size=CONFIG['valid_batch_size'], num_workers=CONFIG['num_workers'], shuffle=False, pin_memory=True)

# Image model

In [20]:
from torchvision.models import resnet50

In [21]:
class ImageFeatureExtractor(nn.Module):
    def __init__(self, projector=[2048, 512, 64, 8], pool='gem-1'):
        super(ImageFeatureExtractor, self).__init__()
        model = resnet50(pretrained=True)
        modules = nn.ModuleList(model.children())[:-2]
        self.features = nn.Sequential(*modules)

        arch=[]
        for i in range(1,len(projector)):
            arch.append(nn.Linear(projector[i-1], projector[i]))
            if i!=len(projector)-1:
                arch.append(nn.ReLU())
        
        self.projector=nn.Sequential(*arch)
        self.pool = pool

    def forward(self, inputs):
        output = self.features(inputs)
        output = torch.flatten(output, start_dim=-2)
        if self.pool == 'gem-1':
          output = output.mean(-1)
        elif self.pool == 'gem-2':
          output = torch.pow((output ** 2).mean(-1), 1/2)
        elif self.pool == 'gem-inf':
          output = output.max(-1)
        output = self.projector(output)
        return output

In [22]:
projector = [2048, CONFIG['embedding_size']]
image_model = ImageFeatureExtractor(projector=projector, pool=CONFIG['pool']).cuda()
image_model.train()
if CONFIG['freeze_image_model']:
  print('freeze image encoder features')
  for name, p in image_model.named_parameters():
    if 'features' in name:
      p.requires_grad = False

# Text model

In [23]:
class TextExtractorModel(nn.Module):
    def __init__(self, text_model, projector=[748, 96, 32]):
        super(TextExtractorModel, self).__init__()
        self.text_model = AutoModel.from_pretrained(text_model)

        arch=[]
        for i in range(1,len(projector)):
            arch.append(nn.Linear(projector[i-1], projector[i]))
            if i!=len(projector)-1:
                arch.append(nn.ReLU())
        
        self.projector=nn.Sequential(*arch)
        self.init_weights(self.projector)
        
    def init_weights(self, m):
        if type(m) == torch.nn.Linear:
            torch.nn.init.xavier_uniform_(m.weight)
            m.bias.data.fill_(0)
            
    def forward(self, ids, mask):
        out = self.text_model(input_ids=ids, attention_mask=mask, output_hidden_states=False)[1]
        text_embeddings = self.projector(out)
        return text_embeddings

In [24]:
projector=[768, CONFIG['embedding_size']]
text_model = TextExtractorModel(CONFIG['text_model_name'], projector=projector).cuda()
text_model.train()
if CONFIG['freeze_text_model']:
  print('freeze text encoder features')
  for name, p in text_model.named_parameters():
    if 'text_model' in name:
      p.requires_grad = False

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Optimizer

In [25]:
optimizer = optim.Adam([
                {'params': image_model.parameters()},
                {'params': text_model.parameters()}
            ], lr=CONFIG['learning_rate'], weight_decay=CONFIG['weight_decay'], betas=(0.9, 0.999), amsgrad=True)

# Loss function

In [26]:
from pytorch_metric_learning import distances, losses, miners
loss_func = losses.TripletMarginLoss(distance=distances.LpDistance(normalize_embeddings=False))
miner = miners.MultiSimilarityMiner()

# Training & Validation

In [27]:
def train(epoch, image_model, text_model):
  losses = []
  image_model.train()
  text_model.train()
  for index, item in enumerate(tqdm(train_loader)):
    optimizer.zero_grad()

    # transformed image
    img = item['image'].cuda()
    
    # tokens that define the captions
    mask = item['mask'].cuda()
    ids = item['ids'].cuda()          
    
    # get encoded images; N x feature_dim (N = batch size)
    image_outputs = image_model(img)

    # get encoded captions; N x feature_dim
    text_outputs = text_model(ids, mask)

    embeddings = torch.cat((image_outputs, text_outputs), dim=0) # 2N x feature_dim

    batch_size = image_outputs.shape[0]
    labels = torch.arange(batch_size)
    labels = torch.cat([labels, labels], dim=0)

    hard_pairs = miner(embeddings, labels)
    loss = loss_func(embeddings, labels, hard_pairs)
    
    loss.backward()
    optimizer.step()

    losses.append(loss.item())

  return np.array(losses).mean()

In [28]:
def validate(epoch, image_model, text_model, model_dir):
  losses = []
  
  with torch.no_grad():
    image_model.eval()
    text_model.eval()
    for index, item in enumerate(tqdm(valid_loader)):
      img = item['image'].cuda()
      mask = item['mask'].cuda()
      ids = item['ids'].cuda()          
      
      image_outputs = image_model(img)
      text_outputs = text_model(ids, mask)

      embeddings = torch.cat((image_outputs, text_outputs), dim=0) # 2N x feature_dim
      batch_size = image_outputs.shape[0]
      labels = torch.arange(batch_size)
      labels = torch.cat([labels, labels], dim=0)

      hard_pairs = miner(embeddings, labels)
      loss = loss_func(embeddings, labels, hard_pairs)

      losses.append(loss.item())
    
    torch.save(image_model.state_dict(), f'{model_dir}/image_model_epoch{epoch}.pt') 
    torch.save(text_model.state_dict(), f'{model_dir}/text_model_epoch{epoch}.pt') 

  return np.array(losses).mean()

In [29]:
train_losses = []
valid_losses = []
best_val_loss = 1000
best_epoch = 0
epochs = 15
exp_name = CONFIG['experiment']
model_dir = f'./checkpoints/{exp_name}'
os.makedirs(model_dir, exist_ok=True)

for epoch in range(epochs):
  train_loss = train(epoch, image_model, text_model)
  valid_loss = validate(epoch, image_model, text_model, model_dir)

  print(train_loss, valid_loss)

  train_losses.append(train_loss)
  valid_losses.append(valid_loss)

  if valid_loss < best_val_loss:
    print(f'loss has been decreased from {best_val_loss} to {valid_loss}')
    best_val_loss = valid_loss
    best_epoch = epoch

    torch.save(image_model.state_dict(), f'{model_dir}/best_image_model.pt') 
    torch.save(text_model.state_dict(), f'{model_dir}/best_text_model.pt')

100%|██████████| 2480/2480 [18:02<00:00,  2.29it/s]
100%|██████████| 310/310 [04:24<00:00,  1.17it/s]


0.22052122115067416 0.2809468926441285
loss has been decreased from 1000 to 0.2809468926441285


100%|██████████| 2480/2480 [18:02<00:00,  2.29it/s]
100%|██████████| 310/310 [00:41<00:00,  7.51it/s]


0.11793268034835497 0.2416702060449508
loss has been decreased from 0.2809468926441285 to 0.2416702060449508


100%|██████████| 2480/2480 [18:02<00:00,  2.29it/s]
100%|██████████| 310/310 [00:40<00:00,  7.69it/s]


0.10944452862285318 0.21497617633112015
loss has been decreased from 0.2416702060449508 to 0.21497617633112015


100%|██████████| 2480/2480 [18:01<00:00,  2.29it/s]
100%|██████████| 310/310 [00:41<00:00,  7.45it/s]


0.10381420264020562 0.19753167235082195
loss has been decreased from 0.21497617633112015 to 0.19753167235082195


100%|██████████| 2480/2480 [18:04<00:00,  2.29it/s]
100%|██████████| 310/310 [00:41<00:00,  7.46it/s]


0.09977676488399025 0.18839292934825344
loss has been decreased from 0.19753167235082195 to 0.18839292934825344


100%|██████████| 2480/2480 [18:03<00:00,  2.29it/s]
100%|██████████| 310/310 [00:41<00:00,  7.41it/s]


0.09695563606707559 0.17478933214179931
loss has been decreased from 0.18839292934825344 to 0.17478933214179931


100%|██████████| 2480/2480 [18:03<00:00,  2.29it/s]
100%|██████████| 310/310 [00:41<00:00,  7.41it/s]


0.09482516340250449 0.17156431271183875
loss has been decreased from 0.17478933214179931 to 0.17156431271183875


100%|██████████| 2480/2480 [18:05<00:00,  2.29it/s]
100%|██████████| 310/310 [00:40<00:00,  7.60it/s]


0.09296978983667589 0.168347185032983
loss has been decreased from 0.17156431271183875 to 0.168347185032983


100%|██████████| 2480/2480 [18:02<00:00,  2.29it/s]
100%|██████████| 310/310 [00:41<00:00,  7.40it/s]


0.09150900668765027 0.16314026828735106
loss has been decreased from 0.168347185032983 to 0.16314026828735106


100%|██████████| 2480/2480 [18:05<00:00,  2.29it/s]
100%|██████████| 310/310 [00:42<00:00,  7.37it/s]


0.08983833774804108 0.16169912892003213
loss has been decreased from 0.16314026828735106 to 0.16169912892003213


100%|██████████| 2480/2480 [18:03<00:00,  2.29it/s]
100%|██████████| 310/310 [00:41<00:00,  7.42it/s]


0.08973966094274674 0.16369799707205065


100%|██████████| 2480/2480 [18:02<00:00,  2.29it/s]
100%|██████████| 310/310 [00:41<00:00,  7.46it/s]


0.0881776280776267 0.16306299980609648


100%|██████████| 2480/2480 [18:02<00:00,  2.29it/s]
100%|██████████| 310/310 [00:41<00:00,  7.47it/s]


0.08828014301646861 0.16386158963845623


100%|██████████| 2480/2480 [18:03<00:00,  2.29it/s]
100%|██████████| 310/310 [00:41<00:00,  7.41it/s]


0.087685390064613 0.1666185686184514


100%|██████████| 2480/2480 [18:02<00:00,  2.29it/s]
100%|██████████| 310/310 [00:41<00:00,  7.49it/s]


0.08723848586631638 0.16374253089870175


In [51]:
df = pd.DataFrame()
df['epochs'] = [i for i in range(epochs)]
df['train_loss'] = train_losses
df['val_loss'] = valid_losses
df.to_csv(f'{model_dir}/logger.csv')

In [52]:
df

Unnamed: 0,epochs,train_loss,val_loss
0,0,0.220521,0.280947
1,1,0.117933,0.24167
2,2,0.109445,0.214976
3,3,0.103814,0.197532
4,4,0.099777,0.188393
5,5,0.096956,0.174789
6,6,0.094825,0.171564
7,7,0.09297,0.168347
8,8,0.091509,0.16314
9,9,0.089838,0.161699


In [30]:
projector = [2048, CONFIG['embedding_size']]
best_image_model = ImageFeatureExtractor(projector=projector, pool=CONFIG['pool']).cuda()
projector=[768, CONFIG['embedding_size']]
best_text_model = TextExtractorModel(CONFIG['text_model_name'], projector=projector).cuda()

best_image_model.load_state_dict(torch.load(f'{model_dir}/best_image_model.pt'))
best_text_model.load_state_dict(torch.load(f'{model_dir}/best_text_model.pt'))

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<All keys matched successfully>

# Test model

In [32]:
from torch.nn.functional import normalize

In [40]:
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import top_k_accuracy_score

In [33]:
test_captions = df_test['caption_title_and_reference_description']
text_model.eval()
test_caption_tokens = []
for caption in tqdm(test_captions):
  inputs = CONFIG["tokenizer"].encode_plus(
              caption,
              truncation=True,
              add_special_tokens=True,
              max_length=CONFIG["max_length"],
              padding='max_length'
  )
  ids = torch.tensor(inputs['input_ids'], dtype=torch.long).cuda().unsqueeze(0)
  mask = torch.tensor(inputs['attention_mask'], dtype=torch.long).cuda().unsqueeze(0)
  text_features = normalize(best_text_model(ids, mask))
  text_features = text_features.cpu().detach().numpy()
  test_caption_tokens.append(text_features.squeeze())
test_caption_base = np.stack(test_caption_tokens, axis=1).T

100%|██████████| 9918/9918 [01:37<00:00, 102.20it/s]


In [41]:
caption_retriever = NearestNeighbors(n_neighbors=5, p=2).fit(test_caption_base)

In [42]:
relevant_k = 10

In [43]:
best_image_model.eval()
preds = []
for index, item in enumerate(tqdm(test_loader)):
  img = item['image'].cuda()
  mask = item['mask'].cuda()
  ids = item['ids'].cuda()          
  
  image_outputs = best_image_model(img).cpu().detach().numpy()
  _, retrieved_ids = caption_retriever.kneighbors(image_outputs, n_neighbors=relevant_k)
  preds.extend(retrieved_ids)

100%|██████████| 310/310 [00:59<00:00,  5.17it/s]


In [44]:
targets = list(range(len(preds)))

### Top1 Accuracy

In [45]:
from sklearn.metrics import accuracy_score

In [47]:
accuracy_score([pred[0] for pred in preds], targets)

0.0071587013510788465

### Topk Accuracy

In [61]:
desired_k = 5

In [62]:
count = 0
for i in range(len(preds)):
  true = targets[i]
  if true in preds[i][:desired_k]:
    count += 1

print(count / len(preds))

0.031155474894131882
