In [1]:
import torch
from datetime import datetime
import torch.optim as optim
import matplotlib.pyplot as plt
#from model import IDCM_NN
import numpy as np

In [None]:
class ImgNN(nn.Module):
    """Network to learn image representations"""
    def __init__(self, input_dim=4096, output_dim=1024):
        super(ImgNN, self).__init__()
        self.denseL1 = nn.Linear(input_dim, 1024)
        self.denseL2 = nn.Linear(1024, output_dim)
        
    def forward(self, x):
        #print(x)
        out = F.relu(self.denseL1(x))
        out = F.relu(self.denseL2(out))
        return out


class TextNN(nn.Module):
    """Network to learn text representations"""
    def __init__(self, input_dim=1024, output_dim=1024):
        super(TextNN, self).__init__()
        self.denseL1 = nn.Linear(input_dim, output_dim)
        self.denseL2 = nn.Linear(output_dim, output_dim)

    def forward(self, x):
        out = F.relu(self.denseL1(x))
        out = F.relu(self.denseL2(out))
        return out


class IDCM_NN(nn.Module):
    """Network to learn text representations"""
    def __init__(self, img_input_dim=4096, img_output_dim=2048,
                 text_input_dim=1024, text_output_dim=2048, minus_one_dim=1024, output_dim=10):
        super(IDCM_NN, self).__init__()
        self.img_net = ImgNN(img_input_dim, img_output_dim)
        self.text_net = TextNN(text_input_dim, text_output_dim)
        self.linearLayer = nn.Linear(img_output_dim, minus_one_dim)
        #self.linearLayer2 = nn.Linear(minus_one_dim, output_dim)

    def forward(self, img, text):
        view1_feature = self.img_net(img)
        view2_feature = self.text_net(text)
        view1_feature = self.linearLayer(view1_feature)
        view2_feature = self.linearLayer(view2_feature)

        view1_predict = view1_feature#self.linearLayer2((view1_feature))
        view2_predict = view2_feature#self.linearLayer2((view2_feature))
        return view1_feature, view2_feature, view1_predict, view2_predict

In [2]:
from torch.utils.data.dataset import Dataset
from scipy.io import loadmat, savemat
from torch.utils.data import DataLoader
import numpy as np

class CustomDataSet(Dataset):
    def __init__(
            self,
            images,
            texts,
            labels):
        self.images = np.array(images, dtype=np.float32)
        self.texts = np.array(texts, dtype=np.float32)
        self.labels = np.array(labels, dtype=np.int)
        self.labels = np.squeeze(self.labels)
        #print(type(self.images), type(self.texts), type(self.labels))

    def __getitem__(self, index):
        img = self.images[index]
        text = self.texts[index]
        label = self.labels[index]
        return img, text, label

    def __len__(self):
        count = len(self.images)
        #print(len(self.images),len(self.labels))
        assert len(self.images) == len(self.labels)
        return count


def ind2vec(ind, N=None):
    ind = np.asarray(ind)
    if N is None:
        N = ind.max() + 1
    return np.arange(N) == np.repeat(ind, N, axis=1)

def get_loader(path, batch_size):
    img_train = loadmat(path+"train_img.mat")['train_img'].astype(float)
    img_test = loadmat(path + "test_img.mat")['test_img'].astype(float)
    text_train = loadmat(path+"train_txt.mat")['train_txt'].astype(float)
    text_test = loadmat(path + "test_txt.mat")['test_txt'].astype(float)
    label_train = loadmat(path+"train_img_lab.mat")['train_img_lab'].astype(int)
    label_test = loadmat(path + "test_img_lab.mat")['test_img_lab'].astype(int)
    print(label_train)
    #print("shapes are:  ", img_train.shape, img_test.shape, text_train.shape, text_test.shape, label_train.shape, label_test.shape)
    #label_train = label_train.reshape(label_train.shape[1])
    #label_test = label_test.reshape(label_test.shape[1])

    #label_train = ind2vec(label_train).astype(int)
    #label_test = ind2vec(label_test).astype(int)

    imgs = {'train': img_train, 'test': img_test}
    texts = {'train': text_train, 'test': text_test}
    labels = {'train': label_train, 'test': label_test}
    dataset = {x: CustomDataSet(images=imgs[x], texts=texts[x], labels=labels[x])
               for x in ['train', 'test']}

    shuffle = {'train': False, 'test': False}

    dataloader = {x: DataLoader(dataset[x], batch_size=batch_size,
                                shuffle=shuffle[x], num_workers=0) for x in ['train', 'test']}

    img_dim = img_train.shape[1]
    text_dim = text_train.shape[1]
    num_class = label_train.shape[1]

    input_data_par = {}
    input_data_par['img_test'] = img_test
    input_data_par['text_test'] = text_test
    input_data_par['label_test'] = label_test
    input_data_par['img_train'] = img_train
    input_data_par['text_train'] = text_train
    input_data_par['label_train'] = label_train
    input_data_par['img_dim'] = img_dim
    input_data_par['text_dim'] = text_dim
    input_data_par['num_class'] = num_class
    return dataloader, input_data_par


In [3]:
from __future__ import print_function
from __future__ import division
import torch
import torch.nn as nn
import torchvision
import time
import copy
from evaluate import fx_calc_map_label
import torch.nn.functional as F
from evaluate import fx_calc_map_label
import numpy as np
print("PyTorch Version: ", torch.__version__)
print("Torchvision Version: ", torchvision.__version__)


def calc_label_sim(label_1, label_2):
    Sim = label_1.float().mm(label_2.float().t())
    return Sim

# def cos(x, y):
#     return x.mm(y.t())

def calc_loss(view1_feature, view2_feature, view1_predict, view2_predict, labels_1, labels_2, alpha, beta):
    term1 = ((view1_predict-labels_1.float())**2).sum(1).sqrt().mean() + ((view2_predict-labels_2.float())**2).sum(1).sqrt().mean()

    cos = lambda x, y: x.mm(y.t()) / ((x ** 2).sum(1, keepdim=True).sqrt().mm((y ** 2).sum(1, keepdim=True).sqrt().t())).clamp(min=1e-6) / 2.
    theta11 = cos(view1_feature, view1_feature)
    theta12 = cos(view1_feature, view2_feature)
    theta22 = cos(view2_feature, view2_feature)
    Sim11 = calc_label_sim(labels_1, labels_1).float()
    Sim12 = calc_label_sim(labels_1, labels_2).float()
    Sim22 = calc_label_sim(labels_2, labels_2).float()
    term21 = ((1+torch.exp(theta11)).log() - Sim11 * theta11).mean()
    term22 = ((1+torch.exp(theta12)).log() - Sim12 * theta12).mean()
    term23 = ((1 + torch.exp(theta22)).log() - Sim22 * theta22).mean()
    term2 = term21 + term22 + term23

    term3 = ((view1_feature - view2_feature)**2).sum(1).sqrt().mean()

    im_loss = term1 + alpha * term2 + beta * term3
    return im_loss

def normalize(x, power = 2):
    norm = x.pow(power).sum(1, keepdim=True).pow(1. / power)
    out = x.div(norm)
    return out

def calc_loss1(view1_feature, view2_feature, view1_predict, view2_predict, labels_1, labels_2, alpha, beta, device, temperature, base_temperature, batch_size):
    #print(labels_1.shape)
    view1_feature = normalize(view1_feature, 2)
    view2_feature = normalize(view2_feature, 2)
    view1_predict = F.softmax(view1_predict, dim = 1)
    view2_predict = F.softmax(view2_predict, dim = 1)
    #term3 = ((view1_feature - view2_feature) ** 2).sum(1).sqrt().mean()
    labels_1_ori = labels_1.float()
    labels_2_ori = labels_2.float()

    labels_1 = torch.argmax(labels_1, dim = 1)
    labels_2 = torch.argmax(labels_2, dim = 1)
    batch_size = labels_1.shape[0]
    #print(labels_1.shape, labels_2.shape)
    labels_1 = labels_1.contiguous().view(-1, 1)
    labels_2 = labels_2.contiguous().view(-1, 1)

    mask_img = torch.eq(labels_1, labels_1.T).float().to(device)
    mask_txt = torch.eq(labels_2, labels_2.T).float().to(device)
    mask_img2txt = torch.eq(labels_1, labels_2.T).float().to(device)
    mask_txt2img = torch.eq(labels_2, labels_1.T).float().to(device)
    mask_img2lab = torch.eq(labels_1, labels_1.T).float().to(device)
    mask_txt2lab = torch.eq(labels_2, labels_2.T).float().to(device)
    #print("here1")

    #print(view1_predict.shape, labels_1_ori.shape, labels_1.shape)
    img_contrast = torch.div(
        torch.matmul(view1_feature, view1_feature.T),
        temperature)
    txt_contrast = torch.div(
        torch.matmul(view2_feature, view2_feature.T),
        temperature)
    img2txt_contrast = torch.div(
        torch.matmul(view1_feature, view2_feature.T),
        temperature)
    txt2img_contrast = torch.div(
        torch.matmul(view2_feature, view1_feature.T),
        temperature)
    img2lab_contrast = torch.div(
        torch.matmul(view1_predict, labels_1_ori.T),
        temperature)
    txt2lab_contrast = torch.div(
        torch.matmul(view2_predict, labels_2_ori.T),
        temperature)
    #print("here2")
    #print("img_contrast")
    #print(img_contrast, txt_contrast)
    logits_img_max, _ = torch.max(img_contrast, dim=1, keepdim=True)
    logits1 = img_contrast - logits_img_max.detach()
    logits_txt_max, _ = torch.max(txt_contrast, dim=1, keepdim=True)
    logits2 = txt_contrast - logits_txt_max.detach()
    logits_img2txt_max, _ = torch.max(img2txt_contrast, dim=1, keepdim=True)
    logits3 = img2txt_contrast - logits_img2txt_max.detach()
    logits_txt2img_max, _ = torch.max(txt2img_contrast, dim=1, keepdim=True)
    logits4 = txt2img_contrast - logits_txt2img_max.detach()
    logits_img2lab_max, _ = torch.max(img2lab_contrast, dim=1, keepdim=True)
    logits5 = img2lab_contrast - logits_img2lab_max.detach()
    logits_txt2lab_max, _ = torch.max(txt2lab_contrast, dim=1, keepdim=True)
    logits6 = txt2lab_contrast - logits_txt2lab_max.detach()
    #print("here3")
    contrast_count = 1
    anchor_count = 1
    # tile mask
    mask_img = mask_img.repeat(anchor_count, contrast_count)
    mask_txt = mask_txt.repeat(anchor_count, contrast_count)
    mask_img2txt = mask_img2txt.repeat(anchor_count, contrast_count)
    mask_txt2img = mask_txt2img.repeat(anchor_count, contrast_count)
    mask_img2lab = mask_img2lab.repeat(anchor_count, contrast_count)
    mask_txt2lab = mask_txt2lab.repeat(anchor_count, contrast_count)
    #print(mask_img.shape,view1_feature.shape[1])
    #print(mask_img)
    logits_mask_1 = torch.scatter(
        torch.ones_like(mask_img),
        1,
        torch.arange(batch_size * anchor_count).view(-1, 1).to(device),
        0
    )
    logits_mask_2 = torch.scatter(
        torch.ones_like(mask_txt),
        1,
        torch.arange(batch_size * anchor_count).view(-1, 1).to(device),
        0
    )
    logits_mask_3 = torch.scatter(
        torch.ones_like(mask_img2txt),
        1,
        torch.arange(batch_size * anchor_count).view(-1, 1).to(device),
        0
    )
    logits_mask_4 = torch.scatter(
        torch.ones_like(mask_txt2img),
        1,
        torch.arange(batch_size * anchor_count).view(-1, 1).to(device),
        0
    )
    logits_mask_5 = torch.scatter(
        torch.ones_like(mask_img),
        1,
        torch.arange(batch_size * anchor_count).view(-1, 1).to(device),
        0
    )
    logits_mask_6 = torch.scatter(
        torch.ones_like(mask_txt),
        1,
        torch.arange(batch_size * anchor_count).view(-1, 1).to(device),
        0
    )
    #print("here4")
    mask_img = mask_img * logits_mask_1
    mask_txt = mask_txt * logits_mask_2
    mask_img2txt = mask_img2txt * logits_mask_3
    mask_txt2img = mask_txt2img * logits_mask_4
    mask_img2lab = mask_img2lab * logits_mask_5
    mask_txt2lab = mask_txt2lab * logits_mask_6
    # compute log_prob
    exp_logits1 = torch.exp(logits1) * logits_mask_1
    exp_logits2 = torch.exp(logits2) * logits_mask_2
    exp_logits3 = torch.exp(logits3) * logits_mask_3
    exp_logits4 = torch.exp(logits4) * logits_mask_4
    exp_logits5 = torch.exp(logits5) * logits_mask_5
    exp_logits6 = torch.exp(logits6) * logits_mask_6
    #print("exp_logits1")
    #print(exp_logits1,exp_logits2)
    log_prob1 = logits1 - torch.log(exp_logits1.sum(1, keepdim=True))
    log_prob2 = logits2 - torch.log(exp_logits2.sum(1, keepdim=True))
    log_prob3 = logits3 - torch.log(exp_logits3.sum(1, keepdim=True))
    log_prob4 = logits4 - torch.log(exp_logits4.sum(1, keepdim=True))
    log_prob5 = logits5 - torch.log(exp_logits5.sum(1, keepdim=True))
    log_prob6 = logits6 - torch.log(exp_logits6.sum(1, keepdim=True))
    #print("log_prob3")
    #print(log_prob3,log_prob3)
    # compute mean of log-likelihood over positive
    mean_log_prob_pos1 = (mask_img * log_prob1).sum(1) / mask_img.sum(1)
    mean_log_prob_pos2 = (mask_txt * log_prob2).sum(1) / mask_txt.sum(1)
    mean_log_prob_pos3 = (mask_img2txt * log_prob3).sum(1) / mask_img2txt.sum(1)
    mean_log_prob_pos4 = (mask_txt2img * log_prob4).sum(1) / mask_txt2img.sum(1)
    mean_log_prob_pos5 = (mask_img2lab * log_prob5).sum(1) / mask_img2lab.sum(1)
    mean_log_prob_pos6 = (mask_txt2lab * log_prob6).sum(1) / mask_txt2lab.sum(1)
    #print("mean_log_prob_pos1")
    #print(mean_log_prob_pos1)
    # loss
    loss1 = - (temperature / base_temperature) * mean_log_prob_pos1
    loss2 = - (temperature / base_temperature) * mean_log_prob_pos2
    loss3 = - (temperature / base_temperature) * mean_log_prob_pos3
    loss4 = - (temperature / base_temperature) * mean_log_prob_pos4
    loss5 = - (temperature / base_temperature) * mean_log_prob_pos5
    loss6 = - (temperature / base_temperature) * mean_log_prob_pos6
    #print("loss1")
    #print(loss1)
    loss1 = loss1.view(1, batch_size).mean()
    loss2 = loss2.view(1, batch_size).mean()
    loss3 = loss3.view(1, batch_size).mean()
    loss4 = loss4.view(1, batch_size).mean()
    loss5 = loss5.view(1, batch_size).mean()
    loss6 = loss6.view(1, batch_size).mean()
    #print("loss1")
    #print(loss1)
    return loss3 + loss4 + loss1 + loss2 + loss5 + loss6

def calc_loss2(view1_feature, view2_feature, view1_predict, view2_predict, labels_1, labels_2, alpha, beta, device, temperature, base_temperature, batch_size):
    #print(labels_1.shape)
    view1_feature = normalize(view1_feature, 2)
    view2_feature = normalize(view2_feature, 2)
    #print("view1_feature, view2_feature")
    #print(view1_feature, view2_feature)
   
    img2txt_contrast = torch.div(
        torch.matmul(view1_feature, view2_feature.T),
        temperature)
    txt2img_contrast = torch.div(
        torch.matmul(view2_feature, view1_feature.T),
        temperature)
    #print("img2txt_contrast, txt2img_contrast")
    #print(img2txt_contrast, txt2img_contrast)
    logits_img2txt_max, _ = torch.max(img2txt_contrast, dim=1, keepdim=True)
    logits3 = img2txt_contrast - logits_img2txt_max.detach()
    logits_txt2img_max, _ = torch.max(txt2img_contrast, dim=1, keepdim=True)
    logits4 = txt2img_contrast - logits_txt2img_max.detach()
    #print("logits3, logits4")
    #print(logits3, logits4)
   
    exp_logits3 = torch.exp(logits3)
    exp_logits4 = torch.exp(logits4)
    #print("exp_logits1")
    #print(exp_logits3, exp_logits4)
    log_prob3 = logits3 - torch.log(exp_logits3.sum(1, keepdim=True))
    log_prob4 = logits4 - torch.log(exp_logits4.sum(1, keepdim=True))
    #print("log_prob3")
    #print(log_prob3,log_prob3)
    # compute mean of log-likelihood over positive
    mask_img2txt = torch.eye(logits3.shape[0], m=logits3.shape[1], out=None).to(device)
    mask_txt2img = torch.eye(logits4.shape[0], m=logits4.shape[1], out=None).to(device)
    mean_log_prob_pos3 = (mask_img2txt * log_prob3).sum(1) / mask_img2txt.sum(1)
    mean_log_prob_pos4 = (mask_txt2img * log_prob4).sum(1) / mask_txt2img.sum(1)
    #print("mean_log_prob_pos1")
    #print(mean_log_prob_pos1)
    # loss
    loss3 = - (temperature / base_temperature) * mean_log_prob_pos3
    loss4 = - (temperature / base_temperature) * mean_log_prob_pos4
    #print("loss1")
    #print(loss1)
    loss3 = loss3.view(1, logits3.shape[0]).mean()
    loss4 = loss4.view(1, logits4.shape[0]).mean()
    #print("loss3 loss4")
    #print(loss3, loss4)
    return 100*(loss3 + loss4)# + (loss1 + loss2) * 0.5

def train_model(model, data_loaders, optimizer, alpha, beta, device="cpu", num_epochs=500):
    since = time.time()
    # device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    batch_size = 800
    temperature = 0.7#0.37 best now
    base_temperature = 0.07 #0.07 best now
    test_img_acc_history = []
    test_txt_acc_history = []
    epoch_loss_history =[]

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        #adjust_learning_rate(optimizer, epoch)
        print('Epoch {}/{}'.format(epoch+1, num_epochs))
        print('-' * 20)

        # Each epoch has a training and validation phase
        for phase in ['train', 'test']:
            if phase == 'train':
                # Set model to training mode
                model.train()
            else:
                # Set model to evaluate mode
                model.eval()

            running_loss = 0.0
            running_corrects_img = 0
            running_corrects_txt = 0
            # Iterate over data.
            for idx,(imgs, txts, labels) in enumerate(data_loaders[phase]):
                #warmup_learning_rate(epoch, idx, len(data_loaders[phase]), optimizer)
                # imgs = imgs.to(device)
                # txts = txts.to(device)
                # labels = labels.to(device)
                labels = np.squeeze(labels)
                #print(labels.shape)
                if torch.sum(imgs != imgs)>1 or torch.sum(txts != txts)>1:
                    print("Data contains Nan.")

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    # Get model outputs and calculate loss
                    # Special case for inception because in training it has an auxiliary output. In train
                    #   mode we calculate the loss by summing the final output and the auxiliary output
                    #   but in testing we only consider the final output.
                    if torch.cuda.is_available():
                        imgs = imgs.to(device)#.cuda()
                        txts = txts.to(device)#.cuda()
                        labels = labels.to(device)#.cuda()


                    # zero the parameter gradients
                    optimizer.zero_grad()

                    # Forward
                    view1_feature, view2_feature, view1_predict, view2_predict = model(imgs, txts)

                    loss = calc_loss2(view1_feature, view2_feature, view1_predict, view2_predict, labels, labels, alpha, beta, device, temperature, base_temperature, batch_size)

                    img_preds = view1_predict
                    txt_preds = view2_predict

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item()
                running_corrects_img += torch.sum(torch.argmax(img_preds, dim=1) == labels)#torch.argmax(labels, dim=1))
                running_corrects_txt += torch.sum(torch.argmax(txt_preds, dim=1) == labels)#torch.argmax(labels, dim=1))

            epoch_loss = running_loss / len(data_loaders[phase].dataset)
            # epoch_img_acc = running_corrects_img.double() / len(data_loaders[phase].dataset)
            # epoch_txt_acc = running_corrects_txt.double() / len(data_loaders[phase].dataset)
            t_imgs, t_txts, t_labels = [], [], []
            with torch.no_grad():
                for imgs, txts, labels in data_loaders['test']:
                    #print("test: ", labels.shape)
                    #labels = np.squeeze(labels)
                    if torch.cuda.is_available():
                        imgs = imgs.cuda()
                        txts = txts.cuda()
                        labels = labels.cuda()
                    t_view1_feature, t_view2_feature, _, _ = model(imgs, txts)
                    t_view1_feature, t_view2_feature = normalize(t_view1_feature, 2), normalize(t_view2_feature, 2)
                    temp = labels.cpu().numpy()
                    #print("temp shape: ", temp.shape)
                    t_imgs.append(t_view1_feature.cpu().numpy())
                    t_txts.append(t_view2_feature.cpu().numpy())
                    t_labels.extend(temp)
                    #print("t_labels in loop:", len(t_labels))
            t_imgs = np.concatenate(t_imgs)
            t_txts = np.concatenate(t_txts)
            #print("t_labels value:", t_labels)
            #print("t_labels:" , np.array(t_labels, dtype = int).shape)
            #t_labels = np.concatenate(t_labels).argmax(1)
            if epoch < 500:
                img2text = fx_calc_map_label(t_imgs, t_txts, t_labels, dist_method='COS')
                txt2img = fx_calc_map_label(t_txts, t_imgs, t_labels, dist_method='COS')
            else:
                img2text = fx_calc_map_label(t_imgs, t_txts, t_labels, dist_method='COS')
                txt2img = fx_calc_map_label(t_txts, t_imgs, t_labels, dist_method='COS')

            #print("img2text: r1, r5, r10", img2text[0], img2text[1], img2text[2])
            #print("txt2img: r1, r5, r10", txt2img[0], txt2img[1], txt2img[2])
            #print("average: r1, r5, r10", (txt2img[0]+img2text[0]) / 2, (txt2img[1] + img2text[1]) / 2, (txt2img[2] + img2text[2]) / 2)
            print('{} Loss: {:.4f} Img2Txt: {:.4f}  Txt2Img: {:.4f}  Average: {:.4f}  Best_acc: {:.4f}'.format(phase, epoch_loss, img2text, txt2img, (img2text + txt2img) / 2, best_acc))

            # deep copy the model
            if phase == 'test' and (img2text + txt2img) / 2. > best_acc:
                best_acc = (img2text + txt2img) / 2.
                torch.save(model, './model_cl')
                best_model_wts = copy.deepcopy(model.state_dict())
            if phase == 'test':
                test_img_acc_history.append(img2text)
                test_txt_acc_history.append(txt2img)
                epoch_loss_history.append(epoch_loss)

        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
    print('Best average ACC: {:4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model, test_img_acc_history, test_txt_acc_history, epoch_loss_history



PyTorch Version:  1.6.0
Torchvision Version:  0.7.0


In [5]:
import numpy as np
import scipy
import scipy.spatial as T

def fx_calc_map_label(image, text, label, k = 0, dist_method='COS'):
  #print("labels:" ,len(label))
  #label =  np.array(label)
  #label = np.squeeze(label)
  #print("labels:" ,len(label))
  #print(label.shape)
  if dist_method == 'L2':
    dist = scipy.spatial.distance.cdist(image, text, 'euclidean')
  elif dist_method == 'COS':
    dist = scipy.spatial.distance.cdist(image, text, 'cosine')
  ord = dist.argsort()
  numcases = dist.shape[0]
  res = []
  for i in range(numcases):
    order = ord[i]
    p = 0.0
    r = 0.0
    for j in range(numcases):
      #print(np.array(label).shape)
      if label[i] == label[order[j]]:
          res += [j]
          break
  rank = [1, 5, 10]
  acc = [sum([_ < r for _ in res]) / len(res) for r in rank]
  return acc[2]


def fx_calc_map_label1(image, text, label, k = 0, dist_method='COS'):
  if dist_method == 'L2':
    dist = T.distance.cdist(image, text, 'euclidean')
  elif dist_method == 'COS':
    dist = T.distance.cdist(image, text, 'cosine')
  ord = dist.argsort()
  numcases = dist.shape[0]
  if k == 0:
    k = numcases
  res = []
  for i in range(numcases):
    order = ord[i]
    p = 0.0
    r = 0.0
    for j in range(k):
      if label[i] == label[order[j]]:
        r += 1
        p += (r / (j + 1))
    if r > 0:
      res += [p / r]
    else:
      res += [0]
  return np.mean(res)


In [None]:
def normalized(a, order=2 ,axis=-1 ):
    l2 = np.atleast_1d(np.linalg.norm(a, order, axis))
    l2[l2==0] = 1
    return a / np.expand_dims(l2, axis)


if __name__ == '__main__':
    # environmental setting: setting the following parameters based on your experimental environment.
    #dataset = 'pascal'
    dataset = 'flicker30k'
    device = torch.device("cuda")# if torch.cuda.is_available() else "cpu")
    # data parameters
    DATA_DIR = 'data/' + dataset + '/'
    alpha = 1e-3
    beta = 1e-1
    MAX_EPOCH = 500
    batch_size = 13000 #4096
    # batch_size = 512
    lr = 5e-5
    betas = (0.5, 0.999)
    weight_decay = 0

    print('...Data loading is beginning...')

    data_loader, input_data_par = get_loader(DATA_DIR, batch_size)

    print('...Data loading is completed...')

    model_ft = IDCM_NN(img_input_dim=input_data_par['img_dim'], text_input_dim=input_data_par['text_dim'], output_dim=input_data_par['num_class']).to(device)
    params_to_update = list(model_ft.parameters())

    # Observe that all parameters are being optimized
    optimizer = optim.Adam(params_to_update, lr=lr, betas=betas)

    print('...Training is beginning...')
    # Train and evaluate
    model_ft, img_acc_hist, txt_acc_hist, loss_hist = train_model(model_ft, data_loader, optimizer, alpha, beta, device, MAX_EPOCH)
    print('...Training is completed...')

    print('...Evaluation on testing data...')
    view1_feature, view2_feature, view1_predict, view2_predict = model_ft(torch.tensor(input_data_par['img_test']).to(device), torch.tensor(input_data_par['text_test']).to(device))
    label = torch.argmax(torch.tensor(input_data_par['label_test']), dim=1)
    view1_feature = view1_feature.detach().cpu().numpy()
    view2_feature = view2_feature.detach().cpu().numpy()
    view1_predict = view1_predict.detach().cpu().numpy()
    view2_predict = view2_predict.detach().cpu().numpy()
    t_view1_feature, t_view2_feature = normalized(view1_feature, 2), normalize(view2_feature, 2)
    img_to_txt = fx_calc_map_label(t_view1_feature, t_view2_feature, label)
    print('...Image to Text MAP = {}'.format(img_to_txt))

    txt_to_img = fx_calc_map_label(t_view1_feature, t_view2_feature, label)
    print('...Text to Image MAP = {}'.format(txt_to_img))

    print('...Average MAP = {}'.format(((img_to_txt + txt_to_img) / 2.)))


...Data loading is beginning...
[[ 2772 12264 22100 ...   860 15795 23654]]
...Data loading is completed...
...Training is beginning...
Epoch 1/500
--------------------
train Loss: 1.8371 Img2Txt: 0.0406  Txt2Img: 0.0406  Average: 0.0406  Best_acc: 0.0000
test Loss: 9.4176 Img2Txt: 0.0406  Txt2Img: 0.0406  Average: 0.0406  Best_acc: 0.0000

Epoch 2/500
--------------------
train Loss: 1.8217 Img2Txt: 0.0567  Txt2Img: 0.0542  Average: 0.0554  Best_acc: 0.0406
test Loss: 9.2474 Img2Txt: 0.0567  Txt2Img: 0.0542  Average: 0.0554  Best_acc: 0.0406

Epoch 3/500
--------------------
train Loss: 1.7962 Img2Txt: 0.0387  Txt2Img: 0.0529  Average: 0.0458  Best_acc: 0.0554
test Loss: 9.1505 Img2Txt: 0.0387  Txt2Img: 0.0529  Average: 0.0458  Best_acc: 0.0554

Epoch 4/500
--------------------
train Loss: 1.7786 Img2Txt: 0.0645  Txt2Img: 0.0677  Average: 0.0661  Best_acc: 0.0554
test Loss: 9.0366 Img2Txt: 0.0645  Txt2Img: 0.0677  Average: 0.0661  Best_acc: 0.0554

Epoch 5/500
--------------------
tra

train Loss: 1.6942 Img2Txt: 0.0735  Txt2Img: 0.0825  Average: 0.0780  Best_acc: 0.1057
test Loss: 8.7784 Img2Txt: 0.0735  Txt2Img: 0.0825  Average: 0.0780  Best_acc: 0.1057

Epoch 41/500
--------------------
train Loss: 1.6985 Img2Txt: 0.0858  Txt2Img: 0.1044  Average: 0.0951  Best_acc: 0.1057
test Loss: 8.7482 Img2Txt: 0.0858  Txt2Img: 0.1044  Average: 0.0951  Best_acc: 0.1057

Epoch 42/500
--------------------
train Loss: 1.6923 Img2Txt: 0.0948  Txt2Img: 0.1083  Average: 0.1015  Best_acc: 0.1057
test Loss: 8.7375 Img2Txt: 0.0948  Txt2Img: 0.1083  Average: 0.1015  Best_acc: 0.1057

Epoch 43/500
--------------------
train Loss: 1.6932 Img2Txt: 0.0858  Txt2Img: 0.0941  Average: 0.0899  Best_acc: 0.1057
test Loss: 8.7378 Img2Txt: 0.0858  Txt2Img: 0.0941  Average: 0.0899  Best_acc: 0.1057

Epoch 44/500
--------------------
train Loss: 1.6892 Img2Txt: 0.0974  Txt2Img: 0.1122  Average: 0.1048  Best_acc: 0.1057
test Loss: 8.7173 Img2Txt: 0.0974  Txt2Img: 0.1122  Average: 0.1048  Best_acc: 0.