In [1]:
import pandas as pd
import torch

import torchvision.models as models
from torch.utils.data import Dataset, DataLoader
from skimage import io, transform
import matplotlib.pyplot as plt
from torch.autograd import Variable
import numpy as np
import torch.nn as nn
import torch.nn.functional as f
from skimage.transform import rescale, resize, downscale_local_mean
from torchvision import datasets, transforms
from PIL import Image
from tqdm import tqdm
import h5py
import pickle
from transformers import *
from sklearn.externals import joblib
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
from tensorboardX import SummaryWriter
import time

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
class CAD120Dataset(Dataset):
    def __init__(self, feature_file,  caption_file , raw_caption_file=None, transform=None):

        
        self.caption = joblib.load(caption_file)
        self.caption = [ cap.permute(1,0,2) for cap in self.caption ]
        #self.raw_caption = joblib.load(raw_caption_file)
        self.features = joblib.load(feature_file)
        #self.features = [feature.cpu().numpy() for feature in self.features ]
        
    
    def __len__(self):
        return len(self.caption)
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
            
        features = self.features[idx]
        captions = self.caption[idx]
        #raw_captions = self.raw_caption[idx]
        
       
        sample = {"features": features, "captions":captions, 
                 # "raw_captions": raw_captions
                 }

        return sample

In [3]:
cad120 =  CAD120Dataset('features_subject1.bin','caption_subject1.bin', 'caption_words_subject1.bin' )

In [4]:
sample = cad120[10]

In [5]:
sample['features'].shape

torch.Size([400, 1, 8192])

In [6]:
device = 'cuda:0'
class Net(nn.Module):
    def __init__(self, input_size_image, input_size_word_embedding, 
                hidden_size, output_size,num_layers=1):
        super(Net, self).__init__()
        self.input_size_image = input_size_image
        self.input_size_word_embedding = input_size_word_embedding
        self.hidden_size = hidden_size
        
        self.num_layers = num_layers
        self.lstm_image = nn.LSTM(input_size_image, hidden_size, num_layers,  bidirectional=True)
        self.lstm_caption = nn.LSTM(input_size_word_embedding, hidden_size, num_layers,  bidirectional=True)
      
    
    def forward(self, image_features, caption_features):
        # Set initial states
        
        img_batch_len = len(image_features)
        caption_batch_len = len(caption_features)
        
 
       
        h0 = torch.zeros(self.num_layers*2, 1, self.hidden_size).to(device)
        c0 = torch.zeros(self.num_layers*2, 1, self.hidden_size).to(device)
        
        h1 = torch.zeros(self.num_layers*2, 1, self.hidden_size).to(device)
        c1 = torch.zeros(self.num_layers*2, 1, self.hidden_size).to(device)
        OUT0 = []
        OUT1 = []
        hidden_embedding0 = []
        hidden_embedding1 = []
        for i in range(img_batch_len):
            #print(image_features[i].shape)
            h0 = torch.zeros(self.num_layers*2, 1, self.hidden_size).to(device)
            c0 = torch.zeros(self.num_layers*2, 1, self.hidden_size).to(device)
            
            hidden_embedding0 = []
            for j in range(image_features[i].shape[0]):
                out0, (h0,c0) = self.lstm_image(image_features[i][j].view(1,1,-1), (h0,c0))  # out: tensor of shape (batch_size, seq_length, hidden_size*2)
                hidden_embedding0.append(h0)
            hidden_embedding0 = torch.stack(hidden_embedding0)
            
            OUT0.append(hidden_embedding0)
                
        
        
            
        for i in range(caption_batch_len):
            h1 = torch.zeros(self.num_layers*2, 1, self.hidden_size).to(device)
            c1 = torch.zeros(self.num_layers*2, 1, self.hidden_size).to(device)
            hidden_embedding1 = []
            
            for j in range(caption_features[i].shape[0]):
                out1, (h1, c1) = self.lstm_caption(caption_features[i][j].view(1,1,-1), (h1, c1))
                hidden_embedding1.append(h1)
            
            hidden_embedding1 = torch.stack(hidden_embedding1)
            OUT1.append(hidden_embedding1)
        

        OUT0 = torch.stack(OUT0)
        OUT1 = torch.stack(OUT1)
        OUT0 = OUT0.view(OUT0.shape[0],OUT0.shape[1],-1)
        OUT1 = OUT1.view(OUT1.shape[0],OUT1.shape[1],-1)
        return OUT0,OUT1



In [7]:
model = Net(512*4*4, 768, 256, 10,1).to(device)
writer = SummaryWriter()

In [8]:
# optimizer = optim.SGD(model.parameters(), 0.01,momentum=0.9,weight_decay=1e-4)
optimizer = optim.Adam(model.parameters())

def my_collate(batch):
    img = [item["features"] for item in batch]
    caption = [item["captions"] for item in batch]
    #raw_caption = [item["raw_captions"] for item in batch]
    #caption = torch.LongTensor(caption)
    return [img, caption]

train_loader = DataLoader( CAD120Dataset('features_subjectall.bin', \
                                         'caption_subjectall.bin', \
                                         'caption_words_subjectall.bin'), batch_size=16, shuffle = True,
                          collate_fn = my_collate)
test_set = CAD120Dataset('features_subject5.bin','caption_subject5.bin')
test_loader = DataLoader( test_set, batch_size = int(len(test_set)), shuffle = False,
                          collate_fn = my_collate)

In [9]:
#test_set = CAD120Dataset('features_subject3.bin','caption_subject3.bin')

In [10]:
def train(model, device, train_loader, optimizer, epoch):
    model.train()
    for batch_idx, b in enumerate(train_loader):
        start_time = time.time()
        imgs = b[0]
        captions = b[1]
       
        img_feature, caption_feature  =  [torch.tensor(img).to(device) for img in imgs], \
                        [torch.tensor(caption).to(device) for caption in captions]
        
        optimizer.zero_grad()
        
        h_image, h_caption = model(img_feature, caption_feature)

        loss = torch.tensor(0.0).cuda()
        accuracy = 0

    
        norm_image = torch.norm(h_image, dim=2).unsqueeze(2)
        norm_caption = torch.norm(h_caption, dim=2).unsqueeze(2)

        h_image = h_image/norm_image
        h_caption = h_caption/norm_caption

        matrix = torch.einsum('ijk, npk -> ijpn', h_image, h_caption)

        matrix = torch.max(matrix, 2).values
        matrix = torch.sum(matrix,1)
        
        #print(matrix)
        diag = torch.diag(matrix)
        
        
        tmp_diag = (diag*torch.ones(matrix.shape[0],matrix.shape[0]).cuda()).T 
      
        tmp_diag = tmp_diag + torch.eye(matrix.shape[0]).cuda()
        plus_one_matrix = matrix+1

        
        loss += torch.sum(torch.clamp(plus_one_matrix - tmp_diag, 0))
        
        
        m = torch.max(matrix,1)
#         for i in range(len(m.values)):
#             index = m.indices[i]
#             if index == i:
#                 continue
#             loss += 3*max(m.values[i] + 1 - diag[i], 0.0)
        
        accuracy1 = torch.sum( diag  >= m.values)*1.0 /  float(len(imgs))
        accuracy += torch.sum( diag  >= m.values)*1.0
        
        
        matrix = torch.einsum('ijk, npk -> ijpn', h_caption, h_image)
        matrix = torch.max(matrix, 2).values
        matrix = torch.sum(matrix,1)
        #print(matrix)
        #print(matrix)
        diag = torch.diag(matrix)
        tmp_diag = (diag*torch.ones(matrix.shape[0],matrix.shape[0]).cuda()).T 
        tmp_diag = tmp_diag + torch.eye(matrix.shape[0]).cuda()
        plus_one_matrix = matrix+1
        
        loss += torch.sum(torch.clamp(plus_one_matrix - tmp_diag, 0))
     
        m = torch.max(matrix,1)
#         for i in range(len(m.values)):
#             index = m.indices[i]
#             if index == i:
#                 continue
#             loss += max(m.values[i]+1-diag[i],0.0)

        accuracy2 = torch.sum( diag  >= m.values)*1.0 / float(len(imgs))
        accuracy += torch.sum( diag  >= m.values)*1.0

        accuracy /= 2.0* float(len(imgs))
        loss /= 2.0* float(len(imgs))
        
        loss = loss.cuda()
        loss.backward()
        optimizer.step()
        
        print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.4f} \t Time: {:.4f} seconds Acc1: {:.4f} Acc2: {:.4f} Acc: {:.4f}'.format(
                epoch, (batch_idx) * len(imgs), len(train_loader.dataset),
                100. * (batch_idx) / len(train_loader), loss.item(), time.time()-start_time, accuracy1, accuracy2, accuracy))
        


In [11]:
def test(model, device, test_loader,epoch):
    model.eval()
    for batch_idx, b in enumerate(test_loader):
        start_time = time.time()
        imgs = b[0]
        captions = b[1]
       
        img_feature, caption_feature  =  [torch.tensor(img).to(device) for img in imgs], \
                        [torch.tensor(caption).to(device) for caption in captions]
        
        optimizer.zero_grad()
        
        h_image, h_caption = model(img_feature, caption_feature)

        loss = torch.tensor(0.0).cuda()
        accuracy = 0
    
        norm_image = torch.norm(h_image, dim=2).unsqueeze(2)
        norm_caption = torch.norm(h_caption, dim=2).unsqueeze(2)
        
        h_image = h_image/norm_image
        h_caption = h_caption/norm_caption

        
        matrix = torch.einsum('ijk, npk -> ijpn', h_image, h_caption)
        matrix = torch.max(matrix, 2).values
        matrix = torch.sum(matrix,1)
        diag = torch.diag(matrix)
        tmp_diag = (diag*torch.ones(matrix.shape[0],matrix.shape[0]).cuda()).T 
        
        #tmp_diag = tmp_diag + torch.eye(matrix.shape[0]).cuda()
        plus_one_matrix = matrix+1


        loss += torch.sum(torch.clamp(plus_one_matrix - tmp_diag, 0))
        
        
        m = torch.max(matrix,1)
#         for i in range(len(m.values)):
#             index = m.indices[i]
#             if index == i:
#                 continue
#             loss += 3*max(m.values[i] + 1 - diag[i], 0.0)
        
        accuracy1 = torch.sum( diag  > m.values)*1.0 /  float(len(imgs))
        accuracy += torch.sum( diag  > m.values)*1.0
        
        
        matrix = torch.einsum('ijk, npk -> ijpn', h_caption, h_image)
        matrix = torch.max(matrix, 2).values
        matrix = torch.sum(matrix,1)
        diag = torch.diag(matrix)
        tmp_diag = (diag*torch.ones(matrix.shape[0],matrix.shape[0]).cuda()).T 
        #tmp_diag = tmp_diag + torch.eye(matrix.shape[0]).cuda()
        plus_one_matrix = matrix+1
  
        loss += torch.sum(torch.clamp(plus_one_matrix - tmp_diag, 0))
     
        m = torch.max(matrix,1)
#         for i in range(len(m.values)):
#             index = m.indices[i]
#             if index == i:
#                 continue
#             loss += max(m.values[i]+1-diag[i],0.0)

        accuracy2 = torch.sum( diag  > m.values)*1.0 / float(len(imgs))
        accuracy += torch.sum( diag  > m.values)*1.0

        accuracy /= 2.0* float(len(imgs))
        loss /= 2.0* float(len(imgs))
        print("different accuracy :", accuracy1, accuracy2)

        return accuracy, loss.item()

In [12]:
for epoch in range(1000):
    train(model, 'cuda:0', train_loader, optimizer, epoch)
    #writer.add_scalars('data/loss', {'loss': loss}, epoch)
    #writer.add_scalars('data/train_accuracy', {'train_accuracy': accuracy}, epoch)
    accuracy,loss = test(model, 'cuda:0', test_loader,epoch)
    #print("test_accuracy: ", accuracy, "test_loss", loss)
    torch.save(model.state_dict(), "./checkPoint.pth")
    #writer.add_scalars('data/test_accuracy', {'test_accuracy': accuracy}, epoch)
    #writer.add_scalars('data/test_loss', {'test_loss': loss}, epoch)
    #if epoch == 500:
        #optimizer = optim.SGD(model.parameters(), 0.01,momentum=0.9,weight_decay=1e-4)
#writer.export_scalars_to_json("./all_scalars.json")
#writer.close()

  
  if __name__ == '__main__':




KeyboardInterrupt: 