In [1]:
import pandas as pd
import torch
import torchvision.models as models
from torch.utils.data import Dataset, DataLoader
from skimage import io, transform
import matplotlib.pyplot as plt
from torch.autograd import Variable
import numpy as np
import torch.nn as nn
from skimage.transform import rescale, resize, downscale_local_mean
from torchvision import datasets, transforms
from PIL import Image
from tqdm import tqdm
import h5py
import pickle
from transformers import *
from sklearn.externals import joblib
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
from tensorboardX import SummaryWriter

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
class CAD120Dataset(Dataset):
    def __init__(self, feature_file,  caption_file , transform=None):

        
        self.caption = joblib.load(caption_file)
        self.caption = [ cap.permute(1,0,2) for cap in self.caption ]
        self.features = joblib.load(feature_file)
        
    
    def __len__(self):
        return len(self.caption)
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
            
        features = self.features[idx]
        captions = self.caption[idx]
        
       
        sample = {"features": features, "captions":captions}

        return sample

In [3]:
device = 'cuda:0'
class Net(nn.Module):
    def __init__(self, input_size_image, input_size_word_embedding, 
                hidden_size, output_size,num_layers=1):
        super(Net, self).__init__()
        self.input_size_image = input_size_image
        self.input_size_word_embedding = input_size_word_embedding
        self.hidden_size = hidden_size
        
        self.num_layers = num_layers
        self.lstm_image = nn.LSTM(input_size_image, hidden_size, num_layers,  bidirectional=True)
        self.lstm_caption = nn.LSTM(input_size_word_embedding, hidden_size, num_layers,  bidirectional=True)
      
    
    def forward(self, image_features, caption_features):
        # Set initial states
        
        img_batch_len = len(image_features)
        caption_batch_len = len(caption_features)
        
 
       
        h0 = torch.zeros(self.num_layers*2, 1, self.hidden_size).to(device)
        c0 = torch.zeros(self.num_layers*2, 1, self.hidden_size).to(device)
        
        h1 = torch.zeros(self.num_layers*2, 1, self.hidden_size).to(device)
        c1 = torch.zeros(self.num_layers*2, 1, self.hidden_size).to(device)
        OUT0 = []
        OUT1 = []
        for i in range(img_batch_len):
            #print(image_features[i].shape)
            h0 = torch.zeros(self.num_layers*2, 1, self.hidden_size).to(device)
            c0 = torch.zeros(self.num_layers*2, 1, self.hidden_size).to(device)
            for j in range(image_features[i].shape[0]):
                out0, (h0,c0) = self.lstm_image(image_features[i][j].view(1,1,-1), (h0,c0))  # out: tensor of shape (batch_size, seq_length, hidden_size*2)
            OUT0.append(out0)
                
            
        for i in range(caption_batch_len):
            h1 = torch.zeros(self.num_layers*2, 1, self.hidden_size).to(device)
            c1 = torch.zeros(self.num_layers*2, 1, self.hidden_size).to(device)
            for j in range(caption_features[i].shape[0]):
                out1, (h1, c1) = self.lstm_caption(caption_features[i][j].view(1,1,-1), (h1, c1))
            OUT1.append(out1)
        

        OUT0 = torch.stack(OUT0)
        OUT1 = torch.stack(OUT1)
        OUT0 = OUT0.view(OUT0.shape[0],-1)
        OUT1 = OUT1.view(OUT1.shape[0],-1)
        return OUT0,OUT1



In [4]:
model = Net(512*4*4, 768, 256, 10,1).to(device)
writer = SummaryWriter()

In [5]:
def train(model, device, train_loader, optimizer, epoch):
    model.train()
    for batch_idx, b in enumerate(train_loader):
        imgs = b[0]
        captions = b[1]
        
        img_feature, caption_feature  = imgs, [caption.to(device) for caption in captions]
        
        optimizer.zero_grad()
        model(img_feature, caption_feature)
        h_image, h_caption = model(img_feature, caption_feature)

    
    
        l = h_image.shape[0]
        
        loss = 0
        S_image = -1*torch.ones((l,1)).cuda()
        S_caption = -1*torch.ones((l,1)).cuda()
        accuracy = 0
        
        for i in range(l):
            index_image = 0
            for j in range(l):
                if S_image[i] <  h_image[i].dot(h_caption[j]):
                    S_image[i] =  h_image[i].dot(h_caption[j])
                    index_image = j
                    
            if i == index_image:
                accuracy += 1.0
            loss += max(0,S_image[i]- h_image[i].dot(h_caption[i])+1)
        
        
        for i in range(l):
            index_caption = 0
            for j in range(l):    
                if S_caption[i] <  h_caption[i].dot(h_image[j]):
                    S_caption[i] =  h_caption[i].dot(h_image[j])
                    index_caption = j
                    
            if i == index_caption:
                accuracy += 1.0
            loss += max(0,S_caption[i]-h_image[i].dot(h_caption[i])+1)
        
        accuracy /= float(2*l)
        loss /= float(l)
            
            
        loss.backward()
        optimizer.step()
        #if batch_idx % 16 == 0:
        
        print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(imgs), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))
        

        return loss.item(), accuracy

In [6]:
def test(model, device, test_loader,epoch):
    model.eval()
    for batch_idx, b in enumerate(test_loader):
        imgs = b[0]
        captions = b[1]
        
        img_feature, caption_feature  = imgs, [caption.to(device) for caption in captions]
        
        model(img_feature, caption_feature)
        h_image, h_caption = model(img_feature, caption_feature)
        l = h_image.shape[0]
        
        
        S_image = -1*torch.ones((l,1)).cuda()
        S_caption = -1*torch.ones((l,1)).cuda()
        accuracy = 0
        
        for i in range(l):
            index_image = 0
            for j in range(l):
                if S_image[i] <  h_image[i].dot(h_caption[j]):
                    S_image[i] =  h_image[i].dot(h_caption[j])
                    index_image = j
                    
            if i == index_image:
                accuracy += 1.0
            
        
        
        for i in range(l):
            index_caption = 0
            for j in range(l):    
                if S_caption[i] <  h_caption[i].dot(h_image[j]):
                    S_caption[i] =  h_caption[i].dot(h_image[j])
                    index_caption = j
                    
            if i == index_caption:
                accuracy += 1.0
           
        
        accuracy /= float(2*l)

        return accuracy

In [7]:
# optimizer = optim.SGD(model.parameters(), 0.01,momentum=0.9,weight_decay=1e-4)
optimizer = optim.Adam(model.parameters())

def my_collate(batch):
    img = [item["features"] for item in batch]
    caption = [item["captions"] for item in batch]
    #caption = torch.LongTensor(caption)
    return [img, caption]

train_loader = DataLoader( CAD120Dataset('features.bin','caption.bin'), batch_size=128, shuffle = True,
                          collate_fn = my_collate)
test_loader = DataLoader( CAD120Dataset('features_subject3.bin','caption_subject3.bin'), batch_size=128, shuffle = True,
                          collate_fn = my_collate)

In [None]:
for epoch in range(1000):
    loss,accuracy = train(model, 'cuda:0', train_loader, optimizer, epoch)
    writer.add_scalars('data/loss', {'loss': loss}, epoch)
    writer.add_scalars('data/train_accuracy', {'train_accuracy': accuracy}, epoch)
    accuracy = test(model, 'cuda:0', test_loader,epoch)
    writer.add_scalars('data/test_accuracy', {'test_accuracy': accuracy}, epoch)
    if epoch == 50:
        optimizer = optim.SGD(model.parameters(), 0.01,momentum=0.9,weight_decay=1e-4)
writer.export_scalars_to_json("./all_scalars.json")
writer.close()

