In [15]:
import sys

# DIRECTORY STRUCTURE
"""
ADARI/
    .....
mmml_f20/
    this
TwoWayNets/
    .....
"""
# REPLACE WITH PATH TO TwoWayNets Repo
sys.path.append('../TwoWayNets/')

In [16]:
import torch
import torch.utils.data as data
import os
import time
import pickle
import numpy as np
import datetime
from PIL import Image
import pandas as pd

import json
from torchvision import transforms, datasets
import torchvision.models as models
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torch.nn.functional as F


import torch.nn as nn
import torch.optim as optim

import itertools
import collections

from model import TwoWayNet

cuda = torch.cuda.is_available()
cuda
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device - {device}")

In [17]:


# IMAGES
im_path_fur = "../ADARI/images/furniture/thumbs/big"

# ResNet IMAGE EMBEDDINGS
image_embeddings_path = "../ADARI/image_embeddings/resnet_image_embeddings.json"

# JSON_FILES
data_path_fur = "../ADARI/json_files/cleaned/furniture_cleaned.json"

# FURNITURE VOCAB 
vocab_id2w = "../ADARI/json_files/vocabulary/furniture/vocab_id2w.json"
vocab_w2id = "../ADARI/json_files/vocabulary/furniture/vocab_w2id.json"

# WORD EMBEDDINGS
word_embeddings_path = "../ADARI/word_embeddings/fur_5c_50d_sk_glove_ft.json"

# FILES FOR DATALOADER
dset_path = '../ADARI/json_files/dataset_for_dataloader/dset_dataloader.json'
im2idx_path = '../ADARI/json_files/dataset_for_dataloader/im2idx.json'


In [18]:
# Path for file dset_dataloader.json
def open_json(path):
    f = open(path) 
    data = json.load(f) 
    f.close()
    return data 

In [19]:
word_embs = open_json(word_embeddings_path)

In [20]:
img_embs = open_json(image_embeddings_path)

In [21]:
vocab = open_json(vocab_w2id)

In [22]:
dset = open_json(dset_path)
im2idx = open_json(im2idx_path)

In [23]:
img_size = 64
class ResnetImagesGloveWordsDataset(Dataset):
    """
        __getitem__ should return (image encoding, text encoding), image name
            where image encoding has shape [image_encoding_feature_dim],
            text encoding has shape [text_encoding_feature_dim]
    """
    def __init__(self, 
                 img_name_to_words, 
                 im2idx, 
                 word_embds, 
                 img_embds, 
                 train=True, 
                 device=None):
        # word_embds is word -> embedding
        self.word_embds = word_embds
        # img_embeds is img_idx -> embedding
        self.img_embds = img_embds
        # Dataset is image_name -> [words]
        self.img_name_to_words = img_name_to_words
        self.images = list(img_name_to_words.keys())
        self.img_embed_shape = len(img_embds[list(img_embds.keys())[0]])
        self.im2idx = im2idx
        
        
        self.max_words = 40
        self.word_shape = len(word_embds[list(word_embds.keys())[0]])
        self.word_embed_shape = self.word_shape * self.max_words
        
        self.device = device
        
    def __len__(self):
        return len(self.images)

    def __getitem__(self, index):
        image_name = self.images[index]
        idx = self.im2idx[image_name]
        
        words = self.img_name_to_words[image_name]
        w_embds = []
        for w in words[:min(self.max_words, len(words))]:
            if w in self.word_embds:
                w_embds.append(torch.tensor(self.word_embds[w], device=self.device)
                               .reshape(self.word_shape, 1))
        # pad the rest
        for _ in range(self.max_words - len(w_embds)):
            w_embds.append(torch.full((self.word_shape, 1), 0.0, device=self.device))
            
        w_concat = torch.cat(w_embds)
        
        return (torch.tensor(self.img_embds[str(idx)]), w_concat.reshape((w_concat.shape[0]))), image_name

In [24]:
dataset = ResnetImagesGloveWordsDataset(dset, im2idx, word_embs, img_embs, device=device)
# Split dataset into test and train
train_set, test_set = torch.utils.data.random_split(dataset, [int(.8 * len(dataset)), int(.2 * len(dataset))])

In [33]:
def train(model, dataloader, params, img_shape, words_shape, losses, device=None):
    model.to(device)
    
    all_gamma_weights = set(model.gammas)
    all_linear_weights = set(model.weights)
    other_params = set(model.parameters()).difference(all_gamma_weights.union(all_linear_weights))
    
    optim = torch.optim.SGD([{'params': list(all_gamma_weights), 'weight_decay': params.GAMMA_COEF},
                            {'params': list(all_linear_weights), 'weight_decay': params.WEIGHT_DECAY},
                            {'params': list(other_params)}], 
                            params.BASE_LEARNING_RATE, 
                            nesterov=True, 
                            momentum=Params.MOMENTUM)
    mse_loss = torch.nn.MSELoss()

    model.train()
    for epoch in range(params.EPOCH_NUMBER):
        avg_loss = 0
        for (img_embed, words_embed), _ in dataloader:
            img_embed = img_embed.to(device)
            words_embed = words_embed.to(device)
            
            data = {"x": img_embed, "y": words_embed}
            optim.zero_grad()
            # Forward pass
            xprime, yprime, hidden_xs, hidden_ys = model(data)
                
            # Compute losses
            # reconstruction losses
            loss_x = mse_loss(xprime, data["x"])
            loss_y = mse_loss(yprime, data["y"])

            # shape [batch_size, hidden_dim]
            embedded_x, embedded_y = hidden_xs[model.hidden_output_layer], hidden_ys[model.hidden_output_layer]

            # hidden loss
            loss_hidden = mse_loss(embedded_x, embedded_y)

            # covariance loss
            # cov shape: [hidden_dim, hidden_dim]
            cov_x = torch.matmul(embedded_x.T, embedded_x) / embedded_x.shape[0]
            cov_y = torch.matmul(embedded_y.T, embedded_y) / embedded_y.shape[0]

            # Compute covariance losses
            cov_loss_x = torch.sqrt(torch.norm(cov_x, p='fro')) - torch.sqrt(torch.norm(torch.diag(cov_x)))
            cov_loss_y = torch.sqrt(torch.norm(cov_y, p='fro')) - torch.sqrt(torch.norm(torch.diag(cov_y)))

            loss = (params.LOSS_X * loss_x + 
                    params.LOSS_Y * loss_y + 
                    params.L2_LOSS * loss_hidden + 
                    params.WITHEN_REG_X * cov_loss_x + 
                    params.WITHEN_REG_Y * cov_loss_y)
            
            # Backward step
            loss.backward()
            avg_loss += loss.item()

            # Update step
            optim.step()
        losses.append(avg_loss / len(dataset))
        print(f"Epoch {epoch+1}, loss: {losses[-1]}")
    return model, losses

In [34]:
class Params:
    # region Training Params
    BATCH_SIZE = 2
    VALIDATION_BATCH_SIZE = 1000
    EPOCH_NUMBER = 100
    DECAY_RATE = 0.5
    BASE_LEARNING_RATE = 0.0001
    MOMENTUM = 0.9
    # endregion

    # region Loss Weights
    WEIGHT_DECAY = 0.05
    GAMMA_COEF = 0.05
    WITHEN_REG_X = 0.5
    WITHEN_REG_Y = 0.5
    L2_LOSS = 0.25
    LOSS_X = 1
    LOSS_Y = 1
    # endregion

    # region Architecture
    LAYER_SIZES = [2000, 3000, 16000]
    TEST_LAYER = 1
    DROP_PROBABILITY = 0.5
    LEAKINESS = 0.3
    # endregion

    @classmethod
    def print_params(cls):
        OutputLog().write('Params:\n')
        for (key, value) in cls.__dict__.iteritems():
            if not key.startswith('__'):
                OutputLog().write('{0}: {1}'.format(key, value))

In [35]:
params = Params()
dataloader = DataLoader(train_set, batch_size=params.BATCH_SIZE, shuffle=True)

# Train the Model

In [36]:
model_name = datetime.datetime.now()
model = TwoWayNet(dataset.img_embed_shape, 
                  dataset.word_embed_shape, 
                  params.LAYER_SIZES, 
                  params.TEST_LAYER, 
                  params.DROP_PROBABILITY)
losses = []
# UNCOMMENT THIS AND MODIFY PATH TO RESUME TRAINING
#model.load_state_dict(torch.load(PATH))
try:
    train(model, dataloader, params, dataset.img_embed_shape, dataset.word_embed_shape, losses, device=device)
except KeyboardInterrupt:
    torch.save(model.state_dict(), f"TwoWayNet_ADARI_{model_name}.pth")
    with open(f"TwoWayNet_ADARI_losses_{datetime.datetime.now()}.json", "w") as f:
        json.dump(losses, f)

X Encoder shapes:
D_in: 2048, D_out: 2000
D_in: 2000, D_out: 3000
D_in: 3000, D_out: 16000
D_in: 16000, D_out: 2000
Y Encoder shapes:
D_in: 2000, D_out: 16000
D_in: 16000, D_out: 3000
D_in: 3000, D_out: 2000
D_in: 2000, D_out: 2048


# Generate Embeddings of tests images and text

In [37]:
#dataloader = DataLoader(test_set, batch_size=1, shuffle=False)
dataloader = DataLoader(train_set, batch_size=1, shuffle=False)
# img_name -> (img_embedding, text embedding)
all_embeddings = dict()
model.eval()
with torch.no_grad():
    for (img, txt), img_name in dataloader:
        xprime, yprime, hidden_imgs, hidden_txts = model({"x": img, "y": txt})
        embedded_x, embedded_y = hidden_imgs[model.hidden_output_layer], hidden_txts[model.hidden_output_layer]
        all_embeddings[img_name[0]] = (embedded_x[0].tolist(), embedded_y[0].tolist())
    
    

tensor([[-9.9253e-05,  4.8122e-02, -6.8531e-02,  ..., -8.6740e-05,
          9.3502e-02,  8.6351e-02]])
tensor([[-9.9253e-05, -9.6644e-02,  1.1464e-01,  ..., -8.6740e-05,
          1.6126e-01, -2.3337e-01]])
tensor([[-9.9253e-05,  7.0451e-02, -1.6967e-01,  ..., -8.6740e-05,
          9.4332e-02,  2.4264e-01]])
tensor([[-9.9253e-05, -2.4740e-02, -2.1265e-03,  ..., -8.6740e-05,
         -6.4868e-02,  4.7632e-02]])
tensor([[-9.9253e-05,  2.6802e-02,  3.8801e-02,  ..., -8.6740e-05,
          1.1330e-01,  1.0752e-01]])
tensor([[-9.9253e-05, -7.8923e-02,  1.9054e-03,  ..., -8.6740e-05,
          8.2679e-02,  7.2636e-02]])
tensor([[-9.9253e-05, -3.8653e-02,  1.8752e-01,  ..., -8.6740e-05,
          9.0655e-02,  2.8481e-02]])
tensor([[-9.9253e-05,  1.0653e-01, -5.3904e-02,  ..., -8.6740e-05,
         -8.2955e-02,  1.3139e-01]])
tensor([[-9.9253e-05,  1.4339e-01,  2.3375e-01,  ..., -8.6740e-05,
         -5.3440e-02, -9.1981e-02]])
tensor([[-9.9253e-05, -6.3471e-02,  2.0816e-01,  ..., -8.6740e-0

tensor([[-9.9253e-05, -1.1360e-01, -9.4421e-03,  ..., -8.6740e-05,
          4.1856e-02, -2.3337e-01]])
tensor([[-9.9253e-05,  1.0535e-01, -9.1422e-03,  ..., -8.6740e-05,
          3.9061e-02,  2.6506e-02]])
tensor([[-9.9253e-05, -1.9350e-02, -5.0569e-02,  ..., -8.6740e-05,
          5.0290e-02,  5.9134e-03]])
tensor([[-9.9253e-05, -2.9245e-02, -2.3930e-02,  ..., -8.6740e-05,
         -8.5769e-02,  4.8011e-02]])
tensor([[-9.9253e-05,  3.0404e-02, -1.1881e-01,  ..., -8.6740e-05,
         -6.1775e-02,  9.4130e-02]])
tensor([[-9.9253e-05, -1.4367e-02, -8.5737e-02,  ..., -8.6740e-05,
          1.0898e-01, -2.0689e-01]])
tensor([[-9.9253e-05, -1.7948e-02,  1.8571e-01,  ..., -8.6740e-05,
          1.9556e-01, -1.6017e-02]])
tensor([[-9.9253e-05,  1.0886e-01,  1.3377e-01,  ..., -8.6740e-05,
          4.5214e-03, -6.2884e-02]])


KeyboardInterrupt: 

In [None]:
with open(f"TwoWayNet_paired_embedding_result_{model_name}.json", "w") as f:
    json.dump(all_embeddings, f)
    
    