In [1]:
import sys
sys.path.append('/Users/alexschneidman/CMU/F20/777/TwoWayNets/')

In [2]:
import torch
import torch.utils.data as data
import os
import time
import pickle
import numpy as np
from PIL import Image

import json
import matplotlib.pyplot as plt
from torchvision import transforms, datasets
import torchvision.models as models
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torch.nn.functional as F


import torch.nn as nn
import torch.optim as optim

import itertools
import collections
import pdb

from model import TwoWayNet

cuda = torch.cuda.is_available()
cuda
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
# IMAGES
im_path_fur = "../ADARI/images/furniture/thumbs/big"

# ResNet IMAGE EMBEDDINGS
image_embeddings_path = "../ADARI/image_embeddings/resnet_image_embeddings.json"

# JSON_FILES
data_path_fur = "../ADARI/json_files/cleaned/furniture_cleaned.json"

# FURNITURE VOCAB 
vocab_id2w = "../ADARI/json_files/vocabulary/furniture/vocab_id2w.json"
vocab_w2id = "../ADARI/json_files/vocabulary/furniture/vocab_w2id.json"

# WORD EMBEDDINGS
word_embeddings_path = "../ADARI/word_embeddings/fur_5c_50d_sk_glove_ft.json"

# FILES FOR DATALOADER
dset_path = '../ADARI/json_files/dataset_for_dataloader/dset_dataloader.json'
im2idx_path = '../ADARI/json_files/dataset_for_dataloader/im2idx.json'


In [4]:
# Path for file dset_dataloader.json
def open_json(path):
    f = open(path) 
    data = json.load(f) 
    f.close()
    return data 

In [5]:
word_embs = open_json(word_embeddings_path)

In [6]:
img_embs = open_json(image_embeddings_path)

In [7]:
vocab = open_json(vocab_w2id)

In [8]:
dset = open_json(dset_path)
im2idx = open_json(im2idx_path)

In [9]:
img_size = 64
class ResnetImagesGloveWordsDataset(Dataset):
    def __init__(self, 
                 img_name_to_words, 
                 im2idx, 
                 word_embds, 
                 img_embds, 
                 train=True, 
                 device=None):
        # word_embds is word -> embedding
        self.word_embds = word_embds
        # img_embeds is img_idx -> embedding
        self.img_embds = img_embds
        # Dataset is image_name -> [words]
        self.img_name_to_words = img_name_to_words
        self.images = list(img_name_to_words.keys())
        self.img_embed_shape = len(img_embds[list(img_embds.keys())[0]])
        self.im2idx = im2idx
        
        
        self.max_words = 40
        self.word_shape = len(word_embds[list(word_embds.keys())[0]])
        self.word_embed_shape = self.word_shape * self.max_words
        
        self.device = device
        
    def __len__(self):
        return len(self.images)

    def __getitem__(self, index):
        image_name = self.images[index]
        idx = self.im2idx[image_name]
        
        words = self.img_name_to_words[image_name]
        w_embds = []
        for w in words[:min(self.max_words, len(words))]:
            if w in self.word_embds:
                w_embds.append(torch.tensor(self.word_embds[w], device=self.device)
                               .reshape(self.word_shape, 1))
        # pad the rest
        for _ in range(self.max_words - len(w_embds)):
            w_embds.append(torch.full((self.word_shape, 1), 0.0, device=self.device))
            
        w_concat = torch.cat(w_embds)
        
        return torch.tensor(self.img_embds[str(idx)]), w_concat.reshape((w_concat.shape[0]))

    

In [10]:
dataset = ResnetImagesGloveWordsDataset(dset, im2idx, word_embs, img_embs, device=device)

In [11]:
def train(dataloader, params, img_shape, words_shape, device=None):
    model = TwoWayNet(img_shape, 
                      words_shape, 
                      params.LAYER_SIZES, 
                      params.TEST_LAYER, 
                      params.DROP_PROBABILITY)
    model.to(device)
    optim = torch.optim.SGD(model.parameters(), 
                            params.BASE_LEARNING_RATE, 
                            nesterov=True, 
                            momentum=Params.MOMENTUM)
    mse_loss = torch.nn.MSELoss()

    losses = []
    model.train()
    for epoch in range(params.EPOCH_NUMBER):
        avg_loss = 0
        for img_embed, words_embed in dataloader:
            img_embed = img_embed.to(device)
            words_embed = words_embed.to(device)
            
            print(f"img shape: {img_embed.shape}")
            print(f"words_embed shape: {words_embed.shape}")
            
            data = {"x": img_embed, "y": words_embed}
            optim.zero_grad()
            # Forward pass
            xprime, yprime, hidden_xs, hidden_ys = model(data)

            # Compute losses
            # reconstruction losses
            loss_x = mse_loss(xprime, data["x"])
            loss_y = mse_loss(yprime, data["y"])

            # hidden loss
            loss_hidden = mse_loss(hidden_xs[model.hidden_output_layer], hidden_ys[model.hidden_output_layer])

            # covariance loss
            hidden_xs_tensor, hidden_ys_tensor = torch.stack(hidden_xs), torch.stack(hidden_ys)
            cov_x = torch.dot(hidden_xs_tensor.T, hidden_xs_tensor) / hidden_xs_tensor.shape[0]
            cov_y = torch.dot(hidden_ys_tensor.T, hidden_ys_tensor) / hidden_ys_tensor.shape[0]

            cov_loss_x = torch.sqrt(torch.sum(cov_x ** 2)) - torch.sqrt(torch.sum(torch.diag(cov_x) ** 2))
            cov_loss_y = torch.sqrt(torch.sum(cov_y ** 2)) - torch.sqrt(torch.sum(torch.diag(cov_y) ** 2))

            # Weight Decay loss
            loss_weight_decay = model.get_summed_l2_weights()
            
            # Gamma loss
            loss_gamma = model.get_summed_gammas()

            loss = (params.LOSS_X * loss_x + 
                    params.LOSS_Y * loss_y + 
                    params.L2_LOSS * loss_hidden + 
                    params.WITHEN_REG_X * cov_loss_x + 
                    params.WITHIN_REG_Y * cov_loss_y + 
                    params.GAMMA_COEF * loss_gamma +
                    params.WEIGHT_DECAY * loss_weight_decay)
            
            # Backward step
            loss.backward()
            avg_loss += loss.item()

            # Update step
            optim.step()
        losses.append(avg_loss / len(dataset))
        print(f"Epoch {epoch+1}, loss: {losses[-1]}")
    return model

In [12]:
class Params:

    # region Training Params
    BATCH_SIZE = 128
    VALIDATION_BATCH_SIZE = 1000
    EPOCH_NUMBER = 100
    DECAY_RATE = 0.5
    BASE_LEARNING_RATE = 0.0001
    MOMENTUM = 0.9
    # endregion

    # region Loss Weights
    WEIGHT_DECAY = 0.05
    GAMMA_COEF = 0.05
    WITHEN_REG_X = 0.5
    WITHEN_REG_Y = 0.5
    L2_LOSS = 0.25
    LOSS_X = 1
    LOSS_Y = 1
    # endregion

    # region Architecture
    LAYER_SIZES = [2000, 3000, 16000]
    TEST_LAYER = 1
    DROP_PROBABILITY = 0.5
    LEAKINESS = 0.3
    # endregion

    @classmethod
    def print_params(cls):
        OutputLog().write('Params:\n')
        for (key, value) in cls.__dict__.iteritems():
            if not key.startswith('__'):
                OutputLog().write('{0}: {1}'.format(key, value))

In [13]:
params = Params()
dataloader = DataLoader(dataset, batch_size=params.BATCH_SIZE, shuffle=True)
model = train(dataloader, params, dataset.img_embed_shape, dataset.word_embed_shape, device=device)

X Encoder shapes:
D_in: 2048, D_out: 2000
D_in: 2000, D_out: 3000
D_in: 3000, D_out: 16000
D_in: 16000, D_out: 2000
Y Encoder shapes:
D_in: 2000, D_out: 16000
D_in: 16000, D_out: 3000
D_in: 3000, D_out: 2000
D_in: 2000, D_out: 2048
img shape: torch.Size([128, 2048])
words_embed shape: torch.Size([128, 2000])


RuntimeError: invalid argument 0: Sizes of tensors must match except in dimension 0. Got 2000 and 3000 in dimension 2 at ../aten/src/TH/generic/THTensor.cpp:612

In [None]:
print(model.x_encoder[0])