In [57]:
from __future__ import print_function
from __future__ import division

import sys
import torch
import torch.utils.data as data
import os
import time
import pickle
import numpy as np
from PIL import Image
import re
import io

import json
import matplotlib.pyplot as plt
from torchvision import transforms, datasets, models
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torch.nn.functional as F


from IPython.display import clear_output
import torch.nn as nn
import torch.optim as optim

import itertools
import collections
import pdb
cuda = torch.cuda.is_available()
device = torch.device("cuda" if cuda else "cpu")
cuda

False

In [58]:
# Path for file dset_dataloader.json
def open_json(path):
    f = open(path) 
    data = json.load(f) 
    f.close()
    return data 

def flatten(S):
    if S == []:
        return S
    if isinstance(S[0], list):
        return flatten(S[0]) + flatten(S[1:])
    return S[:1] + flatten(S[1:])

### Bar to visualize progress

In [59]:
def update_progress(progress):
    bar_length = 20
    if isinstance(progress, int):
        progress = float(progress)
    if not isinstance(progress, float):
        progress = 0
    if progress < 0:
        progress = 0
    if progress >= 1:
        progress = 1

    block = int(round(bar_length * progress))
    clear_output(wait = True)
    text = "Progress: [{0}] {1:.1f}%".format( "#" * block + "-" * (bar_length - block), progress * 100)
    print(text)

In [60]:
# FILES FOR DATALOADER
dset_words_p = "/Users/manuelladron/iCloud_archive/Documents/_CMU/PHD-CD/PHD-CD_Research/ADARI/json_files/cleaned/ADARI_v2/furniture/ADARI_furniture_words.json"
vocab_p = "/Users/manuelladron/iCloud_archive/Documents/_CMU/PHD-CD/PHD-CD_Research/ADARI/json_files/cleaned/ADARI_v2/furniture/ADARI_furniture_vocab_adjs.json"

In [99]:
class MakeOneHots(object):
    def __init__(self, image_words_path, vocab_path, out_path, freq_voc=2):

        self.out_path = out_path
        self.freq_vocab = freq_voc
        self.vocab = open_json(vocab_path)
        self.image_words = open_json(image_words_path)
    
    def save_json(self, file_path, data):
        out_file = open(file_path, "w")
        json.dump(data, out_file)
        out_file.close()
    
    def chop_vocabulary(self, verbose=True):
        chopped = dict()
        w2i = dict()
        i2w = dict()
        i = 0
        
        for k,v in self.vocab.items():
            if v > self.freq_vocab:
                chopped[k] = v
                w2i[k] = i
                i2w[i] = k
                i += 1
        
        if verbose:
            print('Original dictionary length: {}\nCropped dictionary length:{}'.format(len(self.vocab), len(chopped)))
        sorted_vocab = {k: v for k, v in sorted(chopped.items(), key=lambda item: item[1], reverse=True)}

        return sorted_vocab, w2i, i2w
    
    
    def sampling_rate_and_negative_sample(self, vocab, w2i):
        # Returns sampling rate of word (prob of keeping the word ) and negative sampling rate
        # 1) variables for sampling_rate

        frequencies_ids = dict()
        frequencies = dict()
        total_number_words = sum(vocab.values())
        threshold = 1e-5
        for word, count in vocab.items():
            # for sampling rate 
            z_w = count / total_number_words # this all add up to 1
            frequencies[word] = z_w
            w_id = w2i[word]
            frequencies_ids[w_id] = z_w

        # Noise_dist
        noise_dist = {key:val**(3/4) for key, val in frequencies.items()}

        # Frequency of dropping
        p_drop = {word: 1 - np.sqrt(threshold/frequencies[word]) for word in vocab}

        # Noise dist normalized 
        Z = sum(noise_dist.values())
        neg_sampling = dict()
        neg_sampling_ids = dict()

        for k, v in noise_dist.items():
            k_id = w2i[k]
            n_s_value = v/Z
            neg_sampling[k] = n_s_value
            neg_sampling_ids[k_id] = n_s_value

        return frequencies, frequencies_ids 
    
    
    #### Get 10 most relevant 
    def sort_list_by_sample_rate(self, idx_list, i2w, s_rate_idx, verbose=False):
        """
        Receives a list of indexes, sorts this list according to a dictionary of sample rates, and selects 
        the 10 more common labels
        """
        sorted_idxs = sorted(idx_list, key=lambda x: s_rate_idx[x], reverse=True)
        if verbose:
            for idx in idx_list:
                print('idx: {} corresponds to word: {} and has s_rate: {}'.format(idx, i2w[idx], s_rate_idx[idx]))
            print('sorted_idxs')
        
        return sorted_idxs[:10]
    
    def get_one_hot(self, w2i, i2w, s_rate_idx):
        images_onehot = dict()
        vocab_len = len(w2i)
        ii = 0
        dic_length = len(self.image_words)
        for k, v in self.image_words.items():
            update_progress(ii/dic_length)
            onehot = [0] * vocab_len
            has_label = False
            idxs = []
            for label in v:
                if label in w2i.keys():
                    idx = w2i[label]
                    idxs.append(idx)
                    #onehot[idx] = 1
                    has_label = True
            
            # Get list of max 10 most common labels 
            most_common = self.sort_list_by_sample_rate(idxs, i2w, s_rate_idx)
            
            for x in most_common:
                onehot[x] = 1
            images_onehot[k] = onehot
            ii += 1
        
        return images_onehot
    
    def run(self):
        sorted_vocab, w2i, i2w = self.chop_vocabulary()
        s_rate, s_rate_idxs = self.sampling_rate_and_negative_sample(sorted_vocab, w2i)
        images_onehot = self.get_one_hot(w2i, i2w, s_rate_idxs)
        
        self.save_json(self.out_path + '/ADARI_furniture_onehots.json', images_onehot)
        self.save_json(self.out_path + '/ADARI_furniture_onehots_vocab.json', sorted_vocab)
        self.save_json(self.out_path + '/ADARI_furniture_onehots_w2i.json', w2i)
        self.save_json(self.out_path + '/ADARI_furniture_onehots_i2w.json', i2w)
        

In [100]:
save_path = '/Users/manuelladron/iCloud_archive/Documents/_CMU/PHD-CD/PHD-CD_Research/ADARI/json_files/cleaned/ADARI_v2/furniture'
M = MakeOneHots(dset_words_p, vocab_p, save_path)


In [101]:
M.run()

Progress: [####################] 100.0%
