In [2]:
import json
import sys
import os
from os.path import splitext, join, exists
import scipy.io as sio
import numpy as np
import keras
from keras.losses import mean_squared_error

Using TensorFlow backend.


## Data Generator

In [3]:
def load(filename):
    return sio.loadmat(filename, appendmat=False, squeeze_me=True)['data']

def normalize_rows(mat, ord=2):
    ''' return a row normalized matrix
    '''
    assert mat.ndim == 2
    norms = zeros_to_eps(np.linalg.norm(mat, ord=ord, axis=1))
    return mat / norms.reshape(-1, 1)

def zeros_to_eps(mat):
    ''' replace zeros in a matrix by a tiny constant
    '''
    mat[np.isclose(mat, 0.)] = np.finfo(mat.dtype).eps
    return mat

In [4]:
# https://stanford.edu/~shervine/blog/keras-how-to-generate-data-on-the-fly
class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, list_IDs, anno, vecs, set_, shuffle=True):
        'Initialization'
        self.anno = anno
        self.vecs = vecs
        self.batch_size = 1
        self.list_IDs = list_IDs
        self.shuffle = shuffle
        self.set_ = set_
        self.dim_img = 4096
        self.dim_word = 300
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.list_IDs) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Find list of IDs
        list_IDs_temp = [self.list_IDs[k] for k in indexes]

        # Generate data
        X, y = self.__data_generation(list_IDs_temp)

        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, list_IDs_temp):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)

        ID = list_IDs_temp[0]
        fname_x = splitext(self.anno[self.set_][ID]["file_name"])[0] + ".dat"
        #x = load(join("data/", self.set_, fname_x))
        x = load(join("/media/nico/DATOS/COCO/vgg19", self.set_, fname_x))

        tags = self.anno[self.set_][ID]["tags"]
        y = [[0]*self.dim_word if self.vecs[w] is None else self.vecs[w] for w in tags]
        y = normalize_rows(np.array(y, dtype=np.float32))

        X = np.empty((len(y), self.dim_img))
        Y = np.empty((len(y), self.dim_word))

        # Generate data
        for i, y_elem in enumerate(y):
            # Initialization            
            X[i,] = x
            Y[i,] = y_elem
        return [X, Y], [X, Y]

In [5]:
anno = json.load(open("../data/coco_noun_0.5.tags", 'r'))
vecs = json.load(open("../data/coco_noun_0.5.word2vec", 'r'))

In [6]:
train_list_ids = list(anno["train2014"].keys())
val_list_ids = list(anno["val2014"].keys())

In [7]:
training_generator = DataGenerator(train_list_ids, anno, vecs, "train2014")
val_generator = DataGenerator(val_list_ids, anno, vecs, "val2014", shuffle=False)

In [8]:
x = load(join("../data/", "train2014", "COCO_train2014_000000111189.dat"))
x = normalize_rows(x.reshape(1, -1)).squeeze()
n_dim = len(x)
tags = anno["train2014"]["111189"]["tags"]
y = [[0]*vec_dim if vecs[w] is None else vecs[w] for w in tags]
y = normalize_rows(np.array(y, dtype=np.float32))

In [9]:
x.shape, y[0].shape

((4096,), (300,))

## Model

In [10]:
from keras.layers import Input, Dense
from keras.models import Model
from keras.layers import Concatenate

In [11]:
# https://blog.keras.io/building-autoencoders-in-keras.html
# this is the size of our encoded representations
encoding_dim = 200  # 32 floats -> compression of factor 24.5, assuming the input is 784 floats

# Inputs
input_x = Input(shape=(4096,),name="input_img")
input_y = Input(shape=(300,),name="input_text")

fc1 = Dense(300, activation='relu', name="fc1")(input_x)

# "encoded" is the encoded representation of the input
fc_share = Dense(300, activation='relu', name="fc_share")

share_img = fc_share(fc1)
share_word = fc_share(input_y)
concat = Concatenate(name="concat")([share_img, share_word])
encoded = Dense(encoding_dim, activation='relu', name="encoded")(concat)

# "decoded" is the lossy reconstruction of the input
decoded1 = Dense(4096,name="fc2")(encoded)
decoded2 = Dense(300,name="fc3")(encoded)

# this model maps an input to its reconstruction
autoencoder = Model([input_x, input_y], [decoded1, decoded2])

# this model maps an input to its encoded representation
encoder = Model([input_x, input_y], encoded)

Instructions for updating:
Colocations handled automatically by placer.


In [13]:
r = autoencoder.predict([x.reshape(1,-1), y[0].reshape(1,-1)])
r1 = encoder.predict([x.reshape(1,-1), y[0].reshape(1,-1)])

In [17]:
autoencoder.save_weights("model_autoencoder")

In [12]:
autoencoder.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_img (InputLayer)          (None, 4096)         0                                            
__________________________________________________________________________________________________
fc1 (Dense)                     (None, 300)          1229100     input_img[0][0]                  
__________________________________________________________________________________________________
input_text (InputLayer)         (None, 300)          0                                            
__________________________________________________________________________________________________
fc_share (Dense)                (None, 300)          90300       fc1[0][0]                        
                                                                 input_text[0][0]           

In [14]:
r[0].shape
r[1].shape

(1, 300)

In [13]:
def mean_square_error_bis(y_true, y_pred):
    y_true_img = y_true[:,0]  # shape (4096,)
    y_true_word = y_true[:,1] # shape (300,)
    y_pred_img = y_pred[:,0]  # shape (4096,)
    y_pred_word = y_pred[:,1] # shape (300,)
    return mean_squared_error(y_true_img, y_pred_img) + mean_squared_error(y_true_word, y_pred_word)

In [14]:
autoencoder.compile(loss=mean_square_error_bis, optimizer='adam')
his = autoencoder.fit_generator(generator=training_generator,
                                epochs=1,
                                validation_data=val_generator)

Instructions for updating:
Use tf.cast instead.
Epoch 1/1


# Encoder

In [63]:
encoder = Model([input_x, input_y], autoencoder.get_layer('encoded').output)

In [64]:
encoder.summary()

Model: "model_6"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_img (InputLayer)          (None, 4096)         0                                            
__________________________________________________________________________________________________
fc1 (Dense)                     (None, 300)          1229100     input_img[0][0]                  
__________________________________________________________________________________________________
input_text (InputLayer)         (None, 300)          0                                            
__________________________________________________________________________________________________
fc_share (Dense)                (None, 300)          90300       fc1[0][0]                        
                                                                 input_text[0][0]           

In [23]:
m1 = Model(input=input_x, output=share_img)
m1.predict(x.reshape(1,-1)).shape

  """Entry point for launching an IPython kernel.


(1, 300)

In [24]:
m2 = Model(input=input_y, output=share_word)
m2.predict(y[0].reshape(1,-1)).shape

  """Entry point for launching an IPython kernel.


(1, 300)

In [None]:
def mean_square_error_2_head(y_true, y_pred):
    y_true_img = y_true[:,0]  # shape (4096,)
    y_true_word = y_true[:,1] # shape (300,)
    y_pred_img = y_pred[:,0]  # shape (4096,)
    y_pred_word = y_pred[:,1] # shape (300,)
    return mean_squared_error(y_true_img, y_pred_img) + mean_squared_error(y_true_word, y_pred_word)

In [29]:
ls

Autoencoder.ipynb  model  pytorch.ipynb


In [18]:
his.history

{'fc2_loss': [7.822301e-05],
 'fc3_loss': [2.6554415e-05],
 'loss': [0.00010229377667907386],
 'val_fc2_loss': [5.655377026414499e-05],
 'val_fc3_loss': [4.771835847350303e-06],
 'val_loss': [9.093570406548679e-05]}

In [19]:
autoencoder.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_img (InputLayer)          (None, 4096)         0                                            
__________________________________________________________________________________________________
fc1 (Dense)                     (None, 300)          1229100     input_img[0][0]                  
__________________________________________________________________________________________________
input_text (InputLayer)         (None, 300)          0                                            
__________________________________________________________________________________________________
fc_share (Dense)                (None, 300)          90300       fc1[0][0]                        
                                                                 input_text[0][0]           