In [1]:
import tensorflow as tf
import numpy as np

In [2]:
import cv2
import matplotlib.pyplot as plt
import numpy as np
import time
import glob, os, sys
import pickle

def save_object(obj, filename):
    with open(filename, 'wb') as output:  # Overwrites any existing file.
        pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)
        
def load_object(filename):
    with open(filename, 'rb') as fp:
        return pickle.load(fp)

In [15]:
from keras import backend as K
from keras.preprocessing import image
import skimage
import keras

from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Flatten, Conv2D, MaxPooling2D, LSTM
from keras import optimizers

from sklearn.model_selection import train_test_split

from keras.applications.resnet50 import ResNet50 as resnet
from keras.applications.resnet50 import preprocess_input as resnet_pp

from keras.optimizers import Adam

In [4]:
main_path = os.path.abspath("/home/mikey/Repos/visual-attention-cnn-and-eye-tracking/POETdataset/PascalImages/")
classes = ['dog', 'sofa']
IMG_SIZE = 224

In [8]:
# Build a model by adding preprocessing before the pretrained CNN
def get_feature_extraction_model(img_size):
    cnn_object, pp_function = _get_pretrained_model()
    model = keras.models.Sequential()
    cnn_model = cnn_object(weights='imagenet', include_top=False, pooling='max')
    model.add(keras.layers.Lambda(pp_function, name='preprocessing', input_shape=(img_size, img_size, 3)))
    model.add(cnn_model)
    return model

# Unpacking information from the models dictionary
def _get_pretrained_model():
    cnn_object = resnet
    pp_function = resnet_pp
    return cnn_object, pp_function

def _get_patches(x, patch_width):
    patches = np.squeeze(np.asarray(skimage.util.view_as_windows(x, window_shape=(1,patch_width,patch_width,3), 
                                                                 step=(1,patch_width,patch_width,3)), dtype=np.int))
    print(patches.shape)
    patches = patches.reshape(x.shape[0], int(IMG_SIZE/patch_width)**2, patch_width, patch_width, 3)
    return patches

def get_features(files, model, patch_width):
    # Load images based on the size of the Lambda layer 
    # provided as the first layer before the pretrained CNN
    x = np.asarray([image.img_to_array(image.load_img(file, target_size=(IMG_SIZE, IMG_SIZE))) for file in files], dtype=np.int)
    patches = _get_patches(x, patch_width = patch_width)
    patches_shape = patches.shape
    features = model.predict(patches.reshape(-1, patch_width, patch_width, 3), verbose=1)
    print(features.shape)
    return features.reshape(patches_shape[0], patches_shape[1], 2048)

In [6]:
PATCH_WIDTH=56
model = get_feature_extraction_model(img_size = PATCH_WIDTH)

In [10]:
files_list = [glob.glob(os.path.join(main_path, '%s*' %class_)) for class_ in classes]
files_dict = {class_name.replace('*', ''): class_files for class_name, class_files in zip(classes, files_list)}

for files in files_list:
    assert len(files) > 0
    
new_dir = 'soft_attention_features'
if not os.path.exists(new_dir):
    os.makedirs(new_dir)

for class_number, (class_name, files) in enumerate(files_dict.items()):
    print(class_name)
    features = get_features(files, model, PATCH_WIDTH)
    print(features.shape)
    save_object(features, new_dir + os.path.sep + class_name+'_features.pkl')

dog
(1257, 4, 4, 56, 56, 3)
(20112, 2048)
(1257, 16, 2048)
sofa
(467, 4, 4, 56, 56, 3)
(7472, 2048)
(467, 16, 2048)


In [33]:
X = []
y = []
for class_number, (class_name, files) in enumerate(files_dict.items()):
    classfeatures_path = os.path.abspath(os.path.join('.', 'soft_attention_features', class_name + '_features.pkl'))
    print(classfeatures_path)
    features = load_object(classfeatures_path).reshape(-1,16,2048)
    print(features.shape)
    X.append(features)
    y.append([class_number] * features.shape[0])
    
X = np.concatenate(X)
y = np.concatenate(y)
y = keras.utils.to_categorical(y)
X.shape, y.shape

/home/mikey/Repos/visual-attention-cnn-and-eye-tracking/soft_attention_features/dog_features.pkl
(1257, 16, 2048)
/home/mikey/Repos/visual-attention-cnn-and-eye-tracking/soft_attention_features/sofa_features.pkl
(467, 16, 2048)


((1724, 16, 2048), (1724, 2))

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, stratify=y)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((1551, 16, 2048), (1551, 2), (173, 16, 2048), (173, 2))

In [17]:
dense_model = Sequential()
dense_model.add(LSTM(6))
dense_model.add(Dense(units=128, activation='relu', input_shape=(2048,)))
dense_model.add(Dropout(0.25))
dense_model.add(Dense(units=2, activation='softmax'))

dense_model.compile(loss='categorical_crossentropy',
             optimizer=Adam(),
             metrics=['accuracy'])

history = dense_model.fit(X_train,y_train,
                    batch_size = 32,
                    epochs = 100,
                    validation_data=(X_test, y_test))

Train on 1551 samples, validate on 173 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100

KeyboardInterrupt: 

In [71]:
NUM_CHANNELS = 1

tf.reset_default_graph()

X_input = tf.placeholder(tf.float32, [None, 16, 2048], name='X')
labels = tf.placeholder(tf.int32, [None, 2], name='labels')

rnn_units = 10
basic_cell = tf.nn.rnn_cell.BasicRNNCell(num_units=rnn_units)
outputs, states = tf.nn.dynamic_rnn(basic_cell, X_input, dtype=tf.float32)
# outputs = tf.reshape(outputs, shape=(tf.shape(outputs)[0] * tf.shape(outputs)[1], rnn_units))



outputs = tf.reduce_sum(outputs, axis=1)
# weights = tf.ones_like(pool2_flat)
# weights = tf.nn.softmax(weights)t

# pool2_flat = tf.einsum('ijk, ijk->ik', weights, X)
dense = tf.layers.dense(inputs=outputs, units=128, activation=tf.nn.relu)

# Logits Layer
logits = tf.layers.dense(inputs=dense, units=2)

classes = tf.argmax(logits, axis=1)
probabilites = tf.nn.softmax(logits, name="softmax_tensor")

loss = tf.nn.softmax_cross_entropy_with_logits_v2(labels=labels, logits=logits)

optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.0001)
train_op = optimizer.minimize(
    loss=loss,
    global_step=tf.train.get_global_step()
)

accuracy = tf.reduce_mean(tf.cast(tf.equal(classes, tf.argmax(labels, 1)), tf.float32))

init_g = tf.global_variables_initializer()
init_l = tf.local_variables_initializer()

In [72]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, stratify=y)
X_train.shape, y_train.shape, X_test.shape, y_test.shape
n_epochs = 10
batch_size = 32

with tf.Session() as sess:
    sess.run(init_g)
    sess.run(init_l)
    
        # Training cycle
    for epoch in range(n_epochs):
        print("Epoch:", epoch)
        train_loss = []
        for b in range(0, X_train.shape[0], batch_size):
            _, acc = sess.run([train_op, accuracy], feed_dict={
                                                            X_input: X_train[b:b+batch_size],
                                                            labels: y_train[b:b+batch_size]
                                                        })
#             print(acc)
        
        _, acc = sess.run([train_op, accuracy], feed_dict={
                                                X_input: X_test,
                                                labels: y_test
                                            })
        print(acc)

Epoch: 0
0.87283236
Epoch: 1
0.9017341
Epoch: 2
0.91907513
Epoch: 3
0.91907513
Epoch: 4
0.9306358
Epoch: 5
0.9364162
Epoch: 6
0.9364162
Epoch: 7
0.9364162
Epoch: 8
0.9364162
Epoch: 9
0.9479769


Note to self:

I need to use lstm cell (building block) instead of dynamic rnn (unrolled rnn) like here:

    for t in range(self.T):
            context, alpha = self._attention_layer(features, features_proj, h, reuse=(t!=0))
            alpha_list.append(alpha)

            if self.selector:
                context, beta = self._selector(context, h, reuse=(t!=0))

            with tf.variable_scope('lstm', reuse=(t!=0)):
                _, (c, h) = lstm_cell(inputs=tf.concat( [x[:,t,:], context],1), state=[c, h])

            logits = self._decode_lstm(x[:,t,:], h, context, dropout=self.dropout, reuse=(t!=0))

            loss += tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=captions_out[:, t],logits=logits)*mask[:, t] )
            
So that I can take the output of the lstm at each step and feed it to my attention network