In [1]:
import numpy as np
import tensorflow as tf

In [2]:
annotationFilePath = '../data/33k_67k_CloudPics_groupedData_landmark_image.txt'
with open(annotationFilePath,'r') as f:
    annTmp = f.read().splitlines()

In [3]:
pids = np.array([line.split()[0].split("/")[0] for line in annTmp],dtype=np.int32)
#fids = [[line.split()[0],[float(val)for val in line.split()[1:]]] for line in annTmp]
fids = [line.split() for line in annTmp]

In [4]:
len(fids)

739936

In [5]:
# Setup a tf.Dataset where one "epoch" loops over all PIDS.
# PIDS are shuffled after every epoch and continue indefinitely.
unique_pids = np.unique(pids)
dataset = tf.data.Dataset.from_tensor_slices(unique_pids)
dataset = dataset.shuffle(len(unique_pids))

In [6]:
batch_p = 2
batch_k = 5
# Constrain the dataset size to a multiple of the batch-size, so that
# we don't get overlap at the end of each epoch.
dataset = dataset.take((len(unique_pids) // batch_p) * batch_p)
dataset = dataset.repeat(None)  # Repeat forever. Funny way of stating it.

In [7]:
def sample_k_fids_for_pid(pid,all_pids,all_fids,batch_k):
    """ Given a PID, select K FIDs of that specific PID. """
    possible_fids = tf.boolean_mask(all_fids, tf.equal(all_pids, pid))
    #tf.Print(selected_fids,[selected_fids])
    
    # The following simply uses a subset of K of the possible FIDs
    # if more than, or exactly K are available. Otherwise, we first
    # create a padded list of indices which contain a multiple of the
    # original FID count such that all of them will be sampled equally likely.
    count = tf.shape(possible_fids)[0]
    padded_count = tf.cast(tf.ceil(tf.cast(batch_k,tf.float32) / tf.cast(count,tf.float32)), tf.int32) * count
    full_range = tf.mod(tf.range(padded_count), count)

    # Sampling is always performed by shuffling and taking the first k.
    shuffled = tf.random_shuffle(full_range)
    selected_fids = tf.gather(possible_fids, shuffled[:batch_k])
    selected_landmark = tf.string_to_number(selected_fids[:,1:],tf.float32)
    selected_fids = selected_fids[:,0]
    return selected_fids, selected_landmark, tf.fill([batch_k], pid)

In [8]:
# For every PID, get K images.
dataset = dataset.map(lambda pid: sample_k_fids_for_pid(pid, all_pids=pids,
                                                        all_fids=fids,
                                                        batch_k=batch_k),4)
# Ungroup/flatten the batches for easy loading of the files.
dataset = dataset.apply(tf.contrib.data.unbatch())

# Group it back into PK batches.
batch_size = batch_p * batch_k
dataset = dataset.batch(batch_size)

# Overlap producing and consuming for parallelism.
dataset = dataset.prefetch(5)

# Since we repeat the data infinitely, we only need a one-shot iterator.
#images, fids, pids = dataset.make_one_shot_iterator().get_next()
imgPath,landmark,labels = dataset.make_one_shot_iterator().get_next()

In [9]:
with tf.Session() as sess:
    for i in xrange(10):
        imgPath_val,landmark_val,labels_val = sess.run([imgPath,landmark,labels])
        print labels_val.shape, labels_val

(10,) [ 2307  2307  2307  2307  2307 20174 20174 20174 20174 20174]
(10,) [ 62383  62383  62383  62383  62383 123978 123978 123978 123978 123978]
(10,) [123042 123042 123042 123042 123042    192    192    192    192    192]
(10,) [74744 74744 74744 74744 74744 93452 93452 93452 93452 93452]
(10,) [ 41130  41130  41130  41130  41130 116159 116159 116159 116159 116159]
(10,) [83348 83348 83348 83348 83348  6231  6231  6231  6231  6231]
(10,) [83181 83181 83181 83181 83181 10920 10920 10920 10920 10920]
(10,) [137419 137419 137419 137419 137419 128298 128298 128298 128298 128298]
(10,) [41780 41780 41780 41780 41780 63668 63668 63668 63668 63668]
(10,) [143043 143043 143043 143043 143043  86165  86165  86165  86165  86165]


In [15]:
print imgPath_val.shape

(10,)


In [9]:
with tf.Session() as sess:
    for i in xrange(10):
        imgPath_val,landmark_val,labels_val = sess.run([imgPath,landmark,labels])
        print labels_val.shape, labels_val

[[ 54574  54574  54574  54574  54574]
 [ 44771  44771  44771  44771  44771]
 [  5464   5464   5464   5464   5464]
 [ 92208  92208  92208  92208  92208]
 [  9353   9353   9353   9353   9353]
 [  7830   7830   7830   7830   7830]
 [107139 107139 107139 107139 107139]
 [ 32676  32676  32676  32676  32676]
 [  9163   9163   9163   9163   9163]
 [112511 112511 112511 112511 112511]]
[[119658 119658 119658 119658 119658]
 [143018 143018 143018 143018 143018]
 [ 67021  67021  67021  67021  67021]
 [139286 139286 139286 139286 139286]
 [114781 114781 114781 114781 114781]
 [ 70869  70869  70869  70869  70869]
 [ 10340  10340  10340  10340  10340]
 [ 65278  65278  65278  65278  65278]
 [132451 132451 132451 132451 132451]
 [142777 142777 142777 142777 142777]]
[[ 54525  54525  54525  54525  54525]
 [138225 138225 138225 138225 138225]
 [  2180   2180   2180   2180   2180]
 [ 24917  24917  24917  24917  24917]
 [111470 111470 111470 111470 111470]
 [ 45116  45116  45116  45116  45116]
 [142804 1

In [13]:
print fid_batch_val

[[[ 283.  302.  433.  300.  360.  405.  314.  470.  415.  469.]
  [ 291.  299.  440.  302.  366.  404.  313.  468.  411.  471.]
  [ 283.  302.  433.  300.  360.  405.  314.  470.  415.  469.]
  [ 283.  302.  433.  300.  360.  405.  314.  470.  415.  469.]
  [ 133.  144.  203.  144.  169.  187.  147.  218.  193.  217.]
  [ 137.  143.  208.  144.  171.  186.  147.  216.  192.  216.]
  [ 137.  143.  208.  144.  171.  186.  147.  216.  192.  216.]
  [ 291.  299.  440.  302.  366.  404.  313.  468.  411.  471.]
  [ 137.  143.  208.  144.  171.  186.  147.  216.  192.  216.]
  [ 133.  144.  203.  144.  169.  187.  147.  218.  193.  217.]]]
