In [1]:
# Import libraries for inspecting files
import json
import pickle
import h5py
import numpy as np

import time

word_embed_size = 50
max_objects = 100

In [2]:
embed_dict = {}
start_time = time.time()
with open('./data/glove.6B.'+str(word_embed_size)+'d.txt') as f:
    for line in f:
        wordsAndVectors = line.strip().split(' ')
        word = wordsAndVectors[0]
        vector = wordsAndVectors[1:]
        vector = list(map(float, vector))
        embed_dict[word] = vector

print('Time elapsed:', np.round(time.time() - start_time,1), "seconds")

print(len(embed_dict))

Time elapsed: 3.9 seconds
400000


In [3]:
# Function that breaks text into words and uses the embedding dictionary returning the average for each word
def encode_text(text):
    word_list = list(text.split())
    if len(word_list) == 1:
        if word_list[0] in embed_dict:

            embedding = np.array(embed_dict[word_list[0]])
        else:

            embedding = np.zeros(word_embed_size)
    else:
        embed_list = np.zeros((len(word_list), word_embed_size))

        for i in range(len(word_list)):
            if word_list[i] in embed_dict:
                embed_list[i] = embed_dict[word_list[i]]
      
        embedding = np.mean(embed_list, axis=0)
    return embedding


In [4]:

def parse_graph (data_sg, num_images_feed, index_start=0, report_interval=5000):

    # Initialise data files
    bboxes_matrix = np.zeros((num_images_feed, max_objects, 4))
    features_matrix = np.zeros((num_images_feed, max_objects, 4*word_embed_size))
    image_info_dict = {}
    image_count=0
    #index_to_image = {} #Z# inspection code

    start_time = time.time()

    for image_id in data_sg:

        if image_count % report_interval == 0:
            print('Progress:', str(np.round(100*(image_count / num_images_feed),0))
                  +'%, Time:', np.round(time.time() - start_time,1), "seconds")

        object_name_dict = {}

        # First pass is to go through the object list and build dictionary of objects so relations can recognise
        for object_id in data_sg[image_id]['objects']:
            object_name_dict[object_id] = data_sg[image_id]['objects'][object_id]['name']

        # Second pass is to build the embeddings         
        object_count = 0

        #obj_list = [] #Z
        for object_id in data_sg[image_id]['objects']:

            # Get bounding box details and store
            x = data_sg[image_id]['objects'][object_id]['x']
            y = data_sg[image_id]['objects'][object_id]['y']
            h = data_sg[image_id]['objects'][object_id]['h']
            w = data_sg[image_id]['objects'][object_id]['w']
            bboxes_matrix[image_count, object_count] = [x,y,h,w]

            obj_name = data_sg[image_id]['objects'][object_id]['name']
            #obj_list.append(obj_name) #Z
            object_name_encode = encode_text(obj_name)
            objects_list = object_name_encode

            # Get mean attribute encodings for each object       
            n_attribs = len(data_sg[image_id]['objects'][object_id]['attributes'])        
            attribs_list = np.zeros(word_embed_size)

            if n_attribs >= 1:
                if n_attribs == 1:
                    for attribute in data_sg[image_id]['objects'][object_id]['attributes']:
                        attribs_list = encode_text(attribute)
                elif n_attribs > 1:
                    attribs_sublists = np.zeros((n_attribs, word_embed_size))
                    attrib_count = 0
                    for attribute in data_sg[image_id]['objects'][object_id]['attributes']:

                        attribs_sublists[attrib_count] = encode_text(attribute)
                        attrib_count += 1
                    attribs_list = np.mean(attribs_sublists, axis=0)       

            # Get mean relation encodings for each object       
            n_relations = len(data_sg[image_id]['objects'][object_id]['relations'])     
            relation_object_list = np.zeros(word_embed_size)
            relationship_list = np.zeros(word_embed_size)

            if n_relations >= 1:
                if n_relations == 1:

                    for relation in data_sg[image_id]['objects'][object_id]['relations']:
                        rel_object_id = relation['object']
                        if rel_object_id in object_name_dict:    
                            rel_object_name = object_name_dict[rel_object_id]
                            relations_object_list = encode_text(rel_object_name)
                        relationship_list = encode_text(relation['name'])

                elif n_relations > 1:    
                    relations_objects_sublists = np.zeros((n_relations, word_embed_size))
                    relationships_sublists = np.zeros((n_relations, word_embed_size))
                    relations_count = 0  
                    for relation in data_sg[image_id]['objects'][object_id]['relations']:
                        rel_object_id = relation['object']
                        if rel_object_id in object_name_dict:    
                            rel_object_name = object_name_dict[rel_object_id]
                            relations_objects_sublists[relations_count] = encode_text(rel_object_name)
                        relationships_sublists[relations_count] = encode_text(relation['name'])
                        relations_count += 1
                    relation_object_list = np.mean(relations_objects_sublists, axis=0)
                    relationship_list = np.mean(relationships_sublists, axis=0)

            features_list = np.concatenate([objects_list, attribs_list, relation_object_list, relationship_list])
            features_matrix[image_count, object_count] = features_list

            object_count += 1
            if object_count >= 100:
                break

        # Create image summary info
        image_info_dict[str(image_id)] = {'width': data_sg[image_id]['width'], 'objectsNum': object_count,
                                          'height': data_sg[image_id]['height'], 'index': image_count+index_start}

        #index_to_image[image_count]=[image_id, obj_list] #Z inspection code

        image_count +=1
        if image_count >= num_images_feed:
            break

    print('\nComplete. Time elapsed:', np.round(time.time() - start_time,0), "seconds")
    return bboxes_matrix, features_matrix, image_info_dict, image_count
        

In [5]:
with open('./data/sceneGraphs/train_sceneGraphs.json') as f:
    data_sg = json.load(f)


In [6]:
bboxes_train, features_train, image_info_train, n_train = parse_graph (data_sg, len(data_sg))

Progress: 0.0%, Time: 0.0 seconds
Progress: 7.0%, Time: 6.7 seconds
Progress: 13.0%, Time: 13.5 seconds
Progress: 20.0%, Time: 20.4 seconds
Progress: 27.0%, Time: 27.4 seconds
Progress: 33.0%, Time: 34.3 seconds
Progress: 40.0%, Time: 41.3 seconds
Progress: 47.0%, Time: 48.3 seconds
Progress: 53.0%, Time: 55.1 seconds
Progress: 60.0%, Time: 62.2 seconds
Progress: 67.0%, Time: 69.0 seconds
Progress: 73.0%, Time: 75.9 seconds
Progress: 80.0%, Time: 82.7 seconds
Progress: 87.0%, Time: 89.6 seconds
Progress: 93.0%, Time: 96.3 seconds

Complete. Time elapsed: 103.0 seconds


In [7]:
print('bboxes_train.shape:', bboxes_train.shape)
print('features_train.shape:', features_train.shape)
print('len(image_info_train):', len(image_info_train))
print('n_train:', n_train)

bboxes_train.shape: (74942, 100, 4)
features_train.shape: (74942, 100, 200)
len(image_info_train): 74942
n_train: 74942


In [8]:
with open('./data/sceneGraphs/val_sceneGraphs.json') as f:
    data_sg = json.load(f)
bboxes_val, features_val, image_info_val, n_val = parse_graph (data_sg, len(data_sg), index_start = n_train)

Progress: 0.0%, Time: 0.0 seconds
Progress: 47.0%, Time: 6.7 seconds
Progress: 93.0%, Time: 13.6 seconds

Complete. Time elapsed: 15.0 seconds


In [9]:
print('bboxes_val.shape:', bboxes_val.shape)
print('features_tval.shape:', features_val.shape)
print('len(image_info_val):', len(image_info_val))
print('n_val:', n_val)

bboxes_val.shape: (10696, 100, 4)
features_tval.shape: (10696, 100, 200)
len(image_info_val): 10696
n_val: 10696


In [10]:
bboxes_matrix = np.concatenate((bboxes_train, bboxes_val))
features_matrix = np.concatenate((features_train, features_val))
image_info_dict = {**image_info_train, **image_info_val}

In [11]:
bboxes_train = bboxes_val = features_train = features_val = image_info_train = image_info_val = data_sg = None

In [12]:
print('Bounding boxes: shape =', bboxes_matrix.shape)
print('Image 0:\n', bboxes_matrix[0][0:10])
print('\nImage 1:\n', bboxes_matrix[1][0:10])
print('\nImage 2:\n', bboxes_matrix[2][0:10])

Bounding boxes: shape = (85638, 100, 4)
Image 0:
 [[248.  55.  34.  64.]
 [245.  92.  16.  26.]
 [268.  32.  50.  49.]
 [  0.   0. 374. 499.]
 [402.  55.  95.  15.]
 [ 68. 123.  27.  24.]
 [ 57. 162.  57.  93.]
 [ 90. 147.  16.  24.]
 [  0.   0. 374. 396.]
 [178. 184.  99. 115.]]

Image 1:
 [[134.   0.  85.  56.]
 [143.  15.  16.  29.]
 [249.  13.  33.  32.]
 [304.  11.  65.  26.]
 [261.  54.  21.  29.]
 [382.   0. 145.  13.]
 [281.   0. 116.  23.]
 [460.   1. 104.  23.]
 [395.  40.  33.  21.]
 [213.  53.  30.  29.]]

Image 2:
 [[215. 227. 106. 147.]
 [ 12. 260.  43.  95.]
 [384. 186. 147. 104.]
 [108.  89. 116.  54.]
 [234. 110.  70.  40.]
 [  0. 242.  91. 109.]
 [  0. 201.  35.  52.]
 [108. 173.  31.  35.]
 [246. 109.  13.  15.]
 [ 44. 128.  98. 162.]]


In [13]:
print(len(image_info_dict))
print(json.dumps(image_info_dict['2373554'], sort_keys=True, indent=4))

85638
{
    "height": 281,
    "index": 1,
    "objectsNum": 29,
    "width": 500
}


In [14]:
print('Features: shape =', features_matrix.shape)
print('\nImage 0 (name encoding):\n', np.round(
    features_matrix[0, 0:5, 0:10],3))
print('\nImage 0 (attributes encoding):\n', np.round(
    features_matrix[0, 0:5, word_embed_size*1:word_embed_size*1+10],3))
print('\nImage 0 (relation object encoding):\n', np.round(
    features_matrix[0, 0:5, word_embed_size*2:word_embed_size*2+10],3))
print('\nImage 0 (relationship encoding):\n', np.round(
    features_matrix[0, 0:5, word_embed_size*3:word_embed_size*3+10],3))

Features: shape = (85638, 100, 200)

Image 0 (name encoding):
 [[-0.255 -0.752 -0.867  1.12   0.129  1.012 -0.572 -0.362  0.443 -0.122]
 [ 0.195  0.802  0.366  0.617 -0.028 -0.018 -1.003 -0.117  0.615 -0.642]
 [-0.241 -1.011 -0.825  0.313  0.564  0.434 -0.625 -0.936  0.145  0.366]
 [-0.303  1.244 -1.087  0.247  0.093 -0.772 -1.22  -0.17   0.564 -0.697]
 [-0.384  0.191 -0.505  0.106  0.138  0.135 -0.705 -0.468 -0.062 -1.069]]

Image 0 (attributes encoding):
 [[ 0.481  0.489 -0.239 -0.071  0.535  0.471 -0.685 -0.472  0.17  -0.573]
 [ 0.     0.     0.     0.     0.     0.     0.     0.     0.     0.   ]
 [ 0.481  0.489 -0.239 -0.071  0.535  0.471 -0.685 -0.472  0.17  -0.573]
 [ 0.667  0.064 -1.63   0.103  0.911  0.173 -0.099  0.092  0.101  1.165]
 [-0.464  0.333 -0.211 -0.129  0.763  0.812 -0.644 -1.022 -0.27  -0.704]]

Image 0 (relation object encoding):
 [[ 0.     0.     0.     0.     0.     0.     0.     0.     0.     0.   ]
 [ 0.     0.     0.     0.     0.     0.     0.     0.     0.

In [15]:
print('\nImage 1 (name encoding):\n', np.round(
    features_matrix[1, 0:5, 0:10],3))
print('\nImage 1 (attributes encoding):\n', np.round(
    features_matrix[1, 0:5, word_embed_size*1:word_embed_size*1+10],3))
print('\nImage 1 (relation object encoding):\n', np.round(
    features_matrix[1, 0:5, word_embed_size*2:word_embed_size*2+10],3))
print('\nImage 1 (relationship encoding):\n', np.round(
    features_matrix[1, 0:5, word_embed_size*3:word_embed_size*3+10],3))


Image 1 (name encoding):
 [[ 0.622  1.199 -0.014  0.201  0.694  0.121 -0.904 -1.402  0.434 -0.485]
 [ 0.888  1.072 -0.09  -0.084  1.154  0.648  0.076  0.215 -0.259 -0.953]
 [ 0.888  1.072 -0.09  -0.084  1.154  0.648  0.076  0.215 -0.259 -0.953]
 [ 0.888  1.072 -0.09  -0.084  1.154  0.648  0.076  0.215 -0.259 -0.953]
 [ 0.888  1.072 -0.09  -0.084  1.154  0.648  0.076  0.215 -0.259 -0.953]]

Image 1 (attributes encoding):
 [[ 0.     0.     0.     0.     0.     0.     0.     0.     0.     0.   ]
 [-0.577  0.87  -0.491 -0.108  0.654  0.325 -1.326 -1.011 -0.207 -0.799]
 [-0.577  0.87  -0.491 -0.108  0.654  0.325 -1.326 -1.011 -0.207 -0.799]
 [-0.577  0.87  -0.491 -0.108  0.654  0.325 -1.326 -1.011 -0.207 -0.799]
 [-0.577  0.87  -0.491 -0.108  0.654  0.325 -1.326 -1.011 -0.207 -0.799]]

Image 1 (relation object encoding):
 [[ 0.546  0.89  -0.066  0.08   0.823  0.295 -0.481 -0.489  0.    -0.713]
 [ 0.644  1.043  0.032  0.17   0.726  0.214 -0.675 -0.857  0.178 -0.668]
 [ 0.685  1.048  0.011  

In [16]:
print('\nImage 2 (name encoding):\n', np.round(
    features_matrix[2, 0:5, 0:10],3))
print('\nImage 2 (attributes encoding):\n', np.round(
    features_matrix[2, 0:5, word_embed_size*1:word_embed_size*1+10],3))
print('\nImage 2 (relation object encoding):\n', np.round(
    features_matrix[2, 0:5, word_embed_size*2:word_embed_size*2+10],3))
print('\nImage 2 (relationship encoding):\n', np.round(
    features_matrix[2, 0:5, word_embed_size*3:word_embed_size*3+10],3))


Image 2 (name encoding):
 [[-1.000e-03 -3.010e-01 -1.880e-01 -5.450e-01 -6.460e-01  6.040e-01
  -1.050e-01 -3.010e-01 -2.430e-01 -6.140e-01]
 [ 5.350e-01  5.760e-01 -5.400e-02 -2.080e-01 -7.880e-01 -1.760e-01
  -2.130e-01 -1.440e-01  1.034e+00 -7.900e-02]
 [ 2.250e-01  3.840e-01 -5.010e-01  1.670e-01  2.530e-01  6.700e-02
  -4.540e-01  7.200e-02  2.800e-01 -1.385e+00]
 [-4.170e-01  4.650e-01  1.380e-01 -1.930e-01  1.011e+00  1.500e-01
  -2.000e-02  3.730e-01 -3.730e-01 -8.900e-01]
 [-9.400e-02  4.300e-01 -1.720e-01 -4.550e-01  1.645e+00  4.030e-01
  -3.730e-01  2.510e-01 -1.060e-01  1.080e-01]]

Image 2 (attributes encoding):
 [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]

Image 2 (relation object encoding):
 [[ 0.467  0.254 -0.004 -0.046 -0.518 -0.07  -0.408  0.102  0.407 -0.654]
 [-0.161 -0.044 -0.497 -0.215 -0.675  0.283 -0.236 -0.291  0.041 -0.641]
 [ 0.     0.  

### Checking question and answer code for images not in scene graphs

In [35]:
with open('./data/balanced_train_data.json') as f:
    data_bal_train = json.load(f)

In [36]:
print("Keys: %s" % data_bal_train.keys())
bal_tr_key = list(data_bal_train.keys())[0]

Keys: dict_keys(['questions'])


In [37]:
print(len(data_bal_train['questions']))

132062


In [39]:
i = 0
i_max = 999000
data_bal_train_sg = {}
n_false = 0

for qa_item in data_bal_train['questions']:
    #print(qa_item['imageId'], qa_item)
    if qa_item['imageId'] in image_info_dict:
        #print(True)
        n_false = n_false
    else:
        #print(False)
        n_false += 1
    i += 1
    if i > i_max:
        break
print("Number not in dict =", n_false)

Number not in dict = 0


### Savings data to JSON and H5 file formats

In [20]:
# Write JSON File
with open('./data/gqa_objects_sg_merged_info.json', 'w') as outfile:
    json.dump(image_info_dict, outfile)

In [40]:
# Write H5PY File
export_file = h5py.File('./data/gqa_objects_sg.h5', 'w')
export_file.create_dataset('bboxes', data=bboxes_matrix)
export_file.create_dataset('features', data=features_matrix)
export_file.close()

### Testing file saves by uploading and reading

In [21]:
# Load merged file dictionary
with open('./data/gqa_objects_sg_merged_info.json') as f:
    data_info_dict = json.load(f)

In [22]:
print(len(data_info_dict))

85638


In [25]:
i = 0
j = len(data_info_dict)

for item in data_info_dict:
    
    i += 1
    if i < 5:
        print(item, data_info_dict[str(item)])
    if i == 5:
        print()
    if (j-i) < 5:
        print(item, data_info_dict[str(item)])


2386621 {'width': 500, 'objectsNum': 16, 'height': 375, 'index': 0}
2373554 {'width': 500, 'objectsNum': 29, 'height': 281, 'index': 1}
2370799 {'width': 500, 'objectsNum': 16, 'height': 333, 'index': 2}
2370791 {'width': 500, 'objectsNum': 16, 'height': 333, 'index': 3}

2374606 {'width': 500, 'objectsNum': 21, 'height': 375, 'index': 85633}
2360947 {'width': 500, 'objectsNum': 15, 'height': 375, 'index': 85634}
2360946 {'width': 375, 'objectsNum': 5, 'height': 500, 'index': 85635}
2379678 {'width': 500, 'objectsNum': 25, 'height': 334, 'index': 85636}
2379672 {'width': 500, 'objectsNum': 9, 'height': 333, 'index': 85637}


In [26]:
# Open objects file and inspect
file_objects = './data/gqa_objects_sg.h5'
data_obs = h5py.File(file_objects, 'r')

In [27]:
# List all groups
print("Keys: %s" % data_obs.keys())
obs_key = list(data_obs.keys())[0]

Keys: <KeysViewHDF5 ['bboxes', 'features']>


In [29]:
print('Image 1 objects (bounding boxes): shape =', 
      data_obs['bboxes'].shape, '\n\n', data_obs['bboxes'][1][0:10])
print('\nImage',j-1,'objects (bounding boxes): shape =', 
      data_obs['bboxes'].shape, '\n\n', data_obs['bboxes'][j-1][0:10])

Image 1 objects (bounding boxes): shape = (85638, 100, 4) 

 [[134.   0.  85.  56.]
 [143.  15.  16.  29.]
 [249.  13.  33.  32.]
 [304.  11.  65.  26.]
 [261.  54.  21.  29.]
 [382.   0. 145.  13.]
 [281.   0. 116.  23.]
 [460.   1. 104.  23.]
 [395.  40.  33.  21.]
 [213.  53.  30.  29.]]

Image 85637 objects (bounding boxes): shape = (85638, 100, 4) 

 [[ 73.   2.  98. 267.]
 [158. 164.  56.  61.]
 [133. 100.   9.  12.]
 [ 58.  97. 145. 116.]
 [158. 163.  78.  90.]
 [ 32. 220.  50. 188.]
 [292.   0. 332. 207.]
 [ 97.  97.  45.  47.]
 [ 26.  88. 244. 198.]
 [  0.   0.   0.   0.]]


In [30]:
# Get shape and visualise features
print('Image 1 objects (features): shape =', 
      data_obs['features'].shape, '\n\n', np.round(data_obs['features'][1, 0:5, 0:10],2))
print('\nImage',j-1,'objects (features): shape =', 
      data_obs['features'].shape, '\n\n', np.round(data_obs['features'][j-1, 0:5, 0:10],2))

Image 1 objects (features): shape = (85638, 100, 200) 

 [[ 0.62  1.2  -0.01  0.2   0.69  0.12 -0.9  -1.4   0.43 -0.49]
 [ 0.89  1.07 -0.09 -0.08  1.15  0.65  0.08  0.22 -0.26 -0.95]
 [ 0.89  1.07 -0.09 -0.08  1.15  0.65  0.08  0.22 -0.26 -0.95]
 [ 0.89  1.07 -0.09 -0.08  1.15  0.65  0.08  0.22 -0.26 -0.95]
 [ 0.89  1.07 -0.09 -0.08  1.15  0.65  0.08  0.22 -0.26 -0.95]]

Image 85637 objects (features): shape = (85638, 100, 200) 

 [[ 0.26  0.32  0.74 -0.37  0.66 -0.49 -0.56 -0.24 -0.45 -0.13]
 [-0.46  0.52 -1.   -0.45  0.54  1.37 -0.07 -1.2   0.07  0.45]
 [ 0.07 -0.03 -0.2  -0.27  0.18  0.78  0.88  0.37  0.53  0.08]
 [ 0.45 -0.5  -0.54 -0.02  0.22  0.55 -0.67 -0.69  0.63 -0.2 ]
 [ 0.3   0.41 -0.38 -1.21  1.05  1.58 -0.15 -0.28  1.01  0.09]]
