In [1]:
import json
import numpy as np
import pandas as pd
from collections import defaultdict
from functools import reduce
from sklearn.model_selection import train_test_split

In [2]:
json_data = {}
sizes = {}
#set_name = ['rsicd', 'ucm', 'sydney']
set_name = ['rsicd', 'ucm']

# read in json files from all three datasets
for name in set_name:
    with open('../data/raw/dataset_' + name + '_modified.json', 'r') as data:
        json_data[name] = json.load(data)
        sizes[name] = len(json_data[name]['images'])
        print(f'There are {sizes[name]} images in the {name} dataset.')

There are 10921 images in the rsicd dataset.
There are 2100 images in the ucm dataset.


In [3]:
ucm_data = {}
sydney_data = {}
rsicd_data = {}

for name in set_name:
    for single_image in json_data[name]['images']:
        new_filename = name + '_' + single_image['filename'][:-3] + 'jpg'
        if (name == 'rsicd'):
            rsicd_data[new_filename] = single_image
            rsicd_data[new_filename]['old_dataset_name'] = 'dataset_rsicd_modified'
        elif (name == 'ucm'):
            ucm_data[new_filename] = single_image
            ucm_data[new_filename]['old_dataset_name'] = 'dataset_ucm_modified'
        elif (name == 'sydney'):
            sydney_data[new_filename] = single_image
            sydney_data[new_filename]['old_dataset_name'] = 'dataset_sydney_modified'
        else: 
            print("uh-oh") #should add try catch later
            
print(f'There are {len(rsicd_data)} images in the rsicd dataset.')
print(f'There are {len(ucm_data)} images in the ucm dataset.')
print(f'There are {len(sydney_data)} images in the sydney dataset.')

There are 10921 images in the rsicd dataset.
There are 2100 images in the ucm dataset.
There are 0 images in the sydney dataset.


In [4]:
# Python code to merge dict 
def merge_2(dict1, dict2, dict3):
    dict2.update(dict1)
    return(dict3.update(dict2)) 

def merge(dict1, dict2, dict3): 
    res = {**dict1, **dict2, **dict3} 
    return res

In [5]:
combined_data = merge(rsicd_data, ucm_data, sydney_data)

In [6]:
len(combined_data)

13021

In [7]:
combined_df = pd.DataFrame(combined_data).T

In [8]:
combined_df.head(-5)

Unnamed: 0,filename,imgid,sentences,split,sentids,old_dataset_name
rsicd_airport_1.jpg,airport_1.jpg,0,"[{'tokens': ['many', 'planes', 'are', 'parked'...",train,"[0, 1, 2, 3, 4]",dataset_rsicd_modified
rsicd_airport_10.jpg,airport_10.jpg,1,"[{'tokens': ['some', 'planes', 'are', 'parked'...",train,"[5, 6, 7, 8, 9]",dataset_rsicd_modified
rsicd_airport_100.jpg,airport_100.jpg,2,"[{'tokens': ['many', 'planes', 'are', 'parked'...",train,"[10, 11, 12, 13, 14]",dataset_rsicd_modified
rsicd_airport_101.jpg,airport_101.jpg,3,"[{'tokens': ['many', 'planes', 'are', 'parked'...",train,"[15, 16, 17, 18, 19]",dataset_rsicd_modified
rsicd_airport_102.jpg,airport_102.jpg,4,"[{'tokens': ['several', 'buildings', 'and', 'g...",train,"[20, 21, 22, 23, 24]",dataset_rsicd_modified
...,...,...,...,...,...,...
ucm_2091.jpg,2091.tif,2090,"[{'tokens': ['Two', 'tennis', 'courts', 'arran...",test,"[10450, 10451, 10452, 10453, 10454]",dataset_ucm_modified
ucm_2092.jpg,2092.tif,2091,"[{'tokens': ['Three', 'tennis', 'courts', 'arr...",test,"[10455, 10456, 10457, 10458, 10459]",dataset_ucm_modified
ucm_2093.jpg,2093.tif,2092,"[{'tokens': ['Three', 'tennis', 'courts', 'arr...",test,"[10460, 10461, 10462, 10463, 10464]",dataset_ucm_modified
ucm_2094.jpg,2094.tif,2093,"[{'tokens': ['Two', 'tennis', 'courts', 'on', ...",test,"[10465, 10466, 10467, 10468, 10469]",dataset_ucm_modified


In [9]:
train_valid, test = train_test_split(combined_df, test_size=0.2, random_state=123)
train, valid = train_test_split(train_valid, test_size=0.2, random_state=123)

In [10]:
len(train)

8332

In [11]:
len(test)

2605

In [12]:
len(valid)

2084

In [13]:
train.head()

Unnamed: 0,filename,imgid,sentences,split,sentids,old_dataset_name
ucm_1080.jpg,1080.tif,1079,"[{'tokens': ['Lots', 'of', 'boats', 'docked', ...",train,"[5395, 5396, 5397, 5398, 5399]",dataset_ucm_modified
rsicd_denseresidential_363.jpg,denseresidential_363.jpg,2793,"[{'tokens': ['the', 'area', 'is', 'a', 'densel...",train,"[13965, 13966, 13967, 13968, 13969]",dataset_rsicd_modified
rsicd_resort_126.jpg,resort_126.jpg,7330,"[{'tokens': ['many', 'green', 'trees', 'and', ...",train,"[36650, 36651, 36652, 36653, 36654]",dataset_rsicd_modified
rsicd_baseballfield_165.jpg,baseballfield_165.jpg,743,"[{'tokens': ['the', 'fan', 'shaped', 'baseball...",train,"[3715, 3716, 3717, 3718, 3719]",dataset_rsicd_modified
ucm_18.jpg,18.tif,17,"[{'tokens': ['There', 'is', 'a', 'piece', 'of'...",train,"[85, 86, 87, 88, 89]",dataset_ucm_modified


In [14]:
test.head()

Unnamed: 0,filename,imgid,sentences,split,sentids,old_dataset_name
ucm_1096.jpg,1096.tif,1095,"[{'tokens': ['Lots', 'of', 'boats', 'docked', ...",test,"[5475, 5476, 5477, 5478, 5479]",dataset_ucm_modified
rsicd_00188.jpg,00188.jpg,10187,"[{'tokens': ['the', 'bustling', 'block', 'embr...",train,"[50935, 50936, 50937, 50938, 50939]",dataset_rsicd_modified
rsicd_meadow_209.jpg,meadow_209.jpg,4342,"[{'tokens': ['several', 'green', 'trees', 'are...",train,"[21710, 21711, 21712, 21713, 21714]",dataset_rsicd_modified
rsicd_viaduct_37.jpg,viaduct_37.jpg,9880,"[{'tokens': ['there', 'is', 'a', 'lake', 'near...",train,"[49400, 49401, 49402, 49403, 49404]",dataset_rsicd_modified
rsicd_00536.jpg,00536.jpg,10535,"[{'tokens': ['the', 'football', 'field', 'is',...",train,"[52675, 52676, 52677, 52678, 52679]",dataset_rsicd_modified


In [15]:
valid.head()

Unnamed: 0,filename,imgid,sentences,split,sentids,old_dataset_name
rsicd_park_3.jpg,park_3.jpg,5352,"[{'tokens': ['the', 'lake', 'is', 'surrounded'...",train,"[26760, 26761, 26762, 26763, 26764]",dataset_rsicd_modified
rsicd_mountain_176.jpg,mountain_176.jpg,4875,"[{'tokens': ['it', 'is', 'a', 'piece', 'of', '...",train,"[24375, 24376, 24377, 24378, 24379]",dataset_rsicd_modified
rsicd_denseresidential_210.jpg,denseresidential_210.jpg,2624,"[{'tokens': ['the', 'residential', 'of', 'grey...",train,"[13120, 13121, 13122, 13123, 13124]",dataset_rsicd_modified
rsicd_school_163.jpg,school_163.jpg,8071,"[{'tokens': ['the', 'road', 'through', 'the', ...",train,"[40355, 40356, 40357, 40358, 40359]",dataset_rsicd_modified
ucm_997.jpg,997.tif,996,"[{'tokens': ['This', 'is', 'a', 'part', 'of', ...",test,"[4980, 4981, 4982, 4983, 4984]",dataset_ucm_modified


In [16]:
train_dict = train.to_dict(orient='index')
test_dict = test.to_dict(orient='index')
valid_dict = valid.to_dict(orient='index')

In [17]:
# In each of the images add the current set name key-value pair
new_set_name = ['train', 'test', 'valid']
for name in new_set_name:
    if (name == 'train'):
        for key, value in train_dict.items():
            value['split'] = name
    elif (name == 'test'):
        for key, value in test_dict.items():
            value['split'] = name
    elif (name == 'valid'):
        for key, value in valid_dict.items():
            value['split'] = name
    else:
        print("uh-oh")

In [18]:
imgs_names = [(train_dict, 'train', train), 
              (valid_dict, 'valid', valid), 
              (test_dict, 'test', test)]

In [19]:
for imgs, name, _ in imgs_names:
    with open('../data/processed/json/' + name + '.json', 'w') as file:
        json.dump(imgs, file)

In [20]:
train_dict

{'ucm_1080.jpg': {'filename': '1080.tif',
  'imgid': 1079,
  'sentences': [{'tokens': ['Lots',
     'of',
     'boats',
     'docked',
     'at',
     'the',
     'harbor',
     'and',
     'the',
     'boats',
     'are',
     'closed',
     'to',
     'each',
     'other'],
    'raw': 'Lots of boats docked at the harbor and the boats are closed to each other .',
    'imgid': 1079,
    'sentid': 5395},
   {'tokens': ['Lots',
     'of',
     'boats',
     'docked',
     'neatly',
     'at',
     'the',
     'harbor'],
    'raw': 'Lots of boats docked neatly at the harbor .',
    'imgid': 1079,
    'sentid': 5396},
   {'tokens': ['Many',
     'boats',
     'docked',
     'neatly',
     'at',
     'the',
     'harbor',
     'and',
     'the',
     'water',
     'is',
     'deep',
     'blue'],
    'raw': 'Many boats docked neatly at the harbor and the water is deep blue .',
    'imgid': 1079,
    'sentid': 5397},
   {'tokens': ['Many',
     'boats',
     'docked',
     'neatly',
     'at