In [1]:
import json
import numpy as np
import pandas as pd
from collections import defaultdict
from functools import reduce
from sklearn.model_selection import train_test_split

In [2]:
json_data = {}
sizes = {}
set_name = ['rsicd', 'ucm', 'sydney']

# read in json files from all three datasets
for name in set_name:
    with open('../data/raw/dataset_' + name + '_modified.json', 'r') as data:
        json_data[name] = json.load(data)
        sizes[name] = len(json_data[name]['images'])
        print(f'There are {sizes[name]} images in the {name} dataset.')

There are 10921 images in the rsicd dataset.
There are 2100 images in the ucm dataset.
There are 613 images in the sydney dataset.


In [3]:
ucm_data = {}
sydney_data = {}
rsicd_data = {}

for name in set_name:
    for single_image in json_data[name]['images']:
        new_filename = name + '_' + single_image['filename'][:-3] + 'jpg'
        if (name == 'rsicd'):
            rsicd_data[new_filename] = single_image
            rsicd_data[new_filename]['old_dataset_name'] = 'dataset_rsicd_modified'
        elif (name == 'ucm'):
            ucm_data[new_filename] = single_image
            ucm_data[new_filename]['old_dataset_name'] = 'dataset_ucm_modified'
        elif (name == 'sydney'):
            sydney_data[new_filename] = single_image
            sydney_data[new_filename]['old_dataset_name'] = 'dataset_sydney_modified'
        else: 
            print("uh-oh") #should add try catch later
            
print(f'There are {len(rsicd_data)} images in the rsicd dataset.')
print(f'There are {len(ucm_data)} images in the ucm dataset.')
print(f'There are {len(sydney_data)} images in the sydney dataset.')

There are 10921 images in the rsicd dataset.
There are 2100 images in the ucm dataset.
There are 613 images in the sydney dataset.


In [4]:
# Python code to merge dict 
def merge_2(dict1, dict2, dict3):
    dict2.update(dict1)
    return(dict3.update(dict2)) 

def merge(dict1, dict2, dict3): 
    res = {**dict1, **dict2, **dict3} 
    return res

In [5]:
combined_data = merge(rsicd_data, ucm_data, sydney_data)

In [6]:
len(combined_data)

13634

In [7]:
combined_df = pd.DataFrame(combined_data).T

In [9]:
combined_df.head(-5)

Unnamed: 0,filename,imgid,sentences,split,sentids,old_dataset_name
rsicd_airport_1.jpg,airport_1.jpg,0,"[{'tokens': ['many', 'planes', 'are', 'parked'...",train,"[0, 1, 2, 3, 4]",dataset_rsicd_modified
rsicd_airport_10.jpg,airport_10.jpg,1,"[{'tokens': ['some', 'planes', 'are', 'parked'...",train,"[5, 6, 7, 8, 9]",dataset_rsicd_modified
rsicd_airport_100.jpg,airport_100.jpg,2,"[{'tokens': ['many', 'planes', 'are', 'parked'...",train,"[10, 11, 12, 13, 14]",dataset_rsicd_modified
rsicd_airport_101.jpg,airport_101.jpg,3,"[{'tokens': ['many', 'planes', 'are', 'parked'...",train,"[15, 16, 17, 18, 19]",dataset_rsicd_modified
rsicd_airport_102.jpg,airport_102.jpg,4,"[{'tokens': ['several', 'buildings', 'and', 'g...",train,"[20, 21, 22, 23, 24]",dataset_rsicd_modified
...,...,...,...,...,...,...
sydney_604.jpg,604.tif,603,"[{'tokens': ['There', 'is', 'a', 'wide', 'runw...",val,"[3015, 3016, 3017, 3018, 3019]",dataset_sydney_modified
sydney_605.jpg,605.tif,604,"[{'tokens': ['A', 'narrow', 'runway', 'with', ...",val,"[3020, 3021, 3022, 3023, 3024]",dataset_sydney_modified
sydney_606.jpg,606.tif,605,"[{'tokens': ['A', 'wide', 'runway', 'with', 'w...",val,"[3025, 3026, 3027, 3028, 3029]",dataset_sydney_modified
sydney_607.jpg,607.tif,606,"[{'tokens': ['A', 'straight', 'runway', 'with'...",val,"[3030, 3031, 3032, 3033, 3034]",dataset_sydney_modified


In [10]:
train_valid, test = train_test_split(combined_df, test_size=0.2, random_state=123)
train, valid = train_test_split(train_valid, test_size=0.2, random_state=123)

In [11]:
len(train)

8725

In [12]:
len(test)

2727

In [13]:
len(valid)

2182

In [14]:
train.head()

Unnamed: 0,filename,imgid,sentences,split,sentids,old_dataset_name
rsicd_church_212.jpg,church_212.jpg,2036,"[{'tokens': ['some', 'buildings', 'and', 'many...",train,"[10180, 10181, 10182, 10183, 10184]",dataset_rsicd_modified
rsicd_church_145.jpg,church_145.jpg,1961,"[{'tokens': ['a', 'church', 'is', 'near', 'a',...",train,"[9805, 9806, 9807, 9808, 9809]",dataset_rsicd_modified
sydney_333.jpg,333.tif,332,"[{'tokens': ['A', 'curved', 'river', 'with', '...",train,"[1660, 1661, 1662, 1663, 1664]",dataset_sydney_modified
rsicd_farmland_12.jpg,farmland_12.jpg,3233,"[{'tokens': ['several', 'pieces', 'of', 'farml...",train,"[16165, 16166, 16167, 16168, 16169]",dataset_rsicd_modified
rsicd_beach_161.jpg,beach_161.jpg,959,"[{'tokens': ['two', 'rows', 'of', 'waves', 'in...",train,"[4795, 4796, 4797, 4798, 4799]",dataset_rsicd_modified


In [15]:
test.head()

Unnamed: 0,filename,imgid,sentences,split,sentids,old_dataset_name
ucm_1704.jpg,1704.tif,1703,"[{'tokens': ['It', 'is', 'a', 'straight', 'run...",train,"[8515, 8516, 8517, 8518, 8519]",dataset_ucm_modified
rsicd_meadow_272.jpg,meadow_272.jpg,4412,"[{'tokens': ['the', 'meadow', 'is', 'green', '...",train,"[22060, 22061, 22062, 22063, 22064]",dataset_rsicd_modified
rsicd_00666.jpg,00666.jpg,10665,"[{'tokens': ['the', 'grass', 'with', 'a', 'sma...",train,"[53325, 53326, 53327, 53328, 53329]",dataset_rsicd_modified
rsicd_viaduct_414.jpg,viaduct_414.jpg,9930,"[{'tokens': ['the', 'viaducts', 'crisscross', ...",val,"[49650, 49651, 49652, 49653, 49654]",dataset_rsicd_modified
rsicd_industrial_52.jpg,industrial_52.jpg,4168,"[{'tokens': ['there', 'is', 'a', 'viaduct', 'b...",val,"[20840, 20841, 20842, 20843, 20844]",dataset_rsicd_modified


In [16]:
valid.head()

Unnamed: 0,filename,imgid,sentences,split,sentids,old_dataset_name
ucm_1029.jpg,1029.tif,1028,"[{'tokens': ['Lots', 'of', 'boats', 'docked', ...",train,"[5140, 5141, 5142, 5143, 5144]",dataset_ucm_modified
rsicd_park_282.jpg,park_282.jpg,5333,"[{'tokens': ['a', 'park', 'with', 'some', 'gre...",train,"[26665, 26666, 26667, 26668, 26669]",dataset_rsicd_modified
rsicd_bridge_306.jpg,bridge_306.jpg,1520,"[{'tokens': ['a', 'long', 'bridge', 'is', 'bet...",train,"[7600, 7601, 7602, 7603, 7604]",dataset_rsicd_modified
rsicd_playground_155.jpg,playground_155.jpg,5932,"[{'tokens': ['some', 'trees', 'are', 'near', '...",train,"[29660, 29661, 29662, 29663, 29664]",dataset_rsicd_modified
rsicd_mountain_330.jpg,mountain_330.jpg,5047,"[{'tokens': ['some', 'part', 'of', 'the', 'mou...",train,"[25235, 25236, 25237, 25238, 25239]",dataset_rsicd_modified


In [17]:
train_dict = train.to_dict(orient='index')
test_dict = test.to_dict(orient='index')
valid_dict = valid.to_dict(orient='index')

In [18]:
# In each of the images add the current set name key-value pair
new_set_name = ['train', 'test', 'valid']
for name in new_set_name:
    if (name == 'train'):
        for key, value in train_dict.items():
            value['current_set_name'] = name
    elif (name == 'test'):
        for key, value in test_dict.items():
            value['current_set_name'] = name
    elif (name == 'valid'):
        for key, value in valid_dict.items():
            value['current_set_name'] = name
    else:
        print("uh-oh")

In [23]:
imgs_names = [(train_dict, 'train', train), 
              (valid_dict, 'valid', valid), 
              (test_dict, 'test', test)]

In [27]:
for imgs, name, _ in imgs_names:
    with open('../data/processed/json/' + name + '.json', 'w') as file:
        json.dump(imgs, file)

In [28]:
train_dict

{'rsicd_church_212.jpg': {'filename': 'church_212.jpg',
  'imgid': 2036,
  'sentences': [{'tokens': ['some',
     'buildings',
     'and',
     'many',
     'green',
     'trees',
     'are',
     'around',
     'a',
     'gray',
     'church'],
    'raw': 'some buildings and many green trees are around a gray church .',
    'imgid': 2036,
    'sentid': 10180},
   {'tokens': ['some',
     'buildings',
     'and',
     'many',
     'green',
     'trees',
     'are',
     'around',
     'a',
     'gray',
     'church'],
    'raw': 'some buildings and many green trees are around a gray church .',
    'imgid': 2036,
    'sentid': 10181},
   {'tokens': ['some',
     'buildings',
     'and',
     'many',
     'green',
     'trees',
     'are',
     'around',
     'a',
     'gray',
     'church'],
    'raw': 'some buildings and many green trees are around a gray church .',
    'imgid': 2036,
    'sentid': 10182},
   {'tokens': ['some',
     'buildings',
     'and',
     'many',
     'green',
