In [1]:
import os
import pandas as pd
import json

annotation_path = './data/ILT_data/ILT_data_annotations.csv'
df = pd.read_csv(annotation_path, index_col = 0)
df['img'] = df['img'].astype('str') # Ensure that the data type of the image paths is string

In [2]:
# Get a value_count of the labels an image is assigned, and format it as [(label1, num_votes), (label2, num_votes), ...]
df2 = df.groupby('img')['label'].agg([('label_counts', lambda x: ([(v, x.value_counts()[v]) for v in x.value_counts().keys()]))]).reset_index()

# Check which images have more than one label related to them
display(df2['label_counts'].map(lambda x: len(x) > 1).value_counts())


# Take the most voted label as the best label for the image
df2['best_label'] = df2['label_counts'].map(lambda x: max(x, key = lambda y: y[1])[0])
# Drop the label counts since we don't need them anymore
df2 = df2.drop(['label_counts'], axis = 1)
# Add the label of the image as a column to the original dataframe
df_labeled = pd.merge(df, df2, on = 'img')

False    70
True     28
Name: label_counts, dtype: int64

In [3]:
from sklearn.model_selection import train_test_split
images = df_labeled['img'].unique().tolist()
labels = []
for img in images:
    label = df_labeled[df_labeled['img']==img]['best_label'].tolist()[0]
    labels.append(label)

print('Num images:', len(images), 'Num labels:', len(labels))
X, y = images, labels

train_size, val_size = 0.8, 0.1 # Specify train, val sizes
test_size = round(val_size/(1 - train_size), 1) # Compute how large the test set is from the other sizes

# First split into a train split and the rest (i.e. val and test split together)
train_X, val_test_X, train_y, val_test_y = train_test_split(X, y, stratify = y, train_size = train_size, random_state = 42)

# Split the val-test split into their own splits
val_X, test_X, val_y, test_y = train_test_split(val_test_X, val_test_y, stratify = val_test_y, test_size =  test_size, 
                                                random_state = 42)

Num images: 98 Num labels: 98


In [4]:
from collections import Counter
import numpy as np
np.random.seed(42) # set seed for reproducibility

# Make a dataset dictionary in a similar format to the other (remote sensing) datasets
ilt_dict = {'dataset': 'ILT', 'images': []}

# Get a list of all the images (without duplicates)
image_paths  = df_labeled['img'].unique().tolist()
for abs_image_path in image_paths:
    # Convert absolute image path to relative path for some more flexibility 
    # To do this, we remove all the folders on the path except the thematic folder + image filename
    image_path = '\\'.join(abs_image_path.split('/')[-2:])
    
    # Filter for the part of the dataframe where the image path matches the current one
    image_df = df_labeled[df_labeled['img'] == abs_image_path]
    image_label = image_df['best_label'].tolist()[0]
    image_captions = image_df['caption'].tolist()
    
    # If we have less than 5 captions, randomly pick a few to repeat
    # We do try to copy all captions approximately the same about by using random.choice with no replace
    repeated_captions = []
    while len(image_captions) + len(repeated_captions) < 5:
        repeated_captions.extend(np.random.choice(image_captions, len(image_captions), replace = False).tolist())
    image_captions.extend(repeated_captions) # Add the repeated captions to our original list of captions
    image_captions = image_captions[:5] # Make sure the number of captions is 5
    
    # Check the image path splits to determine the split of this specific image (based on its path)
    split = 'train' if abs_image_path in train_X else 'val' if abs_image_path in val_X else 'test'
    
    # Keep track of all the relevant information about the image
    image_info = {'filename': image_path, 'label': image_label, 'captions': image_captions, 'split': split}

    ilt_dict['images'].append(image_info) # Add the image info to the list

In [5]:
data_info_path = './data/ILT_data/dataset.json'
with open(data_info_path, 'w') as f:
    json.dump(ilt_dict, f)

In [6]:
labels_lst =  sorted(set([c['label'] for c in ilt_dict['images']]))

In [7]:
from collections import Counter
train_labels = [c['label'] for c in ilt_dict['images'] if c['split'] == 'train']
val_labels = [c['label'] for c in ilt_dict['images'] if c['split'] == 'val']
test_labels = [c['label'] for c in ilt_dict['images'] if c['split'] == 'test']
all_labels = [c['label'] for c in ilt_dict['images']]
Counter(train_labels), Counter(val_labels), Counter(test_labels), Counter(all_labels)

(Counter({'Eigenwerken/water engineering': 13,
          'Sea faring/inland shipping': 21,
          'Waste storage/waste processor': 13,
          'Airstrip/aviation': 12,
          'Miscellaneous': 17,
          'Waste transport': 2}),
 Counter({'Waste storage/waste processor': 2,
          'Miscellaneous': 2,
          'Sea faring/inland shipping': 3,
          'Eigenwerken/water engineering': 2,
          'Airstrip/aviation': 1}),
 Counter({'Sea faring/inland shipping': 3,
          'Eigenwerken/water engineering': 1,
          'Airstrip/aviation': 2,
          'Miscellaneous': 3,
          'Waste storage/waste processor': 1}),
 Counter({'Eigenwerken/water engineering': 16,
          'Sea faring/inland shipping': 27,
          'Waste storage/waste processor': 16,
          'Airstrip/aviation': 15,
          'Miscellaneous': 22,
          'Waste transport': 2}))

In [8]:
# Print example 
for item in ilt_dict['images']:
    print(item)
    break

{'filename': 'eigenwerken\\DJI_0060.JPG', 'label': 'Eigenwerken/water engineering', 'captions': ['a construction site around a river and some yellow floating deposit in the river and some boats with an excavator. ', 'construction site of a lock between a canal and the sea', 'construction site of a lock between a canal and the sea', 'a construction site around a river and some yellow floating deposit in the river and some boats with an excavator. ', 'a construction site around a river and some yellow floating deposit in the river and some boats with an excavator. '], 'split': 'train'}
