In [1]:
import os
import pandas as pd
import json
from collections import Counter
import numpy as np

annotation_path = './data/ILT_data/ILT_data_annotations.csv'
df = pd.read_csv(annotation_path, index_col = 0)
df['img'] = df['img'].astype('str') # Ensure that the data type of the image paths is string

In [2]:
# Get a value_count of the labels an image is assigned, and format it as [(label1, num_votes), (label2, num_votes), ...]
df2 = df.groupby('img')['label'].agg([('label_counts', lambda x: ([(v, x.value_counts()[v]) for v in x.value_counts().keys()]))]).reset_index()

# Check which images have more than one label related to them
display(df2['label_counts'].map(lambda x: len(x) > 1).value_counts())
# Check which images have more than one label related to them
display(df2['label_counts'].map(lambda x: len(x) > 2).value_counts())


# Take the most voted label as the best label for the image
df2['best_label'] = df2['label_counts'].map(lambda x: max(x, key = lambda y: y[1])[0])
# Drop the label counts since we don't need them anymore
df2 = df2.drop(['label_counts'], axis = 1)
# Add the label of the image as a column to the original dataframe
df_labeled = pd.merge(df, df2, on = 'img')

False    70
True     28
Name: label_counts, dtype: int64

False    97
True      1
Name: label_counts, dtype: int64

In [3]:
# Get non-duplicates of the images, and count their labels
df_unique = df_labeled.drop_duplicates(subset=['img'])
df_unique['best_label'].value_counts() # 'Waste transportation' is only used twice, very rare label

Sea faring/inland shipping       27
Miscellaneous                    22
Eigenwerken/water engineering    16
Waste storage/waste processor    16
Airstrip/aviation                15
Waste transport                   2
Name: best_label, dtype: int64

In [4]:
# Replace 'Waste transport', which is very infrequent, with a related common label
df_labeled['best_label'] = df_labeled['best_label'].replace('Waste transport', 'Waste storage/waste processor')

In [5]:
# Check the distributions of labels after replacing the infrequent label
df_labeled.drop_duplicates(subset=['img'])['best_label'].value_counts(), df_labeled.shape

(Sea faring/inland shipping       27
 Miscellaneous                    22
 Waste storage/waste processor    18
 Eigenwerken/water engineering    16
 Airstrip/aviation                15
 Name: best_label, dtype: int64,
 (195, 10))

In [6]:
from sklearn.model_selection import train_test_split
images = df_labeled['img'].unique().tolist()
labels = []
for img in images:
    label = df_labeled[df_labeled['img']==img]['best_label'].tolist()[0]
    labels.append(label)

print('Num images:', len(images), 'Num labels:', len(labels))
X, y = images, labels

train_size, val_size = 0.6, 0.2 # Specify train, val sizes
test_size = round(1-val_size/(1 - train_size), 1) # Compute how large the test set is from the other sizes
print(test_size)

# First split into a train split and the rest (i.e. val and test split together)
train_X, val_test_X, train_y, val_test_y = train_test_split(X, y, stratify = y, train_size = train_size, random_state = 42)

# Split the val-test split into their own splits
if val_size == 0.0:
    val_X, val_y = [], []
    test_X, test_y = val_test_X, val_test_y
else:
    val_X, test_X, val_y, test_y = train_test_split(val_test_X, val_test_y, stratify = val_test_y, test_size =  test_size, 
                                                random_state = 42)
len(train_X), len(val_X), len(test_X)

Num images: 98 Num labels: 98
0.5


(58, 20, 20)

In [7]:
np.random.seed(42) # set seed for reproducibility

# Make a dataset dictionary in a similar format to the other (remote sensing) datasets
ilt_dict = {'dataset': 'ILT', 'images': []}

# Get a list of all the images (without duplicates)
image_paths  = df_labeled['img'].unique().tolist()
for abs_image_path in image_paths:
    # Convert absolute image path to relative path for some more flexibility 
    # To do this, we remove all the folders on the path except the thematic folder + image filename
    image_path = '\\'.join(abs_image_path.split('/')[-2:])
    
    # Filter for the part of the dataframe where the image path matches the current one
    image_df = df_labeled[df_labeled['img'] == abs_image_path]
    image_label = image_df['best_label'].tolist()[0]
    image_captions = image_df['caption'].tolist()
    
    # Check the image path splits to determine the split of this specific image (based on its path)
    split = 'train' if abs_image_path in train_X else 'val' if abs_image_path in val_X else 'test'
    
    # Keep track of all the relevant information about the image
    image_info = {'filename': image_path, 'label': image_label, 'captions': image_captions, 'split': split}

    ilt_dict['images'].append(image_info) # Add the image info to the list

In [8]:
data_info_path = './data/ILT_data/dataset.json'
with open(data_info_path, 'w') as f:
    json.dump(ilt_dict, f)

In [9]:
labels_lst =  sorted(set([c['label'] for c in ilt_dict['images']]))

In [10]:
# Check how many times each label occurs per split, and in total
train_labels = [c['label'] for c in ilt_dict['images'] if c['split'] == 'train']
val_labels = [c['label'] for c in ilt_dict['images'] if c['split'] == 'val']
test_labels = [c['label'] for c in ilt_dict['images'] if c['split'] == 'test']
all_labels = [c['label'] for c in ilt_dict['images']]
print(f'Train split label counts\n{Counter(train_labels)}')
print(f'Validation split label counts\n{Counter(val_labels)}')
print(f'Test split label counts\n{Counter(test_labels)}')
print(f'All splits label counts\n{Counter(all_labels)}')

Train split label counts
Counter({'Sea faring/inland shipping': 16, 'Miscellaneous': 13, 'Waste storage/waste processor': 11, 'Eigenwerken/water engineering': 9, 'Airstrip/aviation': 9})
Validation split label counts
Counter({'Miscellaneous': 5, 'Sea faring/inland shipping': 5, 'Waste storage/waste processor': 4, 'Eigenwerken/water engineering': 3, 'Airstrip/aviation': 3})
Test split label counts
Counter({'Sea faring/inland shipping': 6, 'Eigenwerken/water engineering': 4, 'Miscellaneous': 4, 'Airstrip/aviation': 3, 'Waste storage/waste processor': 3})
All splits label counts
Counter({'Sea faring/inland shipping': 27, 'Miscellaneous': 22, 'Waste storage/waste processor': 18, 'Eigenwerken/water engineering': 16, 'Airstrip/aviation': 15})


In [11]:
# Print example of how the data dictionary looks
for item in ilt_dict['images']:
    print(item)
    break

{'filename': 'eigenwerken\\DJI_0060.JPG', 'label': 'Eigenwerken/water engineering', 'captions': ['a construction site around a river and some yellow floating deposit in the river and some boats with an excavator. ', 'construction site of a lock between a canal and the sea'], 'split': 'train'}


# Image resizing

In [12]:
import os
from PIL import Image
ilt_data_path = './data/ILT_data/images/' # original data folder
resized_path = './data/ILT_data/images_resized' # folder for resized images
os.makedirs(resized_path, exist_ok = True) # make sure the folder for resized images exists
folders = os.listdir(ilt_data_path) # list the image folders 

# Keep track of the image dimensions
img_dims, new_img_dims = [], []

target_h = 224 # Target height for the resized images

# Loop over the images folders in the original data folder
for folder in folders:
    # Make a folder in the resized map
    os.makedirs(os.path.join(resized_path, folder), exist_ok = True)
    # Get the images in the original folder
    folder_path = os.path.join(ilt_data_path, folder)
    folder_contents = os.listdir(folder_path)
    # Loop over the content (i.e. images) in the image folder
    for content in folder_contents:
        content_path = os.path.join(folder_path, content)
        
        # Get the image and its dimensions
        img = Image.open(content_path)
        img_w, img_h = img.size
        
        # Compute how the width has to be scaled, based on the target height 
        ratio = target_h/img_h
        new_img_w = round(img_w * ratio)
        
        # Resize the image and store it in the folder for resized images
        img_resized = img.resize((new_img_w, target_h), Image.Resampling.LANCZOS)
        img_resized.save(os.path.join(resized_path, folder, content))
        
        # Store the dimensions of the original and resized image
        img_dims.append(img.size)
        new_img_dims.append(img_resized.size)
    print(folder, len(folder_contents))

.ipynb_checkpoints 0
afvalhopen 200
eigenwerken 20
luchtvaart 20
overig 30
schepen 187


In [13]:
from collections import Counter
print(Counter(img_dims))
print(Counter(new_img_dims))

Counter({(1296, 972): 198, (1000, 750): 120, (1014, 760): 40, (1013, 760): 32, (1000, 562): 30, (960, 540): 13, (2000, 1500): 10, (1368, 912): 7, (1296, 729): 5, (1013, 759): 1, (480, 270): 1})
Counter({(299, 224): 401, (399, 224): 30, (398, 224): 19, (336, 224): 7})
