In [2]:
import os
import time
import numpy as np
import sys
from torch.utils.data import DataLoader, Subset
from sklearn.model_selection import train_test_split

notebook_dir = os.getcwd()
project_root_path = os.path.dirname(notebook_dir)
sys.path.insert(0, project_root_path)

from config import PROJECT_ROOT

from src.preprocessing import *
from src.utils import get_filename_to_id_mapping
from src.dataset import ImageConceptDataset

## 1. Transform Images to Tensors
Convert each image to a tensor of shape - (3, 299, 299).
All tensors are stored in a list to improve efficiency. 
- tensors and np arrays require a single, contiguous block of memory
- would be > 12GB with all of our image tensors (all in ram)

In [6]:
# LOAD AND TRANSFORM IMAGES
input_dir = os.path.join(PROJECT_ROOT, 'images')
resol = 299
training = True
images_file = os.path.join(PROJECT_ROOT, 'data', 'images.txt')

image_tensors, image_paths = load_and_transform_images(input_dir, images_file, resol, training, batch_size=32, verbose=True)

Found 11788 images.
Processing in 369 batches of size 32 (for progress reporting)...


Processing batches: 100%|██████████| 369/369 [02:11<00:00,  2.81it/s]


Finished processing.
Successfully transformed: 11788 images.





## 2. Generate concept label and image label matrices

In [3]:
# CREATE CONCEPT LABELS MATRIX
concept_labels_file = os.path.join(PROJECT_ROOT, 'data', 'image_concept_labels.txt')

concept_labels, uncertainty_matrix = encode_image_concepts(concept_labels_file, verbose=True)

Found 11788 unique images.
Found 312 unique concepts.
Generated concept matrix with shape: (11788, 312)


In [5]:
# CREATE IMAGE LABELS MATRIX
labels_file = os.path.join(PROJECT_ROOT, 'data', 'image_class_labels.txt')
classes_file = os.path.join(PROJECT_ROOT, 'data', 'classes.txt')

image_labels = one_hot_encode_labels(labels_file, classes_file, verbose=True)

Found 200 classes.
Found labels for 11788 images.
Generated one-hot matrix with shape: (11788, 200)


In [10]:
np.sum((uncertainty_matrix == 1)&(concept_labels==0))/(uncertainty_matrix.shape[0]*uncertainty_matrix.shape[1])

np.float64(0.10737397005211732)

## 3. (Optional) Get image_id->filename mapping.
Allows us to check that tensors and label matrices have the same order.

In [11]:
# GET IMAGE ID TO IMAGE FILENAME MAPPING
images_file = os.path.join(PROJECT_ROOT, 'data', 'images.txt')
image_id_mapping = get_filename_to_id_mapping(images_file, reverse=True)

In [12]:
i = 4242
print(f'Filename of image {i}: \n\t{image_id_mapping[i]}')
print(f"Image {i} has concepts: \n\t{concept_labels[i]}")

image_idx = image_paths.index(image_id_mapping[i])
tensor = image_tensors[image_paths.index(image_id_mapping[i])]

print(f"Tensor index of image {i}: \n\t{image_idx}")
print(f"Shape of the first tensor: \n\t{tensor.shape}")

Filename of image 4242: 
	073.Blue_Jay/Blue_Jay_0002_62657.jpg
Image 4242 has concepts: 
	[0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0
 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0
 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
 0 0 0 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0
 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 1 0 0 1 0 0 0 0 1 0
 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0
 1 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0
 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0]
Tensor index of image 4242: 
	4242
Shape of the first tensor: 
	torch.Size([3, 299, 299])


In [13]:
i=10

image_name = image_paths[i]
tensor = image_tensors[i]

print(f"Tensor index {i} has filename: \n\t{image_name}")

print(f"Filename {image_name} has index: \n\t{list(image_id_mapping.values()).index(image_name)}")
# print(concept_labels[i])

Tensor index 10 has filename: 
	001.Black_footed_Albatross/Black_Footed_Albatross_0023_796059.jpg
Filename 001.Black_footed_Albatross/Black_Footed_Albatross_0023_796059.jpg has index: 
	10


## 4. Create Train Test Splits using `train_test_split.txt`

In [30]:
# CREATE TRAIN TEST SPLIT USING TXT FILE
split_file = os.path.join(PROJECT_ROOT, 'data', 'train_test_split.txt')

split_data = split_datasets(split_file, concept_labels, image_labels, uncertainty_matrix, image_tensors)

train_concept_labels = split_data['train_concepts']
test_concept_labels = split_data['test_concepts']

train_img_labels = split_data['train_img_labels']
test_img_labels = split_data['test_img_labels']

train_uncertainty = split_data['train_uncertainty']

train_tensors = split_data['train_tensors']
test_tensors = split_data['test_tensors']

print(f"Train set size: {len(train_tensors)} tensors, {train_concept_labels.shape[0]} concepts, {train_img_labels.shape[0]} labels")
print(f"Test set size:  {len(test_tensors)} tensors, {test_concept_labels.shape[0]} concepts, {test_img_labels.shape[0]} labels")

Split complete: 5994 train images, 5794 test images.
Train set size: 5994 tensors, 5994 concepts, 5994 labels
Test set size:  5794 tensors, 5794 concepts, 5794 labels


In [43]:
class_level_concepts = compute_class_level_concepts(train_concept_labels, train_uncertainty, train_img_labels)

common_concept_indices = select_common_concepts(class_level_concepts, min_class_count=10)

print(f"Selected {len(common_concept_indices)} common concept indices.")

[9 8 9 9 9 9]
Selected 109 common concept indices.


In [33]:
theirs = set([1, 4, 6, 7, 10, 14, 15, 20, 21, 23, 25, 29, 30, 35, 36, 38, 40, 44, 45, 50, 51, 53, 54, 56, 57, 59, 63, 64, 69, 70, 72, 75, 80, 84, 90, 91, \
    93, 99, 101, 106, 110, 111, 116, 117, 119, 125, 126, 131, 132, 134, 145, 149, 151, 152, 153, 157, 158, 163, 164, 168, 172, 178, 179, 181, \
    183, 187, 188, 193, 194, 196, 198, 202, 203, 208, 209, 211, 212, 213, 218, 220, 221, 225, 235, 236, 238, 239, 240, 242, 243, 244, 249, 253, \
    254, 259, 260, 262, 268, 274, 277, 283, 289, 292, 293, 294, 298, 299, 304, 305, 308, 309, 310, 311])
mine = set(common_concept_indices.astype(int))

print(theirs - mine) # 225, 99, 198, 309, 152, 125
print(mine-theirs) # 104, 173, 182
print(mine)

{225, 99, 198, 309, 152, 125}
{np.int64(104), np.int64(173), np.int64(182)}
{np.int64(1), np.int64(4), np.int64(6), np.int64(7), np.int64(10), np.int64(14), np.int64(15), np.int64(20), np.int64(21), np.int64(23), np.int64(25), np.int64(29), np.int64(30), np.int64(35), np.int64(36), np.int64(38), np.int64(40), np.int64(44), np.int64(45), np.int64(50), np.int64(51), np.int64(53), np.int64(54), np.int64(56), np.int64(57), np.int64(59), np.int64(63), np.int64(64), np.int64(69), np.int64(70), np.int64(72), np.int64(75), np.int64(80), np.int64(84), np.int64(90), np.int64(91), np.int64(93), np.int64(101), np.int64(104), np.int64(106), np.int64(110), np.int64(111), np.int64(116), np.int64(117), np.int64(119), np.int64(126), np.int64(131), np.int64(132), np.int64(134), np.int64(145), np.int64(149), np.int64(151), np.int64(153), np.int64(157), np.int64(158), np.int64(163), np.int64(164), np.int64(168), np.int64(172), np.int64(173), np.int64(178), np.int64(179), np.int64(181), np.int64(182), np.i

In [20]:
train_concept_labels = train_concept_labels[:, common_concept_indices]
print(f"Filtered instance concepts shape: {train_concept_labels.shape}")
test_concept_labels = test_concept_labels[:, common_concept_indices]
print(f"Filtered instance concepts shape: {test_concept_labels.shape}")

Filtered instance concepts shape: (5994, 109)
Filtered instance concepts shape: (5794, 109)


## 5. Create Train and Test Datasets

In [None]:
full_train_dataset = ImageConceptDataset(
    image_tensors=train_tensors,
    concept_labels=train_concept_labels,
    image_labels=train_img_labels
)
print(f"Train dataset length: {len(full_train_dataset)}")

test_dataset = ImageConceptDataset(
    image_tensors=test_tensors,
    concept_labels=test_concept_labels,
    image_labels=test_img_labels
)
print(f"Test dataset length: {len(test_dataset)}")

Dataset initialized with 5994 pre-sorted items.
Train dataset length: 5994
Dataset initialized with 5794 pre-sorted items.
Test dataset length: 5794


**get validation set**

In [12]:
val_proportion = 0.20
all_indices = list(range(len(full_train_dataset)))
# Assuming you can get all class labels for the training set
all_train_labels = full_train_dataset.get_labels()

train_indices, val_indices, _, _ = train_test_split(
    all_indices,
    all_train_labels,
    test_size=val_proportion,
    random_state=42, # for reproducibility
    stratify=all_train_labels
)

train_dataset = Subset(full_train_dataset, train_indices)
val_dataset = Subset(full_train_dataset, val_indices)

**Test __getitem__**


In [13]:
concept_names_path = os.path.join(PROJECT_ROOT, 'data', 'concepts.txt')
image_id_mapping = get_filename_to_id_mapping(images_file, reverse=True)

item_index = 10
if item_index < len(train_dataset):
    img_tensor, concepts, img_label, img_id = train_dataset[item_index]
    print(f"Item at index {item_index}:")
    print(f"\tImage Tensor Shape: {img_tensor.shape}")
    print(f"\tConcept Labels Shape: {concepts.shape}")
    print(f"\tImage Label Shape: {img_label.shape}\n")

    # print(f"\tImage ID: {img_id}")
    # print(f"\tFilename (lookup): {image_id_mapping.get(img_id)}\n")

    print(f"\tConcept vector (first 10): {concepts[:10].numpy()}")
    print(f"\tHas {concepts.numpy().sum()} true concepts")
    print(f"\tHas concepts: {get_concepts(concepts.numpy(), concept_names_path)}\n")

    print(f"\tImage Class: {np.argmax(img_label.numpy())+1}")
else:
    print(f"Index {item_index} is out of bounds.")

Item at index 10:
	Image Tensor Shape: torch.Size([3, 299, 299])
	Concept Labels Shape: torch.Size([312])
	Image Label Shape: torch.Size([200])

	Concept vector (first 10): [0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
	Has 34.0 true concepts
	Has concepts: ['has_bill_shape::cone', 'has_wing_color::brown', 'has_wing_color::buff', 'has_upperparts_color::brown', 'has_upperparts_color::buff', 'has_underparts_color::grey', 'has_breast_pattern::solid', 'has_back_color::brown', 'has_back_color::buff', 'has_tail_shape::notched_tail', 'has_upper_tail_color::brown', 'has_upper_tail_color::buff', 'has_head_pattern::eyeline', 'has_breast_color::grey', 'has_throat_color::white', 'has_eye_color::black', 'has_bill_length::shorter_than_head', 'has_forehead_color::grey', 'has_under_tail_color::brown', 'has_under_tail_color::buff', 'has_nape_color::grey', 'has_nape_color::black', 'has_belly_color::grey', 'has_size::very_small_(3_-_5_in)', 'has_shape::perching-like', 'has_back_pattern::striped', 'has_tail_pattern::st

## 5. Create Train and Test DataLoaders
These allow us to generate batches of data.

In [None]:
batch_size = 64
# Shuffle training data
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True, drop_last=True)
print(f"Train DataLoader created with batch size {batch_size}.")
# Do NOT shuffle val or test data
#   pin_memory optimises data transfer from CPU to GPU
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True, drop_last=False)
print(f"Validation DataLoader created with batch size {batch_size}.")
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True, drop_last=False)
print(f"Test DataLoader created with batch size {batch_size}.")

Train DataLoader created with batch size 32.
Validation DataLoader created with batch size 32.
Test DataLoader created with batch size 32.


In [15]:
# Get one batch
for batch_idx, (batch_tensors, batch_concepts, batch_labels, batch_ids) in enumerate(train_loader):
    print(f"Batch {batch_idx + 1}:")
    print(f"\tTensor Batch Shape: {batch_tensors.shape}")
    print(f"\tConcepts Batch Shape: {batch_concepts.shape}")
    print(f"\tLabels Batch Shape: {batch_labels.shape}")
    print(f"\tBatch IDs: {batch_ids}")
    break

KeyboardInterrupt: 