In [1]:
import os
import time
import numpy as np
import sys

notebook_dir = os.getcwd()
project_root_path = os.path.dirname(notebook_dir)
sys.path.insert(0, project_root_path)

from config import PROJECT_ROOT

from src.preprocessing import *
from src.utils import get_filename_to_id_mapping

## 1. Transform Images to Tensors
Convert each image to a tensor of shape - (3, 299,299).
All tensors are stored in a list to improve efficiency. 
- tensors and np arrays require a single, contiguous block of memory
- would be > 12GB with all of our image tensors

In [2]:
# LOAD AND TRANSFORM IMAGES
input_dir = os.path.join(PROJECT_ROOT, 'images')
resol = 299
training = True
images_file = os.path.join(PROJECT_ROOT, 'data', 'images.txt')

image_tensors, image_paths = load_and_transform_images(input_dir, images_file, resol, training, batch_size=32, verbose=True, dev=False)

Using TRAINING transformations:
Found 11788 images.
Processing in 369 batches of size 32 (for progress reporting)...


Processing batches: 100%|██████████| 369/369 [01:02<00:00,  5.94it/s]


Finished processing.
Successfully transformed: 11788 images.





## 2. Generate concept label and image label matrices

In [3]:
# CREATE CONCEPT LABELS MATRIX
concept_labels_file = os.path.join(PROJECT_ROOT, 'data', 'image_concept_labels.txt')

concept_labels = encode_image_concepts(concept_labels_file, verbose=True)

Found 11788 unique images.
Found 312 unique concepts.
Generated concept matrix with shape: (11788, 312)


In [4]:
# CREATE IMAGE LABELS MATRIX
labels_file = os.path.join(PROJECT_ROOT, 'data', 'image_class_labels.txt')
classes_file = os.path.join(PROJECT_ROOT, 'data', 'classes.txt')

image_labels = one_hot_encode_labels(labels_file, classes_file, verbose=True)

Found 200 classes.
Found labels for 11788 images.
Generated one-hot matrix with shape: (11788, 200)


## 3. (Optional) Get image_id->filename mapping.
Allows us to check that tensors and label matrices have the same order.

In [5]:
# GET IMAGE ID TO IMAGE FILENAME MAPPING
images_file = os.path.join(PROJECT_ROOT, 'data', 'images.txt')
image_id_mapping = get_filename_to_id_mapping(images_file, reverse=True)

In [6]:
i = 4242
print(f'Filename of image {i}: \n\t{image_id_mapping[i]}')
print(f"Image {i} has concepts: \n\t{concept_labels[i]}")

image_idx = image_paths.index(image_id_mapping[i])
tensor = image_tensors[image_paths.index(image_id_mapping[i])]

print(f"Tensor index of image {i}: \n\t{image_idx}")
print(f"Shape of the first tensor: \n\t{tensor.shape}")

Filename of image 4242: 
	073.Blue_Jay/Blue_Jay_0002_62657.jpg
Image 4242 has concepts: 
	[0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0
 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0
 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
 0 0 0 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0
 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 1 0 0 1 0 0 0 0 1 0
 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0
 1 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0
 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0]
Tensor index of image 4242: 
	4242
Shape of the first tensor: 
	torch.Size([3, 299, 299])


In [7]:
i=10

image_name = image_paths[i]
tensor = image_tensors[i]

print(f"Tensor index {i} has filename: \n\t{image_name}")

print(f"Filename {image_name} has index: \n\t{list(image_id_mapping.values()).index(image_name)}")
# print(concept_labels[i])

Tensor index 10 has filename: 
	001.Black_footed_Albatross/Black_Footed_Albatross_0023_796059.jpg
Filename 001.Black_footed_Albatross/Black_Footed_Albatross_0023_796059.jpg has index: 
	10


## 4. Create Train Test Splits using `train_test_split.txt`

In [8]:
# CREATE TRAIN TEST SPLIT USING TXT FILE
split_file = os.path.join(PROJECT_ROOT, 'data', 'train_test_split.txt')

split_data = split_datasets(split_file, concept_labels, image_labels, image_tensors)

train_concepts = split_data['train_concepts']
test_concepts = split_data['test_concepts']

train_img_labels = split_data['train_img_labels']
test_img_labels = split_data['test_img_labels']

train_tensors = split_data['train_tensors']
test_tensors = split_data['test_tensors']

print(f"Train set size: {len(train_tensors)} tensors, {train_concepts.shape[0]} concepts, {train_img_labels.shape[0]} labels")
print(f"Test set size:  {len(test_tensors)} tensors, {test_concepts.shape[0]} concepts, {test_img_labels.shape[0]} labels")

Split complete: 5994 train images, 5794 test images.
Train set size: 5994 tensors, 5994 concepts, 5994 labels
Test set size:  5794 tensors, 5794 concepts, 5794 labels


## 5. Create Train and Test Datasets

In [9]:
from src.dataset import ImageConceptDataset
print("\n--- Creating Datasets ---")
train_dataset = ImageConceptDataset(
    image_tensors=train_tensors,
    concept_labels=train_concepts,
    image_labels=train_img_labels
)
print(f"Train dataset length: {len(train_dataset)}")

test_dataset = ImageConceptDataset(
    image_tensors=test_tensors,
    concept_labels=test_concepts,
    image_labels=test_img_labels
)
print(f"Test dataset length: {len(test_dataset)}")


--- Creating Datasets ---
Dataset initialized with 5994 pre-sorted items.
Train dataset length: 5994
Dataset initialized with 5794 pre-sorted items.
Test dataset length: 5794


In [28]:
# --- Test __getitem__ ---
concept_names_path = os.path.join(PROJECT_ROOT, 'data', 'concepts.txt')

item_index = 10
if item_index < len(train_dataset):
    img_tensor, concepts, img_label, img_id = train_dataset[item_index]
    print(f"Item at index {item_index}:")
    print(f"\tImage Tensor Shape: {img_tensor.shape}")
    print(f"\tConcept Labels Shape: {concepts.shape}")
    print(f"\tImage Label Shape: {img_label.shape}\n")

    print(f"\tImage ID: {img_id}")
    print(f"\tFilename (lookup): {image_id_mapping.get(img_id)}\n")

    print(f"\tConcept vector (first 10): {concepts[:10].numpy()}")
    print(f"\tHas {concepts.numpy().sum()} true concepts")
    print(f"\tHas concepts: {get_concepts(concepts.numpy(), concept_names_path)}\n")

    print(f"\tImage Class: {np.argmax(img_label.numpy())+1}")
else:
    print(f"Index {item_index} is out of bounds.")


Item at index 10:
	Image Tensor Shape: torch.Size([3, 299, 299])
	Concept Labels Shape: torch.Size([312])
	Image Label Shape: torch.Size([200])

	Image ID: 11
	Filename (lookup): 001.Black_footed_Albatross/Black_Footed_Albatross_0086_796062.jpg

	Concept vector (first 10): [0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
	Has 22.0 true concepts
	Has concepts: ['has_bill_shape::hooked_seabird', 'has_wing_color::black', 'has_upperparts_color::black', 'has_breast_pattern::solid', 'has_back_color::black', 'has_upper_tail_color::black', 'has_head_pattern::eyeline', 'has_breast_color::grey', 'has_throat_color::black', 'has_eye_color::black', 'has_bill_length::about_the_same_as_head', 'has_forehead_color::grey', 'has_under_tail_color::black', 'has_nape_color::grey', 'has_size::medium_(9_-_16_in)', 'has_shape::duck-like', 'has_back_pattern::solid', 'has_tail_pattern::solid', 'has_primary_color::black', 'has_bill_color::grey', 'has_crown_color::grey', 'has_wing_pattern::solid']

	Image Class: 1


## 5. Create Train and Test DataLoaders
These allow us to generate batches of data.

In [11]:
from torch.utils.data import DataLoader

batch_size = 32
print("\n--- Creating DataLoaders ---")
# Shuffle training data
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True)
# Do NOT shuffle test data - usually evaluate in order
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True)
print(f"Train DataLoader created with batch size {batch_size}.")
print(f"Test DataLoader created with batch size {batch_size}.")


--- Creating DataLoaders ---
Train DataLoader created with batch size 32.
Test DataLoader created with batch size 32.


In [12]:
# Get one batch
for batch_idx, (batch_tensors, batch_concepts, batch_labels, batch_ids) in enumerate(train_loader):
    print(f"Batch {batch_idx + 1}:")
    print(f"\tTensor Batch Shape: {batch_tensors.shape}")
    print(f"\tConcepts Batch Shape: {batch_concepts.shape}")
    print(f"\tLabels Batch Shape: {batch_labels.shape}")
    print(f"\tBatch IDs: {batch_ids}")
    break

Batch 1:
	Tensor Batch Shape: torch.Size([32, 3, 299, 299])
	Concepts Batch Shape: torch.Size([32, 312])
	Labels Batch Shape: torch.Size([32, 200])
	Batch IDs: tensor([2164, 3313, 1355, 3610,  629, 3327, 5402,  857, 4007, 5432, 2211, 4080,
        2823, 5452, 2290, 2743, 5109,   42, 2577, 4940, 1061, 3900, 4603,  651,
        5260, 5354, 3196, 1505,  821,  366, 5468, 1210])
