In [1]:
import os
import time
import numpy as np
import sys
from torch.utils.data import DataLoader, Subset
from sklearn.model_selection import train_test_split

notebook_dir = os.getcwd()
project_root_path = os.path.dirname(notebook_dir)
sys.path.insert(0, project_root_path)

from config import PROJECT_ROOT

from src.preprocessing import *
from src.preprocessing.CUB import *
from src.utils import get_filename_to_id_mapping
from src import ImageConceptDataset

## 1. Transform Images to Tensors
Convert each image to a tensor of shape - (3, 299, 299).
All tensors are stored in a list to improve efficiency. 
- tensors and np arrays require a single, contiguous block of memory
- would be > 12GB with all of our image tensors (all in ram)

In [2]:
# LOAD AND TRANSFORM IMAGES
input_dir = os.path.join(PROJECT_ROOT, 'images')
resol = 299
training = True
images_file = os.path.join(PROJECT_ROOT, 'data', 'CUB', 'images.txt')

image_tensors, image_paths = load_and_transform_images(input_dir, images_file, resol, training, batch_size=32, verbose=True)

Found 11788 images.
Processing in 369 batches of size 32 (for progress reporting)...


Processing batches:   9%|▉         | 35/369 [00:05<00:54,  6.17it/s]


KeyboardInterrupt: 

## 2. Generate concept label and image label matrices

In [None]:
# CREATE CONCEPT LABELS MATRIX
concept_labels_file = os.path.join(PROJECT_ROOT, 'data', 'CUB', 'image_concept_labels.txt')

concept_labels, uncertainty_matrix = encode_image_concepts(concept_labels_file, verbose=True)

In [None]:
unique_concept_vectors = np.unique(concept_labels, axis=0)
num_unique_concept_vectors = unique_concept_vectors.shape[0]
print(f"Number of unique concept vectors: {num_unique_concept_vectors}")

In [None]:
# CREATE IMAGE LABELS MATRIX
labels_file = os.path.join(PROJECT_ROOT, 'data', 'CUB', 'image_class_labels.txt')
classes_file = os.path.join(PROJECT_ROOT, 'data', 'CUB', 'classes.txt')

image_labels = one_hot_encode_labels(labels_file, classes_file, verbose=True)

In [None]:
print("Percentage of instances with uncertainty=1 and concept_label=0", np.sum((uncertainty_matrix == 1)&(concept_labels==0))/(uncertainty_matrix.shape[0]*uncertainty_matrix.shape[1]))

## 3. (Optional) Get image_id->filename mapping.
Allows us to check that tensors and label matrices have the same order.

In [None]:
# # GET IMAGE ID TO IMAGE FILENAME MAPPING
# images_file = os.path.join(PROJECT_ROOT, 'data', 'CUB', 'images.txt')
# image_id_mapping = get_filename_to_id_mapping(images_file, reverse=True)

In [None]:
# i = 4242
# print(f'Filename of image {i}: \n\t{image_id_mapping[i]}')
# print(f"Image {i} has concepts: \n\t{concept_labels[i]}")

# image_idx = image_paths.index(image_id_mapping[i])
# tensor = image_tensors[image_paths.index(image_id_mapping[i])]

# print(f"Tensor index of image {i}: \n\t{image_idx}")
# print(f"Shape of the first tensor: \n\t{tensor.shape}")

In [None]:
# i=10

# image_name = image_paths[i]
# tensor = image_tensors[i]

# print(f"Tensor index {i} has filename: \n\t{image_name}")

# print(f"Filename {image_name} has index: \n\t{list(image_id_mapping.values()).index(image_name)}")
# # print(concept_labels[i])

## 4. Create Train Test Splits using `train_test_split.txt`

In [None]:
# CREATE TRAIN TEST SPLIT USING TXT FILE
split_file = os.path.join(PROJECT_ROOT, 'data', 'CUB', 'train_test_split.txt')

split_data = split_datasets(split_file, concept_labels, image_labels, uncertainty_matrix, image_tensors)

train_concept_labels = split_data['train_concepts']
test_concept_labels = split_data['test_concepts']

train_img_labels = split_data['train_img_labels']
test_img_labels = split_data['test_img_labels']

train_uncertainty = split_data['train_uncertainty']

train_tensors = split_data['train_tensors']
test_tensors = split_data['test_tensors']

print(f"Train set size: {len(train_tensors)} tensors, {train_concept_labels.shape[0]} concepts, {train_img_labels.shape[0]} labels")
print(f"Test set size:  {len(test_tensors)} tensors, {test_concept_labels.shape[0]} concepts, {test_img_labels.shape[0]} labels")

In [None]:
# concept processing
class_level_concepts = compute_class_level_concepts(train_concept_labels, train_uncertainty, train_img_labels)

In [None]:
# apply class-level concepts to each instance
class_concepts = False
if class_concepts:
    train_concept_labels, test_concept_labels = apply_class_concepts_to_instances(train_img_labels, train_concept_labels, class_level_concepts, test_img_labels, test_concept_labels)

In [None]:
common_concept_indices = select_common_concepts(class_level_concepts, min_class_count=10)
train_concept_labels = train_concept_labels[:, common_concept_indices]
test_concept_labels = test_concept_labels[:, common_concept_indices]

# INVESTIGATE INSTANCE DIFFERENCES

In [None]:
Y_train = np.argmax(train_img_labels, axis=1)
Y_test = np.argmax(test_img_labels, axis=1)

In [None]:
print("---TRAIN---")
threshold = 0.9
diff_concepts_by_label = {}  # Dictionary to track different concepts by label
# Group indices by class label
indices_by_label = {}
for i, y in enumerate(Y_train):
    if y not in indices_by_label:
        indices_by_label[y] = []
    indices_by_label[y].append(i)

# For each class, compare all pairs of instances
for y, indices in indices_by_label.items():
    diff_concepts_by_label[y] = 0

    # Compare each pair of instances within the same class
    for i in range(len(indices)):
        for j in range(i+1, len(indices)):  # Only compare each pair once
            idx1, idx2 = indices[i], indices[j]
            different_concepts = np.abs(train_concept_labels[idx1] - train_concept_labels[idx2])
            diff_concepts_by_label[y] += np.sum(different_concepts > threshold)

# Print results for each label
total_diff_concepts = sum(diff_concepts_by_label.values())
# print(f"Different concepts by label:")
# for label, count in sorted(diff_concepts_by_label.items()):
#     print(f"  Label {label}: {count} different concepts")
print(f"There are {total_diff_concepts} different concepts in total.")

In [None]:
difference_in_class = 0
for i, y in enumerate(Y_train):
    if y == 0:
        print(f" Image 0 vs Image {i}: {np.sum(train_concept_labels[0] - train_concept_labels[i] > threshold)}")
        difference_in_class += np.sum(train_concept_labels[0] - train_concept_labels[i] > threshold)

print("Class 0:", difference_in_class)

In [None]:
print("---TEST---")
threshold = 0.9
diff_concepts_by_label = {}  # Dictionary to track different concepts by label
# Group indices by class label
indices_by_label = {}
for i, y in enumerate(Y_test):
    if y not in indices_by_label:
        indices_by_label[y] = []
    indices_by_label[y].append(i)

# For each class, compare all pairs of instances
for y, indices in indices_by_label.items():
    diff_concepts_by_label[y] = 0

    # Compare each pair of instances within the same class
    for i in range(len(indices)):
        for j in range(i+1, len(indices)):  # Only compare each pair once
            idx1, idx2 = indices[i], indices[j]
            different_concepts = np.abs(test_concept_labels[idx1] - test_concept_labels[idx2])
            diff_concepts_by_label[y] += np.sum(different_concepts > threshold)

# Print results for each label
total_diff_concepts = sum(diff_concepts_by_label.values())
# print(f"Different concepts by label:")
# for label, count in sorted(diff_concepts_by_label.items()):
#     print(f"  Label {label}: {count} different concepts")
print(f"There are {total_diff_concepts} different concepts in total.")

## 5. Create Train and Test Datasets

In [None]:
train_dataset = ImageConceptDataset(
    image_tensors=train_tensors,
    concept_labels=train_concept_labels,
    image_labels=train_img_labels
)
print(f"Train dataset length: {len(train_dataset)}")

test_dataset = ImageConceptDataset(
    image_tensors=test_tensors,
    concept_labels=test_concept_labels,
    image_labels=test_img_labels
)
print(f"Test dataset length: {len(test_dataset)}")

**Test __getitem__**


In [None]:
concept_names_path = os.path.join(PROJECT_ROOT, 'data', 'CUB', 'concepts.txt')
image_id_mapping = get_filename_to_id_mapping(images_file, reverse=True)

item_index = 10
if item_index < len(train_dataset):
    img_tensor, concepts, img_label, img_id = train_dataset[item_index]
    print(f"Item at index {item_index}:")
    print(f"\tImage Tensor Shape: {img_tensor.shape}")
    print(f"\tConcept Labels Shape: {concepts.shape}")
    print(f"\tImage Label Shape: {img_label.shape}\n")

    # print(f"\tImage ID: {img_id}")
    # print(f"\tFilename (lookup): {image_id_mapping.get(img_id)}\n")

    print(f"\tConcept vector (first 10): {concepts[:10].numpy()}")
    print(f"\tHas {concepts.numpy().sum()} true concepts")
    print(f"\tHas concepts: {get_concepts(concepts.numpy(), concept_names_path)}\n")

    print(f"\tImage Class: {np.argmax(img_label.numpy())+1}")
else:
    print(f"Index {item_index} is out of bounds.")

## 6. Create Train and Test DataLoaders
These allow us to generate batches of data.

In [None]:
batch_size = 64
# Shuffle training data
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True, drop_last=False)
print(f"Train DataLoader created with batch size {batch_size}.")
# Do NOT shuffle val or test data
#   pin_memory optimises data transfer from CPU to GPU
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True, drop_last=False)
print(f"Test DataLoader created with batch size {batch_size}.")