In [None]:
import pandas as pd
import numpy as np
import os
PROJECT_ROOT = os.path.abspath("../../../")

from derm7pt.dataset import Derm7PtDataset

In [None]:
dir_images = os.path.join(PROJECT_ROOT, 'images', 'Derm7pt')
dir_data = os.path.join(PROJECT_ROOT, 'data', 'Derm7pt')

path_meta_csv = os.path.join(dir_data, 'meta.csv')
path_train_idx_csv = os.path.join(dir_data, 'train_indexes.csv')
path_valid_idx_csv = os.path.join(dir_data, 'valid_indexes.csv')
path_test_idx_csv = os.path.join(dir_data, 'test_indexes.csv')

metadata_df_original = pd.read_csv(path_meta_csv)

train_indexes = list(pd.read_csv(path_train_idx_csv)['indexes'])
valid_indexes = list(pd.read_csv(path_valid_idx_csv)['indexes'])
test_indexes = list(pd.read_csv(path_test_idx_csv)['indexes'])

In [None]:
# For granular labels:
dataset_handler = Derm7PtDataset(
    dir_images=dir_images,
    metadata_df=metadata_df_original.copy(), # Pass a copy as the class modifies it
    train_indexes=train_indexes,
    valid_indexes=valid_indexes,
    test_indexes=test_indexes
)

print("\nDataset handler initialized successfully!")

In [None]:
print("First 5 rows of the processed DataFrame:")
print(dataset_handler.df.head())

In [None]:
print("\nColumns in the processed DataFrame:")
print(dataset_handler.df.columns)
# You'll see original columns and new '_numeric' columns
# e.g., 'diagnosis' and 'diagnosis_numeric'

In [None]:
dataset_handler.dataset_stats()

In [None]:
print("\nAvailable tags (categories of concepts):")
print(dataset_handler.tags) # Shows 'DIAG', 'PN', etc.

In [None]:
diag_labels_numeric = dataset_handler.get_labels(data_type='all', one_hot=False)['DIAG']
print(f"\nNumeric labels for 'DIAG' (Diagnosis) for all images (first 10): \n{diag_labels_numeric.head(10)}")

diag_labels_one_hot = dataset_handler.get_labels(data_type='all', one_hot=True)['DIAG']
print(f"\nOne-hot encoded labels for 'DIAG' for all images (first 5 rows): \n{diag_labels_one_hot[:5]}")

# To understand what the numeric/one-hot labels mean:
diagnosis_definitions = dataset_handler.get_label_by_abbrev('DIAG')
print(f"\nDefinitions for 'DIAG' labels: \n{diagnosis_definitions}")

In [None]:
idx = 10

image_derm = dataset_handler.derm_image(row_index=idx) # Loads image for the first row in the df
print(f"\nShape (size) of dermoscopic image {idx}: {image_derm.shape}")
# You can use matplotlib to display it if you're in an environment that supports it
# import matplotlib.pyplot as plt
# plt.imshow(image_derm)
# plt.title("Dermoscopic Image")
# plt.show()

# GET CONCEPT MATRIX

In [None]:
# all_data = dataset_handler._get_data_frame()
all_data = dataset_handler.get_labels(data_type='all', one_hot=True)

Y = all_data['DIAG']
Y.shape

In [None]:
# Initialize concepts_matrix as an empty array
concepts_matrix = None

# Loop through the 7 concept categories
for i in range(7):
    feature_set = list(all_data.values())[i+1]

    # For the first iteration, initialize concepts_matrix
    if concepts_matrix is None:
        concepts_matrix = feature_set
    else:
        # Concatenate horizontally (along axis 1)
        concepts_matrix = np.hstack((concepts_matrix, feature_set))

# Check the final shape
print(f"Final concepts_matrix shape: {concepts_matrix.shape}")

In [None]:
# Reset the concepts_matrix creation to ensure we know exactly what we're adding
concepts_matrix = None
concept_meanings = []

# Instead of using numeric indices, explicitly iterate through the tags
for tag in dataset_handler.tags.abbrevs:
    # if tag == 'DIAG':
    #     continue
    # Get the one-hot encoded matrix for this tag
    feature_set = all_data[tag]

    # For the first iteration, initialize concepts_matrix
    if concepts_matrix is None:
        concepts_matrix = feature_set
    else:
        # Concatenate horizontally (along axis 1)
        concepts_matrix = np.hstack((concepts_matrix, feature_set))

    # Get the definitions for this tag
    tag_definitions = dataset_handler.get_label_by_abbrev(tag)

    # Store the meaning of each column for this tag
    num_concepts = feature_set.shape[1]
    for i in range(num_concepts):
        name = tag_definitions.names[i]

        concept_meanings.append((tag, name))

# concept_meanings = np.array(concept_meanings)
# Verify we have the correct number of mappings
print(f"Total number of concept columns: {concepts_matrix.shape}")
print(f"Total number of concept meanings: {len(concept_meanings)}")

# Print a few examples to verify
for i in range(5):
    print(f"Column {i}: {concept_meanings[i][0]}-{concept_meanings[i][1]}")


In [None]:
# concepts_present = np.where(concepts_matrix[0] == 1)[0]

# print(concept_meanings[concepts_present])

# Image Label Encoding

In [1]:
dataset_handler.df[['clinic', 'derm']]

NameError: name 'dataset_handler' is not defined

In [2]:
clinic_imgs = dataset_handler.df['clinic']
derm_imgs = dataset_handler.df['derm']
case_nums = dataset_handler.df['case_num']

case_images_dict = {
    case: {'clinic_img': clinic, 'derm_img': derm}
    for case, clinic, derm in zip(case_nums, clinic_imgs, derm_imgs)
}

# Print a sample to verify
print(case_images_dict)

NameError: name 'dataset_handler' is not defined

In [None]:
first_parts = derm_imgs.str.split('/').str[0].str.upper()

# print(np.unique(first_parts))

In [None]:
from src.preprocessing.Derm7pt import export_image_props_to_text

export_image_props_to_text(dataset_handler.df)

In [None]:
from src.preprocessing.Derm7pt import preprocessing_main

labels = preprocessing_main(verbose=True)

In [None]:
# np.argmax(labels, axis=1)

In [None]:
concepts = encode_image_concepts(dataset_handler)

In [None]:
image_names_path = os.path.join(PROJECT_ROOT, 'data', 'Derm7pt', 'image_names.txt')
flattened_df = pd.read_csv(image_names_path, sep=' ', header=None, names=['img_id', 'img_path', 'img_type', 'case_id'])

In [None]:
all_concepts = []

for _, row in flattened_df.iterrows():
    case_id = row['case_id']
    case_concepts = concepts_matrix[case_id]
    all_concepts.append(case_concepts)

all_concepts = np.array(all_concepts)
all_concepts.shape

# flattened_df['concepts'] = all_concepts

# Preprocessing CODE


In [None]:
from src.preprocessing.Derm7pt import *
from src import ImageConceptDataset
from src.preprocessing import *
from src.utils import get_paths, load_Derm_dataset

from torch.utils.data import DataLoader
import os

In [4]:
paths = get_paths()
dataset_handler = load_Derm_dataset(paths)

In [5]:
# Ensure text files exist
if not os.path.exists(paths['labels_file']):
    export_image_props_to_text(dataset_handler.df)

In [6]:
verbose = True

In [7]:
# Get labels and concepts
image_labels = one_hot_encode_labels(paths['labels_file'], paths['classes_path'], verbose=verbose)
concepts_matrix = encode_image_concepts(dataset_handler, verbose=verbose)

# Load and transform images
image_tensors, image_paths = load_and_transform_images(paths['dir_images'], paths['mapping_file'], resol=299, use_training_transforms=True, batch_size=32, verbose=verbose)

Found 34 classes.
Found labels for 2022 images.
Generated one-hot matrix with shape: (2022, 34)
Total number of concept columns: 28
Found 2013 images.
Processing in 63 batches of size 32 (for progress reporting)...


Processing batches: 100%|██████████| 63/63 [00:14<00:00,  4.29it/s]


Finished processing.
Successfully transformed: 2013 images.





In [8]:
# Filter if needed
if image_labels.shape[0] != len(image_tensors):
    filtered_image_labels, filtered_concepts_matrix = filter_concepts_labels(
        paths['mapping_file'], image_tensors, image_paths, image_labels, concepts_matrix
    )
else:
    filtered_image_labels, filtered_concepts_matrix = image_labels, concepts_matrix

if verbose:
    print("Labels shape:", filtered_image_labels.shape)
    print("Concepts shape:", filtered_concepts_matrix.shape)
    print("Image tensors length:", len(image_tensors))


Labels shape: (2013, 34)
Concepts shape: (2013, 28)
Image tensors length: 2013


In [9]:
tensors_dict, concepts_dict, labels_dict = split_data_by_indices(
    image_tensors, image_paths, filtered_concepts_matrix, filtered_image_labels,
    paths, verbose=verbose
)

train_concept_labels = concepts_dict['train']
val_concept_labels = concepts_dict['val']
test_concept_labels = concepts_dict['test']

train_img_labels = labels_dict['train']
val_img_labels = labels_dict['val']
test_img_labels = labels_dict['test']

train_tensors = tensors_dict['train']
val_tensors = tensors_dict['val']
test_tensors = tensors_dict['test']


In [10]:
# concept processing
from config import DERM7PT_CONFIG

class_level_concepts = compute_class_level_concepts(train_concept_labels, None, train_img_labels)

# apply class-level concepts to each instance
if True:
    train_concept_labels, test_concept_labels = apply_class_concepts_to_instances(train_img_labels, train_concept_labels, class_level_concepts, test_img_labels, test_concept_labels, DERM7PT_CONFIG)

common_concept_indices = select_common_concepts(class_level_concepts, min_class_count=2, CUB=False)
train_concept_labels = train_concept_labels[:, common_concept_indices]
test_concept_labels = test_concept_labels[:, common_concept_indices]

In [11]:
common_concept_indices.shape

(14,)

In [12]:
# CREATE TRAIN AND TEST DATASET
train_dataset = ImageConceptDataset(
    image_tensors=train_tensors,
    concept_labels=train_concept_labels,
    image_labels=train_img_labels
)

test_dataset = ImageConceptDataset(
    image_tensors=test_tensors,
    concept_labels=test_concept_labels,
    image_labels=test_img_labels
)


Dataset initialized with 826 pre-sorted items.
Dataset initialized with 790 pre-sorted items.


In [None]:
# CREATE DATALOADERS FROM DATASETS
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True, drop_last=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True, drop_last=False)
