# Preprocessing CODE


In [1]:
from src.preprocessing.Derm7pt import *
from src import ImageConceptDataset
from src.preprocessing import *
from src.utils import get_paths, load_Derm_dataset

from torch.utils.data import DataLoader
import os

In [2]:
paths = get_paths()
dataset_handler = load_Derm_dataset(paths)

In [3]:
# Ensure text files exist
if not os.path.exists(paths['labels_file']):
    export_image_props_to_text(dataset_handler.df)

In [4]:
verbose = True

In [12]:
all_data = dataset_handler.df
all_data

Unnamed: 0,case_num,diagnosis,seven_point_score,pigment_network,streaks,pigmentation,regression_structures,dots_and_globules,blue_whitish_veil,vascular_structures,...,case_id,notes,diagnosis_numeric,pigment_network_numeric,blue_whitish_veil_numeric,vascular_structures_numeric,pigmentation_numeric,streaks_numeric,dots_and_globules_numeric,regression_structures_numeric
0,1,basal cell carcinoma,0,absent,absent,absent,absent,absent,absent,arborizing,...,,,0,0,0,1,0,0,0,0
1,2,basal cell carcinoma,1,absent,absent,absent,absent,irregular,absent,absent,...,,,0,0,0,0,0,0,2,0
2,3,basal cell carcinoma,1,absent,absent,absent,absent,irregular,absent,arborizing,...,,,0,0,0,1,0,0,2,0
3,4,basal cell carcinoma,4,absent,absent,absent,blue areas,irregular,present,within regression,...,,,0,0,1,4,0,0,2,1
4,5,basal cell carcinoma,1,absent,absent,diffuse irregular,absent,absent,absent,absent,...,,,0,0,0,0,3,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1006,1007,vascular lesion,0,absent,absent,absent,absent,absent,absent,absent,...,,,14,0,0,0,0,0,0,0
1007,1008,vascular lesion,0,absent,absent,absent,absent,absent,absent,absent,...,,,14,0,0,0,0,0,0,0
1008,1009,vascular lesion,0,absent,absent,absent,absent,absent,absent,absent,...,,,14,0,0,0,0,0,0,0
1009,1010,vascular lesion,0,absent,absent,diffuse regular,absent,absent,absent,absent,...,,,14,0,0,0,1,0,0,0


In [None]:
import numpy as np
import pandas as pd

all_labels = dataset_handler.get_labels(data_type='all', one_hot=True)
mapping_data = pd.read_csv(paths['mapping_file'], sep=' ', header=None, names=['img_id', 'img_path', 'img_type', 'case_id'])

concepts_matrix = []

for _, row in mapping_data.iterrows():
    case_id = row['case_id']
    instance_concepts = None
    for concept, vals in all_labels.items():
        if concept == 'DIAG':
            continue

        concept_one_hot = vals[case_id]

        if instance_concepts is None:
            instance_concepts = concept_one_hot
        else:
            # Concatenate horizontally (along axis 1)
            instance_concepts = np.hstack((instance_concepts, concept_one_hot))

    concepts_matrix.append(instance_concepts)

concepts_matrix1 = np.array(concepts_matrix)

In [None]:
# Get labels and concepts
image_labels = one_hot_encode_labels(paths['labels_file'], paths['classes_path'], verbose=verbose)
concepts_matrix = encode_image_concepts(dataset_handler, paths['mapping_file'], verbose=verbose)

# Load and transform images
image_tensors, image_paths = load_and_transform_images(paths['dir_images'], paths['mapping_file'], resol=299, use_training_transforms=True, batch_size=32, verbose=verbose)

Found 34 classes.
Found labels for 2022 images.
Generated one-hot matrix with shape: (2022, 34)
Total number of concept columns: 28


In [None]:
# Filter if needed
if image_labels.shape[0] != len(image_tensors):
    filtered_image_labels, filtered_concepts_matrix = filter_concepts_labels(
        paths['mapping_file'], image_tensors, image_paths, image_labels, concepts_matrix
    )
else:
    filtered_image_labels, filtered_concepts_matrix = image_labels, concepts_matrix

if verbose:
    print("Labels shape:", filtered_image_labels.shape)
    print("Concepts shape:", filtered_concepts_matrix.shape)
    print("Image tensors length:", len(image_tensors))


Labels shape: (2013, 34)
Concepts shape: (2013, 28)
Image tensors length: 2013


In [None]:
tensors_dict, concepts_dict, labels_dict = split_data_by_indices(
    image_tensors, image_paths, filtered_concepts_matrix, filtered_image_labels,
    paths, verbose=verbose
)

train_concept_labels = concepts_dict['train']
val_concept_labels = concepts_dict['val']
test_concept_labels = concepts_dict['test']

train_img_labels = labels_dict['train']
val_img_labels = labels_dict['val']
test_img_labels = labels_dict['test']

train_tensors = tensors_dict['train']
val_tensors = tensors_dict['val']
test_tensors = tensors_dict['test']


In [None]:
# concept processing
from config import DERM7PT_CONFIG

class_level_concepts = compute_class_level_concepts(train_concept_labels, None, train_img_labels)

# apply class-level concepts to each instance
if True:
    train_concept_labels, val_concept_labels, test_concept_labels = apply_class_concepts_to_instances(
        class_level_concepts, DERM7PT_CONFIG, train_img_labels, train_concept_labels,
        test_img_labels, test_concept_labels, val_img_labels, val_concept_labels)

common_concept_indices = select_common_concepts(class_level_concepts, min_class_count=0, CUB=False)
train_concept_labels = train_concept_labels[:, common_concept_indices]
val_concept_labels = val_concept_labels[:, common_concept_indices]
test_concept_labels = test_concept_labels[:, common_concept_indices]

In [None]:
import numpy as np
from config import PROJECT_ROOT

np.save(os.path.join(PROJECT_ROOT, 'output', 'Derm7pt', 'class_level_concepts.npy'), class_level_concepts)

In [None]:
common_concept_indices.shape

In [None]:
# CREATE TRAIN AND TEST DATASET
train_dataset = ImageConceptDataset(
    image_tensors=train_tensors,
    concept_labels=train_concept_labels,
    image_labels=train_img_labels
)

test_dataset = ImageConceptDataset(
    image_tensors=test_tensors,
    concept_labels=test_concept_labels,
    image_labels=test_img_labels
)


In [None]:
# CREATE DATALOADERS FROM DATASETS
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True, drop_last=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True, drop_last=False)
