## <span style='color: darkgreen'> This portion of code is for finding the images which are under represented for one-hot encodeing. <span>

In [29]:
import torch
from collections import defaultdict
import os


In [6]:
# === Step 1: Load the .pt file ===
# This assumes your .pt file is saved as a list of dicts, each like:
# {'filename': 'mask_XYZ.png', one_hot label}
training_graph_dataset = torch.load('../DTDataset_Class/Graphs/1024/FullDataset/20250410_TrainGraphAugmentedDatasetWithLabels_and_filenames_one_hot.pt')
print(f'\nA sample of the Dataset: \n', training_graph_dataset[0],'\n\n\n', training_graph_dataset[0][0].y)



A sample of the Dataset: 
 [Data(x=[1024, 7], pos=[1024, 2], edge_index=[2, 6250], y=1), 'ZT76_39_A_4_12'] 


 1


  training_graph_dataset = torch.load('../DTDataset_Class/Graphs/1024/FullDataset/20250410_TrainGraphAugmentedDatasetWithLabels_and_filenames_one_hot.pt')


In [18]:
# === step 2: reading the training dataset and storing the file names which indicated as the benign cores
benign_file_names = []
for entry in training_graph_dataset:
    if entry[0].y==0:
        benign_file_names.append(entry[1])
        # print(entry[1])
print('The number of benign images with label healthy is :', len(benign_file_names))
print(f'The number of total images in the dataset is {len(training_graph_dataset)}')
print(f'The number of under-represented benign images in model one which need to use upsampling for the better training is {len(training_graph_dataset) - 2 * len(benign_file_names)} ')
print(f'Thus, For each benign image, we need to upsample it with 4 different rigid augmentation!!')
print(f'{4* len(benign_file_names) + len(benign_file_names)}')

The number of benign images with label healthy is : 103
The number of total images in the dataset is 641
The number of under-represented benign images in model one which need to use upsampling for the better training is 435 
Thus, For each benign image, we need to upsample it with 4 different rigid augmentation!!
515


In [28]:
# === Step 3: copy the under-represented images (here is benign files) into the Uder_rep_images folder ===
import shutil
from pathlib import Path

SOURCE_FOLDER = f'../../dataset/images/'
DEST_FOLDER = f'../DTDataset_Class/Under_rep_images/'

for filename in benign_file_names:
    filename = filename + '.jpg'
    if not Path(DEST_FOLDER + filename).exists():
        shutil.copy(SOURCE_FOLDER + filename, DEST_FOLDER)
        print(f'file has been copied: {filename}')

In [30]:
! pip install albumentations opencv-python pillow



In [None]:
# === Step 4: RIGID UPSAMPLING ===
# ! pip install albumentations opencv-python pillow

import os
import csv
from PIL import Image
import albumentations as A
from albumentations.pytorch import ToTensorV2
import numpy as np

# === Configurations ===
# Folder containing your original images
input_image_folder = 'dataset/dataverse_files-2/Combined/Combined_images/'               
# Where to save augmented images
output_image_folder = 'dataset/dataverse_files-2/Combined/Augmented_images/'    
# Your saved CSV file from before
csv_file = 'dataset/dataverse_files-2/Combined/underrepresented_Images/underrepresented_images.csv'       

# How many augmented copies to make per image
augmentations_per_image = 4                    

# Create output folder if it doesn't exist
os.makedirs(output_image_folder, exist_ok=True)

# === Define the augmentation pipeline ===
transform = A.Compose([
    A.HorizontalFlip(p=0.5),
    A.VerticalFlip(p=0.5),
    # A.Rotate(limit=30, p=0.7),
    A.RandomBrightnessContrast(p=0.6),
    A.GaussNoise(p=0.2),
    A.ElasticTransform(p=0.2),
    A.HueSaturationValue(p=0.3),
])

# === Read the CSV file and apply augmentations ===
with open(csv_file, newline='') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        filename = row['Filename']
        base_name = os.path.splitext(filename)[0]

        image_path = os.path.join(input_image_folder, filename.replace('png', 'jpg').replace('mask_', ''))
        if not os.path.exists(image_path):
            print(f"⚠️ Warning: {str(image_path)} not found.")
            continue

        # Load image
        image = np.array(Image.open(image_path).convert('RGB'))

        # Apply multiple augmentations
        for i in range(augmentations_per_image):
            augmented = transform(image=image, )['image']
            aug_image = Image.fromarray(augmented)

            # Save augmented image
            aug_filename = f"{base_name}_aug{i+1}.png"
            aug_image.save(os.path.join(output_image_folder, aug_filename))

print("✅ Augmentation completed!")
