In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import os
import math
from PIL import Image
import cv2

In [16]:
isic_train = pd.read_csv('../datasets/ISIC_2019_Training_GroundTruth.csv')


id_col = 'image'  # Assuming 'image' is the ID column
category_cols = [col for col in isic_train.columns if col != id_col]

isic_train_unpivoted = pd.melt(
    isic_train,
    id_vars=[id_col],
    value_vars=category_cols,
    var_name='category',
    value_name='present'
)
isic_train_unpivoted = isic_train_unpivoted[isic_train_unpivoted['present'] == 1]
isic_train_unpivoted = isic_train_unpivoted.drop(columns=['present'])
isic_train_unpivoted.reset_index(drop=True, inplace=True)



AMOUNT_TO_TAKE = {
    "NV": 1000, 
    "MEL": 1000, 
    "BCC": 1000, 
    "BKL": 1000, 
    "AK": 1000, 
    "SCC": 1000, 
    "VASC": 1000, 
    "DF": 1000
}

ORDER_OF_OPERATIONS = [
    'rot0',
    'rot90', 
    'rot180', 
    'rot270', 
    'flipud', 
    'fliplr'
]


In [17]:
def preprocess_image(img, augmentations=['rot90', 'rot180', 'rot270', 'flipud', 'fliplr'], do_crop=True):
    # Preprocessing steps:
    # 1. Crop the image to square dimensions
    # 2. Determine number of transformations needed. First 3 will be rotations, then the next 2 will be flips 

    if do_crop:
        height, width = img.shape[:2]
        min_dim = min(height, width)
        start_x = (width - min_dim) // 2
        start_y = (height - min_dim) // 2
        cropped_img = img[start_y:start_y + min_dim, start_x:start_x + min_dim]
    else:
        cropped_img = img

    augmented_samples = []
    if 'rot0' in augmentations:
        augmented_samples.append(cropped_img)
    if 'rot90' in augmentations:
        augmented_samples.append(np.rot90(cropped_img, k=1, axes=(0, 1)))
    if 'rot180' in augmentations:
        augmented_samples.append(np.rot90(cropped_img, k=2, axes=(0, 1)))
    if 'rot270' in augmentations:
        augmented_samples.append(np.rot90(cropped_img, k=3, axes=(0, 1)))
    if 'flipud' in augmentations:
        augmented_samples.append(np.flipud(cropped_img))
    if 'fliplr' in augmentations:
        augmented_samples.append(np.fliplr(cropped_img))
    
    return augmented_samples



In [20]:


# Clear previous variable definitions
augmented_data = []
INPUT_DIRECTORY = '../datasets/ISIC_2019_Training/'
OUTPUT_DIRECTORY = '../datasets/ISIC_2019_cleaned/'

# Create output directory if it doesn't exist
os.makedirs(OUTPUT_DIRECTORY, exist_ok=True)

# Track our resampled dataset
final_dataset = []

# Process each category
for category in tqdm(isic_train_unpivoted['category'].unique(), desc="Processing categories"):
    if category not in AMOUNT_TO_TAKE:
        continue
        
    target_count = AMOUNT_TO_TAKE[category]
    category_df = isic_train_unpivoted[isic_train_unpivoted['category'] == category]
    available_count = len(category_df)
    
    print(f"\nProcessing category: {category}")
    print(f"Available images: {available_count}, Target: {target_count}")
    
    # Case 1: We have enough images, downsample to target count
    if available_count >= target_count:
        print(f"Downsampling to {target_count} images")
        sampled_df = category_df.sample(n=target_count, random_state=42)
        
        # Process and save original images
        for _, row in tqdm(sampled_df.iterrows(), desc=f"Saving {category} images", total=len(sampled_df)):
            img_path = f'{INPUT_DIRECTORY}{row["image"]}.jpg'
            img = np.array(Image.open(img_path))
            processed_img = preprocess_image(img, augmentations=['rot0'], do_crop=True)[0]
            
            output_filename = f'{row["image"]}_preprocessed_rot0.jpg'
            output_path = f'{OUTPUT_DIRECTORY}{output_filename}'
            Image.fromarray(processed_img).save(output_path)
            
            final_dataset.append({
                'image': output_filename,
                'category': category,
                'augmentation_method': 'rot0'
            })
            
    # Case 2: We need to augment to reach target count
    else:
        print(f"Need augmentation: {available_count} available, need {target_count - available_count} more")
        
        # Calculate how many augmentation methods we need
        needed_count = target_count - available_count
        augmentations_needed = math.ceil(needed_count / available_count)
        
        # Get the augmentation methods we'll use
        augmentation_methods = ORDER_OF_OPERATIONS[:min(augmentations_needed + 1, len(ORDER_OF_OPERATIONS))]
        print(f"Using augmentations: {augmentation_methods}")
        
        # First, process all original images
        original_images = []
        for _, row in tqdm(category_df.iterrows(), desc=f"Processing original {category} images"):
            img_path = f'{INPUT_DIRECTORY}{row["image"]}.jpg'
            img = np.array(Image.open(img_path))
            processed_img = preprocess_image(img, augmentations=['rot0'], do_crop=True)[0]
            
            output_filename = f'{row["image"]}_preprocessed_rot0.jpg'
            output_path = f'{OUTPUT_DIRECTORY}{output_filename}'
            Image.fromarray(processed_img).save(output_path)
            
            final_dataset.append({
                'image': output_filename,
                'category': category,
                'augmentation_method': 'rot0'
            })
            
            # Also add this image to our augmentation candidates
            original_images.append({
                'image_id': row["image"],
                'img_array': img
            })
        
        # Now create augmented images
        augmented_candidates = []
        
        # Skip 'rot0' since we already used it for originals
        for aug_method in augmentation_methods[1:]:
            for img_data in tqdm(original_images, desc=f"Augmenting with {aug_method}"):
                img = img_data['img_array']
                image_id = img_data['image_id']
                
                processed_imgs = preprocess_image(img, augmentations=[aug_method], do_crop=True)
                
                if processed_imgs:  # Make sure we have results
                    output_filename = f'{image_id}_preprocessed_{aug_method}.jpg'
                    output_path = f'{OUTPUT_DIRECTORY}{output_filename}'
                    Image.fromarray(processed_imgs[0]).save(output_path)
                    
                    augmented_candidates.append({
                        'image': output_filename,
                        'category': category,
                        'augmentation_method': aug_method
                    })
        
        # Sample the needed number of augmented images
        if augmented_candidates:
            augmented_df = pd.DataFrame(augmented_candidates)
            
            # If we have more augmented images than needed, sample down
            if len(augmented_df) > needed_count:
                augmented_df = augmented_df.sample(n=needed_count, random_state=42)
            
            # Add the sampled augmented images to our final dataset
            final_dataset.extend(augmented_df.to_dict('records'))

# Convert to DataFrame
final_df = pd.DataFrame(final_dataset)
print(f"\nFinal dataset contains {len(final_df)} images")
print(final_df['category'].value_counts())

# Save metadata
final_df.to_csv(f'{OUTPUT_DIRECTORY}resampled_metadata.csv', index=False)

Processing categories:   0%|          | 0/8 [00:00<?, ?it/s]


Processing category: MEL
Available images: 4522, Target: 1000
Downsampling to 1000 images


Saving MEL images: 100%|██████████| 1000/1000 [00:22<00:00, 43.51it/s]
Processing categories:  12%|█▎        | 1/8 [00:22<02:40, 22.99s/it]


Processing category: NV
Available images: 12875, Target: 1000
Downsampling to 1000 images


Saving NV images: 100%|██████████| 1000/1000 [00:18<00:00, 55.52it/s]
Processing categories:  25%|██▌       | 2/8 [00:41<02:00, 20.06s/it]


Processing category: BCC
Available images: 3323, Target: 1000
Downsampling to 1000 images


Saving BCC images: 100%|██████████| 1000/1000 [00:24<00:00, 40.46it/s]
Processing categories:  38%|███▊      | 3/8 [01:05<01:50, 22.19s/it]


Processing category: AK
Available images: 867, Target: 1000
Need augmentation: 867 available, need 133 more
Using augmentations: ['rot0', 'rot90']


Processing original AK images: 867it [00:20, 41.64it/s]
Augmenting with rot90: 100%|██████████| 867/867 [00:19<00:00, 45.26it/s]
Processing categories:  50%|█████     | 4/8 [01:45<01:56, 29.22s/it]


Processing category: BKL
Available images: 2624, Target: 1000
Downsampling to 1000 images


Saving BKL images: 100%|██████████| 1000/1000 [00:19<00:00, 52.13it/s]
Processing categories:  62%|██████▎   | 5/8 [02:04<01:16, 25.60s/it]


Processing category: DF
Available images: 239, Target: 1000
Need augmentation: 239 available, need 761 more
Using augmentations: ['rot0', 'rot90', 'rot180', 'rot270', 'flipud']


Processing original DF images: 239it [00:04, 50.67it/s]
Augmenting with rot90: 100%|██████████| 239/239 [00:03<00:00, 64.45it/s]
Augmenting with rot180: 100%|██████████| 239/239 [00:03<00:00, 67.62it/s]
Augmenting with rot270: 100%|██████████| 239/239 [00:03<00:00, 66.84it/s]
Augmenting with flipud: 100%|██████████| 239/239 [00:03<00:00, 66.03it/s]
Processing categories:  75%|███████▌  | 6/8 [02:24<00:46, 23.44s/it]


Processing category: VASC
Available images: 253, Target: 1000
Need augmentation: 253 available, need 747 more
Using augmentations: ['rot0', 'rot90', 'rot180', 'rot270']


Processing original VASC images: 253it [00:04, 52.67it/s]
Augmenting with rot90: 100%|██████████| 253/253 [00:03<00:00, 70.21it/s]
Augmenting with rot180: 100%|██████████| 253/253 [00:03<00:00, 71.81it/s]
Augmenting with rot270: 100%|██████████| 253/253 [00:03<00:00, 73.06it/s]
Processing categories:  88%|████████▊ | 7/8 [02:39<00:20, 20.83s/it]


Processing category: SCC
Available images: 628, Target: 1000
Need augmentation: 628 available, need 372 more
Using augmentations: ['rot0', 'rot90']


Processing original SCC images: 628it [00:13, 45.44it/s]
Augmenting with rot90: 100%|██████████| 628/628 [00:12<00:00, 52.33it/s]
Processing categories: 100%|██████████| 8/8 [03:05<00:00, 23.18s/it]


Final dataset contains 8000 images
category
MEL     1000
NV      1000
BCC     1000
AK      1000
BKL     1000
DF      1000
VASC    1000
SCC     1000
Name: count, dtype: int64



