In [2]:
# Example dataset path in Kaggle
image_dir_1 = '/kaggle/input/skin-cancer-mnist-ham10000/ham10000_images_part_1'
image_dir_2 = '/kaggle/input/skin-cancer-mnist-ham10000/ham10000_images_part_2'
metadata_file = '/kaggle/input/skin-cancer-mnist-ham10000/HAM10000_metadata.csv'


In [3]:
import os
import pandas as pd
import numpy as np
from PIL import Image
from tensorflow.keras.utils import to_categorical

# Paths
image_dir_1 = '/kaggle/input/skin-cancer-mnist-ham10000/ham10000_images_part_1'
image_dir_2 = '/kaggle/input/skin-cancer-mnist-ham10000/ham10000_images_part_2'
metadata_file = '/kaggle/input/skin-cancer-mnist-ham10000/HAM10000_metadata.csv'

# Load metadata
labels = pd.read_csv(metadata_file)
images = []
label_list = []

# Load images from both directories
for _, row in labels.iterrows():
    file_path_1 = os.path.join(image_dir_1, row['image_id'] + '.jpg')
    file_path_2 = os.path.join(image_dir_2, row['image_id'] + '.jpg')
    if os.path.exists(file_path_1):
        img = Image.open(file_path_1).resize((128, 128))
    elif os.path.exists(file_path_2):
        img = Image.open(file_path_2).resize((128, 128))
    else:
        continue  # Skip if file is missing
    images.append(np.array(img) / 255.0)  # Normalize
    label_list.append(row['dx'])

# Convert to numpy arrays
X = np.array(images)
# Convert labels to one-hot encoding
y = to_categorical(pd.factorize(pd.Series(label_list))[0])  # Ensure label_list is treated as a Series
class_names = pd.factorize(pd.Series(label_list))[1]  # Extract class names without warning

# Print shapes for verification
print("Dataset shape:", X.shape)
print("Labels shape:", y.shape)
print("Class names:", class_names)


Dataset shape: (10015, 128, 128, 3)
Labels shape: (10015, 7)
Class names: Index(['bkl', 'nv', 'df', 'mel', 'vasc', 'bcc', 'akiec'], dtype='object')


In [4]:
import os

# Define the path for saving outputs
save_dir = '/kaggle/working/gan_outputs/'

# Create the directory if it does not exist
os.makedirs(save_dir, exist_ok=True)

# Example of usage: Save generated images or models
generated_image_path = os.path.join(save_dir, 'generated_image_epoch_1.png')
checkpoint_path = os.path.join(save_dir, 'model_checkpoint_epoch_1.h5')

print(f"All outputs will be saved in: {save_dir}")


All outputs will be saved in: /kaggle/working/gan_outputs/


In [30]:
!ls /kaggle/input/


skin-cancer-mnist-ham10000


In [31]:
!ls /kaggle/input/skin-cancer-mnist-ham10000/


HAM10000_images_part_1	ham10000_images_part_1	hmnist_28_28_RGB.csv
HAM10000_images_part_2	ham10000_images_part_2	hmnist_8_8_L.csv
HAM10000_metadata.csv	hmnist_28_28_L.csv	hmnist_8_8_RGB.csv


In [34]:
import os
import pandas as pd

# Directories for images
image_dir_1 = '/kaggle/input/skin-cancer-mnist-ham10000/ham10000_images_part_1'
image_dir_2 = '/kaggle/input/skin-cancer-mnist-ham10000/ham10000_images_part_2'

# Combine image paths
image_paths = [
    os.path.join(image_dir_1, f) for f in os.listdir(image_dir_1)
] + [
    os.path.join(image_dir_2, f) for f in os.listdir(image_dir_2)
]

# Check available image files
available_files = set(os.listdir(image_dir_1) + os.listdir(image_dir_2))

# Load metadata
metadata = pd.read_csv('/kaggle/input/skin-cancer-mnist-ham10000/HAM10000_metadata.csv')

# Filter metadata to include only available files
metadata = metadata[metadata['image_id'].apply(lambda x: f"{x}.jpg" in available_files)]

# Generate image paths aligned with filtered metadata
image_paths = metadata['image_id'].apply(
    lambda x: os.path.join(image_dir_1, f"{x}.jpg")
    if f"{x}.jpg" in os.listdir(image_dir_1)
    else os.path.join(image_dir_2, f"{x}.jpg")
).tolist()

# Encode labels
labels = metadata['dx']
label_mapping, encoded_labels = pd.factorize(labels)

print(f"Label mapping: {dict(enumerate(label_mapping))}")
assert len(image_paths) == len(encoded_labels), "Mismatch between image paths and labels!"

# Split into training and validation datasets
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    image_paths, encoded_labels, test_size=0.2, random_state=42
)

print(f"Number of training samples: {len(X_train)}")
print(f"Number of validation samples: {len(X_val)}")


Label mapping: {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0, 10: 0, 11: 0, 12: 0, 13: 0, 14: 0, 15: 0, 16: 0, 17: 0, 18: 0, 19: 0, 20: 0, 21: 0, 22: 0, 23: 0, 24: 0, 25: 0, 26: 0, 27: 0, 28: 0, 29: 0, 30: 0, 31: 0, 32: 0, 33: 0, 34: 0, 35: 0, 36: 0, 37: 0, 38: 0, 39: 0, 40: 0, 41: 0, 42: 0, 43: 0, 44: 0, 45: 0, 46: 0, 47: 0, 48: 0, 49: 0, 50: 0, 51: 0, 52: 0, 53: 0, 54: 0, 55: 0, 56: 0, 57: 0, 58: 0, 59: 0, 60: 0, 61: 0, 62: 0, 63: 0, 64: 1, 65: 0, 66: 0, 67: 0, 68: 0, 69: 0, 70: 0, 71: 0, 72: 0, 73: 0, 74: 0, 75: 0, 76: 0, 77: 0, 78: 0, 79: 0, 80: 0, 81: 0, 82: 0, 83: 0, 84: 0, 85: 0, 86: 0, 87: 0, 88: 0, 89: 0, 90: 0, 91: 0, 92: 0, 93: 0, 94: 0, 95: 0, 96: 0, 97: 0, 98: 0, 99: 0, 100: 0, 101: 0, 102: 0, 103: 0, 104: 0, 105: 0, 106: 0, 107: 0, 108: 0, 109: 0, 110: 0, 111: 0, 112: 0, 113: 0, 114: 0, 115: 0, 116: 0, 117: 0, 118: 0, 119: 0, 120: 0, 121: 0, 122: 0, 123: 0, 124: 0, 125: 0, 126: 0, 127: 0, 128: 0, 129: 0, 130: 0, 131: 0, 132: 0, 133: 0, 134: 0, 135: 0, 136: 0

AssertionError: Mismatch between image paths and labels!