In [1]:
from google.colab import files
files.upload()


Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"noyalbennyndl","key":"3b9c0e9705b5b23290cd2493cb5e6a5c"}'}

In [2]:
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


In [3]:
!kaggle datasets download -d kmader/skin-cancer-mnist-ham10000


Dataset URL: https://www.kaggle.com/datasets/kmader/skin-cancer-mnist-ham10000
License(s): CC-BY-NC-SA-4.0
Downloading skin-cancer-mnist-ham10000.zip to /content
 99% 5.17G/5.20G [01:51<00:02, 18.3MB/s]
100% 5.20G/5.20G [01:51<00:00, 49.9MB/s]


In [4]:
import zipfile

with zipfile.ZipFile("skin-cancer-mnist-ham10000.zip", 'r') as zip_ref:
    zip_ref.extractall("data")


In [5]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import shutil
import glob # Add this import

meta_file = "data/HAM10000_metadata.csv"
# images_dir = "data/" # This line is no longer directly used for constructing src in copy_files
output_dir = "processed_data"

df = pd.read_csv(meta_file)

# Build a mapping of image_id to its full path by searching within known image subdirectories
image_paths_map = {}
# List of potential image directories based on previous `os.listdir('data')` output
image_subdirs = [d for d in os.listdir('data') if 'ham10000_images_part' in d.lower() and os.path.isdir(os.path.join('data', d))]

for part_dir_name in image_subdirs:
    full_path_to_part = os.path.join("data", part_dir_name)
    for img_file in glob.glob(os.path.join(full_path_to_part, '*.jpg')):
        image_id = os.path.basename(img_file).replace('.jpg', '')
        image_paths_map[image_id] = img_file

# Add the full image path to the dataframe
df['full_image_path'] = df['image_id'].map(image_paths_map)
# Drop rows where image path couldn't be found (i.e., image files are missing)
df = df.dropna(subset=['full_image_path'])

print(df.head())


     lesion_id      image_id   dx dx_type   age   sex localization  \
0  HAM_0000118  ISIC_0027419  bkl   histo  80.0  male        scalp   
1  HAM_0000118  ISIC_0025030  bkl   histo  80.0  male        scalp   
2  HAM_0002730  ISIC_0026769  bkl   histo  80.0  male        scalp   
3  HAM_0002730  ISIC_0025661  bkl   histo  80.0  male        scalp   
4  HAM_0001466  ISIC_0031633  bkl   histo  75.0  male          ear   

                                full_image_path  
0  data/ham10000_images_part_1/ISIC_0027419.jpg  
1  data/ham10000_images_part_1/ISIC_0025030.jpg  
2  data/ham10000_images_part_1/ISIC_0026769.jpg  
3  data/ham10000_images_part_1/ISIC_0025661.jpg  
4  data/ham10000_images_part_2/ISIC_0031633.jpg  


In [6]:
malignant = ["akiec", "bcc", "mel"]
df["label"] = df["dx"].apply(lambda x: 1 if x in malignant else 0)

df["label"].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,8061
1,1954


In [7]:
train_df, temp_df = train_test_split(
    df, test_size=0.30, stratify=df["label"], random_state=42)

val_df, test_df = train_test_split(
    temp_df, test_size=0.50, stratify=temp_df["label"], random_state=42)

In [8]:
def copy_files(subset_df, subset_name):
    for label, name in [(0, "benign"), (1, "malignant")]:
        os.makedirs(f"{output_dir}/{subset_name}/{name}", exist_ok=True)

    for _, row in subset_df.iterrows():
        src = row["full_image_path"] # Use the pre-computed full path from the dataframe
        dst = f"{output_dir}/{subset_name}/{'malignant' if row['label']==1 else 'benign'}/{row['image_id']}.jpg"
        # The `if os.path.exists(src)` check is less critical now as we dropped rows with missing paths
        shutil.copy(src, dst)

copy_files(train_df, "train")
copy_files(val_df, "val")
copy_files(test_df, "test")

In [9]:
import tensorflow as tf
from tensorflow.keras import layers, models

def create_cnn(input_shape=(128,128,3)):
    model = models.Sequential()

    model.add(layers.Conv2D(32, (3,3), activation='relu', input_shape=input_shape))
    model.add(layers.MaxPooling2D((2,2)))

    model.add(layers.Conv2D(64, (3,3), activation='relu'))
    model.add(layers.MaxPooling2D((2,2)))

    model.add(layers.Conv2D(128, (3,3), activation='relu'))
    model.add(layers.MaxPooling2D((2,2)))

    model.add(layers.Flatten())
    model.add(layers.Dense(128, activation="relu"))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(1, activation="sigmoid"))

    return model

model = create_cnn()
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

train_gen = ImageDataGenerator(rescale=1./255)
val_gen = ImageDataGenerator(rescale=1./255)

train_data = train_gen.flow_from_directory(
    "processed_data/train",
    target_size=(128,128),
    batch_size=32,
    class_mode='binary'
)

val_data = val_gen.flow_from_directory(
    "processed_data/val",
    target_size=(128,128),
    batch_size=32,
    class_mode='binary'
)

history = model.fit(
    train_data,
    validation_data=val_data,
    epochs=10
)

Found 7010 images belonging to 2 classes.
Found 1502 images belonging to 2 classes.


  self._warn_if_super_not_called()


Epoch 1/10
[1m191/220[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m30s[0m 1s/step - accuracy: 0.7920 - loss: 0.5225

In [None]:
import os

# List contents of the 'data' directory after extraction
print("Contents of data directory:")
print(os.listdir('data'))

# If there are subdirectories, list their contents too to find the images
for root, dirs, files in os.walk('data'):
    if files:
        print(f"Files found in {root}: {files[:5]} (showing first 5 if many)")
        break # Stop after finding files in the first subdirectory


In [None]:
test_gen = ImageDataGenerator(rescale=1./255)

test_data = test_gen.flow_from_directory(
    "processed_data/test",
    target_size=(128,128),
    batch_size=32,
    class_mode='binary',
    shuffle=False
)

model.evaluate(test_data)


In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models

# Define a custom preprocessing layer to wrap mobilenet_v2.preprocess_input
class PreprocessLayer(layers.Layer):
    def call(self, inputs):
        return tf.keras.applications.mobilenet_v2.preprocess_input(inputs)

# Load MobileNetV2 base model
base_model = tf.keras.applications.MobileNetV2(
    input_shape=(224, 224, 3),
    include_top=False,
    weights="imagenet"
)

# Freeze base model
base_model.trainable = False

# Build the full model, now using the custom PreprocessLayer
inputs = tf.keras.Input(shape=(224,224,3))
x = PreprocessLayer()(inputs) # Use the custom layer for preprocessing
x = base_model(x, training=False)
x = layers.GlobalAveragePooling2D()(x)
x = layers.Dropout(0.3)(x)
outputs = layers.Dense(1, activation='sigmoid')(x)

model = tf.keras.Model(inputs, outputs)

model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-4),
    loss="binary_crossentropy",
    metrics=["accuracy", tf.keras.metrics.AUC(name="auc")]
)

model.summary()

In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

train_gen = ImageDataGenerator(rescale=1./255)
val_gen = ImageDataGenerator(rescale=1./255)

train_data = train_gen.flow_from_directory(
    "processed_data/train",
    target_size=(224,224),
    batch_size=32,
    class_mode='binary'
)

val_data = val_gen.flow_from_directory(
    "processed_data/val",
    target_size=(224,224),
    batch_size=32,
    class_mode='binary'
)

history = model.fit(
    train_data,
    validation_data=val_data,
    epochs=10
)


In [None]:
import matplotlib.pyplot as plt

# Accuracy graph
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.legend()
plt.title('Accuracy Curve')
plt.show()

# Loss graph
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.legend()
plt.title('Loss Curve')
plt.show()


In [None]:
model.save("skin_cancer_model.keras")


The warning indicates that saving the model in HDF5 format (`.h5`) is considered legacy. To avoid this warning and use the recommended native Keras format, you can change the file extension to `.keras`.

In [None]:
import os
from google.colab import files

# Remove the old .h5 file if it exists, to avoid confusion
if os.path.exists('skin_cancer_model.h5'):
    os.remove('skin_cancer_model.h5')

# Save the model in the recommended .keras format
model_filename_keras = 'skin_cancer_model.keras'
model.save(model_filename_keras)

# Download the .keras file
files.download(model_filename_keras)
