In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv("/kaggle/input/skin-cancer-mnist-ham10000/HAM10000_metadata.csv")
headers = df.head()
print(headers)
print(len(df))

In [None]:
#find unique values from columns

unique_dx = df['dx'].unique()
print(f"dx:{unique_dx}")

unique_dx_type = df['dx_type'].unique()
print(f"dx type:{unique_dx_type}")

#dx and dx type are the same

unique_age = df['age'].unique()
print(f"age:{unique_age}")

unique_gender = df['sex'].unique()
print(f"Unique genders:{unique_gender}")

unique_local = df['localization'].unique()
print(f"Unique localization:{unique_local}")

#there are some unknown/nan data in 'sex', 'localization', 'age'

In [None]:
#Benign: nv,bkl, vasc, df 
#Malignant: mel, bcc, akiec 

#filtered unknown values
filtered_df = df[~(df['age'].isna()) & (df['sex'] != 'unknown') & (df['localization'] != 'unknown')]

#unique values double check

filtered_unique_age = filtered_df['age'].unique()
print(f"Filtered age:{filtered_unique_age}")

filtered_unique_gender = filtered_df['sex'].unique()
print(f"Filtered genders:{filtered_unique_gender}")

filtered_unique_local = filtered_df['localization'].unique()
print(f"Filtered localization:{filtered_unique_local}")

#there should not be any unknown values at this point


In [None]:
#devide benign vs malignant

benign_df = filtered_df[filtered_df['dx'].isin(['nv', 'bkl', 'vasc', 'df'])]
malignant_df = filtered_df[filtered_df['dx'].isin(['mel', 'bcc', 'akiec'])]

benign_dx = benign_df['dx'].unique()
print(f"dx:{benign_dx}")

malignant_dx = malignant_df['dx'].unique()
print(f"dx:{malignant_dx}")

In [None]:
from PIL import Image

In [None]:
input_folder = "/kaggle/input/skin-cancer-mnist-ham10000/HAM10000_images_part_1"
output_folder = "/kaggle/working/resized_images/" #get output folder

os.makedirs(output_folder, exist_ok=True)

target_size = (224, 224)

for filename in os.listdir(input_folder):
    if filename.endswith(".jpg") or filename.endswith(".png"):
        img_path = os.path.join(input_folder, filename)
        img = Image.open(img_path).convert("RGB")
        
        # Resize
        img_resized = img.resize(target_size)
        img_resized.save(os.path.join(output_folder, filename))

print("Resize succeeded!")

In [None]:
resized_folder = "/kaggle/working/resized_images"
#get image id
resized_images = [f.split(".")[0] for f in os.listdir(resized_folder)]

In [None]:
df_final = filtered_df[filtered_df['image_id'].isin(resized_images)].reset_index(drop=True) #get final dataset that matched with the resized images only

In [None]:
#add numeric labesls (benign = 0, malignant = 1)
benign_labels = ['nv', 'bkl', 'vasc', 'df']
malignant_labels = ['mel', 'bcc', 'akiec']

df_final['label'] = df_final['dx'].apply(lambda x: 0 if x in benign_labels else 1)

In [None]:
df_final['img_path'] = df_final['image_id'].apply(lambda x: os.path.join(resized_folder, f"{x}.jpg")) #add resized image path

In [None]:
df_final.head()
#check dimensions
#print("Shape of df_final:", df_final.shape)

In [None]:
#check to see if truly filtered
print("Shape of df_final:", df_final.shape)

unique_dx_final = df_final['dx'].unique()
print(f"dx:{unique_dx_final}")

unique_dx_type_final = df_final['dx_type'].unique()
print(f"dx type:{unique_dx_type_final}")

#dx and dx type are the same

unique_age_final = df_final['age'].unique()
print(f"age:{unique_age_final}")

unique_gender_final = df_final['sex'].unique()
print(f"Unique genders:{unique_gender_final}")

unique_local_final = df_final['localization'].unique()
print(f"Unique localization:{unique_local_final}")

In [None]:
#Baseline CNN training
#import required libraries

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, classification_report

#we just have to stick training on CPU instead

In [None]:
#splittingd dataset
train_df, test_df = train_test_split(
    df_final,
    test_size=0.3,                  # 70% train, 30% test
    stratify=df_final['label'],     # keep class balance
    random_state=42
)

In [None]:
#image data generator
batch_size = 16   # smaller batch for CPU
target_size_model = (224, 224)

#convert labels to strings
train_df['label'] = train_df['label'].astype(str)
test_df['label'] = test_df['label'].astype(str)

train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=20,
    width_shift_range=0.1,
    height_shift_range=0.1,
    zoom_range=0.1,
    horizontal_flip=True
)

test_datagen = ImageDataGenerator(rescale=1./255)

train_generator = train_datagen.flow_from_dataframe(
    train_df,
    directory=None,
    x_col='img_path',
    y_col='label',
    target_size=target_size_model,
    batch_size=batch_size,
    class_mode='binary'
)

test_generator = test_datagen.flow_from_dataframe(
    test_df,
    x_col='img_path',
    y_col='label',
    target_size=target_size_model,
    batch_size=batch_size,
    class_mode='binary',
    shuffle=False
)

In [None]:
#Baseline CNN
model_CNN = Sequential([
    Input(shape=(224,224,3)),  # define input once
    Conv2D(16, (3,3), activation='relu'),  # no input_shape here
    MaxPooling2D(2,2),

    Conv2D(32, (3,3), activation='relu'),
    MaxPooling2D(2,2),

    Conv2D(64, (3,3), activation='relu'),
    MaxPooling2D(2,2),

    Flatten(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model_CNN.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_CNN.summary()

In [None]:
#train the dataset
history = model_CNN.fit(
    train_generator,
    validation_data=test_generator,
    epochs=10,   # smaller epochs for CPU
    verbose=1
)

In [None]:
test_loss, test_acc = model_CNN.evaluate(test_generator)
print("Test Accuracy:", test_acc)

In [None]:
# Predict probabilities on test set
y_pred_probs = model_CNN.predict(test_generator)

# Convert probabilities to binary labels (0=benign, 1=malignant)
y_pred = (y_pred_probs > 0.5).astype(int).flatten()

# True labels
y_true = test_generator.classes

# Malignant recall
malignant_recall = recall_score(y_true, y_pred, pos_label=1)
print("Malignant Recall:", malignant_recall)

# full classification report
print(classification_report(y_true, y_pred, target_names=['Benign', 'Malignant']))

In [None]:
#visualize performance

import matplotlib.pyplot as plt

# Accuracy
plt.plot(history.history['accuracy'], label='train_acc')
plt.plot(history.history['val_accuracy'], label='val_acc')
plt.title('Training vs Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

# Loss
plt.plot(history.history['loss'], label='train_loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.title('Training vs Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()