In [None]:
# STEP 1: Force CPU only
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"  # Disable GPU completely

In [None]:
# STEP 2: Install required packages
!pip install -q kaggle timm albumentations
!pip install -q seaborn scikit-learn

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m65.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m53.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m32.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# STEP 3: Import libraries
import os
import pandas as pd
import numpy as np
import cv2
import seaborn as sns
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt
import shutil
import zipfile
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.layers import GlobalAveragePooling2D, Dense, Dropout
from tqdm import tqdm
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, confusion_matrix
from tensorflow.keras.models import Model, clone_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

In [None]:
# STEP 4: Upload kaggle.json to access dataset
from google.colab import files
files.upload()  # Upload your kaggle.json

# Save to correct location
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json


In [None]:
# Download dataset
!kaggle datasets download -d kmader/skin-cancer-mnist-ham10000

# Unzip
!unzip -q skin-cancer-mnist-ham10000.zip -d ham10000_data

Dataset URL: https://www.kaggle.com/datasets/kmader/skin-cancer-mnist-ham10000
License(s): CC-BY-NC-SA-4.0
Downloading skin-cancer-mnist-ham10000.zip to /content
100% 5.20G/5.20G [02:33<00:00, 27.5MB/s]
100% 5.20G/5.20G [02:33<00:00, 36.3MB/s]


In [None]:
# Load metadata
df = pd.read_csv("ham10000_data/HAM10000_metadata.csv")

# Map correct image paths
image_dir1 = "ham10000_data/HAM10000_images_part_1"
image_dir2 = "ham10000_data/HAM10000_images_part_2"

# Build a mapping from image_id to full path
all_image_paths = {}
for d in [image_dir1, image_dir2]:
    for fname in os.listdir(d):
        if fname.endswith('.jpg'):
            all_image_paths[os.path.splitext(fname)[0]] = os.path.join(d, fname)

# Attach paths and labels
df['path'] = df['image_id'].map(all_image_paths)
df = df.dropna(subset=['path']).reset_index(drop=True)
df['label'] = df['dx']  # categorical labels

# Prepare k-fold cross validation
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Data generators (augmentation only on training)
train_datagen = ImageDataGenerator(
    rescale=1./255,
    horizontal_flip=True,
    vertical_flip=True,
    rotation_range=20,
    zoom_range=0.2,
    width_shift_range=0.1,
    height_shift_range=0.1,
    shear_range=0.1
)
val_datagen = ImageDataGenerator(rescale=1./255)

# Store fold scores
histories = []
fold_no = 1
for train_idx, val_idx in skf.split(df, df['label']):
    print(f"\n--- Fold {fold_no} ---")
    train_df = df.iloc[train_idx]
    val_df = df.iloc[val_idx]

    # Generators\
    train_gen = train_datagen.flow_from_dataframe(
        train_df, x_col='path', y_col='label',
        target_size=(224, 224), batch_size=32,
        class_mode='categorical', shuffle=True
    )
    val_gen = val_datagen.flow_from_dataframe(
        val_df, x_col='path', y_col='label',
        target_size=(224, 224), batch_size=32,
        class_mode='categorical', shuffle=False
    )

    num_classes = len(train_gen.class_indices)

    # Build base model
    base_model = MobileNetV2(weights='imagenet', include_top=False, input_shape=(224,224,3))
    x = base_model.output
    x = GlobalAveragePooling2D()(x)
    x = Dropout(0.4)(x)
    x = Dense(128, activation='relu')(x)
    preds = Dense(num_classes, activation='softmax')(x)
    model = Model(inputs=base_model.input, outputs=preds)

    # Freeze all base layers
    for layer in base_model.layers:
        layer.trainable = False

    # Compile
    model.compile(
        optimizer=Adam(learning_rate=1e-4),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )

    # Callbacks\
    es = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    ckpt = ModelCheckpoint(f"mobilenetv2_fold{fold_no}.h5", monitor='val_accuracy', save_best_only=True)
    rlrop = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-6)

    # Stage 1: Train top layers
    history1 = model.fit(
        train_gen,
        validation_data=val_gen,
        epochs=15,
        callbacks=[es, ckpt, rlrop]
    )

    # Stage 2: Fine-tune some of the base model layers
    # Unfreeze last 20 layers
    for layer in base_model.layers[-20:]:
        layer.trainable = True
    # Recompile with lower lr
    model.compile(
        optimizer=Adam(learning_rate=1e-5),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )

    # Continue training
    history2 = model.fit(
        train_gen,
        validation_data=val_gen,
        epochs=10,
        callbacks=[es, ckpt, rlrop]
    )

    histories.append((history1, history2))
    fold_no += 1

# After all folds, you can analyze average accuracy and loss
print("Training complete. Analyze `histories` for per-fold metrics.")


NameError: name 'pd' is not defined