<a href="https://colab.research.google.com/github/manavidubey/skin_cancer/blob/main/P2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("fanconic/skin-cancer-malignant-vs-benign")

print("Path to dataset files:", path)

Path to dataset files: /root/.cache/kagglehub/datasets/fanconic/skin-cancer-malignant-vs-benign/versions/4


In [2]:
# Step 1: Install Kaggle and authenticate
!pip install -q kaggle

# Upload your Kaggle API key
from google.colab import files
files.upload()  # Choose the 'kaggle.json' file

# Move kaggle.json to the proper directory and set permissions
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Step 2: Download the Skin Cancer dataset from Kaggle
!kaggle datasets download -d fanconic/skin-cancer-malignant-vs-benign

# Unzip the downloaded dataset
!unzip -q skin-cancer-malignant-vs-benign.zip -d skin_cancer_data

# Step 3: Set up train and test directories



KeyboardInterrupt: 

In [None]:
import os
from sklearn.model_selection import train_test_split
import shutil

# Define base directory for extracted dataset
base_dir = "skin_cancer_data"
train_dir = os.path.join(base_dir, "train")
test_dir = os.path.join(base_dir, "test")

# Create directories for training and testing sets
os.makedirs(train_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

# Define the original dataset folder (unzipped location)
data_dir = os.path.join(base_dir, "data")

# Create train-test split (e.g., 80% train, 20% test)
for label in ['benign', 'malignant']:
    # Ensure subdirectories exist in train/test directories
    os.makedirs(os.path.join(train_dir, label), exist_ok=True)
    os.makedirs(os.path.join(test_dir, label), exist_ok=True)

    # Get all images for the current label
    src_dir = os.path.join(data_dir, label)
    all_images = os.listdir(src_dir)

    # Split images into training and testing sets
    train_images, test_images = train_test_split(all_images, test_size=0.2, random_state=42)

    # Move images to respective train/test directories
    for image in train_images:
        shutil.copy(os.path.join(src_dir, image), os.path.join(train_dir, label, image))
    for image in test_images:
        shutil.copy(os.path.join(src_dir, image), os.path.join(test_dir, label, image))

print(f"Training data directory: {train_dir}")
print(f"Testing data directory: {test_dir}")

In [None]:
import os
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

# Define paths
dataset_dir = 'skin_cancer_data'
train_dir = os.path.join(dataset_dir, 'train')
test_dir = os.path.join(dataset_dir, 'test')

# Define transformations for data augmentation and normalization
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize images to 224x224
    transforms.ToTensor(),          # Convert images to PyTorch tensors
    transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])  # Normalize to [-1, 1]
])

# Load datasets
train_dataset = datasets.ImageFolder(root=train_dir, transform=transform)
test_dataset = datasets.ImageFolder(root=test_dir, transform=transform)

# Data loaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=2)

# Class names
class_names = train_dataset.classes
print(f"Classes: {class_names}")
print(f"Number of training samples: {len(train_dataset)}")
print(f"Number of testing samples: {len(test_dataset)}")

# Check a sample batch (optional)
data_iter = iter(train_loader)
images, labels = next(data_iter)
print(f"Batch image shape: {images.shape}")
print(f"Batch labels: {labels}")


In [None]:
import os
import random
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import kagglehub

# Download the dataset using kagglehub
fanconic_skin_cancer_malignant_vs_benign_path = kagglehub.dataset_download('fanconic/skin-cancer-malignant-vs-benign')

# Update vis_dir to point to the downloaded dataset path
vis_dir = os.path.join(fanconic_skin_cancer_malignant_vs_benign_path, 'train')
# Assuming 'train' folder is present inside the downloaded dataset

classes = os.listdir(vis_dir)

image_paths = []

for cls in classes:
    class_dir = os.path.join(vis_dir, cls)
    images = os.listdir(class_dir)
    for img in images:
        image_paths.append(os.path.join(class_dir, img))

random_images = random.sample(image_paths, 16)

fig, axes = plt.subplots(4, 4, figsize=(10, 10))

for i, ax in enumerate(axes.flat):
    img = mpimg.imread(random_images[i])
    ax.imshow(img)
    ax.axis('off')  # Hide axes

plt.show()

In [None]:

classes = os.listdir(vis_dir)


image_count = {cls: len(os.listdir(os.path.join(vis_dir, cls))) for cls in classes}


plt.figure(figsize=(12, 6))
plt.bar(image_count.keys(), image_count.values(), color='skyblue')
plt.xlabel('Class')
plt.ylabel('Number of Images')
plt.title('Number of Images per Class in the Training Data')
plt.xticks(rotation=90, ha='right')
plt.show()

In [None]:
import os
import matplotlib.pyplot as plt

# Assuming 'test' folder is inside the downloaded dataset directory
# Replace with the correct path if needed
test_vis_dir = os.path.join(fanconic_skin_cancer_malignant_vs_benign_path, 'test')

# Check if the directory exists
if os.path.exists(test_vis_dir):
    classes = os.listdir(test_vis_dir)

    image_count = {cls: len(os.listdir(os.path.join(test_vis_dir, cls))) for cls in classes}

    plt.figure(figsize=(12, 6))
    plt.bar(image_count.keys(), image_count.values(), color='orange')
    plt.xlabel('Class')
    plt.ylabel('Number of Images')
    plt.title('Number of Images per Class in the Test Data')
    plt.xticks(rotation=90, ha='right')
    plt.show()
else:
    print(f"Error: Directory not found: {test_vis_dir}")
    print("Please check the path and ensure the 'test' folder exists within the downloaded dataset.")

In [None]:
#dont run this


import os
from PIL import Image


input_dirs = {
    'train': '/kaggle/input/skin-cancer-malignant-vs-benign/train',
    'test': '/kaggle/input/skin-cancer-malignant-vs-benign/test'
}
output_dir = '/kaggle/working/'


for subset in input_dirs.keys():
    subset_dir = os.path.join(output_dir, subset)
    if not os.path.exists(subset_dir):
        os.makedirs(subset_dir)

def resize_and_save_image(input_path, output_path, size=(224, 224)):
    try:
        with Image.open(input_path) as img:

            if img.mode == 'P':
                img = img.convert('RGBA')

            if img.mode in ('RGBA', 'LA') or (img.mode == 'P' and 'transparency' in img.info):
                img = img.convert('RGB')
            img = img.resize(size, Image.LANCZOS)
            img.save(output_path, format='JPEG')
    except Exception as e:
        print(f"Error processing {input_path}: {e}")

def process_directory(input_directory, output_directory):
    for root, dirs, files in os.walk(input_directory):

        relative_path = os.path.relpath(root, input_directory)
        output_path = os.path.join(output_directory, relative_path)
        if not os.path.exists(output_path):
            os.makedirs(output_path)

        # Process each image file
        for file_name in files:
            if file_name.lower().endswith(('.png', '.jpg', '.jpeg')):
                input_file_path = os.path.join(root, file_name)
                output_file_path = os.path.join(output_path, file_name)
                resize_and_save_image(input_file_path, output_file_path)

# Process each directory separately
for subset, dir_path in input_dirs.items():
    process_directory(dir_path, os.path.join(output_dir, subset))

print("Resizing and saving images completed.")

In [None]:

vis_dir = '/kaggle/working/train'


classes = os.listdir(vis_dir)


image_paths = []


for cls in classes:
    class_dir = os.path.join(vis_dir, cls)
    images = os.listdir(class_dir)
    for img in images:
        image_paths.append(os.path.join(class_dir, img))
        random_images = random.sample(image_paths, 16)

fig, axes = plt.subplots(4, 4, figsize=(10, 10))


for i, ax in enumerate(axes.flat):
    img = mpimg.imread(random_images[i])
    ax.imshow(img)
    ax.axis('off')  # Hide axes

# Display the plot
plt.show()

In [None]:
#dont run this


import os
import random
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

vis_dir = '/kaggle/working/train'

classes = os.listdir(vis_dir)

image_paths = []

for cls in classes:
    class_dir = os.path.join(vis_dir, cls)
    images = os.listdir(class_dir)
    for img in images:
        image_paths.append(os.path.join(class_dir, img))

# Check if image_paths is empty or if it has fewer than 16 images
if not image_paths or len(image_paths) < 16:
    # If empty or too small, sample all available images or handle the case appropriately
    random_images = image_paths  # Sample all available images
    # or
    # print("Not enough images to sample 16.")
    # # Handle the case accordingly, e.g., exit or display fewer images
else:
    # If enough images are available, sample 16
    random_images = random.sample(image_paths, 16)

fig, axes = plt.subplots(4, 4, figsize=(10, 10))

# Ensure the loop iterates within the bounds of random_images
for i, ax in enumerate(axes.flat):
    if i < len(random_images):  # Check if index is within bounds
        img = mpimg.imread(random_images[i])
        ax.imshow(img)
        ax.axis('off')  # Hide axes
    else:
        ax.axis('off')  # Hide empty axes if not enough images

# Display the plot
plt.show()

In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import os

# Define base directory and dataset paths
# The base_dir was incorrect. It should point to the Kaggle input directory

# Verify the directory structure
print("Train directory classes:", os.listdir(train_dir))
print("Test directory classes:", os.listdir(test_dir))

# Create ImageDataGenerator instances
train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=10,
    width_shift_range=0.1,
    height_shift_range=0.1,
    shear_range=0.1,
    zoom_range=0.1,
    horizontal_flip=True,
    fill_mode='nearest'
)

test_datagen = ImageDataGenerator(rescale=1./255)

# Create the generators
train_generator = train_datagen.flow_from_directory(
    train_dir,
    seed=777,
    target_size=(224, 224),
    batch_size=20,
    class_mode='binary'
)

test_generator = test_datagen.flow_from_directory(
    test_dir,
    seed=777,
    target_size=(224, 224),
    batch_size=20,
    class_mode='binary'
)

In [None]:
from tensorflow.keras.applications import Xception, ResNet50, VGG16, InceptionV3, DenseNet121
from tensorflow.keras.applications import MobileNet, MobileNetV2, NASNetMobile, EfficientNetB0, EfficientNetB3, EfficientNetB4, ResNet101, ResNet152
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout, BatchNormalization
from tensorflow.keras.callbacks import ModelCheckpoint

def create_model(base_model_class, input_shape=(224, 224, 3)):
    base_model = base_model_class(weights='imagenet', include_top=False, input_shape=input_shape)
    base_model.trainable = False

    x = base_model.output
    x = GlobalAveragePooling2D()(x)
    x = Dense(1024, activation='relu')(x)
    x = BatchNormalization()(x)
    x = Dropout(0.8)(x)
    x = Dense(512, activation='relu')(x)
    x = BatchNormalization()(x)
    x = Dropout(0.5)(x)
    predictions = Dense(1, activation='sigmoid')(x)

    model = Model(inputs=base_model.input, outputs=predictions)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model


In [None]:

models = [
    ('Xception', Xception),
    ('ResNet50', ResNet50),
    ('VGG16', VGG16),
    ('InceptionV3', InceptionV3),
    ('DenseNet121', DenseNet121),
    ('MobileNet', MobileNet),
    ('MobileNetV2', MobileNetV2),
    ('NASNetMobile', NASNetMobile),
    ('EfficientNetB0', EfficientNetB0),
    ('EfficientNetB3', EfficientNetB3),
    ('EfficientNetB4', EfficientNetB4),
    ('ResNet101', ResNet101),
    ('ResNet152', ResNet152),

]


In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint, ProgbarLogger
from tqdm.keras import TqdmCallback  # Add tqdm for enhanced progress bars

# Initialize variables to track the best model
best_model_name = None
best_val_accuracy = 0

# Iterate over the list of models
for model_name, model_class in models:
    print(f"\nTraining model: {model_name}...")

    # Create a new model instance
    model = create_model(model_class)

    # Validate the generators
    try:
        if len(train_generator) == 0:
            raise ValueError("Training generator is empty. Check the data path or preprocessing.")

        if len(test_generator) == 0:
            raise ValueError("Validation generator is empty. Check the data path or preprocessing.")
    except AttributeError:
        print("Error: Data generators must support the length operation. Check your generator implementation.")
        break

    # Define ModelCheckpoint to save the best version of the current model
    checkpoint = ModelCheckpoint(
        filepath=f'{model_name}_best_model.keras',  # Unique file for each model
        monitor='val_accuracy',
        verbose=1,
        save_best_only=True,
        mode='max'
    )

    # Add tqdm progress callback
    progress_callback = TqdmCallback(epochs=10)  # Adjust based on number of epochs

    # Train the model
    try:
        history = model.fit(
            train_generator,
            epochs=10,
            validation_data=test_generator,
            verbose=0,  # Suppress default verbose to avoid duplication
            callbacks=[checkpoint, progress_callback]
        )
    except ValueError as e:
        print(f"Error during training of {model_name}: {e}")
        continue

    # Extract the highest validation accuracy from the training history
    if 'val_accuracy' in history.history and history.history['val_accuracy']:
        val_accuracy = max(history.history['val_accuracy'])
    else:
        print(f"Warning: No validation accuracy found for {model_name}. Skipping...")
        continue

    # Update the best model if the current model performs better
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        best_model_name = model_name

    # Log the performance of the current model
    print(f"{model_name} validation accuracy: {val_accuracy:.4f}")

# Final output: the best model
if best_model_name:
    print(f"\nThe best performing model is {best_model_name} with a validation accuracy of {best_val_accuracy:.4f}")
else:
    print("\nNo model produced a validation accuracy. Please check your models and data.")


In [None]:
from tensorflow.keras.applications import DenseNet121
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout, BatchNormalization


base_model = DenseNet121(weights='imagenet', include_top=False, input_shape=(224, 224, 3))


base_model.trainable = False


x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dense(1024, activation='relu')(x)
x = BatchNormalization()(x)
x = Dropout(0.8)(x)
x = Dense(512, activation='relu')(x)
x = BatchNormalization()(x)
x = Dropout(0.5)(x)
predictions = Dense(1, activation='sigmoid')(x)


model = Model(inputs=base_model.input, outputs=predictions)


model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


model.summary()


In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint


# Save the model with the best validation accuracy
checkpoint = ModelCheckpoint('best_model.keras', monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')

# Train the model on the data
history = model.fit(
    train_generator,
    epochs=150,
    validation_data=test_generator,
    verbose=2,
    callbacks=[checkpoint]  # Include the callback
)

# Save training history
train_accuracy = history.history['accuracy']
train_loss = history.history['loss']
val_accuracy = history.history['val_accuracy']
val_loss = history.history['val_loss']

In [None]:
!pip install visualkeras

In [None]:
from tensorflow.keras.models import load_model
import visualkeras
from PIL import Image


model_path = '/kaggle/working/best_model.keras'
model = load_model(model_path)

visualization_path = '/kaggle/working/model_visualization.png'
visualkeras.layered_view(model).save(visualization_path)

# Display the image using matplotlib
img = Image.open(visualization_path)
plt.figure(figsize=(20, 10))
plt.imshow(img)
plt.axis('off')
plt.show()

In [None]:
import matplotlib.pyplot as plt



plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.plot(train_accuracy, label='Train Accuracy')
plt.plot(val_accuracy, label='Validation Accuracy')
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(loc='lower right')

# Plot training & validation loss values
plt.subplot(1, 2, 2)
plt.plot(train_loss, label='Train Loss')
plt.plot(val_loss, label='Validation Loss')
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(loc='upper right')

# Show the plots
plt.tight_layout()
plt.show()
