<a href="https://www.kaggle.com/code/misspriyagill/major-project?scriptVersionId=210336795" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import os
import re
import seaborn as sns
import matplotlib.pyplot as plt
from PIL import Image
import random
from sklearn.model_selection import train_test_split
import warnings
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.preprocessing.image import load_img

warnings.filterwarnings("ignore", category=FutureWarning)
dataset_path = "/kaggle/input/melanoma-skin-cancer-dataset-of-10000-images/melanoma_cancer_dataset"

def count_images_in_folders(base_path, folder_list):
    counts = {}
    for folder in folder_list:
        folder_path = os.path.join(base_path, folder)
        counts[folder] = len(os.listdir(folder_path)) if os.path.exists(folder_path) else 0
    return counts

folders = ["train/benign", "train/malignant", "test/benign", "test/malignant"]
image_counts = count_images_in_folders(dataset_path, folders)
for folder, count in image_counts.items():
    print(f"{folder}: {count} images")
    
train_benign_path = os.path.join(dataset_path, "train", "benign")
train_malignant_path = os.path.join(dataset_path, "train", "malignant")
test_benign_path = os.path.join(dataset_path, "test", "benign")
test_malignant_path = os.path.join(dataset_path, "test", "malignant")

def check_filename_consistency(base_path, subfolders):
    for subfolder in subfolders:
        folder_path = os.path.join(base_path, subfolder)
        for filename in os.listdir(folder_path):
            if not re.match(r"melanoma_\d+\.jpg", filename):
                print(f"Issue in {subfolder}: {filename}")

check_filename_consistency(dataset_path, folders)

In [None]:
validation_base_path = "/kaggle/working/validation"
subfolders = ["train/benign", "train/malignant"]
os.makedirs(validation_base_path, exist_ok=True)

def create_validation_split(base_path, subfolders, validation_base_path, split_ratio=0.2):
    for subfolder in subfolders:
        source_folder = os.path.join(base_path, subfolder)
        if not os.path.exists(source_folder):
            print(f"Source folder {source_folder} does not exist. Skipping...")
            continue
        
        target_folder = os.path.join(validation_base_path, subfolder.split("/")[-1])
        os.makedirs(target_folder, exist_ok=True)
        images = os.listdir(source_folder)
        labels = [subfolder] * len(images)
        _, val_images = train_test_split(images, test_size=split_ratio, stratify=labels)
        
        for val_image in val_images:
            src = os.path.join(source_folder, val_image)
            dst = os.path.join(target_folder, val_image)
            if not os.path.exists(dst):
                os.symlink(src, dst)

create_validation_split(dataset_path, subfolders, validation_base_path)
print("Validation split created successfully!")

In [None]:
train_benign_dir = '/kaggle/input/melanoma-skin-cancer-dataset-of-10000-images/melanoma_cancer_dataset/train/benign'
train_malignant_dir = '/kaggle/input/melanoma-skin-cancer-dataset-of-10000-images/melanoma_cancer_dataset/train/malignant'
test_benign_dir = '/kaggle/input/melanoma-skin-cancer-dataset-of-10000-images/melanoma_cancer_dataset/test/benign'
test_malignant_dir = '/kaggle/input/melanoma-skin-cancer-dataset-of-10000-images/melanoma_cancer_dataset/test/malignant'

# Count the number of images in each directory
train_benign = len(os.listdir(train_benign_dir))
train_malignant = len(os.listdir(train_malignant_dir))
test_benign = len(os.listdir(test_benign_dir))
test_malignant = len(os.listdir(test_malignant_dir))

# Debug: Print counts and types
print(f"train_benign: {train_benign} images")
print(f"train_malignant: {train_malignant} images")
print(f"test_benign: {test_benign} images")
print(f"test_malignant: {test_malignant} images")

# Class counts dictionary (keys as strings and values as integers)
class_counts = {
    "Train Benign": train_benign,
    "Train Malignant": train_malignant,
    "Test Benign": test_benign,
    "Test Malignant": test_malignant
}

# Extracting keys and values
keys = list(class_counts.keys())
values = list(class_counts.values())

# Debug: Print the keys and values
print(f"Keys: {keys}")
print(f"Values: {values}")

plt.figure(figsize=(8, 6))
sns.barplot(x=keys, y=values, palette='coolwarm')
plt.xlabel('Class', fontsize=12)
plt.ylabel('Number of Images', fontsize=12)
plt.title('Class Distribution in Dataset', fontsize=14)
for index, value in enumerate(values):
    plt.text(index, value + 5, str(value), ha='center', fontsize=10)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
plt.bar(keys, values, color=['blue', 'red', 'blue', 'red'])
plt.xlabel('Class')
plt.ylabel('Number of Images')
plt.title('Class Distribution in Dataset')
plt.xticks(rotation=45) 
plt.show()

In [None]:
train_ratio = train_benign / train_malignant
test_ratio = test_benign / test_malignant

print(f"Train Benign to Malignant Ratio: {train_ratio:.2f}")
print(f"Test Benign to Malignant Ratio: {test_ratio:.2f}")

In [None]:
base_path = '/kaggle/input/melanoma-skin-cancer-dataset-of-10000-images/melanoma_cancer_dataset'

train_dir = os.listdir(os.path.join(base_path, 'train'))
test_dir = os.listdir(os.path.join(base_path, 'test'))

print("Train directory:", train_dir)
print("Test directory:", test_dir)

train_benign_dir = os.listdir(os.path.join(base_path, 'train', 'benign'))
train_malignant_dir = os.listdir(os.path.join(base_path, 'train', 'malignant'))

test_benign_dir = os.listdir(os.path.join(base_path, 'test', 'benign'))
test_malignant_dir = os.listdir(os.path.join(base_path, 'test', 'malignant'))

print(f"Train Benign Files: {len(train_benign_dir)}")
print(f"Train Malignant Files: {len(train_malignant_dir)}")
print(f"Test Benign Files: {len(test_benign_dir)}")
print(f"Test Malignant Files: {len(test_malignant_dir)}")


In [None]:
try:
    train_benign_files = os.listdir(train_benign)
    train_benign_count = len(train_benign_files)
    print(f"Train Benign Files: {train_benign_count}")
    train_malignant_files = os.listdir(train_malignant)
    train_malignant_count = len(train_malignant_files)
    print(f"Train Malignant Files: {train_malignant_count}")
    
except Exception as e:
    print(f"Error accessing files: {e}")


In [None]:
try:
    validation_benign_files = os.listdir(validation_benign)
    validation_benign_count = len(validation_benign_files)
    print(f"Validation Benign Files: {validation_benign_count}")
    validation_malignant_files = os.listdir(validation_malignant)
    validation_malignant_count = len(validation_malignant_files)
    print(f"Validation Malignant Files: {validation_malignant_count}")
    
except Exception as e:
    print(f"Error accessing files: {e}")

In [None]:
categories = ['Benign', 'Malignant']
train_counts = [5000, 4605]
validation_counts = [2448, 2258]
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.bar(categories, train_counts, color=['blue', 'red'])
plt.title('Training Set Class Distribution')
plt.xlabel('Class')
plt.ylabel('Number of Images')

plt.subplot(1, 2, 2)
plt.bar(categories, validation_counts, color=['blue', 'red'])
plt.title('Validation Set Class Distribution')
plt.xlabel('Class')
plt.ylabel('Number of Images')

plt.tight_layout()
plt.show()
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.pie(train_counts, labels=categories, autopct='%1.1f%%', startangle=90, colors=['blue', 'red'])
plt.title('Training Set Class Proportion')

plt.subplot(1, 2, 2)
plt.pie(validation_counts, labels=categories, autopct='%1.1f%%', startangle=90, colors=['blue', 'red'])
plt.title('Validation Set Class Proportion')

plt.tight_layout()
plt.show()

In [None]:
train_benign_ratio = train_counts[0] / sum(train_counts)
train_malignant_ratio = train_counts[1] / sum(train_counts)

validation_benign_ratio = validation_counts[0] / sum(validation_counts)
validation_malignant_ratio = validation_counts[1] / sum(validation_counts)

print(f"Training Set - Benign Ratio: {train_benign_ratio:.2f}")
print(f"Training Set - Malignant Ratio: {train_malignant_ratio:.2f}")
print(f"Validation Set - Benign Ratio: {validation_benign_ratio:.2f}")
print(f"Validation Set - Malignant Ratio: {validation_malignant_ratio:.2f}")

In [None]:
train_dir = '/kaggle/input/melanoma-skin-cancer-dataset-of-10000-images/melanoma_cancer_dataset/train/'
class_folders = ['benign', 'malignant']
for class_folder in class_folders:
    class_dir = os.path.join(train_dir, class_folder)
    image_filenames = [f for f in os.listdir(class_dir) if f.endswith('.jpg') or f.endswith('.png')]
    random_images = random.sample(image_filenames, 5)
    
    plt.figure(figsize=(15, 10))
    for i, img_filename in enumerate(random_images):
        img_path = os.path.join(class_dir, img_filename)
        img = Image.open(img_path)
        plt.subplot(1, 5, i+1)
        plt.imshow(img)
        plt.axis('off')
        plt.title(f'{class_folder} - {img_filename}')
    
    plt.show()

In [None]:
data_gen = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

augmented_dir = "augmented_images"
os.makedirs(augmented_dir, exist_ok=True)

for img_path in malignant_images[:10]: 
    try:
        img = load_img(img_path) 
        x = img_to_array(img)  
        x = x.reshape((1,) + x.shape)  
        i = 0
        for batch in data_gen.flow(x, batch_size=1, save_to_dir=augmented_dir, save_prefix='aug', save_format='jpg'):
            i += 1
            if i > 5:  
                break
    except Exception as e:
        print(f"Error processing {img_path}: {e}")
        
img = load_img(malignant_images[0]) 
x = img_to_array(img)
x = x.reshape((1,) + x.shape)
plt.figure(figsize=(15, 5))
for i, batch in enumerate(data_gen.flow(x, batch_size=1)):
    if i == 5:  
        break
    plt.subplot(1, 5, i + 1)
    plt.imshow(batch[0].astype('uint8'))
    plt.axis('off')
plt.show()
augmented_dir = "augmented_images"
augmented_files = [os.path.join(augmented_dir, f) for f in os.listdir(augmented_dir) if f.endswith('.jpg')]
if augmented_files:
    for i, img_path in enumerate(augmented_files[:5]):  # Show first 5 augmented images
        img = load_img(img_path)
        plt.subplot(1, 5, i + 1)
        plt.imshow(img)
        plt.axis('off')
    plt.show()
else:
    print("No augmented images found.")


PRIYA SAYS:      sooo, i am done with my part. I completed 1,2,3,5,7 steps and a  very lil bit of data augmentation(6)
                 please consult me before making any changes to the original(this) notebook!!
make a copy of this notebook and edit locally.