In [1]:
# --- Cell 1: Import Libraries ---
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import VGG16
from tensorflow.keras.models import Model
import cv2

# --- Cell 2: Define Dataset Paths ---
# Adjust these paths according to your actual folder structure
train_dir = 'diagnosis-breast-cancer/complete_set/training_set'
test_dir = 'diagnosis-breast-cancer/complete_set/testing_set'


data_dir = 'diagnosis-breast-cancer/complete_set/training_set'  # Using training folder

# --- Cell 3: Image Preprocessing and Feature Extraction ---
def extract_features(image_paths, target_size=(224, 224)):
    """
    Extract features from images using a pre-trained model (VGG16)
    """
    # Load pre-trained VGG16 model without top classification layers
    base_model = VGG16(weights='imagenet', include_top=False, 
                       input_shape=(224, 224, 3))
    
    features = []
    labels = []
    
    for image_path in image_paths:
        # Read and preprocess image
        img = cv2.imread(image_path)
        if img is not None:
            # Resize image to match VGG16 input size
            img = cv2.resize(img, target_size)
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            
            # Normalize pixel values
            img = img / 255.0
            
            # Expand dimensions to create batch of size 1
            img_batch = np.expand_dims(img, axis=0)
            
            # Extract features
            feature = base_model.predict(img_batch, verbose=0)
            
            # Flatten the features
            feature_flat = feature.flatten()
            
            features.append(feature_flat)
            
            # Extract label from folder name
            if 'malignant' in image_path:
                labels.append('High Priority')
            else:
                labels.append('Low Priority')
    
    return np.array(features), np.array(labels)

# --- Cell 4: Load and Prepare Image Data ---
def load_image_paths(data_directory):
    """
    Load all image paths and their corresponding labels
    """
    image_paths = []
    
    # Process malignant images (High Priority)
    malignant_path = os.path.join(data_directory, 'malignant')
    if os.path.exists(malignant_path):
        for img_file in os.listdir(malignant_path):
            if img_file.lower().endswith(('.png', '.jpg', '.jpeg')):
                image_paths.append(os.path.join(malignant_path, img_file))
    
    # Process benign images (Low Priority)
    benign_path = os.path.join(data_directory, 'benign')
    if os.path.exists(benign_path):
        for img_file in os.listdir(benign_path):
            if img_file.lower().endswith(('.png', '.jpg', '.jpeg')):
                image_paths.append(os.path.join(benign_path, img_file))
    
    return image_paths

# Load all image paths
print("Loading image paths...")
image_paths = load_image_paths(data_dir)
print(f"Found {len(image_paths)} images")

# --- Cell 5: Extract Features ---
print("Extracting features from images...")
X, y = extract_features(image_paths)

print(f"Feature matrix shape: {X.shape}")
print(f"Labels shape: {y.shape}")
print(f"Class distribution: {pd.Series(y).value_counts()}")

# --- Cell 6: Split Data ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

print(f"Training set size: {X_train.shape[0]}")
print(f"Testing set size: {X_test.shape[0]}")

# --- Cell 7: Train Random Forest Model ---
print("Training Random Forest model...")
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# --- Cell 8: Evaluate Model ---
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, pos_label='High Priority', average='binary')

print("Performance Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"F1-Score (for 'High Priority'): {f1:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# --- Cell 9: Feature Importance (Optional) ---
# Since we're using image features, feature importance is less interpretable
# but we can check the overall feature importance pattern
feature_importance = model.feature_importances_
print(f"\nFeature importance stats:")
print(f"Mean importance: {np.mean(feature_importance):.6f}")
print(f"Max importance: {np.max(feature_importance):.6f}")
print(f"Min importance: {np.min(feature_importance):.6f}")

# --- Cell 10: Sample Prediction Visualization ---
def visualize_predictions(X_test, y_test, y_pred, image_paths_test, num_samples=5):
    """
    Visualize sample predictions (conceptual - would need original images)
    """
    print(f"\nSample Predictions (first {num_samples}):")
    for i in range(min(num_samples, len(y_test))):
        status = "✓" if y_test[i] == y_pred[i] else "✗"
        print(f"Sample {i+1}: True={y_test[i]}, Predicted={y_pred[i]} {status}")

# Note: To properly visualize, we would need to keep track of which test images
# correspond to which predictions

print("\nModel training and evaluation completed!")

Loading image paths...
Found 1112 images
Extracting features from images...
Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m58889256/58889256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 1us/step
Feature matrix shape: (1112, 25088)
Labels shape: (1112,)
Class distribution: Low Priority     791
High Priority    321
Name: count, dtype: int64
Training set size: 778
Testing set size: 334
Training Random Forest model...
Performance Metrics:
Accuracy: 0.8982
F1-Score (for 'High Priority'): 0.7952

Classification Report:
               precision    recall  f1-score   support

High Priority       0.94      0.69      0.80        96
 Low Priority       0.89      0.98      0.93       238

     accuracy                           0.90       334
    macro avg       0.91      0.84      0.86       334
 weighted avg       0.90      0.90      0.89       334


Feature importance stats:
Mean importance