In [2]:
import os
import numpy as np
import pandas as pd
import cv2
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout

In [None]:
##  CLASSES ONLY

# Load metadata
data_dir = r"C:\Users\abbyh\OneDrive\Desktop\MSc Health Data Analytics and Machine Learning\ML\ML Project\archive (3)\BreaKHis_v1\BreaKHis_v1\histology_slides\breast" 
metadata = []
for root, dirs, files in os.walk(data_dir):
    for file in files:
        if file.endswith(".png"):
            # Extract label from the folder structure
            label = "malignant" if "malignant" in root else "benign"
            metadata.append((os.path.join(root, file), label))

# Convert to DataFrame
df = pd.DataFrame(metadata, columns=["filepath", "label"])

# Debugging: Check the shape and first few rows of the DataFrame
print(f"DataFrame shape: {df.shape}")
print(df.head())



DataFrame shape: (7909, 2)
                                            filepath   label
0  C:\Users\abbyh\OneDrive\Desktop\MSc Health Dat...  benign
1  C:\Users\abbyh\OneDrive\Desktop\MSc Health Dat...  benign
2  C:\Users\abbyh\OneDrive\Desktop\MSc Health Dat...  benign
3  C:\Users\abbyh\OneDrive\Desktop\MSc Health Dat...  benign
4  C:\Users\abbyh\OneDrive\Desktop\MSc Health Dat...  benign


In [None]:
## CLASSSES + MAGNIFICATION

# Load metadata
data_dir = r"C:\Users\abbyh\OneDrive\Desktop\MSc Health Data Analytics and Machine Learning\ML\ML Project\archive (3)\BreaKHis_v1\BreaKHis_v1\histology_slides\breast"
metadata = []
for root, dirs, files in os.walk(data_dir):
    for file in files:
        if file.endswith(".png"):
            # Extract label from the folder structure
            label = "malignant" if "malignant" in root else "benign"
            
            # Extract magnification from the folder name or filename
            magnification = None
            # Option 1: Extract from folder name (e.g., "40X", "100X", etc.)
            for part in root.split(os.sep):
                if part.endswith("X") and part[:-1].isdigit():  # Check for patterns like "40X", "100X", etc.
                    magnification = part
                    break
            
            # Option 2: Extract from filename (e.g., "SOB_B_A-14-22549AB-40-001.png")
            if magnification is None:
                match = re.search(r"\d{2,3}X", file)  # Look for patterns like "40X", "100X", etc.
                if match:
                    magnification = match.group(0)
            
            # Append filepath, label, and magnification to metadata
            metadata.append((os.path.join(root, file), label, magnification))

# Convert to DataFrame
df = pd.DataFrame(metadata, columns=["filepath", "label", "magnification"])

# Debugging: Check the shape and first few rows of the DataFrame
print(f"DataFrame shape: {df.shape}")
print(df.head())

DataFrame shape: (7909, 3)
                                            filepath   label magnification
0  C:\Users\OneDrive\Desktop\MSc Health Dat...  benign          100X
1  C:\Users\abbyh\OneDrive\Desktop\MSc Health Dat...  benign          100X
2  C:\Users\abbyh\OneDrive\Desktop\MSc Health Dat...  benign          100X
3  C:\Users\abbyh\OneDrive\Desktop\MSc Health Dat...  benign          100X
4  C:\Users\abbyh\OneDrive\Desktop\MSc Health Dat...  benign          100X


In [3]:
## CLASSES + MAGNIFICATION + SUBTYPE

# Load metadata
data_dir = r"C:\Users\abbyh\OneDrive\Desktop\MSc Health Data Analytics and Machine Learning\ML\ML Project\archive (3)\BreaKHis_v1\BreaKHis_v1\histology_slides\breast"
metadata = []
for root, dirs, files in os.walk(data_dir):
    for file in files:
        if file.endswith(".png"):
            # Extract label from the folder structure
            label = "malignant" if "malignant" in root else "benign"
            
            # Extract magnification from the folder name or filename
            magnification = None
            # Option 1: Extract from folder name (e.g., "40X", "100X", etc.)
            for part in root.split(os.sep):
                if part.endswith("X") and part[:-1].isdigit():  # Check for patterns like "40X", "100X", etc.
                    magnification = part
                    break
            
            # Option 2: Extract from filename (e.g., "SOB_B_A-14-22549AB-40-001.png")
            if magnification is None:
                match = re.search(r"\d{2,3}X", file)  # Look for patterns like "40X", "100X", etc.
                if match:
                    magnification = match.group(0)
            
            # Extract tumor subtype from the folder structure
            tumor_subtype = None
            for part in root.split(os.sep):
                if part in ["adenosis", "fibroadenoma", "phyllodes_tumor", "tubular_adenoma",  # Benign subtypes
                           "ductal_carcinoma", "lobular_carcinoma", "mucinous_carcinoma", "papillary_carcinoma"]:  # Malignant subtypes
                    tumor_subtype = part
                    break
            
            # Append filepath, label, magnification, and tumor subtype to metadata
            metadata.append((os.path.join(root, file), label, magnification, tumor_subtype))

# Convert to DataFrame
df = pd.DataFrame(metadata, columns=["filepath", "label", "magnification", "tumor_subtype"])

# Debugging: Check the shape and first few rows of the DataFrame
print(f"DataFrame shape: {df.shape}")
print(df.head())

DataFrame shape: (7909, 4)
                                            filepath   label magnification  \
0  C:\Users\abbyh\OneDrive\Desktop\MSc Health Dat...  benign          100X   
1  C:\Users\abbyh\OneDrive\Desktop\MSc Health Dat...  benign          100X   
2  C:\Users\abbyh\OneDrive\Desktop\MSc Health Dat...  benign          100X   
3  C:\Users\abbyh\OneDrive\Desktop\MSc Health Dat...  benign          100X   
4  C:\Users\abbyh\OneDrive\Desktop\MSc Health Dat...  benign          100X   

  tumor_subtype  
0      adenosis  
1      adenosis  
2      adenosis  
3      adenosis  
4      adenosis  


In [None]:
df = df.sample(frac=0.1, random_state=42)  

In [36]:
# Preprocess images
def preprocess_image(filepath, target_size=(128, 128)):
    image = cv2.imread(filepath)
    image = cv2.resize(image, target_size)
    image = image / 255.0  # Normalize to [0, 1]
    return image

# Load and preprocess images
X = []
y = []
for filepath, label, _ in df.values:  # Ignore magnification for now
    image = preprocess_image(filepath)
    if image is not None:
        X.append(image)
        y.append(1 if label == "malignant" else 0)  # Convert labels to binary (0: benign, 1: malignant)

X = np.array(X)
y = np.array(y)
y = to_categorical(y, num_classes=2)  # One-hot encoding

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training samples: {len(X_train)}")
print(f"Testing samples: {len(X_test)}")

Training samples: 632
Testing samples: 159
