In [1]:
# Imports and GPU check
%matplotlib inline
import os
import random
import time
import pandas as pd
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import numpy as np
import torch
from torch.utils.data import Dataset
from torchvision import transforms
from PIL import Image
import torch.nn.init as init
import torchvision
import torchvision.transforms as transforms
from torch import nn, optim
from torch.utils.data import DataLoader, Subset
from torchvision import models
from tqdm import tqdm
from itertools import product

# Fix HTTPS certificate error
import ssl
ssl_create_default_https_context = ssl._create_unverified_context
from CustomImageDataset import CustomImageDataset


# Check for GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cpu


In [2]:
def set_seed(seed=42):
    """Sets the seed for reproducibility."""
    # Python RNG
    random.seed(seed)
    
    # PyTorch RNGs
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    
    # Numpy RNG
    np.random.seed(seed)
    
    # OS RNG
    os.environ['PYTHONHASHSEED'] = str(seed)

def worker_init_fn(worker_id):    
    """Ensure that the data loading process is deterministic."""
    np.random.seed(np.random.get_state()[1][0] + worker_id)
    
set_seed(42)  

In [3]:
base_path = "/Users/orcunkarabicak/Documents/JADS/Deep Learning/Project/input/IDC_regular_ps50_idx5/"

In [4]:
folders = [folder for folder in os.listdir(base_path) if not folder.startswith(".")]
print(len(folders))
# 279 patients. MacOS creates a hidden folder .DS_Store breaks the flow.

279


In [5]:
total_images = 0
for n in range(len(folders)):
    patient_id = folders[n]
    for c in [0, 1]:
        patient_path = base_path + patient_id 
        class_path = patient_path + "/" + str(c) + "/"
        subfiles = os.listdir(class_path)
        total_images += len(subfiles)


data = pd.DataFrame(index=np.arange(0, total_images), columns=["patient_id", "path", "target"])

k = 0
for n in range(len(folders)):
    patient_id = folders[n]
    patient_path = base_path + patient_id 
    for c in [0, 1]:
        class_path = patient_path + "/" + str(c) + "/"
        subfiles = os.listdir(class_path)
        for m in range(len(subfiles)):
            image_path = subfiles[m]
            data.loc[k, "path"] = class_path + image_path
            data.loc[k, "target"] = int(c)
            data.loc[k, "patient_id"] = patient_id
            k += 1  




In [6]:
# Custom dataset class

class BreastCancerDataset(Dataset):
    def __init__(self, dataframe, transform=None):
        """
        Args:
            dataframe (Pandas DataFrame): DataFrame containing image paths and labels.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.dataframe = dataframe
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        img_path = self.dataframe.iloc[idx, 1]  # Access image path
        image = Image.open(img_path).convert('RGB')  # Load image and ensure RGB
        label = self.dataframe.iloc[idx, 2]  # Access the label

        if self.transform:
            image = self.transform(image)

        return image, label

In [7]:
def calculate_mean_and_variance(loader):
    mean = 0.0
    variance = 0.0
    total_images = 0

    for images, _ in loader:
        # Rearrange batch to be the shape of [B, C, W * H]
        images = images.view(images.size(0), images.size(1), -1)
        # Update total_images
        total_images += images.size(0)
        # Compute mean and variance here
        mean += images.mean(2).sum(0) 
        variance += images.var(2).sum(0)

    # Final mean and variance
    mean /= total_images
    variance /= total_images


    return mean, variance

Our dataset is imbalanced. We will first split our dataset into train and test preserving the class rations. Afterwards, we will use StratifiedKFold in our train dataset.

In [8]:

# Transform and resize the data
base_transform = transforms.Compose([
    transforms.Resize((50, 50)),  # Resize to 50x50
    transforms.ToTensor()  # Convert to tensor
])

In [9]:
# Hyperparameters
batch_size = 16
subset_ratio = 0.03  # Reduced dataset size for quicker training

epochs = 20
learning_rate = 0.001

In [10]:
# Data Pre-Processing
# Assuming "target" is the column containing class labels
le = LabelEncoder()
data["target"] = le.fit_transform(data["target"])
# Create custom datasets for training, validation and testing
full_train_df, test_df = train_test_split(data, test_size=0.2, random_state=42, stratify=data["target"])
test_dataset = BreastCancerDataset(test_df, transform=base_transform) # Test dataset is being held out to check generalizability of the final model 

print(f'Train dataset size: {full_train_df.shape}\n Test dataset size: {test_df.shape}')

Train dataset size: (222019, 3)
 Test dataset size: (55505, 3)


In [11]:
# Perform train-test split to select a subset while preserving class distribution
limited_train_df, _ = train_test_split(full_train_df, train_size=subset_ratio, random_state=42, stratify=full_train_df["target"])

# Check the size of the generated subset
print("Size of Subset:", len(limited_train_df))


Size of Subset: 6660


In [12]:
# Create a custom DataLoader for the training fold
train_dataset = BreastCancerDataset(limited_train_df, transform=base_transform)  # Base transform
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, worker_init_fn=worker_init_fn)

# Calculate fold-specific mean and standard deviation
mean, variance = calculate_mean_and_variance(train_loader)
std = np.sqrt(variance)
print(f"Mean: {mean}")
print(f"Standard Deviation: {std}")


# Define number of folds
k_folds = 5  # Adjust this value based on your needs

# Create stratified k-fold object
skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)


for fold, (train_index, val_index) in enumerate(skf.split(X=limited_train_df, y=limited_train_df["target"])):
    # Create training and testing DataFrames based on the indices
    train_df = limited_train_df.iloc[train_index]
    val_df = limited_train_df.iloc[val_index]
    
    # Update transform with fold-specific mean and standard deviation
    fold_transform = transforms.Compose([
      transforms.Resize((50, 50)),  # Resize to 50x50
      transforms.ToTensor(),  # Convert to tensor
      transforms.Normalize(mean=mean, std=std)  # Fold-specific normalization
    ])

    
    train_dataset = BreastCancerDataset(train_df, transform=fold_transform)  # Updated fold transform
    val_dataset = BreastCancerDataset(val_df, transform=base_transform) # Using base transform
    
    # Create data loaders for training and testing
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, worker_init_fn=worker_init_fn)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, worker_init_fn=worker_init_fn)
      
print(f"Full train set size: {len(full_train_df)} - Reduced train set size: {len(train_dataset)} - Validation set size: {len(val_dataset)}")

Mean: tensor([0.8128, 0.6357, 0.7344])
Standard Deviation: tensor([0.1022, 0.1559, 0.1168])
Full train set size: 222019 - Reduced train set size: 5328 - Validation set size: 1332
