In [1]:
import cv2
import matplotlib.pyplot as plt
import os
import pandas as pd
from pathlib import Path
import seaborn as sns
import shutil
from PIL import Image

### Prepare directories

In [8]:
preprocessing_scenarios = {
    "none": None,
    "patches": None,
    "patches_with_random_stride": None,
    "patches_with_random_transformations": None,
    "patches_with_filtered_out_background": None,
}

chosen_scenario = "patches"
zoom = 400

cwd = Path().absolute()
main_dir = Path().absolute().parent.parent
input_path = main_dir / 'breakhis'
output_path = main_dir / 'data' / f'{zoom}x'
output_path_original = output_path / 'original'

print(f"Current working directory: {cwd}")
print(f"Main directory: {main_dir}")
print(f"Input path: {input_path}")
print(f"Output path: {output_path}")
print(f"Output path original: {output_path_original}")



Current working directory: /home/miki/repos/uz/breakhis/vcs/scripts/common
Main directory: /home/miki/repos/uz/breakhis/vcs
Input path: /home/miki/repos/uz/breakhis/vcs/breakhis
Output path: /home/miki/repos/uz/breakhis/vcs/data/400x
Output path original: /home/miki/repos/uz/breakhis/vcs/data/400x/original


In [19]:
shutil.rmtree(output_path, ignore_errors=True)


In [20]:
os.makedirs(output_path_original)
os.makedirs(output_path_original / 'benign')
os.makedirs(output_path_original / 'malignant')


### Load data

In [39]:
input_data_df = pd.read_csv(input_path / 'Folds.csv').rename(columns={"filename": "path"})

input_data_df.head()

Unnamed: 0,fold,mag,grp,path
0,1,100,train,BreaKHis_v1/histology_slides/breast/benign/SOB...
1,1,100,train,BreaKHis_v1/histology_slides/breast/benign/SOB...
2,1,100,train,BreaKHis_v1/histology_slides/breast/benign/SOB...
3,1,100,train,BreaKHis_v1/histology_slides/breast/benign/SOB...
4,1,100,train,BreaKHis_v1/histology_slides/breast/benign/SOB...


In [40]:
input_data_df['path'][1]

'BreaKHis_v1/histology_slides/breast/benign/SOB/adenosis/SOB_B_A_14-22549AB/100X/SOB_B_A-14-22549AB-100-002.png'

In [41]:
input_data_df['filename'] = input_data_df['path'].apply(
    lambda x: x.split("/")[-1])
input_data_df["label"] = input_data_df['path'].apply(lambda x: x.split("/")[3])
input_data_df["patient_id"] = input_data_df['path'].apply(
    lambda x: x.split("/")[-3])

input_data_df.head()

Unnamed: 0,fold,mag,grp,path,filename,label,patient_id
0,1,100,train,BreaKHis_v1/histology_slides/breast/benign/SOB...,SOB_B_A-14-22549AB-100-001.png,benign,SOB_B_A_14-22549AB
1,1,100,train,BreaKHis_v1/histology_slides/breast/benign/SOB...,SOB_B_A-14-22549AB-100-002.png,benign,SOB_B_A_14-22549AB
2,1,100,train,BreaKHis_v1/histology_slides/breast/benign/SOB...,SOB_B_A-14-22549AB-100-003.png,benign,SOB_B_A_14-22549AB
3,1,100,train,BreaKHis_v1/histology_slides/breast/benign/SOB...,SOB_B_A-14-22549AB-100-004.png,benign,SOB_B_A_14-22549AB
4,1,100,train,BreaKHis_v1/histology_slides/breast/benign/SOB...,SOB_B_A-14-22549AB-100-005.png,benign,SOB_B_A_14-22549AB


In [42]:
input_data_df = input_data_df[input_data_df.mag == zoom]

### Copy data to new directory
                

In [25]:
for i in range(len(input_data_df)):
    src = input_path / 'BreaKHis_v1' / input_data_df['path'].iloc[i]
    dest = output_path_original / \
        input_data_df["label"].iloc[i] / str(src).split("/")[-1]
    shutil.copyfile(src, dest)

In [43]:
print(f"Benign: {len(os.listdir(output_path_original / 'benign'))}")
print(f"Malignant: {len(os.listdir(output_path_original / 'malignant'))}")


Benign: 588
Malignant: 1232


- All the images are now stores in single folder.

In [44]:
input_data_df['file_loc'] = input_data_df['label'] + \
    "_" + input_data_df['filename']
input_data_df['class'] = input_data_df['label'].apply(
    lambda x: 0 if x == 'benign' else 1)


In [45]:
input_data_df.head()

Unnamed: 0,fold,mag,grp,path,filename,label,patient_id,file_loc,class
91,1,400,train,BreaKHis_v1/histology_slides/breast/benign/SOB...,SOB_B_A-14-22549AB-400-001.png,benign,SOB_B_A_14-22549AB,benign_SOB_B_A-14-22549AB-400-001.png,0
92,1,400,train,BreaKHis_v1/histology_slides/breast/benign/SOB...,SOB_B_A-14-22549AB-400-002.png,benign,SOB_B_A_14-22549AB,benign_SOB_B_A-14-22549AB-400-002.png,0
93,1,400,train,BreaKHis_v1/histology_slides/breast/benign/SOB...,SOB_B_A-14-22549AB-400-003.png,benign,SOB_B_A_14-22549AB,benign_SOB_B_A-14-22549AB-400-003.png,0
94,1,400,train,BreaKHis_v1/histology_slides/breast/benign/SOB...,SOB_B_A-14-22549AB-400-004.png,benign,SOB_B_A_14-22549AB,benign_SOB_B_A-14-22549AB-400-004.png,0
95,1,400,train,BreaKHis_v1/histology_slides/breast/benign/SOB...,SOB_B_A-14-22549AB-400-005.png,benign,SOB_B_A_14-22549AB,benign_SOB_B_A-14-22549AB-400-005.png,0


In [46]:
benign_df = input_data_df[input_data_df['label'] == 'benign']
malignant_df = input_data_df[input_data_df['label'] == 'malignant']

### Benign Samples

In [None]:
# plt.figure(figsize=(30, 10))
# for i in range(0, 40):
#     plt.subplot(4, 10, i+1)
#     img = cv2.imread(os.path.join(
#         output_path, benign_df['label'].iloc[i], benign_df['filename'].iloc[i]), 1)
#     plt.imshow(img)


### Malignant Samples

In [None]:
# plt.figure(figsize=(30, 10))
# for i in range(0, 40):
#     plt.subplot(4, 10, i+1)
#     img = cv2.imread(os.path.join(
#         output_path, malignant_df['label'].iloc[i], malignant_df['filename'].iloc[i]), 1)
#     plt.imshow(img)


**Findings:**

- From the above images there is very little to no difference between malignant and benign samples.
- This might be because we are not the pathologists, That's the original purpose of the detection system.
- Thus it makes it easy in the absence of actual pathologists.

In [47]:
# Creating a new data frame with the file loc as its index, label and class of the patients as its columns.
benign_files = os.listdir(output_path_original / 'benign')
benign_files = [f"data/{zoom}x/original/benign/" +
                file_name for file_name in benign_files]
malignant_files = os.listdir(output_path_original / 'malignant')
malignant_files = [f"data/{zoom}x/original/malignant/" +
                   file_name for file_name in malignant_files]

original_df = pd.DataFrame(benign_files + malignant_files).rename(columns={0: 'file_loc'})
original_df.head()

Unnamed: 0,file_loc
0,data/400x/original/benign/SOB_B_A-14-22549G-40...
1,data/400x/original/benign/SOB_B_TA-14-3411F-40...
2,data/400x/original/benign/SOB_B_F-14-14134E-40...
3,data/400x/original/benign/SOB_B_TA-14-3411F-40...
4,data/400x/original/benign/SOB_B_TA-14-19854C-4...


In [48]:

original_df['label'] = original_df['file_loc'].apply(
    lambda x: 0 if x.split('/')[-1].split("_")[1] == 'B' else 1)
original_df['label_str'] = original_df['file_loc'].apply(
    lambda x: "benign" if x.split('/')[-1].split("_")[1] == 'B' else "malignant")

original_df['patient_id'] = original_df['file_loc'].apply(
    lambda x: "-".join(x.split("-")[:3]).split("/")[-1])
original_df.set_index("file_loc", inplace=True)

original_df.head()

Unnamed: 0_level_0,label,label_str,patient_id
file_loc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
data/400x/original/benign/SOB_B_A-14-22549G-400-011.png,0,benign,SOB_B_A-14-22549G
data/400x/original/benign/SOB_B_TA-14-3411F-400-012.png,0,benign,SOB_B_TA-14-3411F
data/400x/original/benign/SOB_B_F-14-14134E-400-003.png,0,benign,SOB_B_F-14-14134E
data/400x/original/benign/SOB_B_TA-14-3411F-400-017.png,0,benign,SOB_B_TA-14-3411F
data/400x/original/benign/SOB_B_TA-14-19854C-400-012.png,0,benign,SOB_B_TA-14-19854C


### Apply data-preparation scenarios

#### Divide images into 224x224 patches

In [122]:
def patches(original_df):
    """
    Update the dataframe to include patches instead of original images.

    :param original_df: Original dataframe.
    :return: Updated dataframe.
    """

    patch_size=224
    stride=224

    # Create a list to store the dataframes for the patches
    patch_dfs = []

    # Loop over each row in the dataframe
    for idx, row in original_df.iterrows():
        # Load the image to get its size
        # img = Image.open(idx)  # idx is the file_loc, which is the index of the dataframe
        img = Image.open(str(main_dir / idx))  # idx is the file_loc, which is the index of the dataframe

        # Get the size of the image
        width, height = img.size

        # Calculate the number of patches in x and y direction
        num_patches_x = width // stride
        num_patches_y = height // stride

        # Loop over the patches
        for i in range(num_patches_x):
            for j in range(num_patches_y):

                # Calculate the coordinates of the patch
                left = i * stride
                upper = j * stride
                right = left + patch_size
                lower = upper + patch_size

                # Extract the patch from the image
                patch = img.crop((left, upper, right, lower))

                # Update the filename and path
                base_filename = os.path.splitext(idx)[0]
                new_filename = f"{base_filename}_{i}_{j}.png".replace("original", "patches")

                # Save the patch
                patch.save(str(main_dir / new_filename))

                # Create a new dataframe for the patch, copying all the information from the original image
                new_df = pd.DataFrame([row.values], columns=row.index, index=[new_filename])

                # Add the new dataframe to the list of patch dataframes
                patch_dfs.append(new_df)
        
    # Concatenate all the patch dataframes
    df = pd.concat(patch_dfs)
    df.index.name = 'file_loc'
    return df


In [123]:
preprocessing_scenarios["patches"] = patches

datasets = {"original": {"df": original_df, "path": output_path_original}}
for scenario_name, scenario_func in preprocessing_scenarios.items():
    if scenario_func is not None:
        shutil.rmtree(output_path / scenario_name, ignore_errors=True)
        os.makedirs(output_path / scenario_name)
        os.makedirs(output_path / scenario_name / 'benign')
        os.makedirs(output_path / scenario_name / 'malignant')
        datasets[scenario_name] = {"df": scenario_func(original_df), "path": output_path / scenario_name}

### Divide dataset into folds

In [127]:
import numpy as np
from collections import Counter
from sklearn.utils import shuffle

def intersection(lst1, lst2):
    return list(set(lst1) & set(lst2))

def map_nested_indices(nested_indices, original_indices):
    return original_indices[nested_indices]


class StratifiedGroupKFold:
    def __init__(self, n_splits=3, random_state=None):
        self.n_splits = n_splits
        self.random_state = random_state
        self.used_group_ids = []

    def _fill_bucket(self, bucket, class_counts, group_ids, y):
        for group_id, label in zip(group_ids, y):
            if group_id in self.used_group_ids:
                continue
            if class_counts[label] > 0:
                group_indices = np.where(group_ids == group_id)[0]
                bucket[label].extend(group_indices)
                class_counts[label] -= len(group_indices)
                self.used_group_ids.append(group_id)

    def _create_buckets(self, group_ids, y, class_ratios):
        total_samples = len(group_ids)
        samples_per_split = total_samples // self.n_splits

        buckets = []
        for _ in range(self.n_splits):
            bucket = {label: [] for label in class_ratios.keys()}
            class_counts = {label: int(samples_per_split * ratio)
                            for label, ratio in class_ratios.items()}
            self._fill_bucket(bucket, class_counts, group_ids, y)
            buckets.append(bucket)

        return buckets

    def _rotate_buckets(self, buckets):
        return buckets[-1:] + buckets[:-1]

    def _get_indices(self, bucket, group_ids, y):
        indices = []
        for label, groups in bucket.items():
            for group in groups:
                group_indices = np.where(group_ids == group)[0]
                label_indices = np.where(y == label)[0]
                indices.extend(np.intersect1d(group_indices, label_indices))
        return np.array(indices)

    def split(self, X, y, group_ids):
        index_map = np.arange(len(y))
        group_ids_s, y_s, index_map = shuffle(
            group_ids, y, index_map, random_state=self.random_state)

        class_ratios = {label: count / len(y)
                        for label, count in Counter(y).items()}
        buckets = self._create_buckets(group_ids_s, y_s, class_ratios)

        for _ in range(self.n_splits):
            train_buckets = buckets[1:]
            test_bucket = buckets[0]

            train_indices = np.concatenate(
                [np.array(bucket[label]) for bucket in train_buckets for label in bucket])
            test_indices = np.concatenate(
                [np.array(test_bucket[label]) for label in test_bucket])

            train_indices = index_map[train_indices]
            test_indices = index_map[test_indices]
            
            assert len(np.intersect1d(
                np.unique(group_ids.iloc[train_indices]), np.unique(group_ids.iloc[test_indices]))) == 0

            yield train_indices, test_indices
            buckets = self._rotate_buckets(buckets)


##### Split to train&val + test subdatasets (test 20%)

In [None]:
import json


def create_folds(df, output_path, n_splits=5):
    files = df['label']
    labels = df['label']
    patient_ids = df['patient_id']

    # Divide whole dataset into train+val & test subsets ~80%-20% (stratified by label and patient_id)
    sgfk = StratifiedGroupKFold(n_splits=n_splits, random_state=42)
    train_val_index, test_index = next(sgfk.split(files, labels, patient_ids))

    df.iloc[test_index].to_csv(os.path.join(output_path, "test.csv"))

    train_val_files = df['label'].iloc[train_val_index]
    train_val_labels = df['label'].iloc[train_val_index]
    train_val_patient_ids = df['patient_id'].iloc[train_val_index]

    # Divide train+val into train & val subsets ~80%-20% (stratified by label and patient_id)
    sgfk = StratifiedGroupKFold(n_splits=n_splits, random_state=42)

    folds = sgfk.split(train_val_files, train_val_labels, train_val_patient_ids)
    
    print(f"Zoom: {zoom}x")
    print(f"output path: {output_path}\n")

    classes_balance_metadata = {}
    def check_balance(df, index, name):
        m_len = len(df.iloc[index][df.iloc[index].label == 1])
        b_len = len(df.iloc[index][df.iloc[index].label == 0])
        mp = len(df.iloc[index][df.iloc[index].label == 1]) / len(df.iloc[index])
        bp = len(df.iloc[index][df.iloc[index].label == 0]) / len(df.iloc[index])
        print(f"{name} - percent of B vs M samples: {bp:.2f} - {mp:.2f} / count: {b_len} : {m_len}")
        return {"percent_benign": bp, "percent_malignant": mp, "count_benign": b_len, "count_malignant": m_len}


    classes_balance_metadata["all"] = check_balance(df, range(len(df)), "All")
    classes_balance_metadata["test"] = check_balance(df, test_index, "Test")

    test_patient_ids = df.iloc[test_index].patient_id.unique()

    for idx, (train_index, val_index) in enumerate(folds):
        print("=============================================")
        print(f"Saving fold {idx}")
        train_index = map_nested_indices(train_index, train_val_index)
        val_index = map_nested_indices(val_index, train_val_index)

        # Check if there is no intersection between train/val/test datasets in terms of patient IDs
        train_patient_ids = df.iloc[train_index].patient_id.unique()
        val_patient_ids = df.iloc[val_index].patient_id.unique()
        assert intersection(train_patient_ids, val_patient_ids) == [] 
        assert intersection(train_patient_ids, test_patient_ids) == []
        assert intersection(val_patient_ids, test_patient_ids) == []
        
        classes_balance_metadata[f"train_fold_{idx}"] = check_balance(df, train_index, "Train")
        classes_balance_metadata[f"val_fold_{idx}"] = check_balance(df, val_index, "Val")

        df.iloc[train_index].to_csv(os.path.join(output_path, f"train_{idx}.csv"))
        df.iloc[val_index].to_csv(os.path.join(output_path, f"val_{idx}.csv"))

        with open(os.path.join(output_path, "classes_balance_metadata.json"), 'w') as metafile:
            json.dump(classes_balance_metadata, metafile, indent=4)

In [None]:
for dataset_name, dataset_content in datasets.items():
    print(f"Creating folds for {dataset_name}")
    create_folds(dataset_content["df"], dataset_content["path"])