# Breast Cancer Detection based on Histopathology Images

## About the Data:

- **Data Source:** https://www.kaggle.com/code/nasrulhakim86/breast-cancer-histopathology-images-classification/data
- The Breast Cancer Histopathological Image Classification (BreakHis) is composed of 9,109 microscopic images of breast tumor tissue collected from 82 patients.
- The images are collected using different magnifying factors (40X, 100X, 200X, and 400X). 
- To date, it contains 2,480 benign and 5,429 malignant samples (700X460 pixels, 3-channel RGB, 8-bit depth in each channel, PNG format).
- This database has been built in collaboration with the P&D Laboratory – Pathological Anatomy and Cytopathology, Parana, Brazil (http://www.prevencaoediagnose.com.br). 
- Each image filename stores information about the image itself: method of procedure biopsy, tumor class, tumor type, patient identification, and magnification factor. 
- For example, SOBBTA-14-4659-40-001.png is the image 1, at magnification factor 40X, of a benign tumor of type tubular adenoma, original from the slide 14-4659, which was collected by procedure SOB.

In [None]:
import cv2
import matplotlib.pyplot as plt
import os
import pandas as pd
from pathlib import Path
import seaborn as sns
import shutil


### Prepare directories

In [None]:
zoom = 400

cwd = Path().absolute()
input_path = cwd / 'breakhis'
output_path = cwd / f'breakhis_{zoom}x'

print(f"Current working directory: {cwd}")
print(f"Input path: {input_path}")
print(f"Output path: {output_path}")


In [None]:
shutil.rmtree(output_path, ignore_errors=True)


In [None]:
os.makedirs(output_path)
os.makedirs(output_path / 'benign')
os.makedirs(output_path / 'malignant')


### Load data

In [None]:
input_data_df = pd.read_csv(input_path / 'Folds.csv')


In [None]:
input_data_df = input_data_df.rename(columns={"filename": "path"})
input_data_df.head(3)
len(input_data_df)


In [None]:
input_data_df['path'][1]


In [None]:
input_data_df['filename'] = input_data_df['path'].apply(
    lambda x: x.split("/")[-1])
input_data_df["label"] = input_data_df['path'].apply(lambda x: x.split("/")[3])
input_data_df["patient_id"] = input_data_df['path'].apply(
    lambda x: x.split("/")[-3])


In [None]:
input_data_df = input_data_df[input_data_df.mag == zoom]


In [None]:
input_data_df.head(3)
len(input_data_df)


In [None]:
input_data_df.head()

### Copy data to new directory

- The given data consists of very complex structure of folders where it stores the images.
- The structure as follows:
    - BreaKHis_v1
        - histology_slides
            - breast
                - **benign**
                    - **SOB**
                        - Type
                            - **patient_id**  
                                - 40x
                                - 100x
                                - 200x
                                - 400x
                - **malignant**
                    - **SOB**
                        - Type
                            - **patient_id**   
                                - 40x
                                - 100x
                                - 200x
                                - 400x
- To make things simple, using the exact path of the images, all the images are moved to the common folder called **Cancer.**
- Images are renamed with their class and patient_id.
                

In [None]:
for i in range(len(input_data_df)):
    src = input_path / 'BreaKHis_v1' / input_data_df['path'].iloc[i]
    # dest = output_path / input_data_df["label"].iloc[i]
    # divide_image_into_patches(src, dest)
    dest = output_path / \
        input_data_df["label"].iloc[i] / str(src).split("/")[-1]
    shutil.copyfile(src, dest)

In [None]:
print(f"Benign: {len(os.listdir(output_path / 'benign'))}")
print(f"Malignant: {len(os.listdir(output_path / 'malignant'))}")


- All the images are now stores in single folder.

In [None]:
input_data_df['file_loc'] = input_data_df['label'] + \
    "_" + input_data_df['filename']
input_data_df['class'] = input_data_df['label'].apply(
    lambda x: 0 if x == 'benign' else 1)


In [None]:
input_data_df.head(3)

In [None]:
plt.figure(figsize=(10, 6))
sns.set(font_scale=1.5)
sns.set_style("darkgrid")
sns.countplot(x=input_data_df[input_data_df.mag == zoom]['label'])
plt.xlabel("Label")
plt.ylabel("Count")
plt.title(f"Count of 'benign' i 'malignant' samples (zoom {zoom}x)")


- Data is Highly Imabalanced as this is the case with the real world.
- Medical datas are usually imbalanced because of their nature.

In [None]:
benign_df = input_data_df[input_data_df['label'] == 'benign']
malignant_df = input_data_df[input_data_df['label'] == 'malignant']


### Benign Samples

In [None]:
plt.figure(figsize=(30, 10))
for i in range(0, 40):
    plt.subplot(4, 10, i+1)
    img = cv2.imread(os.path.join(
        output_path, benign_df['label'].iloc[i], benign_df['filename'].iloc[i]), 1)
    plt.imshow(img)


### Malignant Samples

In [None]:
plt.figure(figsize=(30, 10))
for i in range(0, 40):
    plt.subplot(4, 10, i+1)
    img = cv2.imread(os.path.join(
        output_path, malignant_df['label'].iloc[i], malignant_df['filename'].iloc[i]), 1)
    plt.imshow(img)


**Findings:**

- From the above images there is very little to no difference between malignant and benign samples.
- This might be because we are not the pathologists, That's the original purpose of the detection system.
- Thus it makes it easy in the absence of actual pathologists.

In [None]:
# Creating a new data frame with the file loc as its index, label and class of the patients as its columns.
benign_files = os.listdir(output_path / 'benign')
benign_files = [f"breakhis_{zoom}x/benign/" +
                file_name for file_name in benign_files]
malignant_files = os.listdir(output_path / 'malignant')
malignant_files = [f"breakhis_{zoom}x/malignant/" +
                   file_name for file_name in malignant_files]

df = pd.DataFrame(benign_files + malignant_files)
df = df.rename(columns={0: 'file_loc'})
df.head()
df['label'] = df['file_loc'].apply(
    lambda x: 0 if x.split('/')[-1].split("_")[1] == 'B' else 1)
df['label_str'] = df['file_loc'].apply(
    lambda x: "benign" if x.split('/')[-1].split("_")[1] == 'B' else "malignant")

df['patient_id'] = df['file_loc'].apply(
    lambda x: "-".join(x.split("-")[:3]).split("/")[-1])
df.set_index("file_loc", inplace=True)

df.head(5)

### Divide images into 224x224 patches

In [None]:
import os
from PIL import Image
import pandas as pd

def update_dataframe_with_patches(df, patch_size=224, stride=224):
    """
    Update the dataframe to include patches instead of original images.

    :param df: Original dataframe.
    :param patch_size: Size of the patches. Default is 224.
    :param stride: Number of pixels to move along the image after each patch. Default is 224.
    :return: Updated dataframe.
    """

    # Create a list to store the dataframes for the patches
    patch_dfs = []

    # Loop over each row in the dataframe
    for idx, row in df.iterrows():

        # Load the image to get its size
        img = Image.open(idx)  # idx is the file_loc, which is the index of the dataframe

        # Get the size of the image
        width, height = img.size

        # Calculate the number of patches in x and y direction
        num_patches_x = width // stride
        num_patches_y = height // stride


        # Loop over the patches
        for i in range(num_patches_x):
            for j in range(num_patches_y):

                # Calculate the coordinates of the patch
                left = i * stride
                upper = j * stride
                right = left + patch_size
                lower = upper + patch_size

                # Extract the patch from the image
                patch = img.crop((left, upper, right, lower))

                # Update the filename and path
                base_filename = os.path.splitext(idx)[0]
                new_filename = f"{base_filename}_{i}_{j}.png"

                # Save the patch
                patch.save(new_filename)

                # Create a new dataframe for the patch, copying all the information from the original image
                new_df = pd.DataFrame([row.values], columns=row.index, index=[new_filename])

                # Add the new dataframe to the list of patch dataframes
                patch_dfs.append(new_df)
        
        os.remove(idx)

    # Concatenate all the patch dataframes
    patches_df = pd.concat(patch_dfs)

    # Drop the original images from the dataframe
    df = df.drop(idx)

    # Concatenate the original dataframe with the patches dataframe
    df = pd.concat([df, patches_df])

    return patches_df

# Optional step:
#df = update_dataframe_with_patches(df)

In [None]:
df.index.name = "file_loc"
df.head()

In [None]:
plt.figure(figsize=(10, 6))
sns.set(font_scale=1.2)
sns.set_style("darkgrid")
sns.countplot(x=df['label_str'])
plt.xlabel("Class")
plt.ylabel("Count")
plt.title(f"Count of 'benign' and 'malignant' samples (zoom {zoom}x)")

In [None]:
import numpy as np
from collections import Counter
from sklearn.utils import shuffle


def map_nested_indices(nested_indices, original_indices):
    return original_indices[nested_indices]


class StratifiedGroupKFold:
    def __init__(self, n_splits=3, random_state=None):
        self.n_splits = n_splits
        self.random_state = random_state
        self.used_group_ids = []

    def _fill_bucket(self, bucket, class_counts, group_ids, y):
        for group_id, label in zip(group_ids, y):
            if group_id in self.used_group_ids:
                continue
            if class_counts[label] > 0:
                group_indices = np.where(group_ids == group_id)[0]
                bucket[label].extend(group_indices)
                class_counts[label] -= len(group_indices)
                self.used_group_ids.append(group_id)

    def _create_buckets(self, group_ids, y, class_ratios):
        total_samples = len(group_ids)
        samples_per_split = total_samples // self.n_splits

        buckets = []
        for _ in range(self.n_splits):
            bucket = {label: [] for label in class_ratios.keys()}
            class_counts = {label: int(samples_per_split * ratio)
                            for label, ratio in class_ratios.items()}
            self._fill_bucket(bucket, class_counts, group_ids, y)
            buckets.append(bucket)

        return buckets

    def _rotate_buckets(self, buckets):
        return buckets[-1:] + buckets[:-1]

    def _get_indices(self, bucket, group_ids, y):
        indices = []
        for label, groups in bucket.items():
            for group in groups:
                group_indices = np.where(group_ids == group)[0]
                label_indices = np.where(y == label)[0]
                indices.extend(np.intersect1d(group_indices, label_indices))
        return np.array(indices)

    def split(self, X, y, group_ids):
        index_map = np.arange(len(y))
        group_ids_s, y_s, index_map = shuffle(
            group_ids, y, index_map, random_state=self.random_state)

        class_ratios = {label: count / len(y)
                        for label, count in Counter(y).items()}
        buckets = self._create_buckets(group_ids_s, y_s, class_ratios)

        for _ in range(self.n_splits):
            train_buckets = buckets[1:]
            test_bucket = buckets[0]

            train_indices = np.concatenate(
                [np.array(bucket[label]) for bucket in train_buckets for label in bucket])
            test_indices = np.concatenate(
                [np.array(test_bucket[label]) for label in test_bucket])

            # Map shuffled indices to original ones
            train_indices = index_map[train_indices]
            test_indices = index_map[test_indices]

            assert len(np.intersect1d(
                np.unique(group_ids[train_indices]), np.unique(group_ids[test_indices]))) == 0

            yield train_indices, test_indices
            buckets = self._rotate_buckets(buckets)


##### Split to train&val + test subdatasets (test 20%)

In [None]:
files = df['label']
labels = df['label']
patient_ids = df['patient_id']

sgfk = StratifiedGroupKFold(n_splits=5, random_state=42)
train_val_index, test_index = next(
    sgfk.split(files, labels, patient_ids))


train_val_files = df['label'].iloc[train_val_index]
train_val_labels = df['label'].iloc[train_val_index]
train_val_patient_ids = df['patient_id'].iloc[train_val_index]


##### Now split remining train&val to separate train and val subdatasets (val 20%)

In [None]:
n_splits = 5
sgfk = StratifiedGroupKFold(n_splits=5, random_state=42)

# folds = sgfk.split(files, labels, patient_ids)
folds = sgfk.split(train_val_files, train_val_labels, train_val_patient_ids)


In [None]:
df.head()

In [None]:
df.iloc[test_index].to_csv(os.path.join(output_path, "test.csv"))


In [None]:
def intersection(lst1, lst2):
    return list(set(lst1) & set(lst2))

print(f"Zoom: {zoom}x")
m_len = len(df[df.label == 1])
b_len = len(df[df.label == 0])
mp = len(df[df.label == 1]) / len(df)
bp = len(df[df.label == 0]) / len(df)
print(f"All - percent of B vs M samples: {bp:.2f} - {mp:.2f} / count: {b_len} : {m_len}")

for idx, (train_index, val_index) in enumerate(folds):
    print("=============================================")
    print(f"Saving fold {idx}")
    # train_index = map_nested_indices(train_index, train_val_index)
    # val_index = map_nested_indices(val_index, train_val_index)

    train_val_patient_ids = df.iloc[train_index].patient_id.unique()
    test_patient_ids = df.iloc[val_index].patient_id.unique()

    m_len = len(df.iloc[train_index][df.iloc[train_index].label == 1])
    b_len = len(df.iloc[train_index][df.iloc[train_index].label == 0])
    mp = len(df.iloc[train_index][df.iloc[train_index].label == 1]
             ) / len(df.iloc[train_index])
    bp = len(df.iloc[train_index][df.iloc[train_index].label == 0]
             ) / len(df.iloc[train_index])
    print(f"Train - percent of B vs M samples: {bp:.2f} - {mp:.2f} / count: {b_len} : {m_len}")


    m_len = len(df.iloc[val_index][df.iloc[val_index].label == 1])
    b_len = len(df.iloc[val_index][df.iloc[val_index].label == 0])
    mp = len(df.iloc[val_index][df.iloc[val_index].label == 1]
             ) / len(df.iloc[val_index])
    bp = len(df.iloc[val_index][df.iloc[val_index].label == 0]
             ) / len(df.iloc[val_index])
    print(f"Val - percent of B vs M samples: {bp:.2f} - {mp:.2f} / count: {b_len} : {m_len}")

    df.iloc[train_index].to_csv(os.path.join(
        output_path, f"train_{idx}.csv"))
    df.iloc[val_index].to_csv(os.path.join(
        output_path, f"val_{idx}.csv"))
