In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

#for dirname, _, filenames in os.walk('/kaggle/input'):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
# CQ500Dataset (torch.utils.data.Dataset) class
import cv2
import numpy as np
from pandas import DataFrame
import torch
import pydicom
import pydicom.dataset
from pydicom.pixel_data_handlers.util import _apply_modality_lut


# Class
class CQ500Dataset(torch.utils.data.Dataset):
    """
    Connects `cq500ct_manifest.parquet` and `label.csv` and turns into Pytorch `tensors`.
    Returns x (shape), y (torch tensor float32) could be used as tran_ds.
    """

    def __init__(
        self, manifest_df: DataFrame, labels_df: DataFrame, transform=None
    ) -> None:
        """
        manifest_df is the `parquet` file created from the whole dataset.
        labels_df is the `csv` file created from `reads.csv`.
        """
        self.mf = manifest_df
        self.lbl = labels_df.set_index("name")  # name as index
        # self.ids become the master ids list, serving the pipeline.
        # find the unique and pixel + labels scans
        self.ids = self.lbl.index.intersection(self.mf["name"].unique())
        self.tf = transform

    def __len__(self) -> int:
        """Report the number of studies (not slices) as `int`."""
        return len(self.ids)

    def __getitem__(self, idx) -> tuple:
        study = self.ids[idx]  ## Pick a study (not slices)
        df = self.mf[
            self.mf["name"] == study
        ]  ## All the rows belonging to this study (chosen index) -> single patient row in manifest
        sid = df["series_uid"].iloc[0]  ## Pick one series #! ???
        slices = df[
            df["series_uid"] == sid
        ]  ## All slices (rows) for the selected series within the study
        volume = [to_windowed_tensor(pydicom.dcmread(p)) for p in slices["path"]]
        x = torch.stack(
            volume
        )  ## Stack per-slice tensors into a 4D batch [num_slice, num_chan, h, w]
        y = self.lbl.loc[
            study, "ICH-majority"
        ]  ## Target scalar label (soft or majority)
        if self.tf:
            x = self.tf(x)
        return x, torch.tensor(y, dtype=torch.float32)


# Helper
# DICOM to Tensor
def to_windowed_tensor(
    ds: pydicom.dataset.FileDataset,
    windows: list[tuple[int, int]] = None,
    out_size: tuple[int, int] = None,
    dtype: torch.dtype = torch.float32,
) -> torch.tensor:
    """
    Helper: convert one DICOM slice into a torch.Tensor [C x H x W]
    ds			= single CQ500 DICOM slice.
    windows		= CT window to apply. Each tuple becomes one output channel.
                    Default = Brain, Subdural, Bone.
    out_size	= (H, W) to resize the slice.
    dtype		= final tensor percision (float32 recommended).

    Returns		= torch.tensor (shape = C x H x W with normalized values)
    """
    ## Handle None args
    if windows is None:
        windows = [(40, 80), (80, 200), (600, 2800)]
    if out_size is None:
        out_size = (256, 256)

    ## Raw values > Hounsfield units (HU)
    hu: np.ndarray = _apply_modality_lut(ds.pixel_array, ds).astype(np.int16)
    if out_size is not None and hu.shape != out_size:
        hu = cv2.resize(hu, out_size[::-1], interpolation=cv2.INTER_LINEAR)

    ## Window / Level > 0-1 float per channel
    ## 3 channels to feed into model
    chans: list[np.ndarray] = []
    for level, width in windows:
        level: int
        width: int
        lower: int = level - (width // 2)
        upper: int = level + (width // 2)
        img_clipped: np.ndarray = np.clip(hu, lower, upper)
        img_norm: np.ndarray = (img_clipped - lower) / float(width)  # 0 - 1
        chans.append(img_norm.astype(np.float32))

    ## Stack and convert to tensor
    arr: np.ndarray = np.stack(chans, axis=0)  # C x H x W
    tensor: torch.Tensor = torch.from_numpy(arr).type(dtype)

    return tensor


In [4]:
## Load compact_reads and manifest. Load and read data for testing.
import os
import numpy as np
import pandas as pd

pq = pd.read_parquet("cq500ct_manifest.parquet")
pq["name"] = pq["name"].astype(str)

compact_reads = pd.read_csv("compact_reads.csv")

ds = CQ500Dataset(manifest_df=pq, labels_df=compact_reads, transform=None)
print(f"Studies available: {len(ds)}")	# available studies

x, y = ds[0]
print("tensor shape: ", x.shape)
print("label - ICH: ", y.item())

Studies available: 473
tensor shape:  torch.Size([32, 3, 256, 256])
label - ICH:  1.0


In [6]:
## Metadata from manifest + compact_reads.
metadata_df = pq.merge(
    compact_reads[['name', 'ICH-majority']],
    on='name',
    how='left',
    validate='many_to_one',
    indicator=True
)
metadata_df.to_parquet("cq500ct_metadata.parquet", index=False)

In [7]:
## Shuffle split to prevent leakage. Split by patient "name"
## CHANGE THIS TO B1 CATEGORY SPLIT ONLY

from sklearn.model_selection import GroupShuffleSplit
splitter = GroupShuffleSplit(
    test_size=0.2,
    n_splits=1,
    random_state=42
)
train_idx, val_idx = next(splitter.split(X=metadata_df, y=metadata_df["ICH-majority"], groups=metadata_df["name"]))

In [None]:
## Check leakage between train and validation indecies
# assert set(metadata_df.iloc[train_idx].name).isdisjoint(metadata_df.iloc[val_idx].name), "Leakage"

In [14]:
## Create dataframes from indecies (train and val)
df = metadata_df.rename(columns={"ICH-majority": "ICH_majority"})
# df.iloc[train_idx].ICH_majority.mean()

train_df = (df.iloc[train_idx]["name"].unique())

val_df = (df.iloc[val_idx]["name"].unique())

train_df = sorted(train_df)
val_df = sorted(val_df)

assert set(train_df).isdisjoint(val_df), "Leakage detected!"
print(f"{len(train_df)=}, {len(val_df)=}")

In [18]:
## Save dataframes into text files (train and val)
pd.Series(train_df).to_csv("train_patients.txt", index=False, header=False)
pd.Series(val_df).to_csv("val_patients.txt", index=False, header=False)