# Dataset Exploration

In [407]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
from collections import Counter
from copy import deepcopy
from rich import inspect
from tqdm import tqdm

# torch
import torch

# monai
import monai
from monai.data import ImageDataset, ThreadDataLoader


In [408]:
# rtk
from rtk import repl
from rtk.config import (
    set_hydra_configuration,
    Configuration,
    DatasetConfiguration,
    JobConfiguration,
)
from rtk.utils import login, hydra_instantiate, get_console, get_logger


In [409]:
repl.install(show_locals=False)
console = get_console()
logger = get_logger("rtk.notebook")
monai.config.print_config()

MONAI version: 1.3.0
Numpy version: 1.23.5
Pytorch version: 2.1.0+cu121
MONAI flags: HAS_EXT = False, USE_COMPILED = False, USE_META_DICT = False
MONAI rev id: 865972f7a791bf7b42efbcd87c8402bd865b329e
MONAI __file__: /home/<username>/anaconda3/envs/rtk/lib/python3.11/site-packages/monai/__init__.py

Optional dependencies:
Pytorch Ignite version: 0.4.13
ITK version: 5.3.0
Nibabel version: 5.1.0
scikit-image version: 0.22.0
scipy version: 1.11.3
Pillow version: 10.0.1
Tensorboard version: NOT INSTALLED or UNKNOWN VERSION.
gdown version: NOT INSTALLED or UNKNOWN VERSION.
TorchVision version: 0.16.0+cu121
tqdm version: 4.66.1
lmdb version: NOT INSTALLED or UNKNOWN VERSION.
psutil version: 5.9.0
pandas version: 2.1.2
einops version: 0.7.0
transformers version: NOT INSTALLED or UNKNOWN VERSION.
mlflow version: 2.8.0
pynrrd version: NOT INSTALLED or UNKNOWN VERSION.
clearml version: NOT INSTALLED or UNKNOWN VERSION.

For details about installing the optional dependencies, please visit:
    ht

### Constant setup

In [410]:
# configs
config_name = "tests"
config_path = "../configs/"
config_dir = os.path.abspath(config_path)
config_dir

In [411]:
cfg: Configuration = set_hydra_configuration(
        config_name=config_name,
        init_method_kwargs={"config_dir": config_dir},
        ConfigurationInstance=Configuration,
    )
dataset_cfg: DatasetConfiguration = cfg.datasets
job_cfg: JobConfiguration = cfg.job
monai.utils.set_determinism(seed=job_cfg.random_state)
cfg

In [412]:
patient_path = dataset_cfg.patient_data
patient_path

In [413]:
scan_path = "/home/nicoleg/workspaces/dissertation/.data/Chest_XRay_14_Kaggle/"
scan_path

In [414]:
# display single image
# depth_slice = scan.shape[2] // 2
# display_scan_slice = scan[:, :, depth_slice].numpy()
# plt.imshow(display_scan_slice, cmap="bone");

# Exploration

In [415]:
# https://stackoverflow.com/questions/339007/how-do-i-pad-a-string-with-zeroes
index = dataset_cfg.index
target = dataset_cfg.target
label_encoding = 1
patient_df = pd.read_csv(patient_path).set_index(index)
label_path = os.path.join(scan_path, f"images_{label_encoding:03}", "images")
os.listdir(label_path)


In [416]:
# subset to the indicated indices in the label encoding
filename_matches = {"image_files": [], index: []}

for filename in os.listdir(label_path):
    filename_matches[index].append(filename)
    filename_matches["image_files"].append(os.path.join(label_path, filename))

filename_matches


## Wrangling data

In [417]:
patient_df.head()

Unnamed: 0_level_0,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],Unnamed: 11
Image Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
00000001_000.png,Cardiomegaly,0,1,58,M,PA,2682,2749,0.143,0.143,
00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,0.143,
00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,0.168,
00000002_000.png,No Finding,0,2,81,M,PA,2500,2048,0.171,0.171,
00000003_000.png,Hernia,0,3,81,F,PA,2582,2991,0.143,0.143,


In [418]:
# drop unnecessary columns

drop_columns = [
    "Follow-up #",
    "OriginalImage[Width",
    "Height]",
    "OriginalImagePixelSpacing[x",
    "y]",
    "Unnamed: 11",
    "View Position",
]
patient_df = patient_df.drop(columns=drop_columns)
patient_df.head()

Unnamed: 0_level_0,Finding Labels,Patient ID,Patient Age,Patient Gender
Image Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
00000001_000.png,Cardiomegaly,1,58,M
00000001_001.png,Cardiomegaly|Emphysema,1,58,M
00000001_002.png,Cardiomegaly|Effusion,1,58,M
00000002_000.png,No Finding,2,81,M
00000003_000.png,Hernia,3,81,F


In [419]:
def unpack_images():
    """"""

    filename_matches = {"image_files": [], index: []}

    for encoding in range(1, 13):
        label_path = os.path.join(scan_path, f"images_{encoding:03}", "images")

        for filename in os.listdir(label_path):
            filename_matches[index].append(filename)
            filename_matches["image_files"].append(os.path.join(label_path, filename))

    return pd.DataFrame.from_dict(filename_matches, orient="columns").set_index(index)

matches = unpack_images()
len(matches), matches.head()

In [420]:
patient_df = patient_df.merge(matches, on=index, how="inner")
patient_df.head()

Unnamed: 0_level_0,Finding Labels,Patient ID,Patient Age,Patient Gender,image_files
Image Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
00000001_000.png,Cardiomegaly,1,58,M,/home/nicoleg/workspaces/dissertation/.data/Ch...
00000001_001.png,Cardiomegaly|Emphysema,1,58,M,/home/nicoleg/workspaces/dissertation/.data/Ch...
00000001_002.png,Cardiomegaly|Effusion,1,58,M,/home/nicoleg/workspaces/dissertation/.data/Ch...
00000002_000.png,No Finding,2,81,M,/home/nicoleg/workspaces/dissertation/.data/Ch...
00000003_000.png,Hernia,3,81,F,/home/nicoleg/workspaces/dissertation/.data/Ch...


In [421]:
with open(os.path.join(scan_path, "train_val_list.txt"), "r") as f:
    train_val_list = f.readlines()

train_val_list

In [422]:
train_val_list = [idx.strip() for idx in train_val_list]
train_val_list

In [423]:
# do the same for the test list
with open(os.path.join(scan_path, "test_list.txt"), "r") as f:
    test_list = f.readlines()
    test_list = [idx.strip() for idx in test_list]

test_list

In [424]:
train_df = patient_df[patient_df.index.isin(train_val_list)]
train_df.head()

Unnamed: 0_level_0,Finding Labels,Patient ID,Patient Age,Patient Gender,image_files
Image Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
00000001_000.png,Cardiomegaly,1,58,M,/home/nicoleg/workspaces/dissertation/.data/Ch...
00000001_001.png,Cardiomegaly|Emphysema,1,58,M,/home/nicoleg/workspaces/dissertation/.data/Ch...
00000001_002.png,Cardiomegaly|Effusion,1,58,M,/home/nicoleg/workspaces/dissertation/.data/Ch...
00000002_000.png,No Finding,2,81,M,/home/nicoleg/workspaces/dissertation/.data/Ch...
00000004_000.png,Mass|Nodule,4,82,M,/home/nicoleg/workspaces/dissertation/.data/Ch...


In [425]:
test_df = patient_df[patient_df.index.isin(test_list)]
test_df.head()

Unnamed: 0_level_0,Finding Labels,Patient ID,Patient Age,Patient Gender,image_files
Image Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
00000003_000.png,Hernia,3,81,F,/home/nicoleg/workspaces/dissertation/.data/Ch...
00000003_001.png,Hernia,3,74,F,/home/nicoleg/workspaces/dissertation/.data/Ch...
00000003_002.png,Hernia,3,75,F,/home/nicoleg/workspaces/dissertation/.data/Ch...
00000003_003.png,Hernia|Infiltration,3,76,F,/home/nicoleg/workspaces/dissertation/.data/Ch...
00000003_004.png,Hernia,3,77,F,/home/nicoleg/workspaces/dissertation/.data/Ch...


## Pneumonia to metaclass

In [426]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()

In [427]:
# subsetting
# all_labels = dataset_cfg.all_labels
labels = dataset_cfg.labels
positive_class = dataset_cfg.preprocessing["positive_class"]
console.print(f"Positive class: '{positive_class}'")
console.print(f"New labels:\n{labels}")

In [428]:
# unique

console.print("Data description:")
console.print(patient_df[target].nunique())
patient_target_counts = Counter(patient_df[target])
patient_target_counts

In [429]:
temp = list(patient_target_counts.keys())[:10]
for key in temp:
    classes = key.split("|")
    console.print(classes)

In [430]:
multi_target = f"multi_{target}"
patient_df[multi_target] = patient_df[target].apply(lambda x: x.split("|"))
patient_df[multi_target]

In [431]:
mlb.fit(patient_df[multi_target])
mlb.classes_

In [432]:
class_encoding = {label: i for i, label in enumerate(mlb.classes_)}
console.print(class_encoding)

In [433]:
for row in patient_df[multi_target][:10]:
    row = [row]
    console.print(mlb.transform(row)[0])

## New multi-index

In [434]:
def build_bitstring(
    target: str, df: pd.DataFrame, mlb: MultiLabelBinarizer, inplace=False
):
    logger.info(f"Building bitstring for '{target}'")

    def __bool_list_to_bitstring(x):
        x = mlb.transform([x])[0]
        return x

    transform = df[target].apply(lambda x: __bool_list_to_bitstring(x))
    if inplace:
        df[target] = transform
    else:
        return transform


def build_multiclass_dataframe(cfg: Configuration, df: pd.DataFrame, **kwargs):
    """"""
    dataset_cfg = kwargs.get("dataset_cfg", cfg.datasets)

    mlb = MultiLabelBinarizer()
    labels = dataset_cfg.labels
    positive_class = dataset_cfg.preprocessing["positive_class"]
    logger.info(f"Positive class: '{positive_class}'")
    logger.info(f"New labels:\n{labels}")
    logger.debug(f"{Counter(df[target])}")

    multi_target = f"multi_{target}"
    patient_df[multi_target] = patient_df[target].apply(lambda x: x.split("|"))
    mlb.fit(df[multi_target])
    class_encoding = {label: i for i, label in enumerate(mlb.classes_)}
    logger.info(f"Dataset class encoding:\n{class_encoding}")

    _transforms = df[multi_target].apply(lambda x: mlb.transform([x])[0])
    multiclass_df = pd.DataFrame.from_records(
        _transforms, index=df.index, columns=mlb.classes_
    )

    return multiclass_df, class_encoding


multiclass_df, class_encoding = build_multiclass_dataframe(cfg, patient_df)
multiclass_df.head()

Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`


Unnamed: 0_level_0,Atelectasis,Cardiomegaly,Consolidation,Edema,Effusion,Emphysema,Fibrosis,Hernia,Infiltration,Mass,No Finding,Nodule,Pleural_Thickening,Pneumonia,Pneumothorax
Image Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
00000001_000.png,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
00000001_001.png,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0
00000001_002.png,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
00000002_000.png,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
00000003_000.png,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0


#### Individual class counts using multi-index table

In [435]:
for column in multiclass_df.columns:
    console.print(f"'{column} ({class_encoding[column]})': {Counter(multiclass_df[column])}")

#### Data split version 1.0: subset to any sample that contains the positive class

In [436]:
pneumonia_df = multiclass_df[multiclass_df["Pneumonia"] == 1]
pneumonia_df["labelv1"] = pd.Series(np.ones(len(pneumonia_df), dtype=int), index=pneumonia_df.index)
pneumonia_df.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0_level_0,Atelectasis,Cardiomegaly,Consolidation,Edema,Effusion,Emphysema,Fibrosis,Hernia,Infiltration,Mass,No Finding,Nodule,Pleural_Thickening,Pneumonia,Pneumothorax,labelv1
Image Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
00000013_010.png,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,1
00000032_012.png,1,0,1,1,0,0,0,0,0,0,0,0,0,1,0,1
00000056_000.png,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1
00000061_012.png,0,0,0,1,1,0,0,0,1,0,0,0,1,1,0,1
00000061_015.png,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1


In [437]:
non_pneumonia_df = multiclass_df.drop(pneumonia_df.index)
non_pneumonia_df["labelv1"] = pd.Series(np.zeros(len(non_pneumonia_df), dtype=int), index=non_pneumonia_df.index)
non_pneumonia_df

Unnamed: 0_level_0,Atelectasis,Cardiomegaly,Consolidation,Edema,Effusion,Emphysema,Fibrosis,Hernia,Infiltration,Mass,No Finding,Nodule,Pleural_Thickening,Pneumonia,Pneumothorax,labelv1
Image Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
00000001_000.png,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
00000001_001.png,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
00000001_002.png,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0
00000002_000.png,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
00000003_000.png,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
00030801_000.png,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
00030802_000.png,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
00030803_000.png,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
00030804_000.png,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


In [438]:
console.print(f"Number of pneumonia cases: {len(pneumonia_df)}, {len(pneumonia_df) / len(multiclass_df) * 100:.4f}%")
console.print(f"Number of non-pneumonia cases: {len(non_pneumonia_df)}, {len(non_pneumonia_df) / len(multiclass_df) * 100:.4f}%")
assert len(multiclass_df) == len(pneumonia_df) + len(non_pneumonia_df)

In [439]:
v1_df = pd.concat([pneumonia_df, non_pneumonia_df])
assert len(v1_df) == len(pneumonia_df) + len(non_pneumonia_df)
v1_df.head()

Unnamed: 0_level_0,Atelectasis,Cardiomegaly,Consolidation,Edema,Effusion,Emphysema,Fibrosis,Hernia,Infiltration,Mass,No Finding,Nodule,Pleural_Thickening,Pneumonia,Pneumothorax,labelv1
Image Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
00000013_010.png,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,1
00000032_012.png,1,0,1,1,0,0,0,0,0,0,0,0,0,1,0,1
00000056_000.png,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1
00000061_012.png,0,0,0,1,1,0,0,0,1,0,0,0,1,1,0,1
00000061_015.png,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1


#### Data split version 2.0: subset to samples ONLYING containing the positive class

In [440]:
# TODO: Fix this; this isn't working properly
non_pneumonia_columns = list(set(multiclass_df.columns) - set(["Pneumonia"]))
console.print(non_pneumonia_columns)

In [441]:
_drop_indices = []

for column in non_pneumonia_columns:
    _drop_indices.extend(list(pneumonia_df[pneumonia_df[column] == 1].index))

drop_indices = pd.Index(_drop_indices)
drop_indices

In [442]:
pneumonia_df = pneumonia_df.drop(index=drop_indices)
pneumonia_df["labelv2"] = pd.Series(np.ones(len(pneumonia_df), dtype=int), index=pneumonia_df.index)
pneumonia_df


Unnamed: 0_level_0,Atelectasis,Cardiomegaly,Consolidation,Edema,Effusion,Emphysema,Fibrosis,Hernia,Infiltration,Mass,No Finding,Nodule,Pleural_Thickening,Pneumonia,Pneumothorax,labelv1,labelv2
Image Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
00000061_015.png,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1
00000144_001.png,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1
00000165_001.png,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1
00000193_019.png,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1
00000218_001.png,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
00028924_005.png,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1
00029481_004.png,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1
00029889_000.png,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1
00030079_018.png,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1


In [443]:
non_pneumonia_df = multiclass_df.drop(index=pneumonia_df.index)
non_pneumonia_df["labelv2"] = pd.Series(np.zeros(len(non_pneumonia_df), dtype=int), index=non_pneumonia_df.index)
non_pneumonia_df

Unnamed: 0_level_0,Atelectasis,Cardiomegaly,Consolidation,Edema,Effusion,Emphysema,Fibrosis,Hernia,Infiltration,Mass,No Finding,Nodule,Pleural_Thickening,Pneumonia,Pneumothorax,labelv2
Image Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
00000001_000.png,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
00000001_001.png,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
00000001_002.png,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0
00000002_000.png,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
00000003_000.png,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
00030801_001.png,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0
00030802_000.png,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
00030803_000.png,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
00030804_000.png,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


In [444]:
console.print(f"Number of pneumonia cases: {len(pneumonia_df)}, {len(pneumonia_df) / len(multiclass_df) * 100:.4f}%")
console.print(f"Number of non-pneumonia cases: {len(non_pneumonia_df)}, {len(non_pneumonia_df) / len(multiclass_df) * 100:.4f}%")
assert len(multiclass_df) == len(pneumonia_df) + len(non_pneumonia_df)

In [445]:
v2_df = pd.concat([pneumonia_df, non_pneumonia_df])
assert len(v2_df) == len(pneumonia_df) + len(non_pneumonia_df)
v2_df.head()

Unnamed: 0_level_0,Atelectasis,Cardiomegaly,Consolidation,Edema,Effusion,Emphysema,Fibrosis,Hernia,Infiltration,Mass,No Finding,Nodule,Pleural_Thickening,Pneumonia,Pneumothorax,labelv1,labelv2
Image Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
00000061_015.png,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1.0,1
00000144_001.png,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1.0,1
00000165_001.png,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1.0,1
00000193_019.png,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1.0,1
00000218_001.png,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1.0,1


### We can form train and test splits now

#### Version 1.0

In [None]:
v1_train_df = patient_df[v1_df.index.isin(train_val_list)]
v1_train_df = pd.concat([v1_train_df, v1_df['labelv1']], axis=1)
v1_train_df

In [None]:
v1_test_df = v1_df[v1_df.index.isin(test_list)]
v1_test_df = pd.concat([v1_test_df, v1_test_df[v1_df['labelv1']]], axis=1)
v1_test_df

In [None]:
assert len(v1_train_df) + len(v1_test_df) == len(v1_df)

#### Version 2.0

In [None]:
v2_train_df = patient_df[v2_df.index.isin(train_val_list)]
v2_train_df = pd.concat([v2_train_df, v2_df["labelv2"]], axis=1)
v2_train_df

In [None]:
console.print(f"{Counter(v2_train_df['labelv2'])}")

In [None]:
v2_test_df = patient_df[v2_df.index.isin(test_list)]
v2_test_df = pd.concat([v2_test_df, v2_df["labelv2"]], axis=1)
v2_test_df

## BitString stuff

In [None]:
from bitstring import BitArray, Bits

### Bitwise operations
Since we now have everything setup as a bitstring, we can use bit operations for subsetting the data rather than the previous way I was doing it. Here's an example:

In [None]:
console.print(class_encoding)

In [None]:
patient_df[multi_target] = build_bitstring(multi_target, patient_df, mlb, inplace=False)

In [None]:
# grab first three rows
a = patient_df.iloc[0]
a

In [None]:
def intlist_to_bitstring(x):
    return BitArray().join([BitArray(bin="1" if b == 1 else "0") for b in x])

In [None]:
# positive-class only bitstring formation

positive_class_encoding = class_encoding[positive_class]
_BITSTRING_POSITIVE_CLASS = np.zeros(len(class_encoding), dtype=int)
_BITSTRING_POSITIVE_CLASS[positive_class_encoding] = 1
BITSTRING_POSITIVE_CLASS = intlist_to_bitstring(_BITSTRING_POSITIVE_CLASS)
# BITSTRING_INVERSE_POSITIVE_CLASS = np.array([0 if i else 1 for i in BITSTRING_POSITIVE_CLASS])

BITSTRING_POSITIVE_CLASS, mlb.inverse_transform(_BITSTRING_POSITIVE_CLASS.reshape(1, -1))
# BITSTRING_INVERSE_POSITIVE_CLASS

In [None]:
def class_to_bitarray(label: str, label_encoding: dict):
    encoding = label_encoding[label]
    _BITSTRING_CLASS = np.zeros(len(class_encoding), dtype=int)
    _BITSTRING_CLASS[encoding] = 1
    BITSTRING_CLASS = intlist_to_bitstring(_BITSTRING_CLASS)

    return BITSTRING_CLASS


pneumonia_bitstring = class_to_bitarray("Pneumonia", class_encoding)
pneumonia_bitstring

In [None]:
_NULL = np.zeros(len(class_encoding), dtype=int)
NULL = intlist_to_bitstring(_NULL)
NULL

In [None]:
# checks if an instance contains only the positive class
non_pneumonia_bitstring = class_to_bitarray("No Finding", class_encoding)
non_pneumonia_bitstring & BITSTRING_POSITIVE_CLASS == NULL

In [None]:
pneumonia_bitstring & BITSTRING_POSITIVE_CLASS == NULL

If we want the Pneumonia ONLY classes to be separated out basically everything else. To do that, we use the inverse of the Pneumonia mask

In [None]:
def subset_to_positive_class(
    df: pd.DataFrame, target: str, inplace=True, version: float = 1.0
):
    positive_class_encoding = class_encoding[positive_class]
    bitstring_positive_class = class_to_bitarray(positive_class, class_encoding)


pneumonia_df = subset_to_positive_class(patient_df, multi_target, inplace=False)
pneumonia_df[[target, multi_target]]

In [None]:
pneumonia_df[multi_target]

### Make into dataset

In [None]:
# visualize
train_dataset = ImageDataset(
    image_files=train_df["image_files"].tolist(), labels=train_df[target].tolist()
)
train_loader = ThreadDataLoader(dataset=train_dataset, batch_size=1, num_workers=4)
iterator = iter(train_loader)

In [None]:
from monai.utils import first

scan, label = next(iterator)
scan.shape, label


In [None]:
scan.squeeze().shape

In [None]:
# patient_id = scan._meta["filename_or_obj"].split("/")[-1].split(".")[0]
plt.title(f"Label: {label[0]}")
display_scan = scan.squeeze().numpy()
# display_scan = np.transpose(display_scan, (1, 2, 0))
plt.imshow(display_scan, cmap="bone");

# Using `RTK`

In [None]:
from omegaconf import DictConfig
import logging

# rtk
from rtk import datasets
from rtk.utils import hydra_instantiate

logging.getLogger("rtk.datasets").setLevel(logging.DEBUG)

dataset = datasets.instantiate_image_dataset(
    cfg=cfg, save_metadata=True
)
train_dataset, test_dataset = dataset[0], dataset[1]

In [None]:
train_dataset[0][0].shape

In [None]:
train_loader = hydra_instantiate(
    cfg=dataset_cfg.dataloader,
    dataset=train_dataset,
    pin_memory=torch.cuda.is_available(),
    shuffle=True,
)
test_loader = hydra_instantiate(
    cfg=dataset_cfg.dataloader,
    dataset=test_dataset,
    pin_memory=torch.cuda.is_available(),
)
iterator = iter(train_loader)


## Get some samples

In [None]:
scan, label = next(iterator)
scan.shape, label

In [None]:
_filename = scan[10]._meta['filename_or_obj'].split('/')[-1]
patient_id = _filename.split('.')[0]
patient_id

In [None]:
plt.title(f"Patient ID: {patient_id}")
display_scan = scan[0].numpy()
display_scan = np.transpose(display_scan, (1, 2, 0))
plt.imshow(display_scan, cmap="bone");

In [None]:
scan, label = datasets.visualize_scan(iterator=iterator)

# Pediatrics age overlap

In [None]:
pneumonia = "Pneumonia"
ped = patient_df[patient_df[target].str.contains(pneumonia)]


ped[ped["Patient Age"] < 18]

In [None]:
print(f"Number of patients with {pneumonia}:", len(ped))
print(f"Number of patients with {pneumonia} and age < 18:", len(ped[ped["Patient Age"] < 18]))
print(f"Number of patients with {pneumonia} and age < 21:", len(ped[ped["Patient Age"] < 21]))

## Combine Chest XRay and Pediatrics data

In [None]:
dataset_cfg.additional_datasets

In [None]:
from rtk.utils import yaml_to_configuration

pediatrics_file_path = dataset_cfg.additional_datasets["dataset_configs"][0]["filepath"]

pediatrics: DatasetConfiguration = yaml_to_configuration(pediatrics_file_path)
pediatrics


In [None]:
from rtk.datasets import load_chest_xray_dataset

ped_train_dataset, ped_test_dataset, ped_train_metadata, ped_test_metadata = load_chest_xray_dataset(
    cfg=cfg,
    dataset_cfg=pediatrics,
    return_metadata=True,
)
ped_train_metadata_subset = ped_train_metadata[ped_train_metadata["labels"] == 1]
ped_test_metadata_subset = ped_test_metadata[ped_test_metadata["labels"] == 1]
ped_train_metadata_subset

In [None]:
vars(cfg)

In [None]:
combined_train_dataset = deepcopy(train_dataset)
combined_test_dataset = deepcopy(test_dataset)
combined_train_dataset.image_files

In [None]:
combined_train_image_files = np.hstack((ped_train_metadata_subset.image_files.values, train_dataset.image_files))
combined_test_image_files = np.hstack((ped_test_metadata_subset.image_files.values, test_dataset.image_files))
combined_train_labels = np.hstack((ped_train_metadata_subset.labels.values, train_dataset.labels))
combined_test_labels = np.hstack((ped_test_metadata_subset.labels.values, test_dataset.labels))


In [None]:
from rtk.utils import hydra_instantiate

combined_train_dataset = hydra_instantiate(
    cfg=dataset_cfg.instantiate,
    image_files=combined_train_image_files,
    labels=combined_train_labels,
    transform=train_dataset.transform,
)
combined_test_dataset = hydra_instantiate(
    cfg=dataset_cfg.instantiate,
    image_files=combined_test_image_files,
    labels=combined_test_labels,
    transform=test_dataset.transform,
)
combined_train_dataset

In [None]:
from collections import Counter

Counter(train_dataset.labels), Counter(combined_train_dataset.labels)

In [None]:
Counter(test_dataset.labels), Counter(combined_test_dataset.labels)

In [None]:
from rtk.datasets import combine_datasets

combined_train_dataset, combined_test_dataset = combine_datasets(train_dataset, test_dataset, dataset_cfg=dataset_cfg)
# combined_test_dataset = combine_datasets(test_dataset, ped_test_dataset, dataset_cfg=dataset_cfg, transform=test_dataset.transform)
combined_train_dataset.image_files

In [None]:
len(combined_train_dataset.image_files)

In [None]:
combined_train_dataloader = hydra_instantiate(
    cfg=dataset_cfg.dataloader,
    dataset=combined_train_dataset,
    pin_memory=torch.cuda.is_available(),
    shuffle=True,
)
combined_test_dataloader = hydra_instantiate(
    cfg=dataset_cfg.dataloader,
    dataset=combined_test_dataset,
    pin_memory=torch.cuda.is_available(),
    shuffle=True,
)
combined_train_iterator = iter(combined_train_dataloader)
combined_test_iterator = iter(combined_test_dataloader)


In [None]:
scan, label = datasets.visualize_scan(iterator=combined_train_iterator)

In [None]:
combined_train_dataset.image_files

## with RTK

In [None]:
Counter(train_dataset.labels), Counter(test_dataset.labels)


In [None]:
train_dataset, test_dataset = datasets.combine_datasets(train_dataset, test_dataset, dataset_cfg=dataset_cfg)

Counter(train_dataset.labels), Counter(test_dataset.labels)
