This notebook analyzes the balance of labales in the dataset

In [1]:
import os
from matplotlib import pyplot as plt
import numpy as np
import nibabel as nib
import polars as pl
from tqdm import tqdm

In [2]:
plt.style.use('seaborn-v0_8-white')

In [3]:
ROOT_PATH = os.getcwd()
TEST_LABELS_PATH = os.path.join(ROOT_PATH, "workspace/test/labelsTs/true")
TRAIN_LABELS_PATH = os.path.join(ROOT_PATH, "workspace/train/labelsTr")

In [4]:
label_annotations = {
    0: "Background",
    1: "PDAC lesion",
    2: "Veins",
    3: "Arteries",
    4: "Pancreas parenchyma",
    5: "Pancreatic duct",
    6: "Common bile duct",
}

In [5]:
def get_label_distribution(label_path):
    data = []
    label_files = [os.path.abspath(os.path.join(label_path, label_name)) for label_name in os.listdir(label_path)]
    label_files = [f for f in label_files if f.endswith(".nii.gz")]
    for label_file in tqdm(label_files, desc="Loading labels"):
        label = nib.load(os.path.realpath(label_file))
        data_arr = np.asanyarray(label.dataobj)
        
        unique, counts = np.unique(data_arr, return_counts=True)
        for label, count in tqdm(zip(unique, counts), desc="Counting pixels", total=len(unique)):
            data.append({
                "file": label,
                "label": int(label),
                "pixel_count": int(count)
            })
    counts_df = pl.DataFrame(data)
    distribution_f = (
        counts_df.group_by("label")
        .agg([
            pl.col("pixel_count").sum().alias("total_pixels"),
            pl.col("file").n_unique().alias("file_count")
        ])
        .sort("label")
    )
    return distribution_f

In [6]:
training_labels = get_label_distribution(TRAIN_LABELS_PATH)
training_labels.write_csv(os.path.join(TRAIN_LABELS_PATH, "training_labels.csv"))

Loading labels:   0%|                                                                                 | 0/719 [00:00<?, ?it/s]




Counting pixels:   0%|                                                                                  | 0/7 [00:00<?, ?it/s]

[A

Counting pixels: 100%|███████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 81783.09it/s]


Loading labels:   0%|                                                                       | 1/719 [00:25<5:10:42, 25.97s/it]




Counting pixels:   0%|                                                                                  | 0/6 [00:00<?, ?it/s]

[A

Counting pixels: 100%|███████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 68947.46it/s]


Loading labels:   0%|▏                                                                      | 2/719 [00:30<2:40:31, 13.43s/it]




Counting pixels:   0%|                                                                                  | 0/7 [00:00<?, ?it/s]

[A

Counting pixels: 100%|███████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 88970.08it/s]


Loading labels:   0%|▎                                                                      | 3/719 [00:37<2:05:29, 10.52s/it]




Counting pixels:   0%|                                                                                  | 0/5 [00:00<?, ?it/s]

[A

Counting pixels: 100%|███████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 64329.82it/s]


Loading labels:   1%|▍                                                                      | 4/719 [00:43<1:43:19,  8.67s/it]




Counting pixels:   0%|                                                                                  | 0/7 [00:00<?, ?it/s]

[A

Counting pixels: 100%|███████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 87122.04it/s]


Loading labels:   1%|▍                                                                      | 5/719 [00:47<1:23:29,  7.02s/it]




Counting pixels:   0%|                                                                                  | 0/4 [00:00<?, ?it/s]

[A

Counting pixels: 100%|███████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 45590.26it/s]


Loading labels:   1%|▌                                                                      | 6/719 [00:57<1:36:08,  8.09s/it]




Counting pixels:   0%|                                                                                  | 0/7 [00:00<?, ?it/s]

[A

Counting pixels: 100%|███████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 70747.30it/s]


Loading labels:   1%|▋                                                                      | 7/719 [01:14<2:08:57, 10.87s/it]




Counting pixels:   0%|                                                                                  | 0/6 [00:00<?, ?it/s]

[A

Counting pixels: 100%|███████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 91180.52it/s]


Loading labels:   1%|▊                                                                      | 8/719 [01:28<2:20:46, 11.88s/it]




Counting pixels:   0%|                                                                                  | 0/6 [00:00<?, ?it/s]

[A

Counting pixels: 100%|███████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 84449.07it/s]


Loading labels:   1%|▉                                                                      | 9/719 [01:35<2:02:59, 10.39s/it]




Counting pixels:   0%|                                                                                  | 0/6 [00:00<?, ?it/s]

[A

Counting pixels: 100%|███████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 82782.32it/s]


Loading labels:   1%|▉                                                                     | 10/719 [01:48<2:10:24, 11.04s/it]




Counting pixels:   0%|                                                                                  | 0/6 [00:00<?, ?it/s]

[A

Counting pixels: 100%|███████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 40394.58it/s]


Loading labels:   2%|█                                                                     | 11/719 [01:51<1:41:29,  8.60s/it]




Counting pixels:   0%|                                                                                  | 0/6 [00:00<?, ?it/s]

[A

Counting pixels: 100%|███████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 81442.80it/s]


Loading labels:   2%|█▏                                                                    | 12/719 [01:58<1:36:02,  8.15s/it]




Counting pixels:   0%|                                                                                  | 0/4 [00:00<?, ?it/s]

[A

Counting pixels: 100%|███████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 58867.42it/s]


Loading labels:   2%|█▎                                                                    | 13/719 [02:09<1:47:24,  9.13s/it]




Counting pixels:   0%|                                                                                  | 0/6 [00:00<?, ?it/s]

[A

Counting pixels: 100%|███████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 90524.55it/s]


Loading labels:   2%|█▎                                                                    | 14/719 [02:10<1:19:34,  6.77s/it]




Counting pixels:   0%|                                                                                  | 0/6 [00:00<?, ?it/s]

[A

Counting pixels: 100%|███████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 55188.21it/s]


Loading labels:   2%|█▍                                                                    | 15/719 [02:23<1:41:23,  8.64s/it]




Counting pixels:   0%|                                                                                  | 0/7 [00:00<?, ?it/s]

[A

Counting pixels: 100%|███████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 70239.54it/s]


Loading labels:   2%|█▌                                                                    | 16/719 [02:26<1:20:32,  6.87s/it]




Counting pixels:   0%|                                                                                  | 0/5 [00:00<?, ?it/s]

[A

Counting pixels: 100%|███████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 67650.06it/s]


Loading labels:   2%|█▋                                                                    | 17/719 [02:30<1:08:08,  5.82s/it]




Counting pixels:   0%|                                                                                  | 0/6 [00:00<?, ?it/s]

[A

Counting pixels: 100%|███████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 87381.33it/s]


Loading labels:   3%|█▊                                                                    | 18/719 [02:35<1:06:45,  5.71s/it]




Counting pixels:   0%|                                                                                  | 0/6 [00:00<?, ?it/s]

[A

Counting pixels: 100%|███████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 82510.90it/s]


Loading labels:   3%|█▊                                                                    | 19/719 [02:40<1:03:25,  5.44s/it]




Counting pixels:   0%|                                                                                  | 0/7 [00:00<?, ?it/s]

[A

Counting pixels: 100%|███████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 96262.71it/s]


Loading labels:   3%|██                                                                      | 20/719 [02:43<55:48,  4.79s/it]




Counting pixels:   0%|                                                                                  | 0/7 [00:00<?, ?it/s]

[A

Counting pixels: 100%|███████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 92327.45it/s]


Loading labels:   3%|██                                                                      | 21/719 [02:44<42:29,  3.65s/it]




Counting pixels:   0%|                                                                                  | 0/6 [00:00<?, ?it/s]

[A

Counting pixels: 100%|███████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 67832.41it/s]


Loading labels:   3%|██▏                                                                   | 22/719 [03:15<2:16:57, 11.79s/it]

In [None]:
testing_labels = get_label_distribution(TEST_LABELS_PATH)
testing_labels.write_csv(os.path.join(TEST_LABELS_PATH, "testing_labels.csv"))

In [None]:
# load label distributions
training_labels = pl.read_csv(os.path.join(TRAIN_LABELS_PATH, "training_labels.csv"))
testing_labels = pl.read_csv(os.path.join(TEST_LABELS_PATH, "testing_labels.csv"))

In [None]:
# plot label distributions side by side bar-wise
fig, ax = plt.subplots(figsize=(12, 8))
bar_width = 0.35
# Plot training labels
index = np.arange(len(label_annotations))
ax.bar(index, np.log(training_labels["total_pixels"]), bar_width, label="Training", color='b')

# Plot testing labels
ax.bar(index + bar_width, np.log(testing_labels["total_pixels"]), bar_width, label="Testing", color='r')
ax.set_yscale('log')
ax.set_ylabel("Log of Total Pixels")
ax.set_xlabel("Labels")
ax.set_title("Label Distribution in Training and Testing Sets")
ax.set_xticks(index + bar_width / 2)
ax.set_xticklabels([label_annotations[i] for i in range(len(label_annotations))])
ax.legend()

plt.tight_layout()
plt.show()