In [185]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import cv2
from dataclasses import dataclass
import pathlib

In [186]:
@dataclass(frozen=True)
class DatasetConfig:
    SEED_VALUE:  int = 41
    CUT_PLANE:   str = "sagital"
         
    DATA_ROOT_SOURCE_PATH: str = '/home/jovyan/data/auto-pet-iii/2024-05-10_Autopet_v1.1'
    MAIN_DATA_CSV_PATH: str = DATA_ROOT_SOURCE_PATH + '/fdg_metadata.csv'
    DATA_ROOT_TARGET:  str = f'/home/jovyan/data/segmentation/autopet-min-{CUT_PLANE}' 
    DATA_ROOT_TRAIN:  str = DATA_ROOT_TARGET + '/Train' 
    DATA_ROOT_VALID:  str = DATA_ROOT_TARGET + '/Valid'
    DATA_ROOT_TEST:   str = DATA_ROOT_TARGET + '/Test'
    DATA_TEST_GT:     str = DATA_ROOT_TARGET + '/Test.csv'

In [187]:
# train, val and test split

In [188]:
data_df = pd.read_csv(DatasetConfig.DATA_ROOT_TARGET + "/data_description.csv")
data_df

FileNotFoundError: [Errno 2] No such file or directory: '/home/jovyan/data/segmentation/autopet-min-sagital/data_description.csv'

In [None]:
data_df.diagnosis.value_counts()

In [None]:
col_names = data_df.columns.to_list()
col_names[0] = "imageId"
col_names

In [None]:
data_df.columns = col_names
data_df.head()

In [None]:
# new height will be based on the mean value
data_df.height.describe()

In [None]:
data_df[data_df.height > 395]

In [None]:
data_df[data_df.height <= 395]

In [None]:
data_df.boxplot(column=["height"])
plt.show()

In [None]:
data_df[data_df.height <= 395].boxplot(column=["height"])
plt.show()

In [None]:
data_df[data_df.height <= 395].height.max()

In [None]:
data_df.condition.value_counts()

In [None]:
# verify distribution of conditions with this filter applied
data_df[data_df.height <= 395].condition.value_counts()

In [None]:
# keep original df and filter the data_df
original_df = data_df.copy()
data_df = data_df[data_df.height <= 395].copy()

# Split and Export Images

In [None]:
@dataclass(frozen=True)
class ImageResizeConfig:
    height: int = 368
    width: int = 400

In [None]:
def pre_process_and_export_record(row, is_segmentation=False):
    image_path = DatasetConfig.DATA_ROOT_TARGET + (row["segPath"] if is_segmentation else row["imagePath"])
    img = cv2.imread(image_path)
    img = cv2.resize(img, (ImageResizeConfig.height, ImageResizeConfig.width))
    img = cv2.normalize(img, None, alpha=0, beta=255, norm_type=cv2.NORM_MINMAX)
    
    img_modality = "SEG" if is_segmentation else "PET"
    
    destination_path = f"{DatasetConfig.DATA_ROOT_TARGET}/{row.subset}/{img_modality}"
    pathlib.Path(destination_path).mkdir(parents=True, exist_ok=True)
    destination_path = f"{destination_path}/{row.imageId}.png"
    
    cv2.imwrite(destination_path,img)

In [None]:
data_df["subset"] = None

In [None]:
data_df.head()

In [None]:
data_df.condition.value_counts()

In [None]:
# split needs to take into account the condition of patient
train_df = pd.concat([
            data_df[data_df.condition == "LUNG_CANCER"].sample(frac=0.8, random_state=DatasetConfig.SEED_VALUE),
            data_df[data_df.condition == "LYMPHOMA"].sample(frac=0.8, random_state=DatasetConfig.SEED_VALUE),
            data_df[data_df.condition == "MELANOMA"].sample(frac=0.8, random_state=DatasetConfig.SEED_VALUE)
        ]).sample(frac=1, random_state=DatasetConfig.SEED_VALUE)
train_df

In [None]:
test_df = data_df.drop(train_df.index, inplace=False).sample(frac=1, random_state=DatasetConfig.SEED_VALUE)
test_df

In [None]:
# val_df = pd.concat([
#             train_df[train_df.condition == "LUNG_CANCER"].sample(frac=0.1, random_state=DatasetConfig.SEED_VALUE),
#             train_df[train_df.condition == "LYMPHOMA"].sample(frac=0.1, random_state=DatasetConfig.SEED_VALUE),
#             train_df[train_df.condition == "MELANOMA"].sample(frac=0.1, random_state=DatasetConfig.SEED_VALUE)
#         ]).sample(frac=1, random_state=DatasetConfig.SEED_VALUE)
# val_df

In [None]:
# train_df = train_df.drop(val_df.index, inplace=False)
# train_df

In [None]:
train_df.condition.value_counts()

In [None]:
# val_df.diagnosis.value_counts()

In [None]:
test_df.condition.value_counts()

In [None]:
train_df.subset = "Train"
# val_df.subset = "Valid"
test_df.subset = "Test"

In [None]:
# full_data = pd.concat([train_df, val_df, test_df]).sample(frac=1,random_state=DatasetConfig.SEED_VALUE)
full_data = pd.concat([train_df, test_df]).sample(frac=1,random_state=DatasetConfig.SEED_VALUE)
full_data

In [None]:
from tqdm import tqdm

In [None]:
for index, row in tqdm(full_data.iterrows(), total=len(full_data)):
    pre_process_and_export_record(row)
    pre_process_and_export_record(row, True)