In [53]:
import pandas as pd
import numpy as np
import cv2
from dataclasses import dataclass
import pathlib

In [54]:
@dataclass(frozen=True)
class DatasetConfig:
    SEED_VALUE:  int = 41
    CUT_PLANE:   str = "coronal"
         
    DATA_ROOT_SOURCE_PATH: str = '/home/jovyan/data/auto-pet-iii/2024-05-10_Autopet_v1.1'
    MAIN_DATA_CSV_PATH: str = DATA_ROOT_SOURCE_PATH + '/fdg_metadata.csv'
    DATA_ROOT_TARGET:  str = f'/home/jovyan/data/pre-processed-autopet-min-{CUT_PLANE}' 
    DATA_ROOT_TRAIN:  str = DATA_ROOT_TARGET + '/Train' 
    DATA_ROOT_VALID:  str = DATA_ROOT_TARGET + '/Valid'
    DATA_ROOT_TEST:   str = DATA_ROOT_TARGET + '/Test'
    DATA_TEST_GT:     str = DATA_ROOT_TARGET + '/Test.csv'

In [55]:
# train, val and test split

In [56]:
data_df = pd.read_csv(DatasetConfig.DATA_ROOT_TARGET + "/data_description.csv")
data_df

Unnamed: 0.1,Unnamed: 0,Subject ID,Study UID,diagnosis,sliceNum,totalSlices,filePath,height,width
0,0,PETCT_f21755a99b,1.3.6.1.4.1.14519.5.2.1.4219.6651.631135993785...,POSITIVE,196,400,/PETCT_f21755a99b-1.3.6.1.4.1.14519.5.2.1.4219...,284,400
1,1,PETCT_176bdc5388,1.3.6.1.4.1.14519.5.2.1.4219.6651.908703633623...,NEGATIVE,196,400,/PETCT_176bdc5388-1.3.6.1.4.1.14519.5.2.1.4219...,274,400
2,2,PETCT_605369e88d,1.3.6.1.4.1.14519.5.2.1.4219.6651.161111150372...,POSITIVE,207,400,/PETCT_605369e88d-1.3.6.1.4.1.14519.5.2.1.4219...,326,400
3,3,PETCT_846c1af245,1.3.6.1.4.1.14519.5.2.1.4219.6651.158562682786...,NEGATIVE,207,400,/PETCT_846c1af245-1.3.6.1.4.1.14519.5.2.1.4219...,326,400
4,4,PETCT_7ce196485f,1.3.6.1.4.1.14519.5.2.1.4219.6651.226902065774...,POSITIVE,215,400,/PETCT_7ce196485f-1.3.6.1.4.1.14519.5.2.1.4219...,558,400
...,...,...,...,...,...,...,...,...,...
997,997,PETCT_193dea6ac7,1.3.6.1.4.1.14519.5.2.1.4219.6651.224241512383...,NEGATIVE,191,400,/PETCT_193dea6ac7-1.3.6.1.4.1.14519.5.2.1.4219...,242,400
998,998,PETCT_80ccbdadf9,1.3.6.1.4.1.14519.5.2.1.4219.6651.478619815683...,POSITIVE,181,400,/PETCT_80ccbdadf9-1.3.6.1.4.1.14519.5.2.1.4219...,326,400
999,999,PETCT_3bce0eb7aa,1.3.6.1.4.1.14519.5.2.1.4219.6651.222796109751...,NEGATIVE,181,400,/PETCT_3bce0eb7aa-1.3.6.1.4.1.14519.5.2.1.4219...,303,400
1000,1000,PETCT_3ba0277c0c,1.3.6.1.4.1.14519.5.2.1.4219.6651.146051428537...,POSITIVE,212,400,/PETCT_3ba0277c0c-1.3.6.1.4.1.14519.5.2.1.4219...,308,400


In [57]:
data_df.diagnosis.value_counts()

diagnosis
POSITIVE    501
NEGATIVE    501
Name: count, dtype: int64

In [58]:
col_names = data_df.columns.to_list()
col_names[0] = "imageId"
col_names

['imageId',
 'Subject ID',
 'Study UID',
 'diagnosis',
 'sliceNum',
 'totalSlices',
 'filePath',
 'height',
 'width']

In [59]:
data_df.columns = col_names
data_df.head()

Unnamed: 0,imageId,Subject ID,Study UID,diagnosis,sliceNum,totalSlices,filePath,height,width
0,0,PETCT_f21755a99b,1.3.6.1.4.1.14519.5.2.1.4219.6651.631135993785...,POSITIVE,196,400,/PETCT_f21755a99b-1.3.6.1.4.1.14519.5.2.1.4219...,284,400
1,1,PETCT_176bdc5388,1.3.6.1.4.1.14519.5.2.1.4219.6651.908703633623...,NEGATIVE,196,400,/PETCT_176bdc5388-1.3.6.1.4.1.14519.5.2.1.4219...,274,400
2,2,PETCT_605369e88d,1.3.6.1.4.1.14519.5.2.1.4219.6651.161111150372...,POSITIVE,207,400,/PETCT_605369e88d-1.3.6.1.4.1.14519.5.2.1.4219...,326,400
3,3,PETCT_846c1af245,1.3.6.1.4.1.14519.5.2.1.4219.6651.158562682786...,NEGATIVE,207,400,/PETCT_846c1af245-1.3.6.1.4.1.14519.5.2.1.4219...,326,400
4,4,PETCT_7ce196485f,1.3.6.1.4.1.14519.5.2.1.4219.6651.226902065774...,POSITIVE,215,400,/PETCT_7ce196485f-1.3.6.1.4.1.14519.5.2.1.4219...,558,400


In [60]:
# new height will be based on the mean value
data_df.height.mean()

350.74251497005986

In [61]:
@dataclass(frozen=True)
class ImageResizeConfig:
    height: int = 350
    width: int = 400

In [62]:
def pre_process_and_export_record(row):
    image_path = DatasetConfig.DATA_ROOT_TARGET + row["filePath"]
    img = cv2.imread(image_path)
    img = cv2.resize(img, (ImageResizeConfig.height, ImageResizeConfig.width))
    img = cv2.normalize(img, None, alpha=0, beta=255, norm_type=cv2.NORM_MINMAX)
    
    img_class = "0" if row.diagnosis == "NEGATIVE" else "1"
    
    destination_path = f"{DatasetConfig.DATA_ROOT_TARGET}/{row.subset}/{img_class}"
    pathlib.Path(destination_path).mkdir(parents=True, exist_ok=True)
    destination_path = f"{destination_path}/{row.imageId}.png"
    
    cv2.imwrite(destination_path,img)

In [63]:
data_df["subset"] = None

In [64]:
data_df.head()

Unnamed: 0,imageId,Subject ID,Study UID,diagnosis,sliceNum,totalSlices,filePath,height,width,subset
0,0,PETCT_f21755a99b,1.3.6.1.4.1.14519.5.2.1.4219.6651.631135993785...,POSITIVE,196,400,/PETCT_f21755a99b-1.3.6.1.4.1.14519.5.2.1.4219...,284,400,
1,1,PETCT_176bdc5388,1.3.6.1.4.1.14519.5.2.1.4219.6651.908703633623...,NEGATIVE,196,400,/PETCT_176bdc5388-1.3.6.1.4.1.14519.5.2.1.4219...,274,400,
2,2,PETCT_605369e88d,1.3.6.1.4.1.14519.5.2.1.4219.6651.161111150372...,POSITIVE,207,400,/PETCT_605369e88d-1.3.6.1.4.1.14519.5.2.1.4219...,326,400,
3,3,PETCT_846c1af245,1.3.6.1.4.1.14519.5.2.1.4219.6651.158562682786...,NEGATIVE,207,400,/PETCT_846c1af245-1.3.6.1.4.1.14519.5.2.1.4219...,326,400,
4,4,PETCT_7ce196485f,1.3.6.1.4.1.14519.5.2.1.4219.6651.226902065774...,POSITIVE,215,400,/PETCT_7ce196485f-1.3.6.1.4.1.14519.5.2.1.4219...,558,400,


In [65]:
train_df = pd.concat([data_df[data_df.diagnosis == "POSITIVE"].sample(frac=0.8, random_state=DatasetConfig.SEED_VALUE),
           data_df[data_df.diagnosis == "NEGATIVE"].sample(frac=0.8, random_state=DatasetConfig.SEED_VALUE)]
         ).sample(frac=1, random_state=DatasetConfig.SEED_VALUE)
train_df

Unnamed: 0,imageId,Subject ID,Study UID,diagnosis,sliceNum,totalSlices,filePath,height,width,subset
960,960,PETCT_ed9fa4eff1,1.3.6.1.4.1.14519.5.2.1.4219.6651.250121439180...,POSITIVE,165,400,/PETCT_ed9fa4eff1-1.3.6.1.4.1.14519.5.2.1.4219...,326,400,
576,576,PETCT_1472967bef,1.3.6.1.4.1.14519.5.2.1.4219.6651.345018069062...,POSITIVE,193,400,/PETCT_1472967bef-1.3.6.1.4.1.14519.5.2.1.4219...,326,400,
430,430,PETCT_32aa845af1,1.3.6.1.4.1.14519.5.2.1.4219.6651.111627576176...,POSITIVE,214,400,/PETCT_32aa845af1-1.3.6.1.4.1.14519.5.2.1.4219...,624,400,
599,599,PETCT_d3f13dff4b,1.3.6.1.4.1.14519.5.2.1.4219.6651.225424126913...,NEGATIVE,210,400,/PETCT_d3f13dff4b-1.3.6.1.4.1.14519.5.2.1.4219...,284,400,
894,894,PETCT_5255c79083,1.3.6.1.4.1.14519.5.2.1.4219.6651.309952339235...,POSITIVE,193,400,/PETCT_5255c79083-1.3.6.1.4.1.14519.5.2.1.4219...,284,400,
...,...,...,...,...,...,...,...,...,...,...
267,267,PETCT_e344879c2e,1.3.6.1.4.1.14519.5.2.1.4219.6651.216424922409...,NEGATIVE,163,400,/PETCT_e344879c2e-1.3.6.1.4.1.14519.5.2.1.4219...,318,400,
707,707,PETCT_b0e002e974,1.3.6.1.4.1.14519.5.2.1.4219.6651.222462861073...,NEGATIVE,201,400,/PETCT_b0e002e974-1.3.6.1.4.1.14519.5.2.1.4219...,368,400,
974,974,PETCT_9c9a347388,1.3.6.1.4.1.14519.5.2.1.4219.6651.877463610141...,POSITIVE,194,400,/PETCT_9c9a347388-1.3.6.1.4.1.14519.5.2.1.4219...,326,400,
830,830,PETCT_fde79b6aa9,1.3.6.1.4.1.14519.5.2.1.4219.6651.960887139192...,POSITIVE,202,400,/PETCT_fde79b6aa9-1.3.6.1.4.1.14519.5.2.1.4219...,577,400,


In [66]:
test_df = data_df.drop(train_df.index, inplace=False).sample(frac=1, random_state=DatasetConfig.SEED_VALUE)
test_df

Unnamed: 0,imageId,Subject ID,Study UID,diagnosis,sliceNum,totalSlices,filePath,height,width,subset
424,424,PETCT_581fa95eb0,1.3.6.1.4.1.14519.5.2.1.4219.6651.417489576678...,POSITIVE,180,400,/PETCT_581fa95eb0-1.3.6.1.4.1.14519.5.2.1.4219...,326,400,
277,277,PETCT_1a90052cb2,1.3.6.1.4.1.14519.5.2.1.4219.6651.326921868047...,NEGATIVE,180,400,/PETCT_1a90052cb2-1.3.6.1.4.1.14519.5.2.1.4219...,326,400,
880,880,PETCT_23ed525e82,1.3.6.1.4.1.14519.5.2.1.4219.6651.319700747490...,POSITIVE,209,400,/PETCT_23ed525e82-1.3.6.1.4.1.14519.5.2.1.4219...,577,400,
978,978,PETCT_ac11b344b6,1.3.6.1.4.1.14519.5.2.1.4219.6651.185972599218...,POSITIVE,154,400,/PETCT_ac11b344b6-1.3.6.1.4.1.14519.5.2.1.4219...,284,400,
294,294,PETCT_dac5cd2a4d,1.3.6.1.4.1.14519.5.2.1.4219.6651.115036314520...,POSITIVE,198,400,/PETCT_dac5cd2a4d-1.3.6.1.4.1.14519.5.2.1.4219...,326,400,
...,...,...,...,...,...,...,...,...,...,...
283,283,PETCT_1cb2d26a19,1.3.6.1.4.1.14519.5.2.1.4219.6651.167566570259...,NEGATIVE,188,400,/PETCT_1cb2d26a19-1.3.6.1.4.1.14519.5.2.1.4219...,330,400,
352,352,PETCT_b6a3c72db6,1.3.6.1.4.1.14519.5.2.1.4219.6651.332328809191...,POSITIVE,173,400,/PETCT_b6a3c72db6-1.3.6.1.4.1.14519.5.2.1.4219...,326,400,
730,730,PETCT_a4cd2b10de,1.3.6.1.4.1.14519.5.2.1.4219.6651.162317265485...,POSITIVE,167,400,/PETCT_a4cd2b10de-1.3.6.1.4.1.14519.5.2.1.4219...,368,400,
861,861,PETCT_c2ffda4725,1.3.6.1.4.1.14519.5.2.1.4219.6651.149257052777...,NEGATIVE,198,400,/PETCT_c2ffda4725-1.3.6.1.4.1.14519.5.2.1.4219...,242,400,


In [67]:
val_df = pd.concat([train_df[train_df.diagnosis == "POSITIVE"].sample(frac=0.1, random_state=DatasetConfig.SEED_VALUE),
           train_df[train_df.diagnosis == "NEGATIVE"].sample(frac=0.1, random_state=DatasetConfig.SEED_VALUE)]
         ).sample(frac=1, random_state=DatasetConfig.SEED_VALUE)
val_df

Unnamed: 0,imageId,Subject ID,Study UID,diagnosis,sliceNum,totalSlices,filePath,height,width,subset
233,233,PETCT_e5d50c5569,1.3.6.1.4.1.14519.5.2.1.4219.6651.219869499045...,NEGATIVE,217,400,/PETCT_e5d50c5569-1.3.6.1.4.1.14519.5.2.1.4219...,338,400,
976,976,PETCT_790246c76c,1.3.6.1.4.1.14519.5.2.1.4219.6651.124180203357...,POSITIVE,213,400,/PETCT_790246c76c-1.3.6.1.4.1.14519.5.2.1.4219...,284,400,
317,317,PETCT_a1d93ebc74,1.3.6.1.4.1.14519.5.2.1.4219.6651.864336233005...,NEGATIVE,178,400,/PETCT_a1d93ebc74-1.3.6.1.4.1.14519.5.2.1.4219...,535,400,
469,469,PETCT_ca9570a0eb,1.3.6.1.4.1.14519.5.2.1.4219.6651.944693887076...,NEGATIVE,200,400,/PETCT_ca9570a0eb-1.3.6.1.4.1.14519.5.2.1.4219...,326,400,
474,474,PETCT_a3df01d3a3,1.3.6.1.4.1.14519.5.2.1.4219.6651.255296228951...,POSITIVE,194,400,/PETCT_a3df01d3a3-1.3.6.1.4.1.14519.5.2.1.4219...,326,400,
...,...,...,...,...,...,...,...,...,...,...
866,866,PETCT_ca47fe5e7d,1.3.6.1.4.1.14519.5.2.1.4219.6651.184188057439...,POSITIVE,221,400,/PETCT_ca47fe5e7d-1.3.6.1.4.1.14519.5.2.1.4219...,326,400,
951,951,PETCT_4a72eeb991,1.3.6.1.4.1.14519.5.2.1.4219.6651.252474349422...,NEGATIVE,176,400,/PETCT_4a72eeb991-1.3.6.1.4.1.14519.5.2.1.4219...,536,400,
208,208,PETCT_802f19931c,1.3.6.1.4.1.14519.5.2.1.4219.6651.415089458243...,POSITIVE,189,400,/PETCT_802f19931c-1.3.6.1.4.1.14519.5.2.1.4219...,284,400,
260,260,PETCT_b510436d83,1.3.6.1.4.1.14519.5.2.1.4219.6651.318475350317...,POSITIVE,222,400,/PETCT_b510436d83-1.3.6.1.4.1.14519.5.2.1.4219...,326,400,


In [68]:
train_df = train_df.drop(val_df.index, inplace=False)
train_df

Unnamed: 0,imageId,Subject ID,Study UID,diagnosis,sliceNum,totalSlices,filePath,height,width,subset
960,960,PETCT_ed9fa4eff1,1.3.6.1.4.1.14519.5.2.1.4219.6651.250121439180...,POSITIVE,165,400,/PETCT_ed9fa4eff1-1.3.6.1.4.1.14519.5.2.1.4219...,326,400,
576,576,PETCT_1472967bef,1.3.6.1.4.1.14519.5.2.1.4219.6651.345018069062...,POSITIVE,193,400,/PETCT_1472967bef-1.3.6.1.4.1.14519.5.2.1.4219...,326,400,
430,430,PETCT_32aa845af1,1.3.6.1.4.1.14519.5.2.1.4219.6651.111627576176...,POSITIVE,214,400,/PETCT_32aa845af1-1.3.6.1.4.1.14519.5.2.1.4219...,624,400,
599,599,PETCT_d3f13dff4b,1.3.6.1.4.1.14519.5.2.1.4219.6651.225424126913...,NEGATIVE,210,400,/PETCT_d3f13dff4b-1.3.6.1.4.1.14519.5.2.1.4219...,284,400,
894,894,PETCT_5255c79083,1.3.6.1.4.1.14519.5.2.1.4219.6651.309952339235...,POSITIVE,193,400,/PETCT_5255c79083-1.3.6.1.4.1.14519.5.2.1.4219...,284,400,
...,...,...,...,...,...,...,...,...,...,...
832,832,PETCT_e77c5fca12,1.3.6.1.4.1.14519.5.2.1.4219.6651.290078519905...,POSITIVE,168,400,/PETCT_e77c5fca12-1.3.6.1.4.1.14519.5.2.1.4219...,326,400,
883,883,PETCT_63464433c8,1.3.6.1.4.1.14519.5.2.1.4219.6651.988752465833...,NEGATIVE,187,400,/PETCT_63464433c8-1.3.6.1.4.1.14519.5.2.1.4219...,326,400,
267,267,PETCT_e344879c2e,1.3.6.1.4.1.14519.5.2.1.4219.6651.216424922409...,NEGATIVE,163,400,/PETCT_e344879c2e-1.3.6.1.4.1.14519.5.2.1.4219...,318,400,
974,974,PETCT_9c9a347388,1.3.6.1.4.1.14519.5.2.1.4219.6651.877463610141...,POSITIVE,194,400,/PETCT_9c9a347388-1.3.6.1.4.1.14519.5.2.1.4219...,326,400,


In [69]:
train_df.diagnosis.value_counts()

diagnosis
POSITIVE    361
NEGATIVE    361
Name: count, dtype: int64

In [70]:
val_df.diagnosis.value_counts()

diagnosis
NEGATIVE    40
POSITIVE    40
Name: count, dtype: int64

In [71]:
test_df.diagnosis.value_counts()

diagnosis
POSITIVE    100
NEGATIVE    100
Name: count, dtype: int64

In [72]:
train_df.subset = "Train"
val_df.subset = "Valid"
test_df.subset = "Test"

In [73]:
full_data = pd.concat([train_df, val_df, test_df]).sample(frac=1,random_state=DatasetConfig.SEED_VALUE)
full_data

Unnamed: 0,imageId,Subject ID,Study UID,diagnosis,sliceNum,totalSlices,filePath,height,width,subset
727,727,PETCT_b66ba83594,1.3.6.1.4.1.14519.5.2.1.4219.6651.242835223533...,NEGATIVE,212,400,/PETCT_b66ba83594-1.3.6.1.4.1.14519.5.2.1.4219...,326,400,Train
940,940,PETCT_63dd9503eb,1.3.6.1.4.1.14519.5.2.1.4219.6651.316550527203...,POSITIVE,201,400,/PETCT_63dd9503eb-1.3.6.1.4.1.14519.5.2.1.4219...,326,400,Test
254,254,PETCT_2e44706eaf,1.3.6.1.4.1.14519.5.2.1.4219.6651.217378964956...,POSITIVE,162,400,/PETCT_2e44706eaf-1.3.6.1.4.1.14519.5.2.1.4219...,493,400,Test
678,678,PETCT_17d334cb6c,1.3.6.1.4.1.14519.5.2.1.4219.6651.143961978386...,POSITIVE,191,400,/PETCT_17d334cb6c-1.3.6.1.4.1.14519.5.2.1.4219...,326,400,Train
247,247,PETCT_b663adb148,1.3.6.1.4.1.14519.5.2.1.4219.6651.316934660962...,NEGATIVE,202,400,/PETCT_b663adb148-1.3.6.1.4.1.14519.5.2.1.4219...,589,400,Train
...,...,...,...,...,...,...,...,...,...,...
180,180,PETCT_2202a936e0,1.3.6.1.4.1.14519.5.2.1.4219.6651.248146545971...,POSITIVE,194,400,/PETCT_2202a936e0-1.3.6.1.4.1.14519.5.2.1.4219...,284,400,Train
154,154,PETCT_f9c4d4f9ab,1.3.6.1.4.1.14519.5.2.1.4219.6651.145948856856...,POSITIVE,174,400,/PETCT_f9c4d4f9ab-1.3.6.1.4.1.14519.5.2.1.4219...,397,400,Test
259,259,PETCT_562294be56,1.3.6.1.4.1.14519.5.2.1.4219.6651.222057902209...,NEGATIVE,155,400,/PETCT_562294be56-1.3.6.1.4.1.14519.5.2.1.4219...,493,400,Test
756,756,PETCT_963a71819a,1.3.6.1.4.1.14519.5.2.1.4219.6651.718616482864...,POSITIVE,191,400,/PETCT_963a71819a-1.3.6.1.4.1.14519.5.2.1.4219...,284,400,Test


In [74]:
from tqdm import tqdm

In [75]:
for index, row in tqdm(full_data.iterrows(), total=len(full_data)):
    pre_process_and_export_record(row)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1002/1002 [00:03<00:00, 293.60it/s]
