In [14]:
from pathlib import Path
import re
import pandas as pd
from collections import Counter
from tqdm import tqdm
import numpy as np
from joblib import Parallel, delayed

In [4]:
data_folder_path = Path("/home/pcallec/NACHOS/data/5_kidneys")

In [6]:
l_image_path = list(data_folder_path.rglob("*.png"))

In [8]:
l_image_path[:5]

[PosixPath('/home/pcallec/NACHOS/data/5_kidneys/01/tumor_temp/9298_AJBH331-left_tumor.png'),
 PosixPath('/home/pcallec/NACHOS/data/5_kidneys/01/tumor_temp/3296_AJBH331-left_tumor.png'),
 PosixPath('/home/pcallec/NACHOS/data/5_kidneys/01/tumor_temp/9457_AJBH331-left_tumor.png'),
 PosixPath('/home/pcallec/NACHOS/data/5_kidneys/01/tumor_temp/1823_AJBH331-left_tumor.png'),
 PosixPath('/home/pcallec/NACHOS/data/5_kidneys/01/tumor_temp/3660_AJBH331-left_tumor.png')]

In [10]:
l_image_path[0]

PosixPath('/home/pcallec/NACHOS/data/5_kidneys/01/tumor_temp/9298_AJBH331-left_tumor.png')

In [11]:
l_image_path[0].stem.split("_")[-1]

'tumor'

In [17]:
def process_image_path(image_path: Path):
    filename = image_path.name
    filename_stem = image_path.stem
    parts = filename_stem.split("_")
    fold_name = parts[-2]
    category = parts[-1]
    
    return {
        "filename": filename,
        "absolute_filepath": image_path,
        "fold_name": fold_name,
        "category": category
    }
    
# Run in parallel
results = Parallel(n_jobs=-1)(
    delayed(process_image_path)(image_path) for image_path in tqdm(l_image_path)
)

# Convert the list of dicts to a DataFrame
df_data = pd.DataFrame(results)

100%|██████████| 300000/300000 [00:03<00:00, 80763.05it/s] 


In [18]:
df_data.head()

Unnamed: 0,filename,absolute_filepath,fold_name,category
0,9298_AJBH331-left_tumor.png,/home/pcallec/NACHOS/data/5_kidneys/01/tumor_t...,AJBH331-left,tumor
1,3296_AJBH331-left_tumor.png,/home/pcallec/NACHOS/data/5_kidneys/01/tumor_t...,AJBH331-left,tumor
2,9457_AJBH331-left_tumor.png,/home/pcallec/NACHOS/data/5_kidneys/01/tumor_t...,AJBH331-left,tumor
3,1823_AJBH331-left_tumor.png,/home/pcallec/NACHOS/data/5_kidneys/01/tumor_t...,AJBH331-left,tumor
4,3660_AJBH331-left_tumor.png,/home/pcallec/NACHOS/data/5_kidneys/01/tumor_t...,AJBH331-left,tumor


In [21]:
df_data["fold_name"].unique()

array(['AJBH331-left', 'AILY469-left', 'AIJV450-right', 'AIKS388-left',
       'AILU486-right'], dtype=object)

In [22]:
df_data["category"].unique()

array(['tumor', 'pelvis', 'cortex', 'medulla', 'calyx', 'fat'],
      dtype=object)

In [24]:
l_category=["cortex", "medulla", "calyx", "fat", "tumor", "pelvis"]

category_to_label = {category: label for label, category in enumerate(l_category)}
df_data['label'] = df_data['category'].map(category_to_label)
df_data.to_csv("/home/pcallec/analyze_images/results/renal_carcinoma/renal_carcinoma_metadata.csv", index=False)

In [26]:
df_data.head()

Unnamed: 0,filename,absolute_filepath,fold_name,category,label
0,9298_AJBH331-left_tumor.png,/home/pcallec/NACHOS/data/5_kidneys/01/tumor_t...,AJBH331-left,tumor,4
1,3296_AJBH331-left_tumor.png,/home/pcallec/NACHOS/data/5_kidneys/01/tumor_t...,AJBH331-left,tumor,4
2,9457_AJBH331-left_tumor.png,/home/pcallec/NACHOS/data/5_kidneys/01/tumor_t...,AJBH331-left,tumor,4
3,1823_AJBH331-left_tumor.png,/home/pcallec/NACHOS/data/5_kidneys/01/tumor_t...,AJBH331-left,tumor,4
4,3660_AJBH331-left_tumor.png,/home/pcallec/NACHOS/data/5_kidneys/01/tumor_t...,AJBH331-left,tumor,4


In [27]:
Counter(df_data["category"])

Counter({'tumor': 50000,
         'pelvis': 50000,
         'cortex': 50000,
         'medulla': 50000,
         'calyx': 50000,
         'fat': 50000})

In [28]:
l_category = df_data["category"].unique()
l_category

array(['tumor', 'pelvis', 'cortex', 'medulla', 'calyx', 'fat'],
      dtype=object)

In [29]:
df_data_stats = df_data.groupby(["fold_name", "category"]).size().reset_index(name='count')
df_data_stats.head()

Unnamed: 0,fold_name,category,count
0,AIJV450-right,calyx,10000
1,AIJV450-right,cortex,10000
2,AIJV450-right,fat,10000
3,AIJV450-right,medulla,10000
4,AIJV450-right,pelvis,10000


In [30]:
df_data_stats.to_csv("/home/pcallec/analyze_images/results/renal_carcinoma/renal_carcinoma_properties_stats.csv", index=False)