In [5]:
from pathlib import Path
import re
import pandas as pd
from collections import Counter
from tqdm import tqdm
import numpy as np

In [6]:
data_folder_path = Path("/home/pcallec/Mouse_data_reprezentative_zip")

In [7]:
l_image_path = list(data_folder_path.rglob("*.png"))

In [8]:
l_image_path[:-5]

[PosixPath('/home/pcallec/Mouse_data_reprezentative_zip/Images_Mouse_CON/167-CON_0002_ModePolarization3D_Total_Intensity_0548.png'),
 PosixPath('/home/pcallec/Mouse_data_reprezentative_zip/Images_Mouse_CON/154-CON_0002_ModePolarization3D_Total_Intensity_0410.png'),
 PosixPath('/home/pcallec/Mouse_data_reprezentative_zip/Images_Mouse_CON/154-CON_0002_ModePolarization3D_Total_Intensity_0578.png'),
 PosixPath('/home/pcallec/Mouse_data_reprezentative_zip/Images_Mouse_CON/176-CON_0002_ModePolarization3D_Total_Intensity_0370.png'),
 PosixPath('/home/pcallec/Mouse_data_reprezentative_zip/Images_Mouse_CON/168_Control_0003_ModePolarization3D_Total_Intensity_0426.png'),
 PosixPath('/home/pcallec/Mouse_data_reprezentative_zip/Images_Mouse_CON/168_Control_0003_ModePolarization3D_Total_Intensity_0563.png'),
 PosixPath('/home/pcallec/Mouse_data_reprezentative_zip/Images_Mouse_CON/195-CON_0003_ModePolarization3D_Total_Intensity_0665.png'),
 PosixPath('/home/pcallec/Mouse_data_reprezentative_zip/Image

In [9]:


# filename,fold_name,relative_path,absolute_filepath,type
df_data = pd.DataFrame()
for image_path in tqdm(l_image_path):
    filename = image_path.name
    filename_stem = image_path.stem
    
    split_text = re.split(r'[-_]', filename)
    # e.g. ['167', 'CON', '0002', 'ModePolarization3D', 'Total', 'Intensity', '0548']
    
    row_dict = pd.DataFrame({
        "filename":[filename],
        "absolute_filepath": [image_path],
        "mouse_id": [int(split_text[0])],
        "category": [split_text[1]],
        "tumor_id": [int(split_text[2])],
    })
    
    df_data = pd.concat([df_data, row_dict], ignore_index=True)

100%|██████████| 28000/28000 [00:17<00:00, 1590.11it/s]


In [10]:
df_data.head()

Unnamed: 0,filename,absolute_filepath,mouse_id,category,tumor_id
0,167-CON_0002_ModePolarization3D_Total_Intensit...,/home/pcallec/Mouse_data_reprezentative_zip/Im...,167,CON,2
1,154-CON_0002_ModePolarization3D_Total_Intensit...,/home/pcallec/Mouse_data_reprezentative_zip/Im...,154,CON,2
2,154-CON_0002_ModePolarization3D_Total_Intensit...,/home/pcallec/Mouse_data_reprezentative_zip/Im...,154,CON,2
3,176-CON_0002_ModePolarization3D_Total_Intensit...,/home/pcallec/Mouse_data_reprezentative_zip/Im...,176,CON,2
4,168_Control_0003_ModePolarization3D_Total_Inte...,/home/pcallec/Mouse_data_reprezentative_zip/Im...,168,Control,3


In [11]:
Counter(df_data["category"])

Counter({'CIS': 7800, 'COMB': 7500, 'MBZ': 5700, 'CON': 4738, 'Control': 2262})

In [12]:
df_data = df_data.replace("Control", "CON")

In [13]:
l_category = df_data["category"].unique()
l_category

array(['CON', 'COMB', 'CIS', 'MBZ'], dtype=object)

In [14]:
df_data["mouse_id"].unique()

array([167, 154, 176, 168, 195, 156, 185, 171, 188, 138, 162, 130, 172,
       179, 145, 128, 164, 180, 147, 198, 158, 135, 159, 178, 129, 169,
       181, 143, 196, 144, 136, 165, 186, 157])

In [15]:
for category in l_category:
    
    l_mouse_id = df_data.query("category==@category")["mouse_id"].unique()
    for mouse_id in l_mouse_id:
        print(mouse_id, 
              df_data.query("category==@category and mouse_id==@mouse_id")["mouse_id"].count())

167 1048
154 554
176 416
168 1110
195 832
156 1152
185 840
171 424
188 624
138 1234
162 495
130 1117
172 747
179 1175
145 1629
128 603
186 500
164 820
180 730
147 1200
198 970
158 1160
135 800
159 440
178 880
169 800
129 750
181 800
143 1100
196 1000
144 500
136 500
165 550
157 500


In [16]:
df_data_stats = df_data.groupby(["category", "mouse_id", "tumor_id"]).size().reset_index(name='count')
df_data_stats

Unnamed: 0,category,mouse_id,tumor_id,count
0,CIS,135,1,400
1,CIS,135,2,400
2,CIS,147,1,400
3,CIS,147,2,400
4,CIS,147,3,400
...,...,...,...,...
83,MBZ,181,2,250
84,MBZ,181,3,300
85,MBZ,196,1,350
86,MBZ,196,2,350


In [17]:
df_data_stats.to_csv("/home/pcallec/analyze_images/results/mouse_data/mouse_properties_stats.csv", index=False)

In [18]:
df_data.groupby(["category", "mouse_id", "tumor_id"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,filename,absolute_filepath
category,mouse_id,tumor_id,Unnamed: 3_level_1,Unnamed: 4_level_1
CIS,135,1,400,400
CIS,135,2,400,400
CIS,147,1,400,400
CIS,147,2,400,400
CIS,147,3,400,400
...,...,...,...,...
MBZ,181,2,250,250
MBZ,181,3,300,300
MBZ,196,1,350,350
MBZ,196,2,350,350


In [19]:
df_data["tumor_id"].nunique()

5

In [20]:
df_data.to_csv("/home/pcallec/analyze_images/results/mouse_data/mouse_properties.csv", index=False)

In [21]:
# Partition data into train, val, and test

df_data_stats

Unnamed: 0,category,mouse_id,tumor_id,count
0,CIS,135,1,400
1,CIS,135,2,400
2,CIS,147,1,400
3,CIS,147,2,400
4,CIS,147,3,400
...,...,...,...,...
83,MBZ,181,2,250
84,MBZ,181,3,300
85,MBZ,196,1,350
86,MBZ,196,2,350


# CON vs COMB

In [27]:
def create_metadata(
    l_category: list,
    df_data: pd.DataFrame,
    df_data_stats: pd.DataFrame,
    output_path: Path,
    random_seed: int = 42):

    np.random.seed(random_seed)

    partition_dict ={} 

    for category in l_category:
        partition_dict[category] = {}
        l_mouse_id = df_data_stats.query("category==@category")["mouse_id"].unique()
        a_mouse_id = np.array(l_mouse_id)
        np.random.shuffle(a_mouse_id)
        print(category, "mouse_id", a_mouse_id)
        # Compute split indices (60%, 20%, 20%)
        n_total = len(a_mouse_id)
        split1 = int(0.6 * n_total)  # First 60%
        split2 = int(0.8 * n_total)  # Next 20%

        # Split the data into three groups
        partition_dict[category]["training"] = a_mouse_id[:split1]  # 60%
        partition_dict[category]["validation"] = a_mouse_id[split1:split2]  # 20%
        partition_dict[category]["test"] = a_mouse_id[split2:]  # 20%
        
    print(partition_dict)

    for category in partition_dict.keys():
        for partition in partition_dict[category].keys():
            print(category,
                partition,
                df_data.query("category==@category and mouse_id in @partition_dict[@category][@partition]")["filename"].count())
            
    # Create a new column 'fold_name' and initialize with None
    df_data['fold_name'] = None
    # Assign partition category based on dictionary mapping
    for category_key, partitions in partition_dict.items():
        for partition_name, mouse_ids in partitions.items():
            df_data.loc[(df_data['category'] == category_key) & (df_data['mouse_id'].isin(mouse_ids)), 'fold_name'] = partition_name
            
    df_filtered = df_data.dropna(subset=['fold_name']).copy()
    category_to_label = {category: label for label, category in enumerate(l_category)}
    df_filtered['label'] = df_filtered['category'].map(category_to_label)
    df_filtered.to_csv(output_path, index=False)

In [31]:
create_metadata(
    l_category=['CON', 'COMB'],
    df_data=df_data,
    df_data_stats=df_data_stats,
    output_path=Path("/home/pcallec/analyze_images/results/mouse_data/input_mouse_metadata_CON_COMB.csv"),
    random_seed=42,
)

CON mouse_id [188 156 176 154 195 167 171 168 185]
COMB mouse_id [162 130 128 145 172 186 179 138]
{'CON': {'training': array([188, 156, 176, 154, 195]), 'validation': array([167, 171]), 'test': array([168, 185])}, 'COMB': {'training': array([162, 130, 128, 145]), 'validation': array([172, 186]), 'test': array([179, 138])}}
CON training 3578
CON validation 1472
CON test 1950
COMB training 3844
COMB validation 1247
COMB test 2409


In [30]:
create_metadata(
    l_category=['CON', 'MBZ'],
    df_data=df_data,
    df_data_stats=df_data_stats,
    output_path=Path("/home/pcallec/analyze_images/results/mouse_data/input_mouse_metadata_CON_MBZ.csv"),
    random_seed=42,
)

CON mouse_id [188 156 176 154 195 167 171 168 185]
MBZ mouse_id [157 136 129 144 165 196 181 143]
{'CON': {'training': array([188, 156, 176, 154, 195]), 'validation': array([167, 171]), 'test': array([168, 185])}, 'MBZ': {'training': array([157, 136, 129, 144]), 'validation': array([165, 196]), 'test': array([181, 143])}}
CON training 3578
CON validation 1472
CON test 1950
MBZ training 2250
MBZ validation 1550
MBZ test 1900
