In [8]:
from aidream_data.constants import DIR_AIDREAM_ON_NAS
from aidream_data.core import AidreamDatabase

from pathlib import Path
import pandas as pd


In [9]:
%%time
# connect to the AIdream Database : 
aidream_db = AidreamDatabase()
aidream_db.connect()


CPU times: user 203 ms, sys: 45 ms, total: 248 ms
Wall time: 9.04 s


In [10]:
%%time
# Load the cohort info dataframe : 
df_cohort_info = aidream_db.get_cohort_info()
# Load the clinical data dataframe :
df_clinical_data = aidream_db.get_clinical_data()


CPU times: user 17 µs, sys: 3 µs, total: 20 µs
Wall time: 22.2 µs


In [11]:
# Select only patients with perfusion data :
df_cohort_info = df_cohort_info.loc[df_cohort_info['']['Perfusion?'] == 'oui']
list_extended_patients = df_cohort_info['patient id']['patient id'].values.tolist()
df_clinical_data = df_clinical_data.loc[df_clinical_data['id_aidream'].isin(list_extended_patients)]

print(f"Number of patients with perfusion data : {len(list_extended_patients)}")


Number of patients with perfusion data : 234


In [12]:
# Dataframe of extended patients containing the path to the patient's perfusion data :

df_extended_patients = df_clinical_data[["id_aidream", "id_cercare"]].copy()
df_extended_patients["tmp"] = df_extended_patients["id_aidream"].apply(lambda x: int(x.lstrip("AIDREAM_")))
df_extended_patients = df_extended_patients.sort_values(by="tmp").drop(columns=["tmp"])

def get_perfusion_directory(cercare_id):
    
    if cercare_id.startswith("MMI-PROB"):
        
        dir_perfusion = DIR_AIDREAM_ON_NAS / "Cercare_perfusion" / "MMI-PROB" / cercare_id / "PERFUSION"
        if dir_perfusion.exists():
            return str(dir_perfusion)
        else:
            return "Not found"
        
    elif cercare_id.startswith("AIDREAM"):
        
        list_perfusion_directories = list((DIR_AIDREAM_ON_NAS / "Cercare_perfusion" / "AIDREAM" / cercare_id).glob("*/AX PERFUSION GRE GADO"))
        
        if len(list_perfusion_directories) != 1:
            return "Not found"
        else:
            return str(list_perfusion_directories[0])
        
df_extended_patients["Perfusion directory"] = df_extended_patients["id_cercare"].apply(get_perfusion_directory)
df_extended_patients.to_csv("extended_patients.csv", index=False)


In [13]:
set(df_extended_patients["id_cercare"].loc[df_extended_patients["id_cercare"].str.startswith("MMI-PROB")])

{'MMI-PROB_001',
 'MMI-PROB_002',
 'MMI-PROB_021',
 'MMI-PROB_024',
 'MMI-PROB_025',
 'MMI-PROB_027',
 'MMI-PROB_028',
 'MMI-PROB_029',
 'MMI-PROB_030',
 'MMI-PROB_031',
 'MMI-PROB_032',
 'MMI-PROB_033',
 'MMI-PROB_035',
 'MMI-PROB_036',
 'MMI-PROB_041',
 'MMI-PROB_042',
 'MMI-PROB_043',
 'MMI-PROB_046',
 'MMI-PROB_047',
 'MMI-PROB_048',
 'MMI-PROB_049',
 'MMI-PROB_050',
 'MMI-PROB_051',
 'MMI-PROB_052',
 'MMI-PROB_053',
 'MMI-PROB_054',
 'MMI-PROB_057',
 'MMI-PROB_058',
 'MMI-PROB_059',
 'MMI-PROB_060',
 'MMI-PROB_061',
 'MMI-PROB_062',
 'MMI-PROB_063',
 'MMI-PROB_064',
 'MMI-PROB_065',
 'MMI-PROB_066',
 'MMI-PROB_067',
 'MMI-PROB_069',
 'MMI-PROB_070',
 'MMI-PROB_071',
 'MMI-PROB_072',
 'MMI-PROB_073',
 'MMI-PROB_075',
 'MMI-PROB_076',
 'MMI-PROB_077',
 'MMI-PROB_078',
 'MMI-PROB_079',
 'MMI-PROB_080',
 'MMI-PROB_085',
 'MMI-PROB_087',
 'MMI-PROB_088',
 'MMI-PROB_089',
 'MMI-PROB_094',
 'MMI-PROB_095',
 'MMI-PROB_096',
 'MMI-PROB_097',
 'MMI-PROB_098',
 'MMI-PROB_099',
 'MMI-PROB_100

In [15]:
{p.parent.stem for p in (DIR_AIDREAM_ON_NAS / 'Cercare_perfusion' / "MMI-PROB").glob("*/PERFUSION")} - set(df_extended_patients["id_cercare"].loc[df_extended_patients["id_cercare"].str.startswith("MMI-PROB")])


{'MMI-PROB_003',
 'MMI-PROB_004',
 'MMI-PROB_008',
 'MMI-PROB_010',
 'MMI-PROB_011',
 'MMI-PROB_012',
 'MMI-PROB_013',
 'MMI-PROB_014',
 'MMI-PROB_015',
 'MMI-PROB_016',
 'MMI-PROB_017',
 'MMI-PROB_019',
 'MMI-PROB_020',
 'MMI-PROB_022',
 'MMI-PROB_023',
 'MMI-PROB_039',
 'MMI-PROB_040',
 'MMI-PROB_045',
 'MMI-PROB_055',
 'MMI-PROB_056',
 'MMI-PROB_084',
 'MMI-PROB_091',
 'MMI-PROB_110',
 'MMI-PROB_121',
 'MMI-PROB_142',
 'MMI-PROB_158',
 'MMI-PROB_167',
 'MMI-PROB_168',
 'MMI-PROB_176',
 'MMI-PROB_205',
 'MMI-PROB_206',
 'MMI-PROB_216',
 'MMI-PROB_227',
 'MMI-PROB_237',
 'MMI-PROB_243',
 'MMI-PROB_246'}

In [16]:
list_weird_patients = {
    'MMI-PROB_003', 'MMI-PROB_004', 'MMI-PROB_008', 'MMI-PROB_010', 'MMI-PROB_011',
    'MMI-PROB_012', 'MMI-PROB_013', 'MMI-PROB_014', 'MMI-PROB_015', 'MMI-PROB_016',
    'MMI-PROB_017', 'MMI-PROB_019', 'MMI-PROB_020', 'MMI-PROB_022', 'MMI-PROB_023',
    'MMI-PROB_039', 'MMI-PROB_040', 'MMI-PROB_045', 'MMI-PROB_055', 'MMI-PROB_056',
    'MMI-PROB_084', 'MMI-PROB_091', 'MMI-PROB_110', 'MMI-PROB_121', 'MMI-PROB_142',
    'MMI-PROB_158', 'MMI-PROB_167', 'MMI-PROB_168', 'MMI-PROB_176', 'MMI-PROB_205',
    'MMI-PROB_206', 'MMI-PROB_216', 'MMI-PROB_227', 'MMI-PROB_237', 'MMI-PROB_243',
    'MMI-PROB_246'}

In [17]:
df_clinical_data.loc[df_clinical_data["id_cercare"].isin(list_weird_patients)]

Unnamed: 0,id_aidream,Centre,Cohorte,id_cercare,IPP,AIDREAM_ID_300pat,avec_perf,IRM_preRT_ax,IRM_rechute_ax,Reperes_preRT,...,recidive_distance,recidive_distance_date,recidive_infield_distant,CT_date,Dosi_date,IRM__preRT_date,IRM__rechute_date,delai_IRMrechute_IRMpreRT_mois,delai_irmrechute_RTj1_jours,PTV_volume_total_cm3


In [18]:
# Select relevant columns and rename them :

old_cols = [('patient id', 'patient id'), ('', 'Perfusion?'),
                ('pre-RT', 'Autoseg?'), ('pre-RT', 'Label1'), ('pre-RT', 'Label3'), ('pre-RT', 'Obs2_preRT_CS'), ( 'pre-RT', 'Obs3_preRT_CV'),
                ('Relapse', 'Autoseg?'), ('Relapse', 'Label1'), ('Relapse', 'Label3'), ('Relapse', 'OBS2_relapse_CS'), ('Relapse', 'OBS3_relapse_CV')]

new_cols = ["patient id", "Perfusion?","Pre-RT Autoseg?", "Pre-RT Label1", "Pre-RT Label3", "Pre-RT Obs2_preRT_CS", "Pre-RT Obs3_preRT_CV",
            "Relapse Autoseg?", "Relapse Label1", "Relapse Label3", "Relapse OBS2_relapse_CS", "Relapse OBS3_relapse_CV"]

df_cohort_info = pd.DataFrame(df_cohort_info[old_cols].values, columns=new_cols)


In [19]:
df_cohort_info['Pre-RT Autoseg?'].value_counts()

Pre-RT Autoseg?
AC_repATLAS                                  148
GE_repNATIV                                   44
GE_repNativAXIALISEDnew_surAppliGE            25
GE_repNativnew_surAppliGE                     15
Manuel/GE_repNativnew_surAppliGE               1
Manuel/GE_repNativAXIALISEDnew_surAppliGE      1
Name: count, dtype: int64

In [21]:
list_extended_patients = df_cohort_info['patient id'].values.tolist()


In [22]:
df_cohort_info.loc[(df_cohort_info['Pre-RT Label1'] == "")]

Unnamed: 0,patient id,Perfusion?,Pre-RT Autoseg?,Pre-RT Label1,Pre-RT Label3,Pre-RT Obs2_preRT_CS,Pre-RT Obs3_preRT_CV,Relapse Autoseg?,Relapse Label1,Relapse Label3,Relapse OBS2_relapse_CS,Relapse OBS3_relapse_CV
31,AIDREAM_131,oui,GE_repNativAXIALISEDnew_surAppliGE,,1.0,,L3 prêt sur le new autoseg GE,GE_repNativAXIALISEDnew_surAppliGE,1.0,1,,
153,AIDREAM_365,oui,GE_repNativAXIALISEDnew_surAppliGE,,,,,GE_repNativAXIALISEDnew_surAppliGE,,1,,L3 prêt sur le new autoseg GE
155,AIDREAM_367,oui,GE_repNativAXIALISEDnew_surAppliGE,,0.0,,à valider les images,GE_repNativAXIALISEDnew_surAppliGE,,1,,L3 prêt sur le new autoseg GE


In [9]:
df_cohort_info = df_cohort_info.loc[
    (df_cohort_info['Pre-RT Label1'] != "") &
    (df_cohort_info['Pre-RT Label3'] != "") &
    (df_cohort_info['Pre-RT Obs2_preRT_CS'] != "") &
    # (df_cohort_info['Pre-RT Obs3_preRT_CV'] != "") &   
    (df_cohort_info['Relapse Label3'] != "")  &
    (df_cohort_info['Relapse OBS2_relapse_CS'] != "") 
    # (df_cohort_info['Relapse OBS3_relapse_CV'] != "")
]

In [10]:
df_cohort_info.columns

Index(['patient id', 'Perfusion?', 'Pre-RT Autoseg?', 'Pre-RT Label1',
       'Pre-RT Label3', 'Pre-RT Obs2_preRT_CS', 'Pre-RT Obs3_preRT_CV',
       'Relapse Autoseg?', 'Relapse Label1', 'Relapse Label3',
       'Relapse OBS2_relapse_CS', 'Relapse OBS3_relapse_CV'],
      dtype='object')

In [11]:
# Check the annotation referential of the patients at pre-RT: 
df_cohort_info['Pre-RT Autoseg?'].value_counts()


Pre-RT Autoseg?
AC_repATLAS    55
GE_repNATIV     1
Name: count, dtype: int64

In [9]:
list_extended_patients

['AIDREAM_1',
 'AIDREAM_10',
 'AIDREAM_100',
 'AIDREAM_101',
 'AIDREAM_102',
 'AIDREAM_103',
 'AIDREAM_104',
 'AIDREAM_106',
 'AIDREAM_109',
 'AIDREAM_11',
 'AIDREAM_110',
 'AIDREAM_112',
 'AIDREAM_113',
 'AIDREAM_114',
 'AIDREAM_115',
 'AIDREAM_116',
 'AIDREAM_117',
 'AIDREAM_118',
 'AIDREAM_12',
 'AIDREAM_120',
 'AIDREAM_121',
 'AIDREAM_122',
 'AIDREAM_123',
 'AIDREAM_124',
 'AIDREAM_125',
 'AIDREAM_126',
 'AIDREAM_127',
 'AIDREAM_128',
 'AIDREAM_129',
 'AIDREAM_13',
 'AIDREAM_130',
 'AIDREAM_131',
 'AIDREAM_132',
 'AIDREAM_133',
 'AIDREAM_135',
 'AIDREAM_136',
 'AIDREAM_137',
 'AIDREAM_139',
 'AIDREAM_14',
 'AIDREAM_140',
 'AIDREAM_141',
 'AIDREAM_142',
 'AIDREAM_143',
 'AIDREAM_144',
 'AIDREAM_145',
 'AIDREAM_146',
 'AIDREAM_147',
 'AIDREAM_148',
 'AIDREAM_149',
 'AIDREAM_15',
 'AIDREAM_151',
 'AIDREAM_152',
 'AIDREAM_153',
 'AIDREAM_16',
 'AIDREAM_17',
 'AIDREAM_18',
 'AIDREAM_19',
 'AIDREAM_2',
 'AIDREAM_20',
 'AIDREAM_200',
 'AIDREAM_201',
 'AIDREAM_202',
 'AIDREAM_203',
 'AIDRE

In [23]:
df_ref = pd.read_excel(DIR_AIDREAM_ON_NAS / "AIDREAM_All" / "AIDREAM_DonneesCliniques_975Patients_v.31082023.xlsx")
df_ref.rename(columns={'New_ID_AIDREAM':'id_aidream'}, inplace=True)
df_ref = df_ref[['id_aidream', 'ID_local']].loc[df_ref['id_aidream'].isin(list_extended_patients)]


In [32]:
df_available_data = pd.read_csv("extended_patients.csv")
df_ref = df_ref.merge(df_available_data, on='id_aidream', how="left")
df_ref.to_csv("extended_patients.csv", index=False)

In [24]:
df_ref = pd.read_csv("extended_patients.csv")

In [25]:
df_missing_patients_36 = df_ref.loc[df_ref["Perfusion directory"] == "Not found"].reset_index(drop=True)
df_missing_patients_36.rename(columns={"ID_local": "Patient ID"}, inplace=True)
df_missing_patients_36[["Patient ID", "id_aidream"]].to_csv("missing_patients_36.csv", index=False)

KeyError: "['Patient ID'] not in index"

In [26]:
df_ref

Unnamed: 0,id_aidream,id_cercare,Perfusion directory
0,AIDREAM_1,MMI-PROB_140,/run/user/1001/gvfs/smb-share:server=31.10.11....
1,AIDREAM_2,MMI-PROB_209,/run/user/1001/gvfs/smb-share:server=31.10.11....
2,AIDREAM_3,MMI-PROB_180,/run/user/1001/gvfs/smb-share:server=31.10.11....
3,AIDREAM_4,MMI-PROB_069,/run/user/1001/gvfs/smb-share:server=31.10.11....
4,AIDREAM_5,MMI-PROB_138,Not found
...,...,...,...
229,AIDREAM_393,AIDREAM_393,/run/user/1001/gvfs/smb-share:server=31.10.11....
230,AIDREAM_394,AIDREAM_394,/run/user/1001/gvfs/smb-share:server=31.10.11....
231,AIDREAM_397,AIDREAM_397,Not found
232,AIDREAM_399,AIDREAM_399,/run/user/1001/gvfs/smb-share:server=31.10.11....


In [42]:
df_missing_patients_36[["Patient ID", "id_aidream"]].to_csv("missing_patients_36.csv", index=False)

In [1]:
df_ref

NameError: name 'df_ref' is not defined

In [27]:
df_links = pd.read_csv("extended_patients.csv")

In [28]:
from tqdm.auto import tqdm
import shutil

In [29]:
_list_patients = df_links["id_aidream"].loc[df_links["Perfusion directory"] != "Not found"]

for patient in tqdm(_list_patients):
    
    dir_perfusion = Path(df_links.loc[df_links["id_aidream"] == patient, "Perfusion directory"].values[0])
    
    dir_dst = Path("/media/maichi/SSD-IGR/Perfusion data") / patient
    dir_dst.mkdir(parents=True, exist_ok=True)
    
    shutil.copytree(src=dir_perfusion, dst=dir_dst, dirs_exist_ok=True)
    

  0%|          | 0/198 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [30]:
import concurrent.futures

def copy_file(patient):
    dir_perfusion = Path(df_links.loc[df_links["id_aidream"] == patient, "Perfusion directory"].values[0])
    dir_dst = Path("/media/maichi/SSD-IGR/Perfusion data") / patient
    dir_dst.mkdir(parents=True, exist_ok=True)
    shutil.copytree(src=dir_perfusion, dst=dir_dst, dirs_exist_ok=True)

_list_patients = df_links["id_aidream"].loc[df_links["Perfusion directory"] != "Not found"]

# Use a ThreadPoolExecutor to copy files in parallel
with concurrent.futures.ThreadPoolExecutor() as executor:
    list(executor.map(copy_file, _list_patients))
    


KeyboardInterrupt

