In [2]:
# Résumé des données manquantes par sujet et bundle (avec confounds spécifiques DEP et actimétrie)
import os
import pandas as pd
from os.path import join as opj
from actiDep.set_config import get_HCP_bundle_names
from actiDep.data.loader import Actidep
# Paramètres
DB_ROOT = '/home/ndecaux/NAS_EMPENN/share/projects/actidep/bids'
PIPELINE = 'hcp_association_50pts'  # adapter si besoin
METRIC_COLS = ['FA','MD','RD','AD','IFW','IRF']
CLASSIF_VARS = ['group','apathy']
CORR_VARS = ['ami','aes']
MANDATORY_VARS = list(set(['age','sex','group']))
DEP_VARS = list(set(MANDATORY_VARS + CLASSIF_VARS + CORR_VARS))
POINT_COL_CANDIDATES = ['point','point_id']
ACTIMETRY_XLSX = opj(DB_ROOT,'actimetry_features.xlsx')

ds = Actidep(db_root=DB_ROOT)
all_subjects= ds.subject_ids

csv_files = ds.get_global(pipeline=PIPELINE, extension='csv',datatype='metric')
present_subjects = list(set([f.subject for f in csv_files]))

all_bundles=list(get_HCP_bundle_names().keys())
bundles_in_db = list(set([f.get_entities().get('bundle',None) for f in csv_files if f.bundle in all_bundles]))
print("Sujets totaux dans la BDD :", len(all_subjects))
print("Sujets avec données dans le pipeline :", len(present_subjects))
print("Sujets sans données dans le pipeline :", set(all_subjects)-set(present_subjects))
print(len(csv_files), "fichiers CSV trouvés")

print("Nombre de bundles dans la BDD :", len(bundles_in_db))
print("Nombre de bundles attendus :", len(all_bundles))

Sujets totaux dans la BDD : 61
Sujets avec données dans le pipeline : 60
Sujets sans données dans le pipeline : {'03026'}
4260 fichiers CSV trouvés
Nombre de bundles dans la BDD : 71
Nombre de bundles attendus : 71


In [3]:
for sub in present_subjects:
    sub_files = [f for f in csv_files if f.subject==sub]
    bundles_for_sub = list(set([f.get_entities().get('bundle',None) for f in sub_files if f.bundle in all_bundles]))
    missing_bundles = list(set(all_bundles)-set(bundles_for_sub))
    if len(missing_bundles)>0:
        print(f"Sujet {sub} : {len(missing_bundles)} bundles manquants :", missing_bundles)
    else:
        print(f"Sujet {sub} : tous les bundles sont présents ({len(bundles_for_sub)})")

Sujet 01018 : tous les bundles sont présents (71)
Sujet 01033 : tous les bundles sont présents (71)
Sujet 01029 : tous les bundles sont présents (71)
Sujet 03016 : tous les bundles sont présents (71)
Sujet 01031 : tous les bundles sont présents (71)
Sujet 03003 : tous les bundles sont présents (71)
Sujet 03018 : tous les bundles sont présents (71)
Sujet 01030 : tous les bundles sont présents (71)
Sujet 01028 : tous les bundles sont présents (71)
Sujet 03015 : tous les bundles sont présents (71)
Sujet 03014 : tous les bundles sont présents (71)
Sujet 01022 : tous les bundles sont présents (71)
Sujet 01008 : tous les bundles sont présents (71)
Sujet 03002 : tous les bundles sont présents (71)
Sujet 03012 : tous les bundles sont présents (71)
Sujet 03019 : tous les bundles sont présents (71)
Sujet 03020 : tous les bundles sont présents (71)
Sujet 03008 : tous les bundles sont présents (71)
Sujet 03004 : tous les bundles sont présents (71)
Sujet 03025 : tous les bundles sont présents (71)


### Infos participants manquantes

In [4]:
participant_file = os.path.join(DB_ROOT,'participants_full_info.xlsx')
participants_df = pd.read_excel(participant_file)
print("Informations disponibles dans participants_full_info.xlsx :", participants_df.columns.tolist())
participants_df['subject_id']=participants_df['participant_id'].str.replace('sub-','')
participants_df.set_index('subject_id',inplace=True)
subjects_df = pd.DataFrame(present_subjects,columns=['subject_id'])
participants_df#MANDATORY_VARS]
subjects_df = subjects_df.merge(participants_df, left_on='subject_id', right_on='subject_id', how='left')

mandatory = subjects_df[['subject_id']+MANDATORY_VARS]
#Get lines that contain NaN
mandatory = mandatory[mandatory.isnull().any(axis=1)]

deps = subjects_df[['subject_id']+DEP_VARS][subjects_df['group'].isin(['dep',''])]
#Get lines that contain NaN
deps = deps[deps.isnull().any(axis=1)]
deps



Informations disponibles dans participants_full_info.xlsx : ['participant_id', 'initial', 'city', 'group', 'age', 'sex', 'nse', 'atcd_endoc', 'updrs', 'matthis', 'type_dep', 'duration_dep', 'cgi', 'madrs', 'apathy', 'fatigue', 'aes', 'ami_ba', 'ami_sm', 'ami_es', 'fluency', 'fluency_s', 'stroop', 'tmt_a', 'tmt_ba', 'mcst_cat', 'mcst_error', 'mcst_pers', 'ami', 'acp1_scores_cliniques', 'pc_act1', 'pc_act2', 'pc_act3', 'pc_act4']


Unnamed: 0,subject_id,group,aes,apathy,sex,age,ami
33,3010,dep,45.0,2.0,f,78.0,


In [5]:
acti_df=pd.read_excel(ACTIMETRY_XLSX)
acti_df['subject_id']=acti_df['participant_id'].str.replace('sub-','')
acti_df.set_index('subject_id',inplace=True)
missing_acti = subjects_df.merge(acti_df, left_on='subject_id', right_on='subject_id', how='left')
acti_columns = acti_df.columns.tolist()
acti_columns.remove('participant_id')
missing_acti = missing_acti[['subject_id']+acti_columns]

#List lines that contain NaN
missing_acti = missing_acti[missing_acti.isnull().any(axis=1)]
missing_acti

Unnamed: 0,subject_id,inactivity_mean_3d,inactivity_std_3d,inactivity_min_3d,inactivity_max_3d,activity_mean_3d,activity_std_3d,activity_min_3d,activity_max_3d,freq_mean_3d,...,oadl_fft_min_12h_3,oadl_fft_min_12h_4,oadl_fft_min_12h_5,oadl_fft_min_12h_6,oadl_fft_max_12h_1,oadl_fft_max_12h_2,oadl_fft_max_12h_3,oadl_fft_max_12h_4,oadl_fft_max_12h_5,oadl_fft_max_12h_6
19,3025,,,,,,,,,,...,,,,,,,,,,
29,1035,,,,,,,,,,...,,,,,,,,,,
46,1026,,,,,,,,,,...,,,,,,,,,,
47,1037,,,,,,,,,,...,,,,,,,,,,
54,1006,,,,,,,,,,...,,,,,,,,,,
55,1044,,,,,,,,,,...,,,,,,,,,,


In [None]:
wrong_bundles= []
for sub in present_subjects:
    sub_files = [f for f in csv_files if f.subject==sub]
    for bundle_file in sub_files:
        
        bundle = bundle_file.get_entities().get('bundle',None)
        metric_df= pd.read_csv(bundle_file.path)
        missing_points=metric_df[metric_df['FA'].isnull()]
        missing_points['subject_id']=sub
        missing_points['bundle']=bundle
        wrong_bundles.append(missing_points)


wrong_bundles = pd.concat(wrong_bundles)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_points['subject_id']=sub
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_points['bundle']=bundle
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_points['subject_id']=sub
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] 

Unnamed: 0,subject_id,bundle,n_missing_points
0,01001,CGleft,1
1,01001,SCPright,2
2,01001,SLFIIright,13
3,01002,AFright,3
4,01002,CA,1
...,...,...,...
279,03022,FXleft,7
280,03024,AFright,1
281,03024,FXright,3
282,03024,SLFIIIright,1


In [6]:
wrong_bundles_by_sub = wrong_bundles.groupby(['subject_id']).size().reset_index(name='n_missing_points')
wrong_bundles_by_sub_and_bundle = wrong_bundles.groupby(['subject_id','bundle']).size().reset_index(name='n_missing_points')
wrong_bundles_by_bundle = wrong_bundles.groupby(['bundle']).size().reset_index(name='n_missing_points')
wrong_bundles_by_bundle_and_point = wrong_bundles.groupby(['bundle','point_id']).size().reset_index(name='n_missing_points')
wrong_bundles_by_bundle_and_point


Unnamed: 0,bundle,point_id,n_missing_points
0,AFleft,3,2
1,AFleft,4,6
2,AFleft,5,7
3,AFleft,6,4
4,AFleft,7,3
...,...,...,...
392,UFleft,45,1
393,UFleft,46,1
394,UFleft,47,1
395,UFleft,48,1


In [9]:
import matplotlib.pyplot as plt
import seaborn as sns
for bundle in wrong_bundles_by_bundle_and_point['bundle'].unique():
    data=wrong_bundles_by_bundle_and_point[wrong_bundles_by_bundle_and_point['bundle']]
    data.plot()
    
    


    
    

KeyError: "None of [Index(['AFleft', 'AFleft', 'AFleft', 'AFleft', 'AFleft', 'AFleft', 'AFleft',\n       'AFleft', 'AFleft', 'AFleft',\n       ...\n       'TOCCleft', 'TOCCleft', 'TOCCright', 'UFleft', 'UFleft', 'UFleft',\n       'UFleft', 'UFleft', 'UFleft', 'UFleft'],\n      dtype='object', length=397)] are in the [columns]"

In [1]:
# Résumé des données manquantes par sujet et bundle (avec confounds spécifiques DEP et actimétrie)
import os
import pandas as pd
from os.path import join as opj
from actiDep.set_config import get_HCP_bundle_names
from actiDep.data.loader import Actidep

# Paramètres
DB_ROOT = '/home/ndecaux/NAS_EMPENN/share/projects/actidep/bids'
PIPELINE = 'hcp_association_50pts'  # adapter si besoin
METRIC_COLS = ['FA','MD','RD','AD','IFW','IRF']
CLASSIF_VARS = ['group','apathy']
CORR_VARS = ['ami','aes']
CONFOUND_WITHOUT_CONTROL = ['age','sex','city','duration_dep','type_dep']  # confond_variables_without_control
MANDATORY_VARS = list(set(['age','sex','group']) | set(CONFOUND_WITHOUT_CONTROL))
POINT_COL_CANDIDATES = ['point','point_id']
ACTIMETRY_XLSX = opj(DB_ROOT,'actimetry_features.xlsx')
# Chargement actimétrie pour liste colonnes
if os.path.exists(ACTIMETRY_XLSX):
    _actim_df = pd.read_excel(ACTIMETRY_XLSX)
    ACTIM_COLS = [c for c in _actim_df.columns if c not in ['subject_id','participant_id']]
else:
    ACTIM_COLS = []
print(f"Colonnes actimétrie détectées: {len(ACTIM_COLS)}")

ads = Actidep(DB_ROOT)
metric_files = ads.get_global(pipeline=PIPELINE, extension='csv', datatype='metric')
print(f"Nb fichiers métriques: {len(metric_files)}")

bundle_names = list(get_HCP_bundle_names().keys())
records = []

for bundle in bundle_names:
    bundle_files = [f for f in metric_files if f.get_entities().get('bundle') == bundle]
    if not bundle_files:
        continue
    for f in bundle_files:
        try:
            df = pd.read_csv(f.path)
        except Exception as e:
            print(f"Lecture impossible {f.path}: {e}")
            continue
        subj = f.get_full_entities().get('subject')
        # Détection colonne point
        point_col = next((c for c in POINT_COL_CANDIDATES if c in df.columns), None)
        if point_col is None:
            records.append({'subject': subj,'bundle': bundle,'issue_type': 'missing_point_column','detail': 'aucune colonne point'})
            continue
        n_point_na = df[point_col].isna().sum()
        if n_point_na > 0:
            records.append({'subject': subj,'bundle': bundle,'issue_type': 'missing_point_ids','detail': f'{n_point_na} lignes point NA'})
        # Métriques
        for m in METRIC_COLS:
            if m not in df.columns:
                records.append({'subject': subj,'bundle': bundle,'issue_type': 'metric_missing_column','detail': m})
            else:
                n_na = df[m].isna().sum()
                if n_na > 0:
                    records.append({'subject': subj,'bundle': bundle,'issue_type': 'metric_nan_values','detail': f'{m}:{n_na} NA'})
        # Variables mandatoires générales
        for v in MANDATORY_VARS:
            if v not in df.columns:
                records.append({'subject': subj,'bundle': bundle,'issue_type': 'variable_missing_column','detail': v})
            else:
                if df[v].isna().all():
                    records.append({'subject': subj,'bundle': bundle,'issue_type': 'variable_all_nan','detail': v})
                elif df[v].isna().any():
                    n_var_na = df[v].isna().sum()
                    records.append({'subject': subj,'bundle': bundle,'issue_type': 'variable_partial_nan','detail': f'{v}:{n_var_na} NA'})
        # Confounds spécifiques aux sujets DEP
        if 'group' in df.columns and (df['group'] == 'dep').any():
            dep_rows = df[df['group']=='dep']
            for v in CONFOUND_WITHOUT_CONTROL:
                if v not in df.columns:
                    records.append({'subject': subj,'bundle': bundle,'issue_type':'dep_confound_missing_column','detail': v})
                else:
                    # proportion NA sur les lignes dep
                    dep_na = dep_rows[v].isna().sum()
                    if dep_rows.shape[0] > 0 and dep_na == dep_rows.shape[0]:
                        records.append({'subject': subj,'bundle': bundle,'issue_type':'dep_confound_all_nan','detail': v})
                    elif dep_na > 0:
                        records.append({'subject': subj,'bundle': bundle,'issue_type':'dep_confound_partial_nan','detail': f'{v}:{dep_na} NA'})
        # Classification vars présence
        for v in CLASSIF_VARS:
            if v not in df.columns:
                records.append({'subject': subj,'bundle': bundle,'issue_type':'classif_missing_column','detail': v})
            else:
                if df[v].isna().all():
                    records.append({'subject': subj,'bundle': bundle,'issue_type':'classif_all_nan','detail': v})
                elif df[v].isna().any():
                    records.append({'subject': subj,'bundle': bundle,'issue_type':'classif_partial_nan','detail': f'{v}:{df[v].isna().sum()} NA'})
        # Corrélation vars présence
        for v in CORR_VARS:
            if v not in df.columns:
                records.append({'subject': subj,'bundle': bundle,'issue_type':'corr_missing_column','detail': v})
            else:
                if df[v].isna().all():
                    records.append({'subject': subj,'bundle': bundle,'issue_type':'corr_all_nan','detail': v})
                elif df[v].isna().any():
                    records.append({'subject': subj,'bundle': bundle,'issue_type':'corr_partial_nan','detail': f'{v}:{df[v].isna().sum()} NA'})
        # Actimétrie: vérifie présence de colonnes & valeurs
        if ACTIM_COLS:
            present_act_cols = [c for c in ACTIM_COLS if c in df.columns]
            if not present_act_cols:
                records.append({'subject': subj,'bundle': bundle,'issue_type':'actimetry_no_columns','detail': 'aucune colonne actimetry'})
            else:
                # valeurs NA agrégées par sujet (ici fichier déjà par points; on regarde si toutes NA)
                sub_df = df[present_act_cols]
                # si toutes les colonnes NA sur toutes les lignes
                if sub_df.isna().all().all():
                    records.append({'subject': subj,'bundle': bundle,'issue_type':'actimetry_all_nan','detail': f'{len(present_act_cols)} cols'})
                else:
                    # colonnes entièrement NA
                    for col in present_act_cols:
                        if sub_df[col].isna().all():
                            records.append({'subject': subj,'bundle': bundle,'issue_type':'actimetry_column_all_nan','detail': col})
        else:
            records.append({'subject': subj,'bundle': bundle,'issue_type':'actimetry_reference_unavailable','detail': 'xlsx absent'})

# Agrégation finale
if records:
    miss_df = pd.DataFrame(records)
    summary = (miss_df.groupby(['issue_type','detail'])
                        .size()
                        .reset_index(name='count')
                        .sort_values('count', ascending=False))
    print("Résumé par type d'anomalie:")
    display(summary.head(50))
    print("\nExemples bruts:")
    display(miss_df.head())
    out_dir = opj(DB_ROOT, 'quality_reports')
    os.makedirs(out_dir, exist_ok=True)
    miss_df.to_csv(opj(out_dir, f'missing_values_{PIPELINE}.csv'), index=False)
    summary.to_csv(opj(out_dir, f'missing_values_summary_{PIPELINE}.csv'), index=False)
    print(f"CSV enregistrés dans {out_dir}")
else:
    print("Aucune donnée manquante détectée selon les critères fournis.")


Colonnes actimétrie détectées: 181
Nb fichiers métriques: 4260
Nb fichiers métriques: 4260
Résumé par type d'anomalie:
Résumé par type d'anomalie:


Unnamed: 0,issue_type,detail,count
0,actimetry_no_columns,aucune colonne actimetry,4260
1,classif_missing_column,apathy,4260
2,classif_missing_column,group,4260
3,corr_missing_column,aes,4260
4,corr_missing_column,ami,4260
149,variable_missing_column,age,4260
150,variable_missing_column,city,4260
154,variable_missing_column,type_dep,4260
153,variable_missing_column,sex,4260
151,variable_missing_column,duration_dep,4260



Exemples bruts:


Unnamed: 0,subject,bundle,issue_type,detail
0,1034,ORright,metric_nan_values,FA:9 NA
1,1034,ORright,metric_nan_values,MD:9 NA
2,1034,ORright,metric_nan_values,RD:9 NA
3,1034,ORright,metric_nan_values,AD:9 NA
4,1034,ORright,metric_nan_values,IFW:9 NA


CSV enregistrés dans /home/ndecaux/NAS_EMPENN/share/projects/actidep/bids/quality_reports
