In [1]:
import os
from collections import defaultdict
from sklearn.model_selection import KFold



In [2]:
# 1. Charger les fichiers
with open("sarcopenia_subj_DIASEM.txt") as f:
    excluded_subjects = set(line.strip() for line in f if line.strip())

with open("RF_all_suj_ordonnes.txt") as f:
    all_cases = [line.strip() for line in f if line.strip()]

In [4]:
# 3. Lire les sujets déjà présents dans les folds existants
used_subjects = set()
n_folds = 5
folds_train = []
folds_test = []

for i in range(1, n_folds + 1):
    with open(f"train_cases_{i}.txt") as f:
        train = [line.strip() for line in f if line.strip()]
    with open(f"test_cases_{i}.txt") as f:
        test = [line.strip() for line in f if line.strip()]
    folds_train.append(train)
    folds_test.append(test)
    used_subjects.update(train)
    used_subjects.update(test)

# 4. Sélectionner les sujets restants de type ABC123
remaining_subjects = []
for case in all_cases:
    if case in excluded_subjects:
        continue
    if case in used_subjects:
        continue
    if not case.startswith("sujet_"):  # ABC123-type
        remaining_subjects.append(case)

remaining_subjects = sorted(set(remaining_subjects))  # dédoublonnage

# 5. Répartir les ABC123 entre les folds via KFold
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)
folds_distribution = list(kf.split(remaining_subjects))

for fold_idx, (train_idx, test_idx) in enumerate(folds_distribution):
    train_extra = [remaining_subjects[i] for i in train_idx]
    test_extra = [remaining_subjects[i] for i in test_idx]

    # Ajouter ces sujets aux folds existants
    folds_train[fold_idx].extend(train_extra)
    folds_test[fold_idx].extend(test_extra)

    # Trier pour la lisibilité
    folds_train[fold_idx].sort()
    folds_test[fold_idx].sort()

    # Sauvegarde finale
    with open(f"train_cases_{fold_idx+1}.txt", "w") as f:
        f.write("\n".join(folds_train[fold_idx]) + "\n")

    with open(f"test_cases_{fold_idx+1}.txt", "w") as f:
        f.write("\n".join(folds_test[fold_idx]) + "\n")

print("✅ Fichiers folds complétés (sans double inclusion des sujets `sujet_*`).")

✅ Fichiers folds complétés (sans double inclusion des sujets `sujet_*`).


In [5]:
remaining_subjects

['BA122_RF',
 'BC167_RF',
 'BH116_RF',
 'BJ128_RF',
 'BP149_RF',
 'CG115_RF',
 'CG147_RF',
 'CR159_RF',
 'CS102_RF',
 'DB145_RF',
 'DH156_RF',
 'DJ101_RF',
 'DO125_RF',
 'FD121_RF',
 'FM144_RF',
 'GD123_RF',
 'GG131_RF',
 'GG140_RF',
 'HM124_RF',
 'IJ157_RF',
 'JJ142_RF',
 'LE120_RF',
 'LJ168_RF',
 'MA113_RF',
 'PJ165_RF',
 'PL158_RF',
 'RA126_RF',
 'RD132_RF',
 'RF148_RF',
 'RY166_RF',
 'SM127_RF',
 'SS130_RF',
 'TC152_RF',
 'TT129_RF',
 'ZS153_RF']