In [1]:
import os
from collections import defaultdict
import random
import numpy as np



In [2]:
# Charger la liste des sujets à exclure
with open("sarcopenia_subj_DIASEM.txt") as f:
    diasem_subjects = set(line.strip() for line in f if line.strip())

# Charger les anciens sujets sarcopéniques "problématiques" mais utilisables
with open("sarcopenia_subj_DIASEM_PROBLEM.txt") as f:
    problem_subjects = set(line.strip() for line in f if line.strip())

# Sujets récupérables = ceux du fichier PROBLEM mais pas dans le fichier DIASEM actuel
recoverable_subjects = problem_subjects - diasem_subjects

# Charger tous les anciens sujets par fold
folds = {}
all_used_subjects = set()



In [3]:
diasem_subjects

{'AL135_RF',
 'AM112_RF',
 'BA137_RF',
 'BM103_RF',
 'CY164_RF',
 'DI154_RF',
 'DP172_RF',
 'DR160_RF',
 'EM151_RF',
 'EP105_RF',
 'FO117_RF',
 'GD171_RF',
 'LA134_RF',
 'LA162_RF',
 'LJ139_RF',
 'LP133_RF',
 'MA119_RF',
 'MM170_RF',
 'PG161_RF',
 'PP104_RF',
 'QP108_RF',
 'RA150_RF',
 'RJ136_RF',
 'SC146_RF'}

In [4]:
problem_subjects

{'AM112_RF',
 'BJ128_RF',
 'CR159_RF',
 'CS102_RF',
 'DH156_RF',
 'DJ101_RF',
 'EM151_RF',
 'FM144_RF',
 'HM124_RF',
 'IJ157_RF',
 'JJ142_RF',
 'LA134_RF',
 'LE120_RF',
 'PP104_RF',
 'RA126_RF',
 'RA150_RF',
 'RD132_RF',
 'RF148_RF',
 'RY166_RF',
 'SC146_RF',
 'SM127_RF',
 'TC152_RF',
 'TT129_RF'}

In [5]:
print(recoverable_subjects)
print (len(recoverable_subjects))

{'SM127_RF', 'RF148_RF', 'CS102_RF', 'HM124_RF', 'DJ101_RF', 'CR159_RF', 'BJ128_RF', 'TC152_RF', 'LE120_RF', 'FM144_RF', 'IJ157_RF', 'RA126_RF', 'DH156_RF', 'RD132_RF', 'RY166_RF', 'JJ142_RF', 'TT129_RF'}
17


In [6]:
for i in range(1, 6):
    with open(f"train_cases_{i}.txt") as f_train, open(f"test_cases_{i}.txt") as f_test:
        train_subjects = [line.strip() for line in f_train if line.strip()]
        test_subjects = [line.strip() for line in f_test if line.strip()]
        folds[i] = {
            'train': [s for s in train_subjects if s not in diasem_subjects],
            'test': [s for s in test_subjects if s not in diasem_subjects],
        }
        all_used_subjects.update(folds[i]['train'])
        all_used_subjects.update(folds[i]['test'])



In [7]:
# Mettre à jour les folds en essayant de garder les tailles constantes
def refill_fold(fold_data, used_subjects, recoverable_subjects, target_train_len, target_test_len):
    new_train = fold_data['train']
    new_test = fold_data['test']
    needed_train = target_train_len - len(new_train)
    needed_test = target_test_len - len(new_test)

    available_subjects = list(recoverable_subjects - used_subjects)
    random.shuffle(available_subjects)

    refill_train = available_subjects[:needed_train]
    refill_test = available_subjects[needed_train:needed_train + needed_test]

    new_train.extend(refill_train)
    new_test.extend(refill_test)
    used_subjects.update(refill_train)
    used_subjects.update(refill_test)

    return {'train': new_train, 'test': new_test}



In [8]:
# Équilibrer les folds
final_folds = {}
used_subjects_final = set()

# Optionnel : calculer les tailles moyennes d'origine
avg_train_len = int(sum(len(folds[i]['train']) for i in range(1, 6)) / 5)
avg_test_len = int(sum(len(folds[i]['test']) for i in range(1, 6)) / 5)

for i in range(1, 6):
    final_folds[i] = refill_fold(
        folds[i],
        used_subjects_final,
        recoverable_subjects,
        target_train_len=avg_train_len,
        target_test_len=avg_test_len
    )

# Réécriture des fichiers
for i in range(1, 6):
    with open(f"train_cases_{i}.txt", "w") as f_train:
        f_train.write("\n".join(final_folds[i]['train']) + "\n")
    with open(f"test_cases_{i}.txt", "w") as f_test:
        f_test.write("\n".join(final_folds[i]['test']) + "\n")

print("✅ Les fichiers train/test ont été mis à jour en respectant les contraintes.")


✅ Les fichiers train/test ont été mis à jour en respectant les contraintes.
