In [103]:
import os
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_val_predict, cross_val_score
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt


In [114]:
region = "STs_right"

labels_path = "/neurospin/dico/rmenasria/Runs/03_main/Input/remote_data/ph_p_dhx.csv"
base_path = "/neurospin/dico/data/deep_folding/current/models/Champollion_V1_after_ablation/embeddings/ABCD_embeddings/"

#get list of files available from base path
files = os.listdir(base_path)

#select region-specific file
for file in files:
    if file.startswith(region)  and file.endswith('.csv'):
        embedding_file = file
        break
print(f"Selected embedding file: {embedding_file}")

# path to the embedding file
embedding_path = os.path.join(base_path, embedding_file)


Selected embedding file: STs_right_name08-32-58--52_embeddings.csv


In [147]:
labels_df     = pd.read_csv(labels_path, low_memory=False)
embeddings_df = pd.read_csv(embedding_path)




# Standardisation of the embedding IDs
embeddings_df['ID_clean'] = (
    embeddings_df['ID']
    .astype(str)
    .str.strip()
    .str.replace(r"^sub-", "", regex=True)       # remove "sub-"
    .str.replace("_", "", regex=False)           # remove "_"
)

# Standardisation of the labels IDs
labels_df = labels_df.copy()
labels_df['src_subject_id_clean'] = (
    labels_df['src_subject_id']
    .astype(str)
    .str.strip()
    .str.replace("_", "", regex=False)           # retire tous les "_"
)

# Get all the common IDs after cleaning
set_emb = set(embeddings_df['ID_clean'])
set_lab = set(labels_df['src_subject_id_clean'])
common  = set_emb & set_lab
print("After cleaning, ID's in common:", len(common))

# Print a sample of common IDs
print("Examples common IDs :", list(common)[:10])

merged_df = embeddings_df.merge(
    labels_df[['src_subject_id_clean', 'devhx_12a_p', 'devhx_12_p']],
    left_on='ID_clean',
    right_on='src_subject_id_clean',
    how='inner'
).drop_duplicates(subset='src_subject_id_clean')

# Exclude all subjects for which the lprematurity abels are not available
merged_df = merged_df[
    (merged_df['devhx_12a_p'].notna()) &
    (merged_df['devhx_12a_p'] != 999) &
    (merged_df['devhx_12_p']!= 999) 
]

# Final extraction
labels_final     = merged_df['devhx_12a_p'].values
embeddings_final = merged_df.filter(regex=r'^dim').values

print("Final shapes → embeddings:", embeddings_final.shape,
      "labels:", labels_final.shape)


After cleaning, ID's in common: 10126
Examples common IDs : ['NDARINVUZBAVXHW', 'NDARINV0HH094Z9', 'NDARINVVCLE058V', 'NDARINV1VCFJ6CR', 'NDARINVR3NZ8ZKA', 'NDARINVBCEBWGDG', 'NDARINVL8WPT2T3', 'NDARINV891DYM0E', 'NDARINV2RB7270C', 'NDARINVUW2UNAF1']
Final shapes → embeddings: (9985, 32) labels: (9985,)


In [150]:
# Classes of prematurity 
merged_df['gest_age'] = 37 - merged_df['devhx_12_p']
print(merged_df['gest_age'].value_counts())




gest_age
33.0    486
34.0    256
31.0    240
32.0    231
35.0    169
29.0    161
30.0     97
36.0     54
24.0     38
28.0     36
27.0     36
25.0     10
26.0      4
Name: count, dtype: int64


In [151]:
def prem_class_true(ga):
    if ga < 27:
        return '<27'         # extreme preterm
    elif 27<= ga < 32:
        return '27-32'       # very preterm
    elif 32<= ga < 37:
        return '32-37'       # moderate preterm
    else:
        return '>=37'        # full term

In [152]:
merged_df['prem_class'] = merged_df['gest_age'].apply(prem_class_true)

# Checking the counts
print(" Counts of the true gestational age:")
print(merged_df['prem_class'].value_counts(), '\n')


 Counts of the true gestational age:
prem_class
>=37     8167
32-37    1196
27-32     570
<27        52
Name: count, dtype: int64 



In [None]:
export_df = merged_df[[
    'ID_clean',      # clean id
    'devhx_12a_p',   # is preterm
    'devhx_12_p',    # nb weeks before term
    'gest_age',      # gestional age
    'prem_class'     # prematurity class
]].rename(columns={'ID_clean':'src_subject_id'})

out_path = 'prematurity_labels_true_classes.csv'
export_df.to_csv(out_path, index=False)

print(f"CSV sauvegardé : {out_path}")
print(export_df.head())

CSV sauvegardé : prematurity_labels_true_classes.csv
    src_subject_id  devhx_12a_p  devhx_12_p  gest_age prem_class
0  NDARINV003RTV85          0.0         NaN       NaN       >=37
1  NDARINV007W6H7B          0.0         NaN       NaN       >=37
2  NDARINV00BD7VDC          0.0         NaN       NaN       >=37
3  NDARINV00HEV6HB          0.0         NaN       NaN       >=37
4  NDARINV00J52GPG          0.0         NaN       NaN       >=37


In [142]:
# load the labels and embeddings dataframes
labels_df     = pd.read_csv(labels_path, low_memory=False)
embeddings_df = pd.read_csv(embedding_path)

# cleaning same as above
embeddings_df['ID_clean'] = (
    embeddings_df['ID']
    .astype(str).str.strip()
    .str.replace(r"^sub-", "", regex=True)
    .str.replace("_", "", regex=False)
)
# labels cleaning
labels_df = labels_df.copy() 
labels_df['src_subject_id_clean'] = (
    labels_df['src_subject_id']
    .astype(str).str.strip()
    .str.replace("_", "", regex=False)
)

# Merge with all columns from labels_df
merged_df = embeddings_df.merge(
    labels_df,
    left_on='ID_clean',
    right_on='src_subject_id_clean',
    how='inner'
).drop_duplicates(subset='src_subject_id_clean')

print(f"Sujets appariés : {merged_df.shape[0]} lignes × {merged_df.shape[1]} colonnes")

# filtering same as above
merged_df = merged_df[
    merged_df['devhx_12a_p'].notna() &
    (merged_df['devhx_12a_p'] != 999) &
    (merged_df['devhx_12_p'] != 999)
]
print(f"Après exclusion des 999/NaN : {merged_df.shape[0]} sujets")

# GA
merged_df['gest_age'] = 37 - merged_df['devhx_12_p']

def prem_class_true(ga):
    if ga < 27:
        return '<27'
    elif ga < 32:
        return '27-32'
    elif ga < 37:
        return '32-37'
    else:
        return '>=37'

merged_df['prem_class'] = merged_df['gest_age'].apply(prem_class_true)
print("Répartition des vraies classes :")
print(merged_df['prem_class'].value_counts())

# export all columns
orig_label_cols = labels_df.columns.tolist()

# Columns to export
export_cols = ['ID_clean'] + orig_label_cols + ['gest_age', 'prem_class']
export_df   = merged_df[export_cols].rename(columns={'ID_clean':'src_subject_id'})

out_path = 'prematurity_labels_complete.csv'
export_df.to_csv(out_path, index=False)
print(f"\nCSV complet sauvegardé ici : {out_path}")
print(export_df.head())

Sujets appariés : 10126 lignes × 305 colonnes
Après exclusion des 999/NaN : 9985 sujets
Répartition des vraies classes :
prem_class
>=37_sem     8167
32-37_sem    1196
27-32_sem     570
<27_sem        52
Name: count, dtype: int64

CSV complet sauvegardé ici : prematurity_labels_complete.csv
    src_subject_id    src_subject_id              eventname  \
0  NDARINV003RTV85  NDAR_INV003RTV85  baseline_year_1_arm_1   
1  NDARINV007W6H7B  NDAR_INV007W6H7B  baseline_year_1_arm_1   
3  NDARINV00BD7VDC  NDAR_INV00BD7VDC  baseline_year_1_arm_1   
4  NDARINV00HEV6HB  NDAR_INV00HEV6HB  baseline_year_1_arm_1   
6  NDARINV00J52GPG  NDAR_INV00J52GPG  baseline_year_1_arm_1   

   devhx_select_language  devhx_1_p  birth_weight_lbs  devhx_2_p_dk  \
0                      0        1.0               7.0           NaN   
1                      0        0.0               8.0           NaN   
3                      0        1.0               8.0           NaN   
4                      0        1.0          

In [None]:
# 1) Identifiants présents dans embeddings mais pas dans labels
emb_ids   = set(embeddings_df['ID'])
label_ids = set(labels_df['src_subject_id'])
missing_in_labels = sorted(list(emb_ids - label_ids))
print(f"{len(missing_in_labels)} IDs d'embeddings SANS label associé :")
print(missing_in_labels[:20], "…")   # on n’affiche que les 20 premiers

# 2) Identifiants présents dans labels mais pas dans embeddings
missing_in_embeddings = sorted(list(label_ids - emb_ids))
print(f"\n{len(missing_in_embeddings)} IDs de labels SANS embedding :")
print(missing_in_embeddings[:20], "…")