In [1]:
import os
import glob
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import LeaveOneGroupOut
import statsmodels.api as sm
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats


#savedir = '/neurospin/dico/data/deep_folding/current/models/Champollion_V0_trained_on_UKB40/'
#savedir = '/neurospin/dico/data/deep_folding/current/models/Champollion_V1_after_ablation_latent_256/'
savedir = '/neurospin/dico/data/deep_folding/current/models/Champollion_V1_after_ablation/'
#dfs_dir = '/neurospin/dico/data/deep_folding/current/models/Champollion_V0_trained_on_UKB40/embeddings/ukb40_epoch80_embeddings'
#dfs_dir = '/neurospin/dico/data/deep_folding/current/models/Champollion_V1_after_ablation_latent_256/embeddings/ukb40_epoch80_embeddings'
n_dims = 32

regions_to_treat = ['SOr_left',
 'SOr_right',
 'FColl-SRh_left',
 'SFmedian-SFpoltr-SFsup_left',
 'SFinf-BROCA-SPeCinf_left',
 'SPoC_left',
 'fronto-parietal_medial_face_left',
 'FIP_left',
 'CINGULATE_left',
 'SC-SPoC_left',
 'SFinter-SFsup_left',
 'FCMpost-SpC_left',
 'SsP-SPaint_left',
 'SOr-SOlf_left',
 'FPO-SCu-ScCal_left',
 'SFmarginal-SFinfant_left',
 'SFint-FCMant_left',
 'STi-STs-STpol_left',
 'SFint-SR_left',
 'Lobule_parietal_sup_left',
 'STi-SOTlat_left',
 'SPeC_left',
 'STsbr_left',
 'ScCal-SLi_left',
 'STs_left',
 'FCLp-subsc-FCLa-INSULA_left',
 'SC-sylv_left',
 'SC-SPeC_left',
 'OCCIPITAL_left',
 'FColl-SRh_right',
 'SFmedian-SFpoltr-SFsup_right',
 'SFinf-BROCA-SPeCinf_right',
 'SPoC_right',
 'fronto-parietal_medial_face_right',
 'FIP_right',
 'CINGULATE_right',
 'SC-SPoC_right',
 'SFinter-SFsup_right',
 'FCMpost-SpC_right',
 'SsP-SPaint_right',
 'SOr-SOlf_right',
 'FPO-SCu-ScCal_right',
 'SFmarginal-SFinfant_right',
 'SFint-FCMant_right',
 'STi-STs-STpol_right',
 'SFint-SR_right',
 'Lobule_parietal_sup_right',
 'STi-SOTlat_right',
 'SPeC_right',
 'STsbr_right',
 'ScCal-SLi_right',
 'STs_right',
 'FCLp-subsc-FCLa-INSULA_right',
 'SC-sylv_right',
 'SC-SPeC_right',
 'OCCIPITAL_right']

## NB : add LARGE_CINGULATE ?

In [2]:
# select the train val subjects
dataset = 'schiz'
all_subjects = pd.read_csv('/neurospin/dico/data/deep_folding/current/datasets/aggregate_schizophrenia/all_subjects.csv')
all_subjects.columns = ['ID']
splits_dir = '/neurospin/dico/data/deep_folding/current/datasets/aggregate_schizophrenia/splits/'
train_val_subjects_dirs = glob.glob(f'{splits_dir}/train_val_*')
# load each split subject file and create cv splits from them
train_val_subjects = []
for i, directory in enumerate(train_val_subjects_dirs):
    train_val_subjects.append(pd.read_csv(directory, sep='\t', header=None))
train_val_subjects = pd.concat(train_val_subjects, axis=0)
train_val_subjects.columns = ['ID']
train_val_subjects['ID'] = train_val_subjects['ID'].astype(str)

# select the test subjects
test_subjects = pd.read_csv(f'{splits_dir}/internal_test.csv', header=None)
test_subjects.columns = ['ID']
test_subjects['ID'] = test_subjects['ID'].astype(str)
# select the test extra
test_extra_subjects = pd.read_csv(f'{splits_dir}/external_test.csv', header=None)
test_extra_subjects.columns = ['ID']
test_extra_subjects['ID'] = test_extra_subjects['ID'].astype(str)

# print lengths
print(f'nb train val : {len(train_val_subjects)}, nb test : {len(test_subjects)}, nb test extra : {len(test_extra_subjects)}')

# load labels
labels1 = pd.read_csv('/neurospin/psy/schizconnect-vip-prague/participants_v-20231108.tsv', usecols=['participant_id', 'diagnosis', 'sex', 'age', 'site'], sep='\t')
labels2 = pd.read_csv('/neurospin/psy/bsnip1/participants_v-20231108.tsv', usecols=['participant_id', 'diagnosis', 'sex', 'age', 'site'], sep='\t')
labels3 = pd.read_csv('/neurospin/psy/candi/participants_v-20231108.tsv', usecols=['participant_id', 'diagnosis', 'sex', 'age'], sep='\t')
labels3['site'] = ['CANDI'] * len(labels3)
labels4 = pd.read_csv('/neurospin/psy/cnp/participants_v-20231108.tsv', usecols=['participant_id', 'diagnosis', 'sex', 'age', 'ScannerSerialNumber'], sep='\t')
labels4['ScannerSerialNumber'] = labels4['ScannerSerialNumber'].apply(lambda x: str(int(x)))
labels4.columns=['participant_id', 'diagnosis', 'sex', 'age', 'site']
labels = pd.concat([labels1, labels2, labels3, labels4], axis=0)
labels.columns = ['ID'] + labels.columns[1:].tolist()
labels.dropna()
label = 'diagnosis'
pathology = 'schizophrenia'
# not only bipolar
print(np.unique(labels['diagnosis'], return_counts=True))
# drop rows for which diagnosis is not control or pathology
labels = labels[labels[label].isin(['control', 'schizoaffective disorder', 'scz'])] # no site info for schizo affective disorder ?
labels['ID'] = labels['ID'].astype(str)
print(f'nb labels : {len(labels)}')

nb train val : 1044, nb test : 118, nb test extra : 130
(array(['F', 'M', 'bd', 'control', 'fep', 'psychotic bd',
       'relative of bipolar disorder',
       'relative of schizoaffective disorder',
       'relative of schizophrenia', 'schizoaffective disorder', 'scz'],
      dtype=object), array([117, 155,  35, 667,  43, 140, 122, 134, 176, 124, 491]))
nb labels : 1282


In [3]:
# function to make correspondance between IDs
def make_correspondance_ids(df1, df2, colname='ID2'):
    """
    Make correspondance between IDs. Adds column ID2 to df1.
    """

    new_df = df1.copy()

    id2_dict = {}
    for id1 in df1['ID'].values:
        filtered_df2 = df2[df2['ID'].apply(lambda x: (x in id1) or (id1 in x))]
        id2 = filtered_df2['ID'].tolist()
        if len(id2) > 0:
            id2 = id2[0]
            id2_dict[id1] = id2
    new_df[colname] = new_df['ID'].map(id2_dict)
    new_df = new_df.dropna()
    return new_df   

In [4]:
## group the train val and test subjects + TEST EXTRA
train_val_test_test_extra_subjects = pd.concat([train_val_subjects, test_subjects, test_extra_subjects], axis=0)
train_val_test_test_extra_subjects['ID'] = train_val_test_test_extra_subjects['ID'].astype(str)
print(len(train_val_test_test_extra_subjects))

1292


In [5]:
# merge train_val_test and labels
df = make_correspondance_ids(labels, train_val_test_test_extra_subjects, colname='ID_train_val_test')
# merge df and all_subjects
df = make_correspondance_ids(df, all_subjects, colname='ID_all_subjects')
# drop
df = df.drop_duplicates()
df = df.dropna()
df.columns = ['ID_labels', 'sex', 'age', 'diagnosis', 'site', 'ID_train_val_test', 'ID']

In [6]:
## REMOVE PRAGUE, CONTAINS CONTROLS ONLY
df = df.loc[df['site']!='PRAGUE']

# Get the embeddings

In [7]:
keywords = ['_left', '_right']
all_matches = []
for var in keywords:
    pattern = f"{savedir}*{var}/*/{dataset}_random_embeddings/full_embeddings.csv" ## TODO : Make sure only one model per region, take highest epoch, and print
    matches = glob.glob(pattern)
    all_matches.extend(matches)
# Optional: remove duplicates
dfs_dirs = list(set(all_matches))
dfs_dirs.sort()

results = {}

In [8]:
## iterate on the regions
embds_dict = {}

p_values = {f'Split{i}': {} for i in range(2)}
r2_values = {f'Split{i}': {} for i in range(2)}
preds = {f'Split{i}': {} for i in range(2)}

for i, directory in enumerate(tqdm(dfs_dirs)):
    region = directory.split('/')[-4]
    print(f'Treating {region}')
    embd=pd.read_csv(directory)
    embd=pd.merge(embd, df, on='ID')
    
    # fit standard scaler on embd and transform
    std = StandardScaler()
    embd_matrix = embd.loc[:, embd.columns.str.startswith('dim')]
    std_embds = std.fit_transform(embd_matrix)
    embd.loc[:, embd.columns.str.startswith('dim')] = std_embds
    print('Length embeddings : ', embd.shape[0])
    embds_dict[region]=np.array(embd_matrix)

  5%|▌         | 3/56 [00:00<00:02, 22.32it/s]

Treating CINGULATE_left
Length embeddings :  1029
Treating CINGULATE_right
Length embeddings :  1029
Treating FCLp-subsc-FCLa-INSULA_left
Length embeddings :  1029
Treating FCLp-subsc-FCLa-INSULA_right
Length embeddings :  1029
Treating FCMpost-SpC_left


 11%|█         | 6/56 [00:00<00:03, 12.84it/s]

Length embeddings :  1029
Treating FCMpost-SpC_right
Length embeddings :  1029
Treating FColl-SRh_left
Length embeddings :  1029
Treating FColl-SRh_right


 14%|█▍        | 8/56 [00:00<00:05,  8.14it/s]

Length embeddings :  1029
Treating FIP_left
Length embeddings :  1029
Treating FIP_right


 21%|██▏       | 12/56 [00:01<00:04,  9.22it/s]

Length embeddings :  1029
Treating FPO-SCu-ScCal_left
Length embeddings :  1029
Treating FPO-SCu-ScCal_right
Length embeddings :  1029
Treating Lobule_parietal_sup_left


 25%|██▌       | 14/56 [00:01<00:04,  9.37it/s]

Length embeddings :  1029
Treating Lobule_parietal_sup_right
Length embeddings :  1029
Treating OCCIPITAL_left
Length embeddings :  1029
Treating OCCIPITAL_right


 32%|███▏      | 18/56 [00:01<00:03, 11.96it/s]

Length embeddings :  1029
Treating SC-SPeC_left
Length embeddings :  1029
Treating SC-SPeC_right
Length embeddings :  1029
Treating SC-SPoC_left


 43%|████▎     | 24/56 [00:01<00:01, 18.43it/s]

Length embeddings :  1029
Treating SC-SPoC_right
Length embeddings :  1029
Treating SC-sylv_left
Length embeddings :  1029
Treating SC-sylv_right
Length embeddings :  1029
Treating SFinf-BROCA-SPeCinf_left
Length embeddings :  1029
Treating SFinf-BROCA-SPeCinf_right
Length embeddings :  1029
Treating SFint-FCMant_left
Length embeddings :  1029
Treating SFint-FCMant_right


 48%|████▊     | 27/56 [00:02<00:01, 18.27it/s]

Length embeddings :  1029
Treating SFint-SR_left
Length embeddings :  1029
Treating SFint-SR_right
Length embeddings :  1029
Treating SFinter-SFsup_left


 55%|█████▌    | 31/56 [00:02<00:01, 15.22it/s]

Length embeddings :  1029
Treating SFinter-SFsup_right
Length embeddings :  1029
Treating SFmarginal-SFinfant_left
Length embeddings :  1029
Treating SFmarginal-SFinfant_right


 61%|██████    | 34/56 [00:02<00:01, 16.10it/s]

Length embeddings :  1029
Treating SFmedian-SFpoltr-SFsup_left
Length embeddings :  1029
Treating SFmedian-SFpoltr-SFsup_right
Length embeddings :  1029
Treating SOr-SOlf_left
Length embeddings :  1029
Treating SOr-SOlf_right
Length embeddings :  1029
Treating SOr_left


 68%|██████▊   | 38/56 [00:02<00:00, 18.56it/s]

Length embeddings :  1029
Treating SOr_right
Length embeddings :  1029
Treating SPeC_left
Length embeddings :  1029
Treating SPeC_right


 75%|███████▌  | 42/56 [00:02<00:00, 17.51it/s]

Length embeddings :  1029
Treating SPoC_left
Length embeddings :  1029
Treating SPoC_right
Length embeddings :  1029
Treating STi-SOTlat_left
Length embeddings :  1029
Treating STi-SOTlat_right
Length embeddings :  1029
Treating STi-STs-STpol_left


 80%|████████  | 45/56 [00:03<00:00, 19.92it/s]

Length embeddings :  1029
Treating STi-STs-STpol_right
Length embeddings :  1029
Treating STs_left
Length embeddings :  1029
Treating STs_right


 89%|████████▉ | 50/56 [00:03<00:00, 13.32it/s]

Length embeddings :  1029
Treating STsbr_left
Length embeddings :  1029
Treating STsbr_right
Length embeddings :  1029
Treating ScCal-SLi_left


 93%|█████████▎| 52/56 [00:03<00:00, 11.17it/s]

Length embeddings :  1029
Treating ScCal-SLi_right
Length embeddings :  1029
Treating SsP-SPaint_left


100%|██████████| 56/56 [00:04<00:00, 13.69it/s]

Length embeddings :  1029
Treating SsP-SPaint_right
Length embeddings :  1029
Treating fronto-parietal_medial_face_left
Length embeddings :  1029
Treating fronto-parietal_medial_face_right
Length embeddings :  1029





# concat the embeddings to get N regions * latent size input tensors

In [9]:
X = np.stack([embds_dict[reg] for reg in regions_to_treat], axis=-1)

In [10]:
Y = embd['diagnosis'].tolist()
Y = np.array([1 if diag!='control' else 0 for diag in Y])

In [11]:
# define train / val / test
train_idxs = embd.loc[embd['ID_train_val_test'].isin(train_val_subjects['ID'])].index.to_numpy().astype(int)
test_idxs = embd.loc[embd['ID_train_val_test'].isin(test_subjects['ID'])].index.to_numpy().astype(int)
X_train, Y_train = X[train_idxs], Y[train_idxs]
X_test, Y_test = X[test_idxs], Y[test_idxs]

In [12]:
X_train.shape, X_test.shape

((804, 32, 56), (95, 32, 56))

# Non Linear / Linear classifiers

In [None]:
# use cross validation

# Custom MLP

In [150]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class CustomConcatClassifier(nn.Module):
    def __init__(self, N, d, p):
        super(CustomConcatClassifier, self).__init__()
        self.N = N
        self.d = d
        self.p = p

        # One unique linear layer per input vector
        self.individual_linears = nn.ModuleList([nn.Linear(d, p) for _ in range(N)])

        # activation
        self.activation = nn.LeakyReLU()

        # Final classifier: from concatenated vector (N*p) to 1
        self.classifier = nn.Linear(N * p, 1)

        self.m = nn.Sigmoid()

    def forward(self, x):
        """
        x: Tensor of shape (batch_size, N, d)
        """
        assert x.shape[1:] == (self.d, self.N), f"Expected input shape (batch_size, {self.d}, {self.N})"

        # Apply each individual linear to its corresponding vector
        features = []
        for i in range(self.N):
            vec = x[:, :, i]                     # shape: (batch_size, d)
            out = self.individual_linears[i](vec)  # shape: (batch_size, p)
            features.append(out)

        # Concatenate all transformed vectors: shape (batch_size, N*p)
        concatenated = torch.cat(features, dim=1)

        # activation
        concatenated = self.activation(concatenated)

        # Final classification layer
        logits = self.classifier(concatenated)   # shape: (batch_size, 1)

        # Binary classification using softmax over 2 outputs: [-logit, +logit]
        #probs = F.softmax(torch.cat([-logits, logits], dim=1), dim=1)  # shape: (batch_size, 2)
        probs = self.m(logits)

        return probs

In [151]:
# If your data is in NumPy arrays
X_tensor = torch.tensor(X_train, dtype=torch.float32)
Y_tensor = torch.tensor(Y_train, dtype=torch.float32)
from torch.utils.data import TensorDataset
train_dataset = TensorDataset(X_tensor, Y_tensor)
from torch.utils.data import DataLoader
trainloader = DataLoader(train_dataset, batch_size=32, shuffle=True, pin_memory=True)

In [152]:
X_tensor = torch.tensor(X_test, dtype=torch.float32)
Y_tensor = torch.tensor(Y_test, dtype=torch.float32)
test_dataset = TensorDataset(X_tensor, Y_tensor)
testloader = DataLoader(test_dataset, batch_size=1, shuffle=False, pin_memory=True)

In [153]:
model = CustomConcatClassifier(N=56, d=32, p=3)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

CustomConcatClassifier(
  (individual_linears): ModuleList(
    (0-55): 56 x Linear(in_features=32, out_features=3, bias=True)
  )
  (activation): LeakyReLU(negative_slope=0.01)
  (classifier): Linear(in_features=168, out_features=1, bias=True)
  (m): Sigmoid()
)

In [154]:
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)

In [156]:
X_train.shape

(804, 32, 56)

In [None]:
for epoch in tqdm(range(10)):  # loop over the dataset multiple times

    loss_epoch = 0

    for i, data in enumerate(trainloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data
        inputs = inputs.to(device)
        labels = labels.to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
        loss = criterion(outputs.reshape(-1), labels)
        loss.backward()
        optimizer.step()

        loss_epoch += loss
    
    print(loss_epoch)

print('Finished Training')

  3%|▎         | 1/30 [00:00<00:23,  1.25it/s]

tensor(67.5984, device='cuda:0', grad_fn=<AddBackward0>)


  7%|▋         | 2/30 [00:01<00:21,  1.31it/s]

tensor(39.0828, device='cuda:0', grad_fn=<AddBackward0>)


 10%|█         | 3/30 [00:02<00:20,  1.32it/s]

tensor(27.0276, device='cuda:0', grad_fn=<AddBackward0>)


 13%|█▎        | 4/30 [00:02<00:18,  1.42it/s]

tensor(20.9990, device='cuda:0', grad_fn=<AddBackward0>)


 17%|█▋        | 5/30 [00:03<00:17,  1.43it/s]

tensor(16.0949, device='cuda:0', grad_fn=<AddBackward0>)


 20%|██        | 6/30 [00:04<00:16,  1.49it/s]

tensor(13.2307, device='cuda:0', grad_fn=<AddBackward0>)


 23%|██▎       | 7/30 [00:04<00:14,  1.60it/s]

tensor(11.4017, device='cuda:0', grad_fn=<AddBackward0>)


 27%|██▋       | 8/30 [00:05<00:12,  1.70it/s]

tensor(10.2459, device='cuda:0', grad_fn=<AddBackward0>)


 30%|███       | 9/30 [00:05<00:11,  1.78it/s]

tensor(8.7094, device='cuda:0', grad_fn=<AddBackward0>)


 33%|███▎      | 10/30 [00:06<00:10,  1.83it/s]

tensor(8.4718, device='cuda:0', grad_fn=<AddBackward0>)


 37%|███▋      | 11/30 [00:06<00:10,  1.80it/s]

tensor(7.4544, device='cuda:0', grad_fn=<AddBackward0>)


 40%|████      | 12/30 [00:07<00:10,  1.65it/s]

tensor(6.5185, device='cuda:0', grad_fn=<AddBackward0>)


 43%|████▎     | 13/30 [00:08<00:10,  1.65it/s]

tensor(5.2930, device='cuda:0', grad_fn=<AddBackward0>)


 47%|████▋     | 14/30 [00:08<00:09,  1.64it/s]

tensor(4.7016, device='cuda:0', grad_fn=<AddBackward0>)


 50%|█████     | 15/30 [00:09<00:09,  1.62it/s]

tensor(4.2804, device='cuda:0', grad_fn=<AddBackward0>)


 53%|█████▎    | 16/30 [00:09<00:08,  1.67it/s]

tensor(3.6443, device='cuda:0', grad_fn=<AddBackward0>)


 57%|█████▋    | 17/30 [00:10<00:07,  1.64it/s]

tensor(3.7764, device='cuda:0', grad_fn=<AddBackward0>)


 60%|██████    | 18/30 [00:11<00:07,  1.65it/s]

tensor(3.2513, device='cuda:0', grad_fn=<AddBackward0>)


 63%|██████▎   | 19/30 [00:11<00:06,  1.66it/s]

tensor(2.7947, device='cuda:0', grad_fn=<AddBackward0>)


 67%|██████▋   | 20/30 [00:12<00:05,  1.69it/s]

tensor(2.5120, device='cuda:0', grad_fn=<AddBackward0>)


 70%|███████   | 21/30 [00:12<00:05,  1.69it/s]

tensor(2.4787, device='cuda:0', grad_fn=<AddBackward0>)


 73%|███████▎  | 22/30 [00:13<00:04,  1.70it/s]

tensor(1.8829, device='cuda:0', grad_fn=<AddBackward0>)


 77%|███████▋  | 23/30 [00:14<00:04,  1.70it/s]

tensor(1.6961, device='cuda:0', grad_fn=<AddBackward0>)


 80%|████████  | 24/30 [00:14<00:03,  1.69it/s]

tensor(1.4959, device='cuda:0', grad_fn=<AddBackward0>)


 83%|████████▎ | 25/30 [00:15<00:02,  1.68it/s]

tensor(1.4244, device='cuda:0', grad_fn=<AddBackward0>)


 87%|████████▋ | 26/30 [00:15<00:02,  1.69it/s]

tensor(1.3354, device='cuda:0', grad_fn=<AddBackward0>)


 90%|█████████ | 27/30 [00:16<00:01,  1.69it/s]

tensor(1.1492, device='cuda:0', grad_fn=<AddBackward0>)


 93%|█████████▎| 28/30 [00:17<00:01,  1.70it/s]

tensor(1.0306, device='cuda:0', grad_fn=<AddBackward0>)


 97%|█████████▋| 29/30 [00:17<00:00,  1.67it/s]

tensor(0.9331, device='cuda:0', grad_fn=<AddBackward0>)


100%|██████████| 30/30 [00:18<00:00,  1.63it/s]

tensor(0.8665, device='cuda:0', grad_fn=<AddBackward0>)
Finished Training





In [158]:
model.eval()

CustomConcatClassifier(
  (individual_linears): ModuleList(
    (0-55): 56 x Linear(in_features=32, out_features=3, bias=True)
  )
  (activation): LeakyReLU(negative_slope=0.01)
  (classifier): Linear(in_features=168, out_features=1, bias=True)
  (m): Sigmoid()
)

In [159]:
outs = []
for data in testloader:
    inputs, _ = data
    inputs = inputs.to(device)
    outputs = model(inputs)
    out = outputs.detach().cpu().numpy()
    outs.append(out)

In [160]:
from sklearn.metrics import roc_auc_score
roc_auc = roc_auc_score(Y_test.reshape(-1,1), np.array(outs).reshape(-1,1))
roc_auc

0.616279069767442