In [48]:
import os
import pandas as pd
import numpy as np

from sklearn.decomposition import PCA

# Create numpy files to pass to compute_PCA

## Filter subsets (train and other)

In [49]:
train_subjects = pd.read_csv("/neurospin/dico/data/deep_folding/current/datasets/schiz/train_subjects.csv", header=None)
train_subjects.columns = ['Subject']
train_subjects

Unnamed: 0,Subject
0,ESOC10104
1,A00014522
2,A00001243
3,or130001
4,A00028405
...,...
923,HC001
924,HC027
925,SS086
926,HC023


In [50]:
used_subjects = pd.read_csv("/neurospin/dico/data/deep_folding/current/datasets/schiz/crops/2mm/CINGULATE/mask/Rskeleton_subject.csv")
used_subjects

Unnamed: 0,Subject
0,INV056VJPL6
1,INV05AFGN2Z
2,INV07WT2ZL3
3,INV0AL14J6U
4,INV0CF8E46F
...,...
1287,st110288
1288,va110289
1289,ye110322
1290,yh100442


In [51]:
fit_subjects = used_subjects[used_subjects.Subject.isin(train_subjects.Subject)]
transform_subjects = used_subjects[~used_subjects.Subject.isin(train_subjects.Subject)]

fit_indices = fit_subjects.index.values
transform_indices = transform_subjects.index.values

print(len(fit_indices), fit_indices[:10])
print(len(transform_indices), transform_indices[:10])

928 [ 0  1  2  3  7  8 10 11 14 15]
364 [ 4  5  6  9 12 13 17 19 24 27]


In [42]:
# save fit and transform subjects
save_path = "/neurospin/dico/data/deep_folding/current/datasets/schiz/crops/2mm/CINGULATE/mask/pca"

fit_subjects.to_csv(os.path.join(save_path, "fit_subject.csv"), header=False)
transform_subjects.to_csv(os.path.join(save_path, "transform_subject.csv"), header=False)

## Create the relevant numpy arrays

In [43]:
side = 'L'

full_arr = np.load(f"/neurospin/dico/data/deep_folding/current/datasets/schiz/crops/2mm/CINGULATE/mask/{side}skeleton.npy")
full_arr.shape

(1292, 18, 41, 38, 1)

In [44]:
fit_arr = full_arr[fit_indices]
transform_arr = full_arr[transform_indices]

print(fit_arr.shape, transform_arr.shape)

(928, 18, 41, 38, 1) (364, 18, 41, 38, 1)


In [45]:
if not os.path.exists(save_path):
    os.makedirs(save_path)

np.save(os.path.join(save_path, f"fit_{side}skeleton.npy"), fit_arr)
np.save(os.path.join(save_path, f"transform_{side}skeleton.npy"), transform_arr)

### Create the fusioned version to compare to both sides SimCLR

In [46]:
R_fit = np.load(os.path.join(save_path, f"fit_Rskeleton.npy"))
L_fit = np.load(os.path.join(save_path, f"fit_Lskeleton.npy"))

R_transform = np.load(os.path.join(save_path, f"transform_Rskeleton.npy"))
L_transform = np.load(os.path.join(save_path, f"transform_Lskeleton.npy"))

# flatten in order to concat
R_fit = R_fit.reshape(R_fit.shape[0], np.prod(R_fit.shape[1:]))
L_fit = L_fit.reshape(L_fit.shape[0], np.prod(L_fit.shape[1:]))
R_transform = R_transform.reshape(R_transform.shape[0], np.prod(R_transform.shape[1:]))
L_transform = L_transform.reshape(L_transform.shape[0], np.prod(L_transform.shape[1:]))

both_fit = np.concatenate([R_fit, L_fit], axis=1)
both_transform = np.concatenate([R_transform, L_transform], axis=1)

print(both_fit.shape, both_transform.shape)

(928, 53884) (364, 53884)


In [47]:
# save the concat arrays

np.save(os.path.join(save_path, f"fit_both_flat_skeleton.npy"), both_fit)
np.save(os.path.join(save_path, f"transform_both_flat_skeleton.npy"), both_transform)

# Compute PCA

In [91]:
if not os.path.exists(save_path+'/embeddings'):
    os.mkdir(save_path+'/embeddings')

In [92]:
def compute_pca(X_fit, X_transform, name, n_components=10):
    pca = PCA(n_components=n_components) 

    pca.fit(X_fit)
    Y_fit = pca.transform(X_fit)
    Y_transform = pca.transform(X_transform)

    np.save(os.path.join(save_path, f"embeddings/{name}_fit.npy"), Y_fit)
    np.save(os.path.join(save_path, f"embeddings/{name}_transform.npy"), Y_transform)
    
    return Y_fit, Y_transform

In [93]:
# R_fit and L_fit have been flattened to create both_fit
pca_R_fit, pca_R_transform = compute_pca(R_fit, R_transform, 'pca_R')
pca_L_fit, pca_L_transform = compute_pca(L_fit, L_transform, 'pca_L')
pca_both_fit, pca_both_transform = compute_pca(both_fit, both_transform, 'pca_both')

In [82]:
transform_subjects.Subject.values

array(['INV0CF8E46F', 'INV0HGGUV9D', 'INV0KXCWYBC', 'INV0PHJ7BTW',
       'INV11JMTY1C', 'INV132W9267', 'INV17T0KUU8', 'INV1EP7FFT6',
       'INV22KFTD88', 'INV2C75ML18', 'INV2E4J99YU', 'INV2LR63NG1',
       'INV2TA9J0N9', 'INV2ZMHWNKA', 'INV3GFP1EVM', 'INV3WWL2XEH',
       'INV47EVTF68', 'INV497MJ3L8', 'INV4AA7CCGL', 'INV4FRH8AGB',
       'INV4XHTBW9J', 'INV55KY3YYJ', 'INV5EFE93NB', 'INV5FCF312N',
       'INV5HDUG6R8', 'INV5LXHAXWU', 'INV5TDTH4X9', 'INV5VN6XHAU',
       'INV5WFNEU0F', 'INV5XA7G6NY', 'INV64AL1N24', 'INV64HRZ0ZP',
       'INV65MBF5YK', 'INV6BKWREW3', 'INV6CH6YU04', 'INV6GG8BG9T',
       'INV6HPWCVVA', 'INV6JL83AF9', 'INV6NFZR9Z7', 'INV6NYR91TX',
       'INV6T5U9DNA', 'INV6YJ1X3KF', 'INV6ZREV8DN', 'INV7PN5J29U',
       'INV7V2ACR47', 'INV7VJB9UG1', 'INV80RD6NKC', 'INV812HU2ZP',
       'INV85VZYDRM', 'INV89625TBA', 'INV89B0GTD8', 'INV8DVCNM0U',
       'INV8E40CF4H', 'INV8E62WHVE', 'INV8KECEAVD', 'INV8RFEMPC8',
       'INV8TAB1G3H', 'INV937EPU3T', 'INV95JFLU47', 'INV99H2F7

## Transform the result numpy arrays into embeddings

In [94]:
def numpy_to_embs(pca_numpy, subjects, save_path=None):
    columns = ["dim_"+str(i+1) for i in range(pca_numpy.shape[1])]

    pca_embeddings = pd.DataFrame(pca_numpy, columns=columns)

    # add subjects
    subjects_new = subjects.reset_index(drop=True)
    subjects_new = subjects_new.rename(columns={'Subject': 'ID'})
    pca_embeddings = pd.concat([subjects_new, pca_embeddings], axis=1)
    
    # save the embeddinds if required
    if save_path is not None:
        pca_embeddings.to_csv(save_path, index=False)

    return pca_embeddings

In [95]:
# save fit subjects
numpy_to_embs(pca_R_fit, fit_subjects, save_path=save_path+'/embeddings/pca_R_fit_embeddings.csv')
numpy_to_embs(pca_L_fit, fit_subjects, save_path=save_path+'/embeddings/pca_L_fit_embeddings.csv')
numpy_to_embs(pca_both_fit, fit_subjects, save_path=save_path+'/embeddings/pca_both_fit_embeddings.csv')

# save transform subjects
numpy_to_embs(pca_R_transform, transform_subjects, save_path=save_path+'/embeddings/pca_R_transform_embeddings.csv')
numpy_to_embs(pca_L_transform, transform_subjects, save_path=save_path+'/embeddings/pca_L_transform_embeddings.csv')
numpy_to_embs(pca_both_transform, transform_subjects, save_path=save_path+'/embeddings/pca_both_transform_embeddings.csv')

Unnamed: 0,ID,dim_1,dim_2,dim_3,dim_4,dim_5,dim_6,dim_7,dim_8,dim_9,dim_10
0,INV0CF8E46F,305.195801,-80.943886,247.671134,-68.466904,5.378986,146.719367,62.118010,-70.431311,80.566163,97.304679
1,INV0HGGUV9D,-83.993981,63.229387,36.605117,207.476647,-236.915436,13.037100,-99.199741,-27.061061,4.116672,77.406326
2,INV0KXCWYBC,256.678417,-259.989763,-137.905382,-151.621715,48.059500,-39.866986,-152.261413,-33.003600,209.273682,13.230429
3,INV0PHJ7BTW,271.064856,-156.400676,17.997159,250.668260,-53.425662,6.647447,-97.697806,170.856311,1.285498,26.193134
4,INV11JMTY1C,291.374197,-31.341903,40.679763,-144.928868,-3.894235,-164.397336,15.070019,203.452546,-30.955545,135.634891
...,...,...,...,...,...,...,...,...,...,...,...
359,jp100096,246.479524,284.457441,-26.534381,181.876100,96.670908,-154.453979,-309.537351,194.539051,7.722173,-150.682872
360,ks100063,-40.614663,161.667889,-332.621048,-189.329395,-77.587934,44.629927,87.695436,-9.775877,111.057478,1.904023
361,mn100399,-277.521452,-94.358357,-16.349846,-30.474885,-96.064432,276.507680,84.742400,93.659757,-61.980005,-36.808249
362,ra120073,139.050257,-405.279273,195.736125,-134.691583,8.149286,-72.130600,175.833507,-67.940715,137.655370,166.838637


The actual classification results are produced by train_multiple_classifiers.