In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
save_path = "/neurospin/dico/data/deep_folding/current/datasets/schiz/"

### Useful functions

In [3]:
def remove_sub(liste):
    cured_list = []
    for name in liste:
        if 'sub-' in name:
            cured_list.append(name[4:])
        else:
            cured_list.append(name)
    return cured_list


def remove_suffixes(liste):
    cured_list = []
    for name in liste:
        if '_ses-1' in name:
            cured_list.append(name[:-6])
        elif '_ses-v1' in name:
            cured_list.append(name[:-7])
        else:
            cured_list.append(name)
    return cured_list

In [4]:
# test

test_list = ['tranquille', 'sub-gros_sac', 'pasb1_ses-1', 'pasb2_ses-v1', 'sub-demoniac_ses-v1']

print(remove_sub(test_list))
print(remove_suffixes(test_list))
print(remove_sub(remove_suffixes(test_list)))

['tranquille', 'gros_sac', 'pasb1_ses-1', 'pasb2_ses-v1', 'demoniac_ses-v1']
['tranquille', 'sub-gros_sac', 'pasb1', 'pasb2', 'sub-demoniac']
['tranquille', 'gros_sac', 'pasb1', 'pasb2', 'demoniac']


# Load and fusion participants.tsv

Use the participants.tsv directly from psy_sbox

## Load

In [5]:
bsnip_subjects = pd.read_csv("/neurospin/psy_sbox/bsnip1/participants.tsv", sep='\t')
candi_subjects = pd.read_csv("/neurospin/psy_sbox/candi/CANDI_t1mri_mwp1_participants.csv", sep='\t')
cnp_subjects = pd.read_csv("/neurospin/psy_sbox/cnp/participants.tsv", sep='\t')
schizconnect_subjects = pd.read_csv("/neurospin/psy_sbox/schizconnect-vip-prague/participants.tsv", sep='\t')

## Preprocess

### bsnip

In [6]:
bsnip_subjects

Unnamed: 0,participant_id,sex,age,diagnosis,study,site,phenotype
0,INV07WT2ZL3,0.0,29.0,control,BSNIP,Dallas,Control
1,INV0AL14J6U,0.0,43.0,schizophrenia,BSNIP,Dallas,Case
2,INV0B7HKFAY,0.0,39.0,relative of proband with schizoaffective disorder,BSNIP,Dallas,Relative
3,INV0G5RXTCC,1.0,34.0,schizoaffective disorder,BSNIP,Dallas,Case
4,INV0WW10RBB,1.0,42.0,schizoaffective disorder,BSNIP,Dallas,Case
...,...,...,...,...,...,...,...
1089,INVZBW8CC34,0.0,22.0,control,BSNIP,Hartford,Control
1090,INVZJDY9PR0,1.0,57.0,relative of proband with psychotic bipolar dis...,BSNIP,Hartford,Relative
1091,INVZR467366,1.0,44.0,relative of proband with schizoaffective disorder,BSNIP,Hartford,Relative
1092,INVZTBFRT07,1.0,54.0,control,BSNIP,Hartford,Control


In [7]:
# remove subjects that have been removed because of deep_folding qc
bsnip_qc = pd.read_csv("/neurospin/dico/data/deep_folding/current/datasets/bsnip1/qc.tsv", sep='\t')
bsnip_kept_sub = bsnip_qc[bsnip_qc.qc == 1].participant_id.astype(str).values

# remove 'sub-'
bsnip_kept_sub = remove_sub(bsnip_kept_sub)

print(len(bsnip_kept_sub))
bsnip_kept_sub[:5]

1080


['INV1BKERGYD', 'INVJ8GHEEJK', 'INVW5TVLKH1', 'INV77JBGWZY', 'INVYYBGWY27']

In [8]:
bsnip_subjects_processed = bsnip_subjects.drop('phenotype', axis=1)
# conversion from 0/1 to M/F not known -> skip it for now

bsnip_subjects_processed = bsnip_subjects_processed[bsnip_subjects_processed.participant_id.isin(bsnip_kept_sub)]

bsnip_subjects_processed

Unnamed: 0,participant_id,sex,age,diagnosis,study,site
0,INV07WT2ZL3,0.0,29.0,control,BSNIP,Dallas
1,INV0AL14J6U,0.0,43.0,schizophrenia,BSNIP,Dallas
2,INV0B7HKFAY,0.0,39.0,relative of proband with schizoaffective disorder,BSNIP,Dallas
3,INV0G5RXTCC,1.0,34.0,schizoaffective disorder,BSNIP,Dallas
4,INV0WW10RBB,1.0,42.0,schizoaffective disorder,BSNIP,Dallas
...,...,...,...,...,...,...
1089,INVZBW8CC34,0.0,22.0,control,BSNIP,Hartford
1090,INVZJDY9PR0,1.0,57.0,relative of proband with psychotic bipolar dis...,BSNIP,Hartford
1091,INVZR467366,1.0,44.0,relative of proband with schizoaffective disorder,BSNIP,Hartford
1092,INVZTBFRT07,1.0,54.0,control,BSNIP,Hartford


### candi

In [9]:
candi_subjects.columns

Index(['participant_id', 'session', 'TIV', 'CSF_Vol', 'GM_Vol', 'WM_Vol',
       'l3thVen_GM_Vol', 'r3thVen_GM_Vol', 'l4thVen_GM_Vol', 'r4thVen_GM_Vol',
       ...
       'age', 'Handedness', 'Handed_extended', 'Weight (lbs)', 'Height (in)',
       'Head_Circumference (cm)', 'Tanner_Stage', 'diagnosis', 'site',
       'study'],
      dtype='object', length=302)

No QC for candi

In [10]:
candi_subjects_processed = candi_subjects[['participant_id', 'sex', 'age', 'study', 'site', 'diagnosis']].copy()
# conversion from 0/1 to M/F not known -> skip it for now

candi_subjects_processed

Unnamed: 0,participant_id,sex,age,study,site,diagnosis
0,BPDwoPsy040,0,11.4,CANDI,CANDI,bipolar disorder without psychosis
1,HC017,0,9.0,CANDI,CANDI,control
2,HC019,1,14.7,CANDI,CANDI,control
3,BPDwoPsy056,0,8.1,CANDI,CANDI,bipolar disorder without psychosis
4,SS097,1,15.4,CANDI,CANDI,schizophrenia
...,...,...,...,...,...,...
98,HC014,0,8.0,CANDI,CANDI,control
99,BPDwoPsy030,1,9.4,CANDI,CANDI,bipolar disorder without psychosis
100,HC025,0,9.6,CANDI,CANDI,control
101,BPDwoPsy058,1,9.1,CANDI,CANDI,bipolar disorder without psychosis


### cnp

In [11]:
cnp_subjects

Unnamed: 0,participant_id,diagnosis,age,gender,bart,bht,dwi,pamenc,pamret,rest,scap,stopsignal,T1w,taskswitch,ScannerSerialNumber,ghost_NoGhost
0,sub-10159,CONTROL,30,F,1.0,,1.0,,,1.0,1.0,1.0,1.0,1.0,35343.0,No_ghost
1,sub-10171,CONTROL,24,M,1.0,1.0,1.0,,,1.0,1.0,1.0,1.0,1.0,35343.0,No_ghost
2,sub-10189,CONTROL,49,M,1.0,,1.0,,,1.0,1.0,1.0,1.0,1.0,35343.0,No_ghost
3,sub-10193,CONTROL,40,M,1.0,,1.0,,,,,,1.0,,35343.0,No_ghost
4,sub-10206,CONTROL,21,M,1.0,,1.0,,,1.0,1.0,1.0,1.0,1.0,35343.0,No_ghost
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
267,sub-70079,ADHD,21,M,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,35426.0,No_ghost
268,sub-70080,ADHD,48,M,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,35426.0,No_ghost
269,sub-70081,ADHD,50,M,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,35426.0,No_ghost
270,sub-70083,ADHD,46,F,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,35426.0,No_ghost


In [12]:
# remove subjects that have been removed because of deep_folding qc
cnp_qc = pd.read_csv("/neurospin/dico/data/deep_folding/current/datasets/cnp/qc.tsv", sep='\t')
cnp_kept_sub = cnp_qc[cnp_qc.qc == 1].participant_id.astype(str).values

print(len(cnp_kept_sub))
cnp_kept_sub[:5]

264


array(['sub-60043', 'sub-70007', 'sub-10235', 'sub-11142', 'sub-10719'],
      dtype=object)

In [13]:
cnp_subjects_processed = cnp_subjects[['participant_id', 'gender', 'age', 'diagnosis']].copy()
cnp_subjects_processed.rename(columns={'gender': 'sex'}, inplace=True)
cnp_subjects_processed['site'] = ['CNP' for i in range(cnp_subjects_processed.shape[0])]
cnp_subjects_processed['study'] = ['CNP' for i in range(cnp_subjects_processed.shape[0])]


cnp_subjects_processed = cnp_subjects_processed[cnp_subjects_processed.participant_id.isin(cnp_kept_sub)]

cnp_subjects_processed

Unnamed: 0,participant_id,sex,age,diagnosis,site,study
0,sub-10159,F,30,CONTROL,CNP,CNP
1,sub-10171,M,24,CONTROL,CNP,CNP
2,sub-10189,M,49,CONTROL,CNP,CNP
3,sub-10193,M,40,CONTROL,CNP,CNP
4,sub-10206,M,21,CONTROL,CNP,CNP
...,...,...,...,...,...,...
267,sub-70079,M,21,ADHD,CNP,CNP
268,sub-70080,M,48,ADHD,CNP,CNP
269,sub-70081,M,50,ADHD,CNP,CNP
270,sub-70083,F,46,ADHD,CNP,CNP


In [14]:
# remove 'sub-' from the db

cnp_kept_sub = cnp_subjects_processed.participant_id.values

cured_cnp_sub = remove_sub(cnp_kept_sub)

cnp_subjects_processed['participant_id'] = cured_cnp_sub

cnp_subjects_processed

Unnamed: 0,participant_id,sex,age,diagnosis,site,study
0,10159,F,30,CONTROL,CNP,CNP
1,10171,M,24,CONTROL,CNP,CNP
2,10189,M,49,CONTROL,CNP,CNP
3,10193,M,40,CONTROL,CNP,CNP
4,10206,M,21,CONTROL,CNP,CNP
...,...,...,...,...,...,...
267,70079,M,21,ADHD,CNP,CNP
268,70080,M,48,ADHD,CNP,CNP
269,70081,M,50,ADHD,CNP,CNP
270,70083,F,46,ADHD,CNP,CNP


### schizconnect

In [15]:
print(schizconnect_subjects.shape[0])
schizconnect_subjects.columns

738


Index(['participant_id', 'sex', 'age', 'diagnosis', 'study', 'site',
       'medication', 'cannabis_last_month', 'tobacco_last_month',
       'alcohol_last_month', 'BPRS', 'PANSS_total', 'PANSS_positive',
       'PANSS_negative', 'PANSS_psychopatho', 'PANSS_desorganisation', 'SANS',
       'SAPS', 'MADRS', 'SOFAS', 'NSS', 'irm', 'Age of Onset', 'Alcohol',
       'Anticonvulsants', 'Antidepressants', 'Antipsychotics', 'BD Type',
       'Density of Episodes', 'Depression Scale', 'Depression Score',
       'Illness Duration', 'Lithium', 'Mania Scale', 'Mania Score',
       'Mood Phase', 'Number of Depressive Episodes',
       'Number of Manic Episodes', 'Onset Time', 'Psychotic', 'Severity',
       'Total Episodes', 'ymrstot', 'psysoc_65', 'psychosis_lt', 'phenotype',
       'session', 'path', 'TIV', 'CSF_Vol', 'GM_Vol', 'WM_Vol'],
      dtype='object')

In [16]:
# remove subjects that have been removed because of deep_folding qc
schizconnect_qc = pd.read_csv("/neurospin/dico/data/deep_folding/current/datasets/schizconnect-vip-prague/qc.tsv", sep='\t')
schizconnect_kept_sub = schizconnect_qc[schizconnect_qc.qc == 1].participant_id.astype(str).values

# remove 'sub-'
schizconnect_kept_sub = remove_sub(schizconnect_kept_sub)

print(len(schizconnect_kept_sub))
schizconnect_kept_sub[:5]

734


['A00027391', 'mw130109', 'A00018979', 'dk100082', 'ESOP00056']

In [17]:
schizconnect_subjects_processed = schizconnect_subjects[['participant_id', 'sex', 'age', 'diagnosis', 'study', 'site']]

schizconnect_subjects_processed = schizconnect_subjects_processed[schizconnect_subjects_processed.participant_id.isin(schizconnect_kept_sub)]

schizconnect_subjects_processed

Unnamed: 0,participant_id,sex,age,diagnosis,study,site
0,A00000300,0.0,36.0,control,SCHIZCONNECT-VIP,MRN
1,A00000368,0.0,52.0,schizophrenia,SCHIZCONNECT-VIP,MRN
2,A00000456,0.0,53.0,schizophrenia,SCHIZCONNECT-VIP,MRN
3,A00000838,0.0,29.0,schizophrenia,SCHIZCONNECT-VIP,MRN
4,A00000909,0.0,27.0,schizophrenia,SCHIZCONNECT-VIP,MRN
...,...,...,...,...,...,...
733,ESOP00043,1.0,25.0,FEP,PRAGUE,PRAGUE
734,ESOP00086,0.0,27.0,FEP,PRAGUE,PRAGUE
735,ESOC10063,0.0,31.0,control,PRAGUE,PRAGUE
736,ESOC10098,1.0,27.0,control,PRAGUE,PRAGUE


## Fusion

In [18]:
schiz_subjects_full = pd.concat([bsnip_subjects_processed, candi_subjects_processed, cnp_subjects_processed, schizconnect_subjects_processed])

schiz_subjects_full

Unnamed: 0,participant_id,sex,age,diagnosis,study,site
0,INV07WT2ZL3,0,29.0,control,BSNIP,Dallas
1,INV0AL14J6U,0,43.0,schizophrenia,BSNIP,Dallas
2,INV0B7HKFAY,0,39.0,relative of proband with schizoaffective disorder,BSNIP,Dallas
3,INV0G5RXTCC,1,34.0,schizoaffective disorder,BSNIP,Dallas
4,INV0WW10RBB,1,42.0,schizoaffective disorder,BSNIP,Dallas
...,...,...,...,...,...,...
733,ESOP00043,1,25.0,FEP,PRAGUE,PRAGUE
734,ESOP00086,0,27.0,FEP,PRAGUE,PRAGUE
735,ESOC10063,0,31.0,control,PRAGUE,PRAGUE
736,ESOC10098,1,27.0,control,PRAGUE,PRAGUE


In [19]:
# homogenize the values for schizophrenia and control
print(schiz_subjects_full.diagnosis.unique())
schiz_subjects_full = schiz_subjects_full.replace({'CONTROL': 'control', 'SCHZ': 'schizophrenia'})
print(schiz_subjects_full.diagnosis.unique())

['control' 'schizophrenia'
 'relative of proband with schizoaffective disorder'
 'schizoaffective disorder' 'psychotic bipolar disorder'
 'relative of proband with schizophrenia'
 'relative of proband with psychotic bipolar disorder'
 'bipolar disorder without psychosis' 'bipolar disorder with psychosis'
 'CONTROL' 'SCHZ' 'BIPOLAR' 'ADHD' 'FEP']
['control' 'schizophrenia'
 'relative of proband with schizoaffective disorder'
 'schizoaffective disorder' 'psychotic bipolar disorder'
 'relative of proband with schizophrenia'
 'relative of proband with psychotic bipolar disorder'
 'bipolar disorder without psychosis' 'bipolar disorder with psychosis'
 'BIPOLAR' 'ADHD' 'FEP']


#### Small checks

In [20]:
print(f"{np.sum([bsnip_subjects.shape[0], candi_subjects.shape[0], cnp_subjects.shape[0], schizconnect_subjects.shape[0]]) - schiz_subjects_full.shape[0]} subjects have been removed")

26 subjects have been removed


In [21]:
print(schiz_subjects_full.sex.unique())
print(schiz_subjects_full.diagnosis.unique())

[0.0 1.0 'F' 'M']
['control' 'schizophrenia'
 'relative of proband with schizoaffective disorder'
 'schizoaffective disorder' 'psychotic bipolar disorder'
 'relative of proband with schizophrenia'
 'relative of proband with psychotic bipolar disorder'
 'bipolar disorder without psychosis' 'bipolar disorder with psychosis'
 'BIPOLAR' 'ADHD' 'FEP']


#### Save database

In [22]:
schiz_subjects_full.to_csv(save_path+"schiz_participants_full.csv", index=False)

# Separate train-val-tests

### Load the set defining pickle

In [23]:
train_val_tests_sets = pd.read_pickle("/neurospin/psy_sbox/analyses/201906_schizconnect-vip-prague-bsnip-biodb-icaar-start_assemble-all/data/train_val_test_test-intra_scz_stratified.pkl")

train_val_tests_sets

{'train':     participant_id  session             study
 0        ESOC10104      1.0            PRAGUE
 1        A00014522      1.0  SCHIZCONNECT-VIP
 2        A00001243      1.0  SCHIZCONNECT-VIP
 3         or130001      1.0  SCHIZCONNECT-VIP
 4        A00028405      1.0  SCHIZCONNECT-VIP
 ..             ...      ...               ...
 928          HC001      1.0             CANDI
 929          HC027      1.0             CANDI
 930          SS086      1.0             CANDI
 931          HC023      1.0             CANDI
 932          SS095      1.0             CANDI
 
 [933 rows x 3 columns],
 'validation':     participant_id  session             study
 0           NM2020      1.0  SCHIZCONNECT-VIP
 1        A00036844      1.0  SCHIZCONNECT-VIP
 2        ESOC10040      1.0            PRAGUE
 3           NM1069      1.0  SCHIZCONNECT-VIP
 4        ESOC10106      1.0            PRAGUE
 ..             ...      ...               ...
 111          10316      1.0               CNP
 112      

In [24]:
# load data
train_set = train_val_tests_sets['train']
val_set = train_val_tests_sets['validation']
test_intra_set = train_val_tests_sets['test_intra']
test_set = train_val_tests_sets['test']

# put the right data type
train_set.participant_id = train_set.participant_id.astype(str)
val_set.participant_id = val_set.participant_id.astype(str)
test_intra_set.participant_id = test_intra_set.participant_id.astype(str)
test_set.participant_id = test_set.participant_id.astype(str)

print("shapes", [train_set.shape[0], val_set.shape[0], test_intra_set.shape[0], test_set.shape[0]])
print(np.sum([train_set.shape[0], val_set.shape[0], test_intra_set.shape[0], test_set.shape[0]]))

shapes [933, 116, 118, 133]
1300


Need to remove the subjects that didn't have the right deep_folding QC or that didn't pass through morphologist

In [25]:
candi_kept_sub = candi_subjects.participant_id.values

# remove 'sub-' from cnp_kept_sub
cnp_kept_sub = remove_sub(cnp_kept_sub)

kept_sub = np.concatenate([bsnip_kept_sub, cnp_kept_sub, candi_kept_sub, schizconnect_kept_sub])

print(len(kept_sub)) # should be equal to 2181

train_set = train_set[train_set.participant_id.isin(kept_sub)]
val_set = val_set[val_set.participant_id.isin(kept_sub)]
test_intra_set = test_intra_set[test_intra_set.participant_id.isin(kept_sub)]
test_set = test_set[test_set.participant_id.isin(kept_sub)]

shapes = [train_set.shape[0], val_set.shape[0], test_intra_set.shape[0], test_set.shape[0]]

print(shapes)
print(np.sum(shapes)) # should be equal to 1292

2181
[928, 116, 118, 130]
1292


### Create train-val-tests csv files

In [26]:
train_set.participant_id.astype(str).to_csv(save_path+'train_subjects.csv', index=False, header=False)
val_set.participant_id.astype(str).to_csv(save_path+'val_subjects.csv', index=False, header=False)
test_intra_set.participant_id.astype(str).to_csv(save_path+'test_intra_subjects.csv', index=False, header=False)
test_set.participant_id.astype(str).to_csv(save_path+'test_subjects.csv', index=False, header=False)

### Create a file with all used subjects

In [27]:
all_sets = pd.concat([train_set, val_set, test_intra_set, test_set])
all_sets

Unnamed: 0,participant_id,session,study
0,ESOC10104,1.0,PRAGUE
1,A00014522,1.0,SCHIZCONNECT-VIP
2,A00001243,1.0,SCHIZCONNECT-VIP
3,or130001,1.0,SCHIZCONNECT-VIP
4,A00028405,1.0,SCHIZCONNECT-VIP
...,...,...,...
128,INV64AL1N24,1.0,BSNIP
129,INVBEKLL87A,1.0,BSNIP
130,INV11JMTY1C,1.0,BSNIP
131,INVK4HB8HEX,1.0,BSNIP


The 'sub-' are missing for some individuals ><'.

The ones concerned are the ones that are not common between 'all_sets' and 'schiz_subjects_full'

In [28]:
# add 'sub-' for the subjects that need it
#all_sets.loc[~all_sets.participant_id.isin(schiz_subjects_full.participant_id), 'participant_id'] = 'sub-' + all_sets.loc[~all_sets.participant_id.isin(schiz_subjects_full.participant_id), 'participant_id']

#### Filter schiz_subjects_full by keeping only the actually used subjects (schizophrenia and control)

In [29]:
schiz_subjects = schiz_subjects_full[schiz_subjects_full.participant_id.isin(all_sets.participant_id)]
schiz_subjects

Unnamed: 0,participant_id,sex,age,diagnosis,study,site
0,INV07WT2ZL3,0,29.0,control,BSNIP,Dallas
1,INV0AL14J6U,0,43.0,schizophrenia,BSNIP,Dallas
6,INV14XK7P6E,0,30.0,control,BSNIP,Dallas
7,INV1HXNTXYF,1,18.0,control,BSNIP,Dallas
9,INV1XCNF4J5,1,25.0,control,BSNIP,Dallas
...,...,...,...,...,...,...
729,ESOC10060,1,26.0,control,PRAGUE,PRAGUE
731,ESOC10019,1,28.0,control,PRAGUE,PRAGUE
735,ESOC10063,0,31.0,control,PRAGUE,PRAGUE
736,ESOC10098,1,27.0,control,PRAGUE,PRAGUE


In [30]:
# save it to csv
schiz_subjects.to_csv(save_path + 'used_schiz_subjects.csv', index=False)

# Fusion numpy & Rskeletons.csv

In [183]:
schiz_subjects = pd.read_csv(save_path + 'used_schiz_subjects.csv')
schiz_subjects

Unnamed: 0,participant_id,sex,age,diagnosis,study,site
0,INV07WT2ZL3,0.0,29.0,control,BSNIP,Dallas
1,INV0AL14J6U,0.0,43.0,schizophrenia,BSNIP,Dallas
2,INV14XK7P6E,0.0,30.0,control,BSNIP,Dallas
3,INV1HXNTXYF,1.0,18.0,control,BSNIP,Dallas
4,INV1XCNF4J5,1.0,25.0,control,BSNIP,Dallas
...,...,...,...,...,...,...
1287,ESOC10060,1.0,26.0,control,PRAGUE,PRAGUE
1288,ESOC10019,1.0,28.0,control,PRAGUE,PRAGUE
1289,ESOC10063,0.0,31.0,control,PRAGUE,PRAGUE
1290,ESOC10098,1.0,27.0,control,PRAGUE,PRAGUE


Don't forget to run it twice (or more), to apply it to both data types (and sides if required).

In [481]:
region_name = 'Sc.Cal.-S.Li.'
save_path_numpy = save_path + f'crops/2mm/{region_name}/mask/'
side = 'R'
data_type = 'skeleton'
#data_type = 'label' or 'skeleton' 

## Remove the very annoying prefixes and suffixes from deep_folding files

In [482]:
# load path to csv
bsnip_path = f"/neurospin/dico/data/deep_folding/current/datasets/bsnip1/crops/2mm/{region_name}/mask/{side}{data_type}_subject.csv"
candi_path = f"/neurospin/dico/data/deep_folding/current/datasets/candi/crops/2mm/{region_name}/mask/{side}{data_type}_subject.csv"
cnp_path = f"/neurospin/dico/data/deep_folding/current/datasets/cnp/crops/2mm/{region_name}/mask/{side}{data_type}_subject.csv"
schizconnect_path = f"/neurospin/dico/data/deep_folding/current/datasets/schizconnect-vip-prague/crops/2mm/{region_name}/mask/{side}{data_type}_subject.csv"

In [483]:
# load, remove sub and co, and save a copy of a targeted df
def cure_df(csv_path, keep_sub=False, save_path=None):
    df = pd.read_csv(csv_path)
    participants = df.Subject
    if not keep_sub:
        participants = remove_sub(participants)
    participants = remove_suffixes(participants)

    cured_df = pd.DataFrame(participants, columns=['Subject'])

    if save_path != None:
        cured_df.to_csv(save_path, index=False)

    return cured_df

In [484]:
cured_bsnip = cure_df(bsnip_path, save_path=bsnip_path[:-4]+"_cured.csv")
cured_candi = cure_df(candi_path, save_path=candi_path[:-4]+"_cured.csv")
cured_cnp = cure_df(cnp_path, save_path=cnp_path[:-4]+"_cured.csv")
cured_schizconnect = cure_df(schizconnect_path, save_path=schizconnect_path[:-4]+"_cured.csv")

In [485]:
print([cured_bsnip.shape[0], cured_candi.shape[0], cured_cnp.shape[0], cured_schizconnect.shape[0]]) # sum should be equal to about 2100

[1080, 103, 264, 734]


## Load the processed files

In [486]:
# load deep_folding numpy files
bsnip_npy = np.load(f"/neurospin/dico/data/deep_folding/current/datasets/bsnip1/crops/2mm/{region_name}/mask/{side}{data_type}.npy")
candi_npy = np.load(f"/neurospin/dico/data/deep_folding/current/datasets/candi/crops/2mm/{region_name}/mask/{side}{data_type}.npy")
cnp_npy = np.load(f"/neurospin/dico/data/deep_folding/current/datasets/cnp/crops/2mm/{region_name}/mask/{side}{data_type}.npy")
schizconnect_npy = np.load(f"/neurospin/dico/data/deep_folding/current/datasets/schizconnect-vip-prague/crops/2mm/{region_name}/mask/{side}{data_type}.npy")

# load associated csv
bsnip_csv = pd.read_csv(f"/neurospin/dico/data/deep_folding/current/datasets/bsnip1/crops/2mm/{region_name}/mask/{side}{data_type}_subject_cured.csv")
candi_csv = pd.read_csv(f"/neurospin/dico/data/deep_folding/current/datasets/candi/crops/2mm/{region_name}/mask/{side}{data_type}_subject_cured.csv")
# /!\ cnp Subject ids are ints
cnp_csv = pd.read_csv(f"/neurospin/dico/data/deep_folding/current/datasets/cnp/crops/2mm/{region_name}/mask/{side}{data_type}_subject_cured.csv", dtype=str)
schizconnect_csv = pd.read_csv(f"/neurospin/dico/data/deep_folding/current/datasets/schizconnect-vip-prague/crops/2mm/{region_name}/mask/{side}{data_type}_subject_cured.csv")

In [487]:
# checks
print(bsnip_npy.shape, candi_npy.shape, cnp_npy.shape, schizconnect_npy.shape)
print(bsnip_csv.shape, candi_csv.shape, cnp_csv.shape, schizconnect_csv.shape)

(1080, 26, 42, 44, 1) (103, 26, 42, 44, 1) (264, 26, 42, 44, 1) (734, 26, 42, 44, 1)
(1080, 1) (103, 1) (264, 1) (734, 1)


## Keep only the used subjects (else bug)

We'll treat the bases separately because of their poor homogenisation (some have 'sub-', others have '_ses-v1', others have '_ses-1').

### bsnip

In [488]:
subs = np.copy(schiz_subjects.participant_id.values)

# modified_subs = []

# for i, sub in enumerate(subs):
#     modified_subs.append('sub-' + sub)

# print(modified_subs[:5])

In [489]:
#bsnip_kept_subjects = bsnip_csv[bsnip_csv.Subject.isin(modified_subs)].index

bsnip_kept_subjects = bsnip_csv[bsnip_csv.Subject.isin(subs)].index
bsnip_kept_subjects

Index([   1,    2,    4,    7,    9,   13,   14,   16,   17,   19,
       ...
       1059, 1063, 1064, 1066, 1069, 1070, 1074, 1075, 1076, 1077],
      dtype='int64', length=385)

### candi

In [490]:
subs = np.copy(schiz_subjects.participant_id.values)

# modified_subs = []

# for i, sub in enumerate(subs):
#     modified_subs.append('sub-' + sub + '_ses-1')

# print(modified_subs[:5])

In [491]:
candi_kept_subjects = candi_csv[candi_csv.Subject.isin(subs)].index
candi_kept_subjects

Index([ 54,  55,  58,  59,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,
        71,  72,  73,  74,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,
        86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,
       100, 101, 102],
      dtype='int64')

### cnp

In [492]:
subs = np.copy(schiz_subjects.participant_id.values)

# modified_subs = []

# for i, sub in enumerate(subs):
#     modified_subs.append(sub + '_ses-1')

# print(modified_subs[:5])
subs

array(['INV07WT2ZL3', 'INV0AL14J6U', 'INV14XK7P6E', ..., 'ESOC10063',
       'ESOC10098', 'ESOC10106'], dtype=object)

In [493]:
cnp_kept_subjects = cnp_csv[cnp_csv.Subject.isin(subs)].index
cnp_kept_subjects

Index([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,
       ...
       164, 165, 166, 167, 168, 169, 170, 171, 172, 173],
      dtype='int64', length=172)

### schizconnect

In [494]:
subs = np.copy(schiz_subjects.participant_id.values)

# modified_subs = []

# for i, sub in enumerate(subs):
#     modified_subs.append('sub-' + sub + '_ses-v1')

# print(modified_subs[:5])

In [495]:
schizconnect_kept_subjects = schizconnect_csv[schizconnect_csv.Subject.isin(subs)].index
schizconnect_kept_subjects

Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
       ...
       724, 725, 726, 727, 728, 729, 730, 731, 732, 733],
      dtype='int64', length=690)

### checks

In [496]:
print(np.sum([bsnip_kept_subjects.shape[0], candi_kept_subjects.shape[0], cnp_kept_subjects.shape[0], schizconnect_kept_subjects.shape[0]])) # should be equal to 1292

1292


### Finally restraint the numpys and the csv

In [497]:
bsnip_npy = bsnip_npy[bsnip_kept_subjects]
candi_npy = candi_npy[candi_kept_subjects]
cnp_npy = cnp_npy[cnp_kept_subjects]
schizconnect_npy = schizconnect_npy[schizconnect_kept_subjects]

bsnip_csv = bsnip_csv.loc[bsnip_kept_subjects, :]
candi_csv = candi_csv.loc[candi_kept_subjects, :]
cnp_csv = cnp_csv.loc[cnp_kept_subjects, :]
schizconnect_csv = schizconnect_csv.loc[schizconnect_kept_subjects, :]

## Concat and save the results

In [498]:
# concat
schiz_npy = np.concatenate([bsnip_npy, candi_npy, cnp_npy, schizconnect_npy], axis=0)
print(schiz_npy.shape)

schiz_csv = pd.concat([bsnip_csv, candi_csv, cnp_csv, schizconnect_csv], axis=0)
schiz_csv = schiz_csv['Subject']
print(schiz_csv.shape)

(1292, 26, 42, 44, 1)
(1292,)


In [499]:
# save

# create the folders if they don't exist already
if not os.path.exists(save_path_numpy):
    os.makedirs(save_path_numpy)

# save the numpy and csv
np.save(save_path_numpy + f'{side}{data_type}.npy', schiz_npy)
schiz_csv.to_csv(save_path_numpy + f'{side}{data_type}_subject.csv', index=False)

# create a folder (empty)
if data_type == 'skeleton':
    truc = 'crops'
else:
    truc = 'labels'
if not os.path.exists(save_path_numpy+f'{side}{truc}'):
    os.mkdir(save_path_numpy+f'{side}{truc}')

In [500]:
# checks
loaded_npy = np.load(save_path_numpy + f'{side}{data_type}.npy')
loaded_csv = pd.read_csv(save_path_numpy + f'{side}{data_type}_subject.csv')

print(loaded_npy.shape)
print(loaded_csv.shape)

print(loaded_csv.applymap(type).Subject.unique())

(1292, 26, 42, 44, 1)
(1292, 1)
[<class 'str'>]
