In [1]:
import numpy as np
import pandas as pd
import pickle

In [2]:
save_path = "/neurospin/dico/data/deep_folding/current/datasets/schiz/"

# Load and fusion participants.tsv

## Load

In [3]:
bsnip_subjects = pd.read_csv("/neurospin/psy_sbox/bsnip1/participants.tsv", sep='\t')
candi_subjects = pd.read_csv("/neurospin/psy_sbox/candi/CANDI_t1mri_mwp1_participants.csv", sep='\t')
cnp_subjects = pd.read_csv("/neurospin/psy_sbox/cnp/participants.tsv", sep='\t')
schizconnect_subjects = pd.read_csv("/neurospin/psy_sbox/schizconnect-vip-prague/participants.tsv", sep='\t')

## Preprocess

### bsnip

In [4]:
bsnip_subjects

Unnamed: 0,participant_id,sex,age,diagnosis,study,site,phenotype
0,INV07WT2ZL3,0.0,29.0,control,BSNIP,Dallas,Control
1,INV0AL14J6U,0.0,43.0,schizophrenia,BSNIP,Dallas,Case
2,INV0B7HKFAY,0.0,39.0,relative of proband with schizoaffective disorder,BSNIP,Dallas,Relative
3,INV0G5RXTCC,1.0,34.0,schizoaffective disorder,BSNIP,Dallas,Case
4,INV0WW10RBB,1.0,42.0,schizoaffective disorder,BSNIP,Dallas,Case
...,...,...,...,...,...,...,...
1089,INVZBW8CC34,0.0,22.0,control,BSNIP,Hartford,Control
1090,INVZJDY9PR0,1.0,57.0,relative of proband with psychotic bipolar dis...,BSNIP,Hartford,Relative
1091,INVZR467366,1.0,44.0,relative of proband with schizoaffective disorder,BSNIP,Hartford,Relative
1092,INVZTBFRT07,1.0,54.0,control,BSNIP,Hartford,Control


In [5]:
bsnip_subjects_processed = bsnip_subjects.drop('phenotype', axis=1)
# conversion from 0/1 to M/F not known -> skip it for now
bsnip_subjects_processed

Unnamed: 0,participant_id,sex,age,diagnosis,study,site
0,INV07WT2ZL3,0.0,29.0,control,BSNIP,Dallas
1,INV0AL14J6U,0.0,43.0,schizophrenia,BSNIP,Dallas
2,INV0B7HKFAY,0.0,39.0,relative of proband with schizoaffective disorder,BSNIP,Dallas
3,INV0G5RXTCC,1.0,34.0,schizoaffective disorder,BSNIP,Dallas
4,INV0WW10RBB,1.0,42.0,schizoaffective disorder,BSNIP,Dallas
...,...,...,...,...,...,...
1089,INVZBW8CC34,0.0,22.0,control,BSNIP,Hartford
1090,INVZJDY9PR0,1.0,57.0,relative of proband with psychotic bipolar dis...,BSNIP,Hartford
1091,INVZR467366,1.0,44.0,relative of proband with schizoaffective disorder,BSNIP,Hartford
1092,INVZTBFRT07,1.0,54.0,control,BSNIP,Hartford


### candi

In [6]:
candi_subjects.columns

Index(['participant_id', 'session', 'TIV', 'CSF_Vol', 'GM_Vol', 'WM_Vol',
       'l3thVen_GM_Vol', 'r3thVen_GM_Vol', 'l4thVen_GM_Vol', 'r4thVen_GM_Vol',
       ...
       'age', 'Handedness', 'Handed_extended', 'Weight (lbs)', 'Height (in)',
       'Head_Circumference (cm)', 'Tanner_Stage', 'diagnosis', 'site',
       'study'],
      dtype='object', length=302)

In [7]:
candi_subjects_processed = candi_subjects[['participant_id', 'sex', 'age', 'study', 'site', 'diagnosis']].copy()
# conversion from 0/1 to M/F not known -> skip it for now

candi_subjects_processed

Unnamed: 0,participant_id,sex,age,study,site,diagnosis
0,BPDwoPsy040,0,11.4,CANDI,CANDI,bipolar disorder without psychosis
1,HC017,0,9.0,CANDI,CANDI,control
2,HC019,1,14.7,CANDI,CANDI,control
3,BPDwoPsy056,0,8.1,CANDI,CANDI,bipolar disorder without psychosis
4,SS097,1,15.4,CANDI,CANDI,schizophrenia
...,...,...,...,...,...,...
98,HC014,0,8.0,CANDI,CANDI,control
99,BPDwoPsy030,1,9.4,CANDI,CANDI,bipolar disorder without psychosis
100,HC025,0,9.6,CANDI,CANDI,control
101,BPDwoPsy058,1,9.1,CANDI,CANDI,bipolar disorder without psychosis


### cnp

In [8]:
cnp_subjects

Unnamed: 0,participant_id,diagnosis,age,gender,bart,bht,dwi,pamenc,pamret,rest,scap,stopsignal,T1w,taskswitch,ScannerSerialNumber,ghost_NoGhost
0,sub-10159,CONTROL,30,F,1.0,,1.0,,,1.0,1.0,1.0,1.0,1.0,35343.0,No_ghost
1,sub-10171,CONTROL,24,M,1.0,1.0,1.0,,,1.0,1.0,1.0,1.0,1.0,35343.0,No_ghost
2,sub-10189,CONTROL,49,M,1.0,,1.0,,,1.0,1.0,1.0,1.0,1.0,35343.0,No_ghost
3,sub-10193,CONTROL,40,M,1.0,,1.0,,,,,,1.0,,35343.0,No_ghost
4,sub-10206,CONTROL,21,M,1.0,,1.0,,,1.0,1.0,1.0,1.0,1.0,35343.0,No_ghost
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
267,sub-70079,ADHD,21,M,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,35426.0,No_ghost
268,sub-70080,ADHD,48,M,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,35426.0,No_ghost
269,sub-70081,ADHD,50,M,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,35426.0,No_ghost
270,sub-70083,ADHD,46,F,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,35426.0,No_ghost


In [9]:
cnp_subjects_processed = cnp_subjects[['participant_id', 'gender', 'age', 'diagnosis']].copy()
cnp_subjects_processed.rename(columns={'gender': 'sex'}, inplace=True)
cnp_subjects_processed['site'] = ['CNP' for i in range(cnp_subjects_processed.shape[0])]
cnp_subjects_processed['study'] = ['CNP' for i in range(cnp_subjects_processed.shape[0])]

cnp_subjects_processed

Unnamed: 0,participant_id,sex,age,diagnosis,site,study
0,sub-10159,F,30,CONTROL,CNP,CNP
1,sub-10171,M,24,CONTROL,CNP,CNP
2,sub-10189,M,49,CONTROL,CNP,CNP
3,sub-10193,M,40,CONTROL,CNP,CNP
4,sub-10206,M,21,CONTROL,CNP,CNP
...,...,...,...,...,...,...
267,sub-70079,M,21,ADHD,CNP,CNP
268,sub-70080,M,48,ADHD,CNP,CNP
269,sub-70081,M,50,ADHD,CNP,CNP
270,sub-70083,F,46,ADHD,CNP,CNP


### schizconnect

In [10]:
schizconnect_subjects.columns

Index(['participant_id', 'sex', 'age', 'diagnosis', 'study', 'site',
       'medication', 'cannabis_last_month', 'tobacco_last_month',
       'alcohol_last_month', 'BPRS', 'PANSS_total', 'PANSS_positive',
       'PANSS_negative', 'PANSS_psychopatho', 'PANSS_desorganisation', 'SANS',
       'SAPS', 'MADRS', 'SOFAS', 'NSS', 'irm', 'Age of Onset', 'Alcohol',
       'Anticonvulsants', 'Antidepressants', 'Antipsychotics', 'BD Type',
       'Density of Episodes', 'Depression Scale', 'Depression Score',
       'Illness Duration', 'Lithium', 'Mania Scale', 'Mania Score',
       'Mood Phase', 'Number of Depressive Episodes',
       'Number of Manic Episodes', 'Onset Time', 'Psychotic', 'Severity',
       'Total Episodes', 'ymrstot', 'psysoc_65', 'psychosis_lt', 'phenotype',
       'session', 'path', 'TIV', 'CSF_Vol', 'GM_Vol', 'WM_Vol'],
      dtype='object')

In [11]:
schizconnect_subjects_processed = schizconnect_subjects[['participant_id', 'sex', 'age', 'diagnosis', 'study', 'site']]
schizconnect_subjects_processed

Unnamed: 0,participant_id,sex,age,diagnosis,study,site
0,A00000300,0.0,36.0,control,SCHIZCONNECT-VIP,MRN
1,A00000368,0.0,52.0,schizophrenia,SCHIZCONNECT-VIP,MRN
2,A00000456,0.0,53.0,schizophrenia,SCHIZCONNECT-VIP,MRN
3,A00000838,0.0,29.0,schizophrenia,SCHIZCONNECT-VIP,MRN
4,A00000909,0.0,27.0,schizophrenia,SCHIZCONNECT-VIP,MRN
...,...,...,...,...,...,...
733,ESOP00043,1.0,25.0,FEP,PRAGUE,PRAGUE
734,ESOP00086,0.0,27.0,FEP,PRAGUE,PRAGUE
735,ESOC10063,0.0,31.0,control,PRAGUE,PRAGUE
736,ESOC10098,1.0,27.0,control,PRAGUE,PRAGUE


## Fusion

In [12]:
schiz_subjects_full = pd.concat([bsnip_subjects_processed, candi_subjects_processed, cnp_subjects_processed, schizconnect_subjects_processed])

schiz_subjects_full

Unnamed: 0,participant_id,sex,age,diagnosis,study,site
0,INV07WT2ZL3,0,29.0,control,BSNIP,Dallas
1,INV0AL14J6U,0,43.0,schizophrenia,BSNIP,Dallas
2,INV0B7HKFAY,0,39.0,relative of proband with schizoaffective disorder,BSNIP,Dallas
3,INV0G5RXTCC,1,34.0,schizoaffective disorder,BSNIP,Dallas
4,INV0WW10RBB,1,42.0,schizoaffective disorder,BSNIP,Dallas
...,...,...,...,...,...,...
733,ESOP00043,1,25.0,FEP,PRAGUE,PRAGUE
734,ESOP00086,0,27.0,FEP,PRAGUE,PRAGUE
735,ESOC10063,0,31.0,control,PRAGUE,PRAGUE
736,ESOC10098,1,27.0,control,PRAGUE,PRAGUE


#### Small checks

In [13]:
print(np.sum([bsnip_subjects.shape[0], candi_subjects.shape[0], cnp_subjects.shape[0], schizconnect_subjects.shape[0]]))
print(schiz_subjects_full.sex.unique())

2207
[0.0 1.0 'F' 'M']


In [14]:
print(schiz_subjects_full.diagnosis.unique())

['control' 'schizophrenia'
 'relative of proband with schizoaffective disorder'
 'schizoaffective disorder' 'psychotic bipolar disorder'
 'relative of proband with schizophrenia'
 'relative of proband with psychotic bipolar disorder'
 'bipolar disorder without psychosis' 'bipolar disorder with psychosis'
 'CONTROL' 'SCHZ' 'BIPOLAR' 'ADHD' 'FEP']


#### Save database

In [15]:
schiz_subjects_full.to_csv(save_path+"schiz_participants_full.csv", index=False)

# Separate train-val-tests

### Load the set defining pickle

In [16]:
train_val_tests_sets = pd.read_pickle("/neurospin/psy_sbox/analyses/201906_schizconnect-vip-prague-bsnip-biodb-icaar-start_assemble-all/data/train_val_test_test-intra_scz_stratified.pkl")

train_val_tests_sets

{'train':     participant_id  session             study
 0        ESOC10104      1.0            PRAGUE
 1        A00014522      1.0  SCHIZCONNECT-VIP
 2        A00001243      1.0  SCHIZCONNECT-VIP
 3         or130001      1.0  SCHIZCONNECT-VIP
 4        A00028405      1.0  SCHIZCONNECT-VIP
 ..             ...      ...               ...
 928          HC001      1.0             CANDI
 929          HC027      1.0             CANDI
 930          SS086      1.0             CANDI
 931          HC023      1.0             CANDI
 932          SS095      1.0             CANDI
 
 [933 rows x 3 columns],
 'validation':     participant_id  session             study
 0           NM2020      1.0  SCHIZCONNECT-VIP
 1        A00036844      1.0  SCHIZCONNECT-VIP
 2        ESOC10040      1.0            PRAGUE
 3           NM1069      1.0  SCHIZCONNECT-VIP
 4        ESOC10106      1.0            PRAGUE
 ..             ...      ...               ...
 111          10316      1.0               CNP
 112      

In [17]:
# load data
train_set = train_val_tests_sets['train']
val_set = train_val_tests_sets['validation']
test_intra_set = train_val_tests_sets['test_intra']
test_set = train_val_tests_sets['test']

# put the right data type
train_set.participant_id = train_set.participant_id.astype(str)
val_set.participant_id = val_set.participant_id.astype(str)
test_intra_set.participant_id = test_intra_set.participant_id.astype(str)
test_set.participant_id = test_set.participant_id.astype(str)

print("shapes", [train_set.shape[0], val_set.shape[0], test_intra_set.shape[0], test_set.shape[0]])
print(np.sum([train_set.shape[0], val_set.shape[0], test_intra_set.shape[0], test_set.shape[0]]))

shapes [933, 116, 118, 133]
1300


### Create train-val-tests csv files

In [18]:
train_set.participant_id.astype(str).to_csv(save_path+'train_subjects.csv', index=False, header=False)
val_set.participant_id.astype(str).to_csv(save_path+'val_subjects.csv', index=False, header=False)
test_intra_set.participant_id.astype(str).to_csv(save_path+'test_intra_subjects.csv', index=False, header=False)
test_set.participant_id.astype(str).to_csv(save_path+'test_subjects.csv', index=False, header=False)

### Create a file with all used subjects

In [19]:
all_sets = pd.concat([train_set, val_set, test_intra_set, test_set])
all_sets

Unnamed: 0,participant_id,session,study
0,ESOC10104,1.0,PRAGUE
1,A00014522,1.0,SCHIZCONNECT-VIP
2,A00001243,1.0,SCHIZCONNECT-VIP
3,or130001,1.0,SCHIZCONNECT-VIP
4,A00028405,1.0,SCHIZCONNECT-VIP
...,...,...,...
128,INV64AL1N24,1.0,BSNIP
129,INVBEKLL87A,1.0,BSNIP
130,INV11JMTY1C,1.0,BSNIP
131,INVK4HB8HEX,1.0,BSNIP


The 'sub-' are missing for some individuals ><'.

The ones concerned are the ones that are not common between 'all_sets' and 'schiz_subjects_full'

In [20]:
# add 'sub-' for the subjects that need it
all_sets.loc[~all_sets.participant_id.isin(schiz_subjects_full.participant_id), 'participant_id'] = 'sub-' + all_sets.loc[~all_sets.participant_id.isin(schiz_subjects_full.participant_id), 'participant_id']

#### Filter schiz_subjects_full by keeping only the actually used subjects (schizophrenia and control)

In [21]:
schiz_subjects = schiz_subjects_full[schiz_subjects_full.participant_id.isin(all_sets.participant_id)]
schiz_subjects

Unnamed: 0,participant_id,sex,age,diagnosis,study,site
0,INV07WT2ZL3,0,29.0,control,BSNIP,Dallas
1,INV0AL14J6U,0,43.0,schizophrenia,BSNIP,Dallas
6,INV14XK7P6E,0,30.0,control,BSNIP,Dallas
7,INV1HXNTXYF,1,18.0,control,BSNIP,Dallas
9,INV1XCNF4J5,1,25.0,control,BSNIP,Dallas
...,...,...,...,...,...,...
729,ESOC10060,1,26.0,control,PRAGUE,PRAGUE
731,ESOC10019,1,28.0,control,PRAGUE,PRAGUE
735,ESOC10063,0,31.0,control,PRAGUE,PRAGUE
736,ESOC10098,1,27.0,control,PRAGUE,PRAGUE


In [22]:
print(schiz_subjects.diagnosis.unique())
# need to homogenize that

['control' 'schizophrenia' 'CONTROL' 'SCHZ']


In [23]:
schiz_subjects = schiz_subjects.replace({'CONTROL': 'control', 'SCHZ': 'schizophrenia'})
schiz_subjects.diagnosis.unique()

array(['control', 'schizophrenia'], dtype=object)

In [24]:
# save it to csv
schiz_subjects.to_csv(save_path + 'used_schiz_subjects.csv', index=False)