This notebook produces a train/val/test stratifying according to zygocity and gender

In [1]:
import glob

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

pd.options.display.max_rows = 20

# Gets subjects treated with morphologist with relevant columns

In [2]:
participants_file = "/neurospin/dico/data/bv_databases/human/not_labeled/hcp/participants.csv"
participants_unrestricted = pd.read_csv(participants_file)
participants_unrestricted = participants_unrestricted[['Subject', "Gender"]]
participants_unrestricted.head()


Unnamed: 0,Subject,Gender
0,100004,M
1,100206,M
2,100307,F
3,100408,M
4,100610,M


In [3]:
len(participants_unrestricted)

1206

In [4]:
participants_file = "~/RESTRICTED_jchavas_1_18_2022_3_17_51.csv"
participants = pd.read_csv(participants_file)
len(participants)

1206

In [5]:
participants = participants[['Subject', 'ZygosityGT', 'Family_ID']]
participants.head()

Unnamed: 0,Subject,ZygosityGT,Family_ID
0,100004,,52259_82122
1,100206,,56037_85858
2,100307,MZ,51488_81352
3,100408,MZ,51730_81594
4,100610,DZ,52813_82634


In [6]:
participants = pd.merge(participants_unrestricted, participants)
participants.head()

Unnamed: 0,Subject,Gender,ZygosityGT,Family_ID
0,100004,M,,52259_82122
1,100206,M,,56037_85858
2,100307,F,MZ,51488_81352
3,100408,M,MZ,51730_81594
4,100610,M,DZ,52813_82634


In [7]:
participants.loc[(participants['ZygosityGT']== " "), 'ZygosityGT'] = 'NotTwin'  
participants['Subject'] = participants['Subject'].astype('string')


In [8]:
print(participants.dtypes)
participants.head()

Subject       string[python]
Gender                object
ZygosityGT            object
Family_ID             object
dtype: object


Unnamed: 0,Subject,Gender,ZygosityGT,Family_ID
0,100004,M,NotTwin,52259_82122
1,100206,M,NotTwin,56037_85858
2,100307,F,MZ,51488_81352
3,100408,M,MZ,51730_81594
4,100610,M,DZ,52813_82634


In [9]:
treated_subjects = glob.glob("/neurospin/dico/data/bv_databases/human/not_labeled/hcp/hcp/*[!.minf]")
treated_subjects = [x.split('/')[-1] for x in treated_subjects]
treated_subjects = [x for x in treated_subjects if 'database' not in x]
print(treated_subjects[:5])
len(treated_subjects)

['210112', '579665', '922854', '517239', '329440']


1114

In [10]:
participants.dtypes

Subject       string[python]
Gender                object
ZygosityGT            object
Family_ID             object
dtype: object

In [11]:
participants = participants[participants['Subject'].isin(treated_subjects)]

In [12]:
len(participants)

1113

In [13]:
set(treated_subjects) - set(participants['Subject'])

{'142626'}

# Stratify according to gender and zygocity

In [14]:
set(participants['ZygosityGT'].tolist())

{'DZ', 'MZ', 'NotTwin'}

In [15]:
def create_subset(df, gender, zygosityGT):
    """Creates a subset of participants.
    
    The subset corresponds to the values of column gender and zgosityGT"""
    subset = df[(df['Gender']==gender) & (df['ZygosityGT']==zygosityGT)]
    subset = subset.groupby(by='Family_ID', as_index=False).apply(lambda x: x)

    return subset

In [16]:
M_MZ = create_subset(participants, 'M', 'MZ')
F_MZ = create_subset(participants, 'F', 'MZ')
M_DZ = create_subset(participants, 'M', 'DZ')
F_DZ = create_subset(participants, 'F', 'DZ')
M_NotTwin = create_subset(participants, 'M', 'NotTwin')
F_NotTwin = create_subset(participants, 'F', 'NotTwin')
M_MZ.head()

  subset = subset.groupby(by='Family_ID', as_index=False).apply(lambda x: x)
  subset = subset.groupby(by='Family_ID', as_index=False).apply(lambda x: x)
  subset = subset.groupby(by='Family_ID', as_index=False).apply(lambda x: x)
  subset = subset.groupby(by='Family_ID', as_index=False).apply(lambda x: x)
  subset = subset.groupby(by='Family_ID', as_index=False).apply(lambda x: x)
  subset = subset.groupby(by='Family_ID', as_index=False).apply(lambda x: x)


Unnamed: 0,Unnamed: 1,Subject,Gender,ZygosityGT,Family_ID
0,632,233326,M,MZ,51698_81562
0,719,352738,M,MZ,51698_81562
1,253,146432,M,MZ,51702_81566
1,1077,818859,M,MZ,51702_81566
2,155,128935,M,MZ,51707_81571


In [19]:
print(len(participants))
print(len(M_MZ)+len(F_MZ)+len(M_DZ)+len(F_DZ)+len(M_NotTwin)+len(F_NotTwin))

1113
1113


In [17]:
len(M_MZ)

118

In [130]:
partition = [0.7,0.15,0.15]

In [176]:
def get_train(df, partition):
    df = df.copy(deep=True)
    upper = int(len(df)*partition[0]/2)*2
    return df[0:upper]

def get_val(df, partition):
    df = df.copy(deep=True)
    lower = int(len(df)*partition[0]/2)*2
    upper = int(len(df)*(partition[0]+partition[1])/2)*2
    return df[lower:upper]

def get_test(df, partition):
    df = df.copy(deep=True)
    lower = int(len(df)*(partition[0]+partition[1])/2)*2
    return df[lower:]


In [177]:
participants.dtypes

Subject       string[python]
Gender                object
ZygosityGT            object
Family_ID             object
dtype: object

In [206]:
def partition_subset(df, partition):
    train = get_train(df, partition)
    val = get_val(df, partition)
    test = get_test(df, partition)
    print(f"lengths: {len(df)} = {len(train) + len(val) + len(test)} = {len(train)} + {len(val)} + {len(test)}")
    return train, val, test

In [207]:
def put_together(df, partition, alls):
    train, val, test = partition_subset(df, partition)
    alls['train'] = pd.concat([alls['train'], train], ignore_index=True)
    alls['val']   = pd.concat([alls['val'], val], ignore_index=True)
    alls['test']  = pd.concat([alls['test'], test], ignore_index=True)
    return alls

In [208]:
alls = {}
alls['train'] = pd.DataFrame(columns=participants.columns).astype(participants.dtypes.to_dict())
alls['val']   = pd.DataFrame(columns=participants.columns).astype(participants.dtypes.to_dict())
alls['test']  = pd.DataFrame(columns=participants.columns).astype(participants.dtypes.to_dict())



In [209]:
alls = put_together(M_MZ, partition, alls)
alls = put_together(F_MZ, partition, alls)
alls = put_together(M_DZ, partition, alls)
alls = put_together(F_DZ, partition, alls)
alls = put_together(M_NotTwin, partition, alls)
alls = put_together(F_NotTwin, partition, alls)
alls['train_val'] = pd.concat([alls['train'], alls['val']], ignore_index=True)

lengths: 118 = 118 = 82 + 18 + 18
lengths: 168 = 168 = 116 + 26 + 26
lengths: 64 = 64 = 44 + 10 + 10
lengths: 106 = 106 = 74 + 16 + 16
lengths: 325 = 325 = 226 + 50 + 49
lengths: 332 = 332 = 232 + 50 + 50


In [210]:
alls['test'].head()

Unnamed: 0,Subject,Gender,ZygosityGT,Family_ID
0,995174,M,MZ,55923_85743
1,146129,M,MZ,55952_85772
2,783462,M,MZ,55952_85772
3,164636,M,MZ,55955_85775
4,214524,M,MZ,55955_85775


In [211]:
len(alls['train']) + len(alls['val']) + len(alls['test'])

1113

In [215]:
len(alls['train_val'])

944

In [216]:
len(alls['val'])

170

In [217]:
len(alls['test'])

169

In [234]:
output_dir = "/neurospin/dico/data/deep_folding/current/datasets/hcp-top-separated/"
train_file = f"{output_dir}train_subjects.csv"
val_file = f"{output_dir}val_subjects.csv"
test_file = f"{output_dir}test_subjects.csv"
alls['train']['Subject'].to_csv(train_file, index=False, header=False)
alls['val']['Subject'].to_csv(val_file, index=False, header=False)
alls['test']['Subject'].to_csv(test_file, index=False, header=False)

In [235]:
!more {train_file} | wc -l

774


In [236]:
!more {val_file} | wc -l

170


In [237]:
!more {test_file} | wc -l

169
