## Set up twin table

The goal of this notebook is to create a data table/list that contains the assignment of who is whose twin. 
A comparable control datasheet has also been prepared in order to select for every twin an age- and sex-matched control.

#### Import modules

In [1]:
import pandas as pd
import os, itertools, random
import numpy as np

#### Load the participants-file and the available session-2 movie-viewing data

In [2]:
# load the participants
df = pd.read_csv('../data/00_dataset_info/participants.tsv', sep='\t')
df.head()

Unnamed: 0,participant_id,family_id,age,age_ses02,sex,handedness,ses01_protocol,ses02_protocol,group
0,sub-0001,fam-0001,12,14.0,M,Right,1b,2b,
1,sub-0002,fam-0154,9,,F,Right,1d,,
2,sub-0003,fam-0116,12,14.0,M,Right,1d,2c,
3,sub-0004,fam-0002,10,11.0,M,Right,1d,2c,
4,sub-0005,fam-0088,10,12.0,F,Right,1d,2c,


#### select right-handed people only

Note: we could also examine the left handed (though small sample), or we could ignore handedness.

In [3]:
df_select_right = df[df.handedness=='Right']
len(df_select_right)

344

In [4]:
# select only those participants for which we have extracted neuroimaging data
subs_extracted = [f[:8] for f in os.listdir('../data/03_extract_ts/') if f.startswith('sub')]
#subs_extracted[:5]

f = df_select_right[df_select_right['participant_id'].isin(subs_extracted)]
f.head()

Unnamed: 0,participant_id,family_id,age,age_ses02,sex,handedness,ses01_protocol,ses02_protocol,group
0,sub-0001,fam-0001,12,14.0,M,Right,1b,2b,
2,sub-0003,fam-0116,12,14.0,M,Right,1d,2c,
3,sub-0004,fam-0002,10,11.0,M,Right,1d,2c,
4,sub-0005,fam-0088,10,12.0,F,Right,1d,2c,
5,sub-0006,fam-0178,13,15.0,M,Right,1b,2a,


In [5]:
# select the twin pairs and set up a dataframe
fam_list_id   = []

twin_A_list_id  = []
twin_B_list_id  = []

twin_A_list_age  = []
twin_B_list_age  = []

twin_A_list_sex  = []
twin_B_list_sex  = []

for i in range(len(f)):
    current_family = f.iloc[i, :]['family_id']
    
    if (len(f[f['family_id'] == current_family]['participant_id'].values)  ==2):

        fam_list_id.append(current_family)
        
        twin_A_list_id.append(f[f['family_id'] == current_family]['participant_id'].values[0])
        twin_B_list_id.append(f[f['family_id'] == current_family]['participant_id'].values[1])
        
        twin_A_list_age.append(f[f['family_id'] == current_family]['age_ses02'].values[0])
        twin_B_list_age.append(f[f['family_id'] == current_family]['age_ses02'].values[1])
        
        twin_A_list_sex.append(f[f['family_id'] == current_family]['sex'].values[0])
        twin_B_list_sex.append(f[f['family_id'] == current_family]['sex'].values[1])
        
        
new_df = pd.DataFrame(np.vstack((fam_list_id, 
                                 twin_A_list_id, twin_A_list_age, twin_A_list_sex,
                                 twin_B_list_id, twin_B_list_age, twin_B_list_sex,)).T , 
                      columns= ['family_id',
                                'twin_A_id', 'twin_A_age','twin_A_sex',
                                'twin_B_id', 'twin_B_age','twin_B_sex'])
new_df.head()

Unnamed: 0,family_id,twin_A_id,twin_A_age,twin_A_sex,twin_B_id,twin_B_age,twin_B_sex
0,fam-0001,sub-0001,14.0,M,sub-0018,14.0,M
1,fam-0116,sub-0003,14.0,M,sub-0233,14.0,F
2,fam-0002,sub-0004,11.0,M,sub-0420,11.0,M
3,fam-0088,sub-0005,12.0,F,sub-0178,12.0,F
4,fam-0178,sub-0006,15.0,M,sub-0329,15.0,F


In [12]:
new_df = new_df.drop_duplicates()
print(len(new_df))
new_df.head()

100


Unnamed: 0,family_id,twin_A_id,twin_A_age,twin_A_sex,twin_B_id,twin_B_age,twin_B_sex
0,fam-0001,sub-0001,14.0,M,sub-0018,14.0,M
1,fam-0116,sub-0003,14.0,M,sub-0233,14.0,F
2,fam-0002,sub-0004,11.0,M,sub-0420,11.0,M
3,fam-0088,sub-0005,12.0,F,sub-0178,12.0,F
4,fam-0178,sub-0006,15.0,M,sub-0329,15.0,F


In [13]:
new_df.to_csv('../data/00_dataset_info/twin_df.csv')

In [15]:
#twin_list.sort()
#twin_list = list(twin_list for twin_list,_ in itertools.groupby(twin_list))
#print(len(twin_list))