In [87]:
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from scipy.spatial import distance

Pour connaître les informations sur la base HCP, voir ci-dessous.

In [56]:
HCP_description = pd.read_csv('/neurospin/dico/adufournet/HCP_S1200_DataDictionary_Oct_30_2023.csv')
HCP_description

Unnamed: 0,fullDisplayName,category,assessment,columnHeader,description
0,Subject,Subject Information,Demographics,Subject,HCP Subject ID
1,Quarter Released,Subject Information,Demographics,Release,HCP data release in which this subject's data ...
2,Acquisition Quarter,Subject Information,Demographics,Acquisition,Quarter in which this subject's 3T and behavio...
3,Gender,Subject Information,Demographics,Gender,Gender of Subject
4,Age Range,Subject Information,Demographics,Age,"Age group of Participant, banded in five-year ..."
...,...,...,...,...,...
808,MOV4 TRfrac,7T Eye Tracker Metadata,MOV,MOV4_TRFRAC,The value of TRfrac (percentage of total scan ...
809,MOV eyetrack compl,7T Eye Tracker Metadata,MOV,MOV_EYETRACK_COMPL,TRfrac (percentage of total scan time that had...
810,REST trackfrac min,7T Eye Tracker Metadata,REST,REST_TRACKFRAC_MIN,The minimum value for Trackfrac (percentage of...
811,REST TRfrac min,7T Eye Tracker Metadata,REST,REST_TRFRAC_MIN,The minimum value for TRfrac (percentage of to...


In [57]:
HCP_info = pd.read_csv('/neurospin/dico/jchavas/RESTRICTED_jchavas_1_18_2022_3_17_51.csv')

In [58]:
HCP_IID = pd.read_csv('/neurospin/dico/adufournet/Runs/01_Heritability_Right_PCS_HCP/Heritability/data/HCP_prob_pred_PCS.csv')
HCP_IID = HCP_IID['IID'].astype(int)
HCP_IID.values

array([100206, 100307, 100408, ..., 994273, 995174, 996782])

In [59]:
HCP_info = HCP_info[HCP_info['Subject'].isin(HCP_IID.values)]
HCP_info

Unnamed: 0,Subject,Age_in_Yrs,HasGT,ZygositySR,ZygosityGT,Family_ID,Mother_ID,Father_ID,TestRetestInterval,Race,...,SSAGA_Times_Used_Illicits,SSAGA_Times_Used_Cocaine,SSAGA_Times_Used_Hallucinogens,SSAGA_Times_Used_Opiates,SSAGA_Times_Used_Sedatives,SSAGA_Times_Used_Stimulants,SSAGA_Mj_Use,SSAGA_Mj_Ab_Dep,SSAGA_Mj_Age_1st_Use,SSAGA_Mj_Times_Used
1,100206,27,True,NotTwin,,56037_85858,56037,85858,,White,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0
2,100307,27,True,NotMZ,MZ,51488_81352,51488,81352,,White,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,1.0
3,100408,33,True,MZ,MZ,51730_81594,51730,81594,,White,...,4.0,5.0,5.0,5.0,0.0,0.0,1.0,1.0,2.0,4.0
4,100610,27,True,NotMZ,DZ,52813_82634,52813,82634,,White,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0
5,101006,35,True,MZ,,51283_52850_81149,51283,81149,,Black or African Am.,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1201,992774,35,True,NotTwin,,51345_81210,51345,81210,,White,...,2.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,3.0,2.0
1202,993675,29,True,NotTwin,,55800_85621,55800,85621,,White,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0
1203,994273,30,True,NotTwin,,52364_82227,52364,82227,,White,...,4.0,0.0,5.0,5.0,5.0,0.0,1.0,1.0,2.0,5.0
1204,995174,25,True,MZ,MZ,55923_85743,55923,85743,,White,...,5.0,0.0,5.0,1.0,1.0,1.0,1.0,0.0,1.0,5.0


#### Test if I can access to the monozygous or dizygous twins only

In [72]:
# to get only the monozygous twins
MZ_ID = HCP_info[HCP_info['ZygosityGT'] == 'MZ']['Subject'].values
print(MZ_ID.shape)
print()
# to get the number of unique family ID
HCP_info_MZ = HCP_info[HCP_info['ZygosityGT'] == 'MZ']
print(len(HCP_info_MZ['Family_ID'].unique()))

(286,)

148


As we can see, 148*2=296, therefore it means that some twins don't have the same family ID, therefore they are considered as alone. In the best case, we are dealing with 286//2=143 pairs.

In [70]:
# to get only the dizygous twins
DZ_ID = HCP_info[HCP_info['ZygosityGT'] == 'DZ']['Subject'].values
print(DZ_ID.shape)
print()
HCP_info_DZ = HCP_info[HCP_info['ZygosityGT'] == 'DZ']
print(len(HCP_info_DZ['Family_ID'].unique()))

(170,)

91


Same here, 91*2=182, which means that some of the dizygous twins don't have the same family ID, or they are on their own.

#### Get the twin pairs depending on the Family_ID

In [85]:
list_MZ_twin_id = []

for familiy_id in HCP_info_MZ['Family_ID'].unique():
    # we start by checking if there are two twins in the same family
    if HCP_info_MZ['Family_ID'].isin([familiy_id]).sum(axis=0) == 2:
        # get the ID of the Monozygous twins that have the same family ID
        list_MZ_twin_id.append(HCP_info_MZ[HCP_info_MZ['Family_ID'].isin([familiy_id])]['Subject'].values)

len(list_MZ_twin_id)

138

In [86]:
list_DZ_twin_id = []

for familiy_id in HCP_info_DZ['Family_ID'].unique():
    # we start by checking if there are two twins in the same family
    if HCP_info_DZ['Family_ID'].isin([familiy_id]).sum(axis=0) == 2:
        # get the ID of the Monozygous twins that have the same family ID
        list_DZ_twin_id.append(HCP_info_DZ[HCP_info_DZ['Family_ID'].isin([familiy_id])]['Subject'].values)

len(list_DZ_twin_id)

79

#### Load an embedded space of dimension 256 for the left CINGULATE region from HCP

In [44]:
embeddings_HCP = pd.read_csv("/neurospin/dico/adufournet/Runs/02_Heritability_Left_PCS_HCP/Program/Output/2024-05-13/09-33-29_206/hcp_epoch60_embeddings/full_embeddings.csv", index_col=0)
embeddings_HCP

Unnamed: 0_level_0,dim1,dim2,dim3,dim4,dim5,dim6,dim7,dim8,dim9,dim10,...,dim247,dim248,dim249,dim250,dim251,dim252,dim253,dim254,dim255,dim256
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100206,-5.876003,-0.845744,-12.204170,10.151408,-18.957836,-29.795780,12.937123,-7.214358,15.110987,10.792451,...,-15.116919,33.840183,-19.154482,12.863470,-20.392763,-25.990793,4.171558,-21.955696,-0.573337,-4.697568
100307,-19.529308,-7.768003,-16.622694,15.692203,-32.981106,-20.219995,-20.074820,3.648118,4.962884,28.067015,...,-2.882752,35.724358,-11.517761,25.840712,-23.715145,-31.383976,7.298614,-4.549547,22.780191,-29.140709
100408,-24.607979,2.905064,-9.724290,35.772090,-38.557724,5.821410,-11.363668,18.840570,-9.526131,34.880870,...,5.701094,47.347880,-1.811345,-25.851000,2.173108,10.323122,-16.164135,32.424270,-1.851322,15.037244
100610,-8.027011,0.322856,-15.113009,3.927100,-22.753002,-6.738647,10.846553,47.801365,18.840824,19.917740,...,-4.859896,60.652016,10.704345,-9.717216,-4.234411,-0.872676,-14.263339,-9.549306,8.477722,4.729099
101006,-25.976790,-5.441645,-10.610805,-9.526792,14.037749,-4.063296,12.062643,-15.676752,12.239485,5.389635,...,-7.835833,56.926426,6.239031,-7.533917,2.079582,8.960461,12.988546,-5.051035,25.262693,-14.783053
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
992774,17.165743,-36.441390,-22.935884,-14.686571,-9.867681,-7.135174,26.120441,-12.414014,16.909752,-9.952538,...,-1.832878,45.708260,21.413220,-16.648903,1.558122,-29.973095,20.565409,-38.839497,-4.218242,-22.351929
993675,-37.006813,28.016909,8.215394,-3.528147,10.978013,-11.092725,-10.434095,10.676113,-4.587836,-6.940797,...,-30.646410,43.642662,6.041584,-11.614371,-13.727970,-7.503685,-14.109009,10.194918,11.565282,11.615798
994273,-35.012260,7.191190,18.193295,-24.775734,10.159574,-2.134576,-51.921300,11.000490,4.337312,6.170895,...,-14.010004,36.660385,-4.164273,-3.856230,29.882395,-28.443563,-18.309708,16.012150,5.939688,-5.448430
995174,-33.374450,3.822559,8.150050,16.612328,-21.112999,-24.347690,-4.773570,-5.935398,5.431972,6.468930,...,19.732710,34.926727,-17.297129,17.589705,-4.477038,-14.648837,-8.435144,-10.012508,21.747760,-11.948552


In [45]:
embeddings_HCP.loc[MZ_ID]

Unnamed: 0_level_0,dim1,dim2,dim3,dim4,dim5,dim6,dim7,dim8,dim9,dim10,...,dim247,dim248,dim249,dim250,dim251,dim252,dim253,dim254,dim255,dim256
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100307,-19.529308,-7.768003,-16.622694,15.692203,-32.981106,-20.219995,-20.074820,3.648118,4.962884,28.067015,...,-2.882752,35.724358,-11.517761,25.840712,-23.715145,-31.383976,7.298614,-4.549547,22.780191,-29.140709
100408,-24.607979,2.905064,-9.724290,35.772090,-38.557724,5.821410,-11.363668,18.840570,-9.526131,34.880870,...,5.701094,47.347880,-1.811345,-25.851000,2.173108,10.323122,-16.164135,32.424270,-1.851322,15.037244
102311,-49.370926,11.047486,9.791389,-1.740883,-8.253458,-8.209259,-37.996940,19.523218,-2.086056,10.054957,...,-21.417790,16.500212,7.179546,-40.797596,17.280964,-3.077050,-19.795439,36.210613,-11.182676,-6.539130
102816,-45.134743,5.836784,46.896057,-4.740401,6.534274,-7.610570,-9.100664,15.250102,23.708466,-12.496902,...,-8.831315,28.772512,-3.389005,-19.156067,5.163394,-19.247955,-13.671687,14.122709,11.701907,-3.849965
103010,-19.532782,36.963530,9.198071,-0.250177,-19.366909,-3.100672,-30.202974,-5.610012,3.908733,19.716177,...,-15.076990,16.637657,-24.203821,-32.125770,20.503872,8.191055,-21.036220,7.089907,-36.372562,30.535755
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
951457,-11.197046,15.945390,-7.822735,5.361244,-5.988432,-4.684734,-24.976837,-25.812449,3.384443,12.109298,...,-8.016763,40.145485,-29.814005,7.953031,0.030010,-3.834618,7.303320,1.404764,-11.006529,-21.452850
969476,-14.371459,-32.201640,-9.769582,-4.832557,-4.457313,-2.332178,-0.518999,-1.780267,4.463874,-5.415079,...,-4.825460,54.748110,14.497149,-0.008737,6.154894,-26.746216,7.875990,-12.498604,24.116000,-11.937415
971160,-29.159193,1.885226,5.484753,7.527430,-0.984326,-11.456272,-16.741371,-18.689064,22.222822,15.781728,...,0.445878,51.532690,-6.050777,6.578206,16.972713,-25.056103,-5.070956,-3.533446,21.689419,-4.805591
973770,-26.779343,-12.743442,-0.334544,4.370245,-7.457458,-26.709127,-1.129493,19.687523,-2.419286,-1.394417,...,-1.526262,44.311092,39.789936,4.813432,13.828463,-61.205864,0.821003,-8.044207,5.010616,-5.931845


### Calculation of the distances between the different twins

First, the monozygous twins.

In [112]:
MZ_mean_cos_dist = []
MZ_mean_eucli_dist = []

for twin_pair in list_MZ_twin_id:
    twin0 = twin_pair[0]
    twin1 = twin_pair[1]
    MZ_mean_cos_dist.append(distance.pdist([embeddings_HCP.loc[twin0].values, embeddings_HCP.loc[twin1].values], 'cosine'))
    MZ_mean_eucli_dist.append(distance.pdist([embeddings_HCP.loc[twin0].values, embeddings_HCP.loc[twin1].values], 'sqeuclidean'))

MZ_mean_cos_dist = np.sum(MZ_mean_cos_dist)/len(MZ_mean_cos_dist)
MZ_mean_eucli_dist = np.sum(MZ_mean_eucli_dist)/len(MZ_mean_eucli_dist)

print(f'The average cosine distance for the monozygous twin in the 256 dim latent space is {MZ_mean_cos_dist}')
print(f'The average euclidean distance for the monozygous twin in the 256 dim latent space is {MZ_mean_eucli_dist}')

The average cosine distance for the monozygous twin in the 256 dim latent space is 0.612746545700147
The average euclidean distance for the monozygous twin in the 256 dim latent space is 110179.09417381036


Then, the dizygous twins.

In [113]:
DZ_mean_cos_dist = []
DZ_mean_eucli_dist = []

for twin_pair in list_DZ_twin_id:
    twin0 = twin_pair[0]
    twin1 = twin_pair[1]
    DZ_mean_cos_dist.append(distance.pdist([embeddings_HCP.loc[twin0].values, embeddings_HCP.loc[twin1].values], 'cosine'))
    DZ_mean_eucli_dist.append(distance.pdist([embeddings_HCP.loc[twin0].values, embeddings_HCP.loc[twin1].values], 'sqeuclidean'))

DZ_mean_cos_dist = np.sum(DZ_mean_cos_dist)/len(DZ_mean_cos_dist)
DZ_mean_eucli_dist = np.sum(DZ_mean_eucli_dist)/len(DZ_mean_eucli_dist)

print(f'The average cosine distance for the dizygous twin in the 256 dim latent space is {DZ_mean_cos_dist}')
print(f'The average euclidean distance for the dizygous twin in the 256 dim latent space is {DZ_mean_eucli_dist}')

The average cosine distance for the dosizygous twin in the 256 dim latent space is 0.5938378898840727
The average euclidean distance for the disizygous twin in the 256 dim latent space is 108341.2933354252


Then, we can get the average distance between all the representations.

In [104]:
cosine_dist_overall = distance.pdist(embeddings_HCP.values, 'cosine')
eucli_dist_overall = distance.pdist(embeddings_HCP.values, 'sqeuclidean')

In [110]:
all_cos_dist_mean = cosine_dist_overall.sum()/len(cosine_dist_overall)
all_cos_dist_mean

0.6553706780484043

In [109]:
all_eucli_dist_mean = eucli_dist_overall.sum()/len(eucli_dist_overall)
all_eucli_dist_mean

119289.95689916781

And compute the relative difference between the average distance and the specific twins' distances. 

In [114]:
(all_cos_dist_mean - MZ_mean_cos_dist)/all_cos_dist_mean

0.0650382047533552

In [115]:
(all_cos_dist_mean - DZ_mean_cos_dist)/all_cos_dist_mean

0.09389005371367397