In [1]:
import os
import torch
import json
import numpy as np
import numpy.random as rd
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('default')

from pandas.plotting import scatter_matrix
from scipy.spatial import distance
from scipy.stats import percentileofscore

## Load the embeddings

In [4]:
path1 = "/neurospin/dico/agaudin/Runs/03_monkeys/Output/analysis_folders/pca/30/Run1/pca_embeddings.csv"

embeddings1 = pd.read_csv(path1, index_col=0)

print(embeddings1.head())

embeddings2 = -embeddings1
embeddings2.head()

                    dim1        dim2        dim3        dim4        dim5  \
Subject                                                                    
1-03HUBJO_t0  -53.579821  283.674235 -289.220584  -39.377640   31.373385   
1-08ANDTI_t0  304.421732  221.746751  212.028836   33.319015 -129.834469   
1-11LEBJO_t0  229.580833  287.437274   26.512003 -120.822792   20.726943   
1-15LEHMI_t0  111.156533    3.509716  -62.315849  -17.678537 -288.433520   
1-17COLMA_t0 -155.806756 -150.651878 -204.962518 -183.430474 -253.461319   

                    dim6        dim7        dim8        dim9       dim10  ...  \
Subject                                                                   ...   
1-03HUBJO_t0 -158.339627    9.670317  152.284108  -65.771396 -149.046171  ...   
1-08ANDTI_t0   21.031527  119.441280  -10.692840   71.951305  124.713433  ...   
1-11LEBJO_t0    5.467387  187.373954  122.830636  -23.879877  -63.080933  ...   
1-15LEHMI_t0    9.222002  -45.920334  106.299119  199.909948  

Unnamed: 0_level_0,dim1,dim2,dim3,dim4,dim5,dim6,dim7,dim8,dim9,dim10,...,dim21,dim22,dim23,dim24,dim25,dim26,dim27,dim28,dim29,dim30
Subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1-03HUBJO_t0,53.579821,-283.674235,289.220584,39.37764,-31.373385,158.339627,-9.670317,-152.284108,65.771396,149.046171,...,-87.509182,-53.117284,24.258174,48.82337,-3.300542,-7.509744,63.925171,72.504643,-102.564855,89.569466
1-08ANDTI_t0,-304.421732,-221.746751,-212.028836,-33.319015,129.834469,-21.031527,-119.44128,10.69284,-71.951305,-124.713433,...,22.659114,-77.277498,-63.190227,85.281956,116.815156,28.785544,19.209134,-141.514044,138.735023,57.747222
1-11LEBJO_t0,-229.580833,-287.437274,-26.512003,120.822792,-20.726943,-5.467387,-187.373954,-122.830636,23.879877,63.080933,...,50.180483,-72.835848,-171.984233,119.051368,50.734883,-130.860103,-42.828971,141.733466,123.745309,17.192685
1-15LEHMI_t0,-111.156533,-3.509716,62.315849,17.678537,288.43352,-9.222002,45.920334,-106.299119,-199.909948,-192.257399,...,37.666366,51.47822,109.85124,106.756577,60.081274,-94.72926,40.706943,17.842731,84.573856,24.888063
1-17COLMA_t0,155.806756,150.651878,204.962518,183.430474,253.461319,89.578872,43.5014,11.313596,-115.759039,-53.291902,...,63.945028,59.527924,44.080342,7.834308,-55.770208,86.571211,54.449158,69.07724,24.373003,-94.710483


In [13]:
embeddings1.loc['1-17COLMA_t0', :]

dim1    -155.806756
dim2    -150.651878
dim3    -204.962518
dim4    -183.430474
dim5    -253.461319
dim6     -89.578872
dim7     -43.501400
dim8     -11.313596
dim9     115.759039
dim10     53.291902
dim11   -199.851824
dim12    -38.700631
dim13     67.739853
dim14    -59.144676
dim15    179.745536
dim16     37.512961
dim17    -53.069641
dim18    110.252272
dim19    129.498434
dim20     66.243626
dim21    -63.945028
dim22    -59.527924
dim23    -44.080342
dim24     -7.834308
dim25     55.770208
dim26    -86.571211
dim27    -54.449158
dim28    -69.077240
dim29    -24.373003
dim30     94.710483
Name: 1-17COLMA_t0, dtype: float64

In [8]:
# get the subject list

# small check before
if embeddings1.index.all() != embeddings2.index.all():
    raise ValueError("The two spaces don't contain the same subjects")
else:
    list_subjects = embeddings1.index.astype('str')

list_subjects

Index(['1-03HUBJO_t0', '1-08ANDTI_t0', '1-11LEBJO_t0', '1-15LEHMI_t0',
       '1-17COLMA_t0', '1-18ROUCO_t0', '1-20FORHU_t0', '1-23CREES_t0',
       '1-40AYMJU_t0', '2-02FERMA_t0',
       ...
       'sub-inhibition20_ses-pretraining_T1w',
       'sub-inhibition21_ses-pretraining_T1w',
       'sub-inhibition22_ses-pretraining_T1w',
       'sub-inhibition23_ses-pretraining_T1w',
       'sub-inhibition24_ses-pretraining_T1w',
       'sub-inhibition25_ses-pretraining_T1w',
       'sub-inhibition26_ses-pretraining_T1w',
       'sub-inhibition28_ses-pretraining_T1w',
       'sub-inhibition29_ses-pretraining_T1w',
       'sub-inhibition31_ses-pretraining_T1w'],
      dtype='object', name='Subject', length=341)

In [24]:
def get_distance_matrix(emb, verbose=False):
    # emb should not have the nn nor the min_dist columns    
    dist_mat = pd.DataFrame()

    for idx in emb.index:
        line = emb[emb.index == idx]
        if verbose:
            print(line)
        distances = emb.apply(distance.euclidean, axis=1, args=[np.array(line)])
        dist_mat[idx] = distances

    return dist_mat

In [25]:
dist_mat_1 = get_distance_matrix(embeddings1)
dist_mat_2 = get_distance_matrix(embeddings2)

In [29]:
dist_mat_1['1-03HUBJO_t0'].sort_values().index

Index(['1-03HUBJO_t0', 'e0071_t1_s03', 'nih_chp_01746_t1', 'nih_chp_05041_t1',
       'nih_chp_05167_t1', 'a0011_t1_s03', 'nih_chp_04282_t1',
       'nih_chp_05651_t1', 'nih_chp_05192_t1', 'nih_chp_04633_t1',
       ...
       'sub-inhibition20_ses-pretraining_T1w', '1-20FORHU_t0',
       'nih_chp_02741_t1', 'sub-inhibition25_ses-pretraining_T1w',
       'a0024_t1_s03', 'a0039_t1_s03', 'nih_chp_01161_t1', 'a0022_t1_s03',
       'nih_chp_04572_t1', 'nih_chp_00901_t1'],
      dtype='object', name='Subject', length=341)

In [52]:
def get_percentile_matrix(dist_mat, verbose=False):
    # the rankings for a given subject are stored in a column (not a line)
    n_sj = len(list_subjects)
    ranking_mat = pd.DataFrame(np.zeros((n_sj, n_sj)), columns=list_subjects, 
                               index=list_subjects)
    for sj in list_subjects:
        distances = dist_mat[sj]
        neighbours = distances.sort_values().index
        for i,neighbour in enumerate(neighbours):
            ranking_mat.loc[neighbour, sj] = i
    
    return(ranking_mat*100/n_sj)

In [51]:
perc_mat_1 = get_percentile_matrix(dist_mat_1)
perc_mat_2 = get_percentile_matrix(dist_mat_2)
perc_mat_1

Subject,1-03HUBJO_t0,1-08ANDTI_t0,1-11LEBJO_t0,1-15LEHMI_t0,1-17COLMA_t0,1-18ROUCO_t0,1-20FORHU_t0,1-23CREES_t0,1-40AYMJU_t0,2-02FERMA_t0,...,sub-inhibition20_ses-pretraining_T1w,sub-inhibition21_ses-pretraining_T1w,sub-inhibition22_ses-pretraining_T1w,sub-inhibition23_ses-pretraining_T1w,sub-inhibition24_ses-pretraining_T1w,sub-inhibition25_ses-pretraining_T1w,sub-inhibition26_ses-pretraining_T1w,sub-inhibition28_ses-pretraining_T1w,sub-inhibition29_ses-pretraining_T1w,sub-inhibition31_ses-pretraining_T1w
Subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1-03HUBJO_t0,0.000000,93.255132,50.439883,94.134897,56.011730,36.363636,97.653959,43.401760,66.275660,86.803519,...,95.014663,68.914956,43.988270,89.736070,96.774194,99.120235,90.322581,70.674487,96.480938,68.035191
1-08ANDTI_t0,92.668622,0.000000,2.052786,19.354839,88.563050,96.774194,25.806452,60.997067,9.677419,80.351906,...,82.111437,42.228739,28.739003,83.870968,64.222874,49.560117,95.014663,91.788856,7.038123,89.149560
1-11LEBJO_t0,46.041056,3.519062,0.000000,49.560117,80.645161,83.284457,39.882698,34.604106,7.917889,64.222874,...,97.360704,49.266862,38.709677,67.155425,65.982405,87.096774,97.067449,95.307918,73.020528,72.140762
1-15LEHMI_t0,85.923754,23.753666,47.800587,0.000000,8.797654,64.222874,39.002933,31.964809,5.571848,31.671554,...,22.580645,69.208211,63.049853,46.627566,19.941349,18.768328,31.378299,64.516129,43.695015,85.337243
1-17COLMA_t0,71.260997,95.601173,90.615836,13.489736,0.000000,0.293255,74.486804,54.545455,65.395894,92.375367,...,26.686217,94.721408,95.894428,80.645161,82.697947,69.501466,20.821114,90.615836,87.683284,74.193548
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
sub-inhibition25_ses-pretraining_T1w,97.947214,49.853372,84.457478,18.768328,43.988270,66.862170,26.979472,22.287390,2.052786,42.815249,...,39.882698,74.780059,77.419355,82.111437,41.935484,0.000000,3.225806,85.630499,22.287390,93.255132
sub-inhibition26_ses-pretraining_T1w,93.255132,97.653959,98.826979,35.483871,10.557185,16.422287,50.733138,78.005865,72.727273,24.926686,...,43.401760,88.269795,90.322581,69.794721,77.126100,1.759531,0.000000,75.659824,68.914956,44.868035
sub-inhibition28_ses-pretraining_T1w,4.692082,12.903226,14.662757,1.759531,4.985337,4.398827,11.436950,7.624633,4.105572,6.451613,...,1.173021,4.105572,14.076246,1.173021,1.466276,7.331378,5.865103,0.000000,1.759531,3.225806
sub-inhibition29_ses-pretraining_T1w,89.442815,6.451613,52.785924,31.378299,51.319648,55.718475,56.011730,45.747801,62.170088,63.636364,...,18.475073,58.651026,34.604106,35.483871,14.956012,11.436950,47.507331,35.190616,0.000000,90.909091


In [59]:
def get_distance(ranking_matrix_1, ranking_matrix_2, fct=np.sqrt, ponderation=None, verbose=False):
    ranking_matrix_1 = fct(ranking_matrix_1)
    ranking_matrix_2 = fct(ranking_matrix_2)
    compute_matrix = ranking_matrix_1 - ranking_matrix_2
    compute_matrix = np.abs(compute_matrix)
    return compute_matrix.values.mean()

In [56]:
get_distance(perc_mat_1, perc_mat_2)

0.0

In [None]:
def custom_ln(x):
    return np.log(x+1)

In [60]:
def latent_space_distance(emb1, emb2, fct=np.sqrt, ponderation=None, verbose=False):
    dist_mat_1 = get_distance_matrix(embeddings1, verbose=verbose)
    dist_mat_2 = get_distance_matrix(embeddings2, verbose=verbose)

    perc_mat_1 = get_percentile_matrix(dist_mat_1, verbose=verbose)
    perc_mat_2 = get_percentile_matrix(dist_mat_2, verbose=verbose)

    return get_distance(perc_mat_1, perc_mat_2, fct=fct, ponderation=ponderation, verbose=verbose)

In [61]:
latent_space_distance(embeddings1, embeddings2)

0.0

In [62]:
path1 = "/neurospin/dico/agaudin/Runs/03_monkeys/Output/analysis_folders/pca/30/Run1/pca_embeddings.csv"
path2 = "/neurospin/dico/agaudin/Runs/03_monkeys/Output/analysis_folders/pca/30/Run2/pca_embeddings.csv"

embeddings1 = pd.read_csv(path1, index_col=0)
embeddings2 = pd.read_csv(path2, index_col=0)

latent_space_distance(embeddings1, embeddings2)

0.12437126816237146

In [63]:
# for a good net
path1 = "/neurospin/dico/agaudin/Runs/03_monkeys/Output/analysis_folders/convnet/no_foldlabel_30/13-15-48/cingulate_ACCpatterns_embeddings/full_embeddings.csv"
path2 = "/neurospin/dico/agaudin/Runs/03_monkeys/Output/analysis_folders/convnet/no_foldlabel_30/16-39-35/cingulate_ACCpatterns_embeddings/full_embeddings.csv"

embeddings1 = pd.read_csv(path1, index_col=0)
embeddings2 = pd.read_csv(path2, index_col=0)

latent_space_distance(embeddings1, embeddings2)

1.296603049512705

In [64]:
# for a bad net
path1 = "/neurospin/dico/agaudin/Runs/03_monkeys/Output/analysis_folders/densenet2/no_foldlabel_4/11-41-24/cingulate_ACCpatterns_embeddings/full_embeddings.csv"
path2 = "/neurospin/dico/agaudin/Runs/03_monkeys/Output/analysis_folders/densenet2/no_foldlabel_4/15-20-30/cingulate_ACCpatterns_embeddings/full_embeddings.csv"

embeddings1 = pd.read_csv(path1, index_col=0)
embeddings2 = pd.read_csv(path2, index_col=0)

latent_space_distance(embeddings1, embeddings2)

2.633784175344139