In [4]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.metrics import r2_score

def simulate_nd_space_and_regression(n_dimensions, p_points):
    # Simulate an N-dimensional space with p points, each dimension normally distributed
    X = np.random.normal(size=(p_points, n_dimensions))
    
    # Simulate a response variable Y, which is also normally distributed and independent from X
    Y = np.random.normal(size=p_points)

    # Perform linear regression
    #model = LinearRegression()
    model = SVR(kernel='linear',max_iter=1000,
                        C=0.01)
    model.fit(X, Y)

    # Predict and compute the R^2 score (correlation coefficient squared)
    Y_pred = model.predict(X)
    r2 = r2_score(Y, Y_pred)
    
    # Compute the correlation coefficient (sqrt of R^2)
    correlation_coefficient = np.sqrt(r2)
    
    return r2, correlation_coefficient

# Example usage
n_dimensions = 256  # Number of dimensions
p_points = 300    # Number of points

r2, correlation_coefficient = simulate_nd_space_and_regression(n_dimensions, p_points)

print(f"R^2 Score: {r2}")
print(f"Correlation Coefficient: {correlation_coefficient}")

R^2 Score: 0.6116277684913793
Correlation Coefficient: 0.7820663453258805




In [None]:
## NB : les dimensions ne sont pas complètement indépendantes
## -> Use the real distribution and shuffle the labels.
## -> Is it better with V0 than V1 ?
## But there is a train / test and a cross validation ...

In [20]:
import numpy as np
from sklearn.svm import SVR
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold

def simulate_nd_space_and_regression(n_dimensions, p_points):
    # Simulate an N-dimensional space with p points, each dimension normally distributed
    X = np.random.normal(size=(p_points, n_dimensions))
    
    # Simulate a response variable Y, which is also normally distributed and independent from X
    Y = np.random.normal(size=p_points)

    # Perform regression using Support Vector Regression (SVR)
    model = SVR(kernel='linear')

    # Perform 10-fold cross-validation
    kf = KFold(n_splits=10, shuffle=True, random_state=42)
    r2_scores = []
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]
        model.fit(X_train, Y_train)
        Y_pred = model.predict(X_test)
        r2 = r2_score(Y_test, Y_pred)
        r2_scores.append(r2)
        print(r2)

    # Compute the mean R^2 score
    mean_r2 = np.mean(r2_scores)

    return mean_r2

# Example usage
n_dimensions = 256  # Number of dimensions
p_points = 300    # Number of points

mean_r2 = simulate_nd_space_and_regression(n_dimensions, p_points)

print(f"Mean R^2 Score from 10-fold CV: {mean_r2}")


-22.691949519898582
-17.56000277866854
-4.69426493175534
-8.425072063814703
-8.21757808731576
-5.158715697133647
-11.166528390839716
-6.078174740630961
-5.979388901874851
-16.37862284398496
Mean R^2 Score from 10-fold CV: -10.635029795591706


In [13]:
## perform CCA !
import pandas as pd
import os
import numpy as np
from sklearn.cross_decomposition import CCA
from sklearn.decomposition import PCA

In [17]:
#embs = pd.read_csv('/neurospin/dico/jlaval/Output/SC-sylv_12-16/16-38-32_170/troiani_custom_embeddings/custom_cross_val_embeddings.csv')
#embs = pd.read_csv('/neurospin/dico/jlaval/Output/SC-sylv_12-16/16-39-27_30/troiani_custom_embeddings/custom_cross_val_embeddings.csv')
#embs = pd.read_csv('/neurospin/dico/jlaval/Output/SC-sylv_left_v1/keep_bottom/troiani_custom_embeddings/custom_cross_val_embeddings.csv') ## V1 before even changing cutin prop
embs = pd.read_csv('/neurospin/dico/jlaval/Output/SC-sylv_left_v1/no_keep_bottom/troiani_custom_embeddings/custom_cross_val_embeddings.csv') ## same here = best ?
#embs = pd.read_csv('/volatile/jl277509/Runs/02_STS_babies/Program/Output/SC-sylv_isomaps/16-40-54_148/troiani_embeddings/custom_cross_val_embeddings.csv')

In [18]:
isomap = pd.read_csv('/neurospin/dico/data/deep_folding/current/datasets/hcp/hcp_isomap_labels.csv')
embs.columns=['Subject']+embs.columns[1:].tolist()
merged = pd.merge(isomap, embs)
cols_embs = [f'dim{k}' for k in range(1,33)]
cols_iso = [f'Isomap_central_left_dim{k}' for k in range(1,7)]
embs, isomap = merged[cols_embs].to_numpy(), merged[cols_iso].to_numpy()

In [153]:
# standardized ?
for k in range(6):
    print(np.mean(isomap[:,k]), np.std(isomap[:,k]))

-0.3810063460197253 2.8785028180120236
-0.5561476711573823 2.568573698964332
-0.6039396072569791 2.2065305357893705
-0.36907427158893086 2.024278734702382
0.18555799005356918 1.6367642162976548
0.09503153515159224 1.5360945034042632


In [154]:
pca = PCA(n_components=32)
arr = pca.fit_transform(embs)
pearsonr(arr[:, 0],arr[:, 26])

(-3.469446951953614e-18, 0.9999999999997334)

NB : Isomap components are not standardized -> should they be ?
does the variance reflect the importance of the component ?

In [155]:
# are they uncorrelated ?
# and : does orthogonal directions imply independance of the obtained variables ?
from scipy.stats import pearsonr
pearsonr(isomap[:,0], isomap[:,1])
# they seem uncorelated

(-0.0651325446315982, 0.22286584578684657)

In [19]:
cca = CCA(n_components=len(cols_iso), scale=False) ## need to set scale to false !!
cca.fit_transform(embs, isomap)
print(cca.score(embs, isomap))
## NB: faut-il faire un train val test pour avoir le vrai score ??

0.5576301757276653


In [6]:
isomap.shape

(352, 6)

In [157]:
# compare to a random shuffle of the labels
scores = []
for k in range(100):
    np.random.shuffle(isomap)
    cca.fit_transform(embs, isomap)
    scores.append(cca.score(embs, isomap))
print(np.mean(scores), np.std(scores), np.quantile(scores, 0.95))

0.06584258093794765 0.005963065767634256 0.07521839539895217


In [105]:
## Try only one component = find best correlation between one direction in latent space and one direction in isomap space
cca = CCA(n_components=1, scale=False)
cca.fit(embs, isomap)
print(cca.score(embs, isomap))

embs_c, isomap_c = cca.transform(embs, isomap)
print(pearsonr(embs_c.reshape(-1), isomap_c.reshape(-1)))

0.02489570350657903
(0.4289645935117821, 3.4407495237434334e-17)


In [22]:
np.std(embs_c)

0.22627644121106594

In [240]:
len(cols_iso)data/deep_folding/current/datasets/hcp/hcp_isomap_labels.csv')
    embs.columns=['Subject']+embs.columns[1:].tolist()
    merged = pd.merge(isomap, embs)
    cols_embs = [f'dim{k}' for k in range(1,33)]
    cols_iso = [f'Isomap_central_left_dim{k}' for k in range(1,7)]
    embs, isomap = merged[cols_embs].to_numpy(), merged[cols_iso].to_numpy()
    cca = CCA(n_components=len(cols_iso))
    cca.fit_transform(embs, isomap)
    print(cca.score(embs, isomap))

    # compare to a random shuffle of the labels
    scores = []
    for k in range(100):
        np.random.shuffle(isomap)
        cca.fit_transform(embs, isomap)
        scores.append(cca.score(embs, isomap))
    print(np.mean(scores), np.std(scores), np.quantile(scores, 0.95))

0.02745890704373823
0.00974655562484734 0.0008706855078084492 0.011143903083616112
0.20040945337058655




0.028225937581078372 0.0024735042060606473 0.032371931838130426
0.1643822995104394
0.02826037617676207 0.002653058806466211 0.03351423360543796
0.18972458796250047
0.028944428950306113 0.0028124928247359984 0.034052614511771356
0.18483657268917217
0.03045062065944876 0.0029466681966070314 0.03587173810740223
0.21535903953819746
0.032956817245809254 0.003251310003655106 0.03786984345905486


In [153]:
## What happens if perfect correlation ?
#embs[:, :6] = isomap
# get a perfect score of 1. (checked)

In [7]:
## CCA between latent spaces ?
embs1 = pd.read_csv('/neurospin/dico/jlaval/Output/ABLATION_FIP/no_cutout/FIP_right_custom_embeddings/custom_cross_val_embeddings.csv').to_numpy()
embs2 = pd.read_csv('/neurospin/dico/jlaval/Output/ABLATION_FIP/no_cutout_4/FIP_right_custom_embeddings/custom_cross_val_embeddings.csv').to_numpy()

embs1 = pd.read_csv('/neurospin/dico/jlaval/Output/ABLATION_FIP/all_augms_0/FIP_right_custom_embeddings/custom_cross_val_embeddings.csv').to_numpy()
embs2 = pd.read_csv('/neurospin/dico/jlaval/Output/ABLATION_FIP/all_augms_4/FIP_right_custom_embeddings/custom_cross_val_embeddings.csv').to_numpy()

embs1 = pd.read_csv('/neurospin/dico/jlaval/Output/FIP_right_regular_augm/cutin_80_2/FIP_right_custom_embeddings/custom_cross_val_embeddings.csv').to_numpy()
embs2 = pd.read_csv('/neurospin/dico/jlaval/Output/FIP_right_regular_augm/cutin_80_3/FIP_right_custom_embeddings/custom_cross_val_embeddings.csv').to_numpy()

#embs1 = pd.read_csv('/neurospin/dico/jlaval/Output/ABLATION_FIP/no_cutout/42433_ukb_FIP_right_random_embeddings/full_embeddings.csv').to_numpy()[:,1:]
#embs2 = pd.read_csv('/neurospin/dico/jlaval/Output/ABLATION_FIP/no_cutout_2/42433_ukb_FIP_right_random_embeddings/full_embeddings.csv').to_numpy()[:,1:]

In [8]:
cca = CCA(n_components=32)
cca.fit(embs1, embs2)
print(cca.score(embs1, embs2))

scores=[]
for k in range(1):
    np.random.shuffle(embs1)
    cca.fit(embs1, embs2)
    scores.append(cca.score(embs1, embs2))
print(np.mean(scores))



0.8923692710910487
-0.7826809260576133


In [None]:
## A calculer sur UKB plutôt !!

In [2]:
## CCA between latent space and morphometric measurements
# NB : in high dimension, is imposing full orthogonality between variables the best way to proceed ? Will we underestimate the correlation ?
# CCA seems better than linear CKA for this specific study ! CKA is better for comparing latent spaces. What about isomap ? Unclear yet.
# indeed, for morpho, the morpho params may not be of high importance in the latent space, so the rescaling seems important to catch them ?

# seems like the values are very low anyway, so it is not interesting to compare the spaces linearly ...
# no significant decorrelation with depth using trimdepth ...

In [64]:
morpho = pd.read_csv('/neurospin/dico/data/deep_folding/current/datasets/UkBioBank/sulcal_morphometry/UKB_21045_FIP_right.csv')

In [233]:
embeddings = pd.read_csv('/neurospin/dico/jlaval/Output/ABLATION_FIP/no_trimdepth_4/42433_ukb_FIP_right_random_embeddings/full_embeddings.csv')

In [234]:
embeddings = embeddings.loc[embeddings['ID'].isin(morpho['Subject'])]
morpho = morpho.loc[morpho['Subject'].isin(embeddings['ID'])]

In [235]:
X = embeddings.to_numpy()[: ,1:]
y = morpho.to_numpy()[:, 4:].astype('float')
#y = morpho[['surface_talairach', 'meandepth_talairach', 'hull_junction_length_talairach']].to_numpy()
#y = morpho[['meandepth_talairach']].to_numpy()

In [236]:
cca = CCA(n_components=y.shape[1], scale=True)
cca.fit(X, y)
print(cca.score(X, y))

-0.25296534738801907


In [209]:
# each var separately
for var in morpho.columns[4:]:
    print(var)
    y = morpho[[var]].to_numpy()
    cca = CCA(n_components=y.shape[1], scale=True)
    cca.fit(X, y)
    print(cca.score(X, y))

surface_talairach
0.024227285168421542
surface_native
0.02662625490004411
maxdepth_talairach
0.04313500532838188
maxdepth_native
0.03441156259702827
meandepth_talairach
0.1550896720193533
meandepth_native
0.13559090919978345
hull_junction_length_talairach
0.05405337534768939
hull_junction_length_native
0.06274162118002058
GM_thickness
0.012143048865606909
opening
0.011950139905762769


In [80]:
# now restrain to some variables
y.shape

(21001, 3)