In [1]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import PredefinedSplit
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import f_regression
from sklearn.pipeline import Pipeline
from sklearn import metrics

import numpy as np
import scipy.io as sio
import h5py

import time

## Prepare the data

config the path to data

In [21]:
all_mats_dir = '/home/marilia/Área de Trabalho/dados_acerta/projeto-cpm/matlab_matrix.mat'
# all_behav_dir = '/Users/siyuangao/Working_Space/fmri/data/HCP515/all_behav.mat'

In [23]:
# import mat4py as mat
# f = mat.loadmat(all_mats_dir)

file_temp = h5py.File(all_mats_dir, 'r') # this deals with -v7.3 .mat file
all_mats = file_temp['all_mats'] # all_mats here is assumed to be node*node*sub*task
all_mats = np.array(all_mats)
all_mats = np.transpose(all_mats,(3, 2, 1, 0))

file_temp = sio.loadmat(all_behav_dir)
all_behav = file_temp['all_behav']
all_behav = np.reshape(all_behav, (-1,)) # convert all_behav to (num_sub, ) to remove the warning when testing the model

print(np.shape(all_mats))

OSError: Unable to open file (file signature not found)

remove missing nodes

In [4]:
missing_nodes = np.array([249, 239, 243, 129, 266, 109, 115, 118, 250]) - 1
all_mats = np.delete(all_mats, missing_nodes, 0)
all_mats = np.delete(all_mats, missing_nodes, 1)

get the task subset

In [5]:
task_subset = [0, 3, 4, 5, 6, 7, 8]
all_mats = all_mats[:, :, :, task_subset]

In [6]:
num_node = np.shape(all_mats)[0]
num_sub = np.shape(all_mats)[2]
num_task = np.shape(all_mats)[3]
num_edge = num_node * (num_node-1) // 2
num_edge_total = num_edge * num_task

vectorize the matrix to edge vector

In [7]:
all_edges = np.zeros([num_edge, num_sub, num_task])
iu1 = np.triu_indices(num_node, 1)
for i in range(num_sub):
    for j in range(num_task):
            all_edges[:, i, j] = all_mats[iu1[0], iu1[1], i, j]
all_edges = np.transpose(all_edges, (2, 0, 1))
all_edges = np.reshape(all_edges, [-1, num_sub])

data size

In [8]:
print(all_edges.shape)
print(all_behav.shape)

(233877, 515)
(515,)


# cross-validation on the model specified

model parameter

In [35]:
pct = 0.1 # percent of edges kept in feature selection
alphas = 10**np.linspace(10,-2,100)*0.5 # specify alphas to search

ridgeCPM pipeline

In [36]:
rg_grid = GridSearchCV(Ridge(normalize=False), cv=10, param_grid={'alpha':alphas}, iid=False)

reg = Pipeline([
  ('feature_selection', SelectPercentile(f_regression, percentile=pct)),
  ('regression', rg_grid)
])

cv10 = KFold(n_splits=10, random_state=665)
# rpcv10 = RepeatedKFold(n_splits=3,n_repeats=3, random_state=665) # repeated kfolds

actually run the model

n_jobs specify how many cpus to use

In [38]:
start = time.time() # time the function
all_pred = cross_val_predict(reg, all_edges.T, all_behav, cv=cv10, n_jobs=4)
# all_score = cross_val_score(reg, all_edges.T, all_behav, cv=rpcv10, n_jobs=1) # repeated kfolds
end = time.time()
print(end - start) # print function running time

63.56466507911682


calculate $r_{pearson}$

In [20]:
print(np.corrcoef(all_pred.T, all_behav.T)[0, 1])

0.35748236279252066


$\sqrt{R^2_CV}$

In [29]:
print(np.sqrt(1-sum((all_pred-all_behav)**2)/sum((all_behav-np.mean(all_behav))**2)))

0.32447572830356003


prediction value

In [30]:
print(all_pred)

[20.03525739 15.90117497 15.03081482 18.46382109 14.17526729 17.64310324
 17.58901274 17.73034345 18.49797281 16.79043392 18.27437123 16.67557594
 22.88498476 23.1190766  18.81550042 15.68032845 15.97972081 19.96044685
 19.13332332 18.4704561  18.67846594 22.33777777 17.3352461  16.46982015
 12.64962264 12.39842116 17.62353429 21.48962885 19.40773916 15.0545446
 16.51471586 14.41225957 20.45695746 19.78434082 15.78096675 17.42646539
 20.9865226  16.58951984 18.87677541 15.1536608  18.46408798 15.66404946
 16.07909494 18.4802629  16.04743349 15.01889166 18.19052867 12.2435229
 15.00114537 21.82317657 17.00633924 15.71616336 19.57276883 15.85589195
 18.23546357 21.10437257 18.58829871 16.21936743 13.67410831 13.67119696
 15.09569609 16.24075323 20.62751228 18.26220947 19.05424565 13.41354428
 20.52551669 16.44259875 18.79561997 16.49914302 18.35991186 18.43281382
 16.97333867 15.58414593 18.07898256 18.83760181 23.12626833 18.63136855
 15.64890149 14.31566541 19.45004092 19.81222772 15.9

# Backup test code

specify cv index for test only (optional)

In [None]:
# cv_idx=[5,9,1,2,7,7,4,7,1,2,4,3,5,7,4,3,1,8,7,2,7,3,9,6,7,6,3,2,6,1,4,5,6,4,3,2,8,5,3,4,4,9,4,7,2,1,6,2,1,5,4,4,2,9,5,1,9,3,2,7,7,4,7,2,10,9,3,4,6,3,5,10,9,8,7,10,6,1,5,7,4,7,3,3,1,1,9,5,2,4,2,3,4,3,1,2,2,10,10,2,8,6,9,3,7,5,3,3,10,4,4,3,2,6,7,2,1,10,8,6,5,9,8,3,2,8,9,8,2,3,7,7,4,10,6,2,10,7,10,10,3,10,3,8,1,10,6,2,10,8,4,9,7,10,6,6,9,7,5,6,7,10,6,6,2,5,9,6,3,4,8,7,2,9,9,5,8,10,1,3,4,2,7,8,1,7,4,8,5,9,3,10,10,3,3,8,5,7,9,8,8,10,3,8,3,1,5,4,4,5,4,8,7,8,7,1,7,6,2,7,10,9,1,6,1,8,4,4,5,3,10,10,1,5,10,8,10,4,5,2,9,8,8,5,2,6,3,6,1,10,4,1,9,6,9,8,5,1,4,3,8,1,2,6,6,1,3,5,7,10,9,6,4,9,4,7,9,10,2,3,8,10,1,1,7,6,1,9,10,9,8,9,1,6,6,2,1,1,8,8,9,4,1,6,5,9,9,10,5,1,1,10,4,7,6,9,5,3,7,6,1,6,10,8,4,5,2,2,3,6,2,3,3,5,10,1,5,7,9,7,10,2,3,5,7,1,6,9,1,9,10,3,5,5,1,9,1,5,9,7,7,3,6,5,8,9,7,4,7,6,8,2,8,8,5,10,3,6,8,8,5,10,2,1,10,1,1,4,9,4,4,1,8,1,8,2,3,5,2,1,7,8,9,5,10,8,8,10,10,5,4,4,9,2,3,6,2,7,6,3,7,7,9,4,9,10,6,8,5,8,9,6,4,9,5,4,1,6,7,2,4,2,2,9,7,8,9,8,9,5,1,10,7,4,4,10,10,5,8,8,6,2,6,1,8,1,10,4,3,6,3,10,3,10,10,4,2,9,5,8,1,5,5,3,10,5,2,2,4,2,9,3,6,5,8,8,3,7,3,6,6,4,7,3,5,2,6,2,4,10,6,9,2,5,1]
# cv_idx = [temp-1 for temp in cv_idx]
# ps = PredefinedSplit(cv_idx)