In [63]:
#from gcca import GCCA
import logging
# set log level
logging.root.setLevel(level=logging.INFO)
import numpy as np
from scipy import linalg,stats
from scipy.sparse.linalg import svds
# from sklearn.decomposition import TruncatedSVD
import glob
from tqdm import tqdm
from sklearn.preprocessing import normalize

from pathlib import Path
import re
import os
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

## See https://github.com/rupy/GCCA

## Load Data

In [64]:
basedir = Path('..')
datadir = basedir / 'data' / 'raw'
correlation_dir = basedir / 'data' / 'interim' / 'latents'

In [65]:
## Grab filenames
def get_files(level='(e|n)',
              subject='[0-9]{3}',
              task='.+?'):
    files = []
    query = f'^{level}_sub-'
    query += f'({subject})_ses-1_'
    query += f'task-{task}\.csv'
    for f in os.listdir(datadir):
        match = re.search(query, f)
        if match:
            files.append((f, match.group(1)))
    
    return(files)

In [4]:
tasks = ['restingstate', 'openmonitoring', 'compassion']
levels = ['e', 'n']

#for task,level in np.array(np.meshgrid(tasks,levels)).T.reshape(-1,2):
#    paths = get_files(level=level, task=task)

paths = get_files(level=levels[0], task=tasks[0])

In [5]:
data = []
subjs = []
for path,subj in paths[:2]:
    data.append(pd.read_csv(datadir / path, header = None).to_numpy())
    subjs.append(subj)

## GCCA

In [None]:
def preprocess(x):
    x2 = stats.zscore(x,axis=1)
    x2 -= np.mean(x2,axis=0)
    return x2

def gcca(data, rank_tolerance=None, n_components=None):
    n = data[0].shape[0]
    
    Uall = []
    Sall = []
    Vall = []
    ranks = []
    for x in tqdm(data):
        # Preprocess
        x = preprocess(x)
        x[np.isnan(x)] = 0
        
        # compute the SVD of the data
        v,s,ut = linalg.svd(x.T, full_matrices=False)
        
        Sall.append(s)
        Vall.append(v.T)
        # Dimensions to reduce to
        if rank_tolerance:
            rank = sum(S > rank_tolerance)
        else:
            rank = n_components
        ranks.append(rank)
        ut = ut.T[:,:rank]
        Uall.append(ut)

    d = min(ranks)
    
    # Create a concatenated view of Us
    Uall_c = np.concatenate(Uall,axis=1)

    _,_,VV=svds(Uall_c,d)
    VV = VV.T
    VV = VV[:,:min([d,VV.shape[1]])]
    
    # SVDS the concatenated Us
    idx_end = 0
    projX = []
    for i in range(len(data)):
        idx_start = idx_end
        idx_end = idx_start + ranks[i]
        VVi = normalize(VV[idx_start:idx_end,:],'l2')
        # Compute the canonical projections
        A = np.sqrt(n-1) * Vall[i][:,:rank]
        A = A @ (linalg.solve(np.diag(Sall[i][:rank]), VVi))
        projX.append(data[i] @ A)

In [67]:
num_views = 2 # Number of subjects (views) to consider
rank_tolerance = 300