# Generalized Canonical Correlation Analysis Demo Pipeline

Note, this requires an installation of the [mvlearn](https://mvlearn.netlify.com/) package. For the most up-to-date release do the following (this assumes that git is installed):

- `git clone https://github.com/neurodata/mvlearn.git`
- `cd mvlearn`
- `pip install -e .`

In [34]:
import numpy as np
from mvlearn.embed.gcca import GCCA
from pathlib import Path
import pandas as pd
import os
import re
import sys

In [19]:
## Returns files that match a query from a folder. Keeps names but not file endings
def get_files(path, ftype='csv'):
    query = f'.*\.{ftype}'
    files = [f[:-len(ftype)] for f in os.listdir(path) if re.search(query, f)]
    return(files)

In [20]:
## Reads the data from each file
def read_file(path):
    return(pd.read_csv(path, header = None).to_numpy())

In [21]:
## scree plot cutoff
n_elbows=3

## Define paths to data
basedir = Path('/mnt/ssd3/ronan/data')
datadir = basedir / 'raw'

savedir = basedir / f'Demo_Gradients'

if not os.path.exists(savedir): 
    os.makedirs(savedir)

## Load filenames and raw data

In [23]:
ftype = 'csv'
filenames = get_files(path=datadir, ftype=ftype)
raw_data = [read_file(datadir / f'{name}{ftype}') for name in filenames]    

In [33]:
print(f'Number of subjects is {len(raw_data)}')
print(f'Each subject data matrix has {raw_data[0].shape[0]} rows (cortical vertices) and {raw_data[0].shape[1]} columns (timesteps)')

Number of subjects is 228
Each subject data matrix has 18715 rows (cortical vertices) and 300 columns (timesteps)


## Calculate gradients per subject and save them

In [35]:
## Create GCCA object and fit and transform to the data
gcca = GCCA(n_elbows=n_elbows)
gradients = gcca.fit_transform(raw_data)

print(f'Min rank: {min(gcca.ranks_)}')

Min rank: 4


In [None]:
## Save latents to csvs
for fname,gradient in zip(filenames, gradients):
    save_path = savedir / f'{fname}csv'
    np.savetxt(save_path, gradient, delimiter=',')