## Pipeline Script 


Inputs: Group CEST and nmap data output from pyGluCEST as well as demographic data from _________. 
Outputs: Compiled dataframes with GluCEST and nmap data. Trimmed based on number of people with sufficient data.

    Trimmed subject-wise dfs: e.g., cestmat (outpath + 'trimmed_cestmat' + dataset + atlas + '.csv')
    Long form dfs: e.g., long_df (outpath + 'longform_grpdf' + dataset + '_' + atlas + '.csv')
         Also have version with standard nmap values
    Mean dfs: e.g., grouped_df (outpath + 'means_' + dataset + '_' + atlas + '.csv')


### Import Packages

In [14]:
import os
import glob
import numpy as np
import pandas as pd
#import network_fcon as fc
import scipy as sp
from scipy.stats import pearsonr
from scipy.stats import linregress
import seaborn as sns
import matplotlib.pyplot as plt
import re
from nilearn.datasets import fetch_atlas_schaefer_2018

### Define paths and variables

In [35]:
# Set variables
dataset = 'longglucest_outputmeasures2'
atlas = 'Schaefer2018_1000Parcels_17Networks'
nmaps = ["NMDA", "mGluR5", "GABA"]
maps = ["cest", "NMDA", "mGluR5", "GABA"]
normalize_cest = True

# Set paths
inpath = "/Users/pecsok/Desktop/ImageData/PMACS_remote/data/nmaps/" + dataset
outpath = "/Users/pecsok/Desktop/ImageData/PMACS_remote/data/nmaps/analyses/" + atlas

# Read in data
cestmat = pd.read_csv(inpath + "/all_subs_GluCEST_" + atlas + "_UNI.csv", sep=',')
NMDAmat = pd.read_csv(inpath + "/all_subs_NMDA_normalized_" + atlas + "_UNI.csv", sep=',')
mGluR5mat = pd.read_csv(inpath + "/all_subs_mGluR5_normalized_" + atlas + "_UNI.csv", sep=',')
GABAmat = pd.read_csv(inpath + "/all_subs_GABA_normalized_" + atlas + "_UNI.csv", sep=',')

# Set indices and correct column names
cestmat.set_index('Subject', inplace = True)
NMDAmat.set_index('Subject', inplace = True)
GABAmat.set_index('Subject', inplace = True)
mGluR5mat.set_index('Subject', inplace = True)
dfs = [cestmat, NMDAmat, mGluR5mat, GABAmat]

# Load in standardized nmap data for alternative approach.
receptor_df = pd.read_csv("/Users/pecsok/projects/Neuromaps/pecsok_pfns/neuromaps/results/receptor_data_scale1000_17.csv", sep=',')


In [11]:
#pd.set_option('display.max_rows', None)
#pd.set_option('display.max_columns', None)
#print(grp_df)

## Trim Data

In [36]:
# ID parcels with < 20 voxels* 
for i, col in enumerate(cestmat.columns):
    if 'NZcount' in col:
        # Set mean col to nan
        mean_col = cestmat.columns[i - 1]
        sigma_col = cestmat.columns[i + 1]
        cestmat[mean_col] = np.where(cestmat[col] < 20, np.nan, cestmat[mean_col])
        cestmat[sigma_col] = np.where(cestmat[col] < 20, np.nan, cestmat[sigma_col])
        cestmat[col] = np.where(cestmat[col] < 20, np.nan, cestmat[col])       
columns = cestmat.columns[cestmat.notnull().sum() > len(cestmat)*.75]
print(cestmat.shape)

# Trim all dfs based on column filter
cestmat= cestmat[columns]
NMDAmat= NMDAmat[columns]
GABAmat= GABAmat[columns]
mGluR5mat= mGluR5mat[columns]
print(cestmat.shape)

# ID subjects missing >65% of remaining GluCEST parcels
sparse_subjs = cestmat[cestmat.isna().sum(axis=1) > cestmat.shape[1] * 0.65].index

# Trim all dfs based on row filter
cestmat = cestmat.drop(index=sparse_subjs)
NMDAmat = NMDAmat.drop(index=sparse_subjs)
GABAmat = GABAmat.drop(index=sparse_subjs)
mGluR5mat = mGluR5mat.drop(index=sparse_subjs)
print(cestmat.shape)

#for df in dfs: Fix, put this back into a loop later
#    df = df[columns]
#    print(df.shape)

# Temporary: Remove mysterious zeros in nmap dataframes
dfs = [NMDAmat, mGluR5mat, GABAmat]
for i in range(len(dfs)):
    df = dfs[i]
    df.replace(0, np.nan, inplace=True)

# Save trimmed dfs
cestmat.to_csv(outpath + '/trimmed_cestmat' + dataset + atlas + '.csv', index=True)
NMDAmat.to_csv(outpath + '/trimmed_NMDAmat' + dataset + atlas + '.csv', index=True)
GABAmat.to_csv(outpath + '/trimmed_GABAmat' + dataset + atlas + '.csv', index=True)
mGluR5mat.to_csv(outpath + '/trimmed_mGluR5mat' + dataset + atlas + '.csv', index=True)

(176, 830)
(176, 203)
(172, 203)


## Normalize GluCEST

In [37]:
# Step 1: Select columns that contain 'NZMean'
if normalize_cest:
    
    nzmean_columns = [col for col in cestmat.columns if 'NZMean' in col]
    
    # Step 2: Calculate mean and std deviation for each subject (row-wise) across selected columns
    cestmat['Subject_Avg_NZMean'] = cestmat[nzmean_columns].mean(axis=1)
    cestmat['Subject_Std_NZMean'] = cestmat[nzmean_columns].std(axis=1)
    
    # Step 3: Calculate z-scores for all selected columns at once and store them in a new dataframe
    zscore_df = (cestmat[nzmean_columns].sub(cestmat['Subject_Avg_NZMean'], axis=0)
                 .div(cestmat['Subject_Std_NZMean'], axis=0))
    
    # Rename z-score columns
    #zscore_df.columns = [col + '_Zscore' for col in zscore_df.columns]
    #print(zscore_df.size)
    #print(zscore_df)
    # Step 4: Concatenate the z-scores dataframe to the original cestmat dataframe
    cestmat = pd.concat([cestmat['group'], zscore_df], axis=1)
    cestmat.to_csv(outpath + '/grp_df_means_std_normalized_' + dataset + '_' + atlas + '.csv', index=False)


  cestmat['Subject_Avg_NZMean'] = cestmat[nzmean_columns].mean(axis=1)
  cestmat['Subject_Std_NZMean'] = cestmat[nzmean_columns].std(axis=1)


## Make classic grp_df

In [38]:
# First, add datatype to column names so can be distinguished later.
cestmat2=cestmat.copy()
NMDAmat2=NMDAmat.copy()
GABAmat2=GABAmat.copy()
mGluR5mat2=mGluR5mat.copy()
cestmat2.columns = [f"GluCEST_{col}" if "NZ" in col else col for col in cestmat2.columns]
NMDAmat2.columns = [f"NMDA_{col}" if "NZ" in col else col for col in NMDAmat2.columns]
GABAmat2.columns = [f"GABA_{col}" if "NZ" in col else col for col in GABAmat2.columns]
mGluR5mat2.columns = [f"mGluR5_{col}" if "NZ" in col else col for col in mGluR5mat2.columns]

# Align dataframes by "Subject" index and concatenate along columns
grp_df = cestmat2.join(NMDAmat2.filter(like='NZ'), how='left')
grp_df = grp_df.join(GABAmat2.filter(like='NZ'), how='left')
grp_df = grp_df.join(mGluR5mat2.filter(like='NZ'), how='left')

# Save grp_df
grp_df.to_csv(outpath + '/grp_df_' + dataset + atlas + '.csv', index=True)
#print(cestmat)

## Make longform group df

In [39]:
# Make longform group df
# Get list of parcel names
parcels = cestmat.filter(like="NZMean").columns.tolist()

# Melt cestmat to get Glu data in long format
cestlong = cestmat.reset_index().melt(id_vars='Subject', value_vars=parcels, 
                                      var_name='Parcel', value_name='GluCEST')

# Melt nmap data. Fix!! turn into loop later.
NMDAlong = NMDAmat.reset_index().melt(id_vars='Subject', value_vars=parcels, 
                                       var_name='Parcel', value_name='NMDA')
GABAlong = GABAmat.reset_index().melt(id_vars='Subject', value_vars=parcels, 
                                       var_name='Parcel', value_name='GABA')
mGluR5long = mGluR5mat.reset_index().melt(id_vars='Subject', value_vars=parcels, 
                                       var_name='Parcel', value_name='mGluR5')

# Merge the long-form dataframes based on Subject and Parcel
long_df = pd.merge(cestlong, NMDAlong,  on=['Subject', 'Parcel'])
long_df = pd.merge(long_df, GABAlong,  on=['Subject', 'Parcel'])
long_df = pd.merge(long_df, mGluR5long,  on=['Subject', 'Parcel'])

# Add diagnostic group
diag_df = cestmat['group']
long_df = pd.merge(long_df, diag_df, on='Subject')
long_df['hstatus'] = np.where(long_df['group'].isin(['TD/NC']), 'HC', 'PSY')
#print(long_df)

# Save longformdf
long_df.to_csv(outpath + '/longform_grpdf_' + dataset + atlas + '.csv', index=True)

### Make mean group dfs by diagnosis

In [41]:
# Make mean group dfs by diagnosis
grouped_subj = long_df.groupby(['Parcel', 'hstatus']).agg(
    CEST_avg=('GluCEST', 'mean'),
    NMDA=('NMDA', 'mean'),
    mGluR5=('mGluR5', 'mean'),
    GABA=('GABA', 'mean')
).reset_index()
print(grouped_subj)
# Save
grouped_subj.to_csv(outpath + '/means_subjectnmaps_' + dataset + '_' + atlas + '.csv', index=False)

                                      Parcel hstatus  CEST_avg      NMDA  \
0         17Networks_RH_ContA_Cingm_1 NZMean      HC -0.239851 -0.515943   
1         17Networks_RH_ContA_Cingm_1 NZMean     PSY -0.235165 -0.478070   
2         17Networks_RH_ContA_Cingm_2 NZMean      HC -0.255432 -0.662924   
3         17Networks_RH_ContA_Cingm_2 NZMean     PSY -0.260417 -0.608125   
4         17Networks_RH_ContB_PFCmp_1 NZMean      HC -0.102085  0.665146   
..                                       ...     ...       ...       ...   
129  17Networks_RH_VisPeri_ExStrSup_7 NZMean     PSY  0.450614  0.819882   
130   17Networks_RH_VisPeri_StriCal_3 NZMean      HC  1.342156 -0.013643   
131   17Networks_RH_VisPeri_StriCal_3 NZMean     PSY  1.096243  0.003004   
132   17Networks_RH_VisPeri_StriCal_5 NZMean      HC  1.024538  0.177231   
133   17Networks_RH_VisPeri_StriCal_5 NZMean     PSY  0.917279  0.039007   

       mGluR5      GABA  
0   -0.290452 -0.823318  
1   -0.286724 -0.835636  
2   -0.35

# Repeat using standard nmaps

In [42]:
# Import and add parcel labels to standard receptor_df
schaefer = fetch_atlas_schaefer_2018(n_rois=1000, yeo_networks=17)
labels = schaefer.labels
labels = [label.decode('utf-8') for label in labels]
receptor_df.index = labels
receptor_df.index.name = 'Parcel'

# Chop up receptor_df by map
NMDAmat = receptor_df[["NMDA"]]
GABAmat = receptor_df[["GABAa"]]
mGluR5mat = receptor_df[["mGluR5"]]

### Make classic grp_df

In [43]:
# Transpose receptor maps.
nmda = NMDAmat.T
gaba = GABAmat.T
mglur5 = mGluR5mat.T

# Keep only parcels contained in cestmat
cestmat_regions = [col.replace(' NZMean', '') for col in cestmat.columns if ' NZMean' in col]
nmda_filtered = nmda[[col for col in nmda.columns if col in cestmat_regions]]
gaba_filtered = gaba[[col for col in gaba.columns if col in cestmat_regions]]
mglur5_filtered = mglur5[[col for col in mglur5.columns if col in cestmat_regions]]

# Filtered columns
nmda_filtered.columns = [f"NMDA_{col}" for col in nmda_filtered.columns]
gaba_filtered.columns = [f"GABA_{col}" for col in gaba_filtered.columns]
mglur5_filtered.columns = [f"mGluR5_{col}" for col in mglur5_filtered.columns]

# Repeat values for length of cestmat
nmda_repeated = pd.concat([nmda_filtered] * len(cestmat), ignore_index=True)
gaba_repeated = pd.concat([gaba_filtered] * len(cestmat), ignore_index=True)
mglur5_repeated = pd.concat([mglur5_filtered] * len(cestmat), ignore_index=True)

# Concatenate
grp_df_std = pd.concat([cestmat2, nmda_repeated, gaba_repeated, mglur5_repeated], axis=1)

# Save grp_df_std
grp_df_std.to_csv(outpath + '/grp_df_std' + dataset + atlas + '.csv', index=True)
print(grp_df_std)

                group  GluCEST_17Networks_RH_VisCent_Striate_2 NZMean  \
100522_12003    TD/NC                                             NaN   
100522_12371    TD/NC                                             NaN   
100522_12783    TD/NC                                             NaN   
102041_12037  PRO/CHR                                       -0.764391   
102041_12500  PRO/CHR                                        1.143908   
...               ...                                             ...   
167               NaN                                             NaN   
168               NaN                                             NaN   
169               NaN                                             NaN   
170               NaN                                             NaN   
171               NaN                                             NaN   

              GluCEST_17Networks_RH_VisCent_ExStr_14 NZMean  \
100522_12003                                       2.117882 

### Make Longform df

In [44]:
# Make longform df using standardized nmaps values

# Get list of parcel names
parcels = cestmat.filter(like="NZMean").columns.tolist()

# Keep relevant columns from long_df and rename parcels
longdf_cest = long_df[["Subject","Parcel","GluCEST","group","hstatus"]]
longdf_cest["Parcel"] = longdf_cest["Parcel"].str.replace(' NZMean', '', regex=False)

# Convert receptor_df from wide to long format for merging
nmda_long = NMDAmat.reset_index().melt(id_vars='Parcel', var_name='Receptor1', value_name='NMDA_standard')
gaba_long = GABAmat.reset_index().melt(id_vars='Parcel', var_name='Receptor2', value_name='GABA_standard')
mglur5_long = mGluR5mat.reset_index().melt(id_vars='Parcel', var_name='Receptor3', value_name='mGluR5_standard')

# Combine dfs
long_df_std = pd.merge(longdf_cest, nmda_long, on='Parcel', how='left')
long_df_std = pd.merge(long_df_std, gaba_long, on='Parcel', how='left')
long_df_std = pd.merge(long_df_std, mglur5_long, on='Parcel', how='left')
# Remove unnecessary columns
long_df_std = long_df_std.drop(columns=long_df_std.filter(like="Receptor").columns)
#print(long_df_std)

# Save the longform dataframe to a CSV
long_df_std.to_csv(outpath + '/longform_grpdf_std_' + dataset + '_' + atlas + '.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  longdf_cest["Parcel"] = longdf_cest["Parcel"].str.replace(' NZMean', '', regex=False)


### Make mean df by diagnosis

In [46]:
# Make mean group dfs by diagnosis
# Standard nmap data
grouped_std = long_df_std.groupby(['Parcel', 'hstatus']).agg(
    CESTavg=('GluCEST', 'mean'),
    NMDA=('NMDA_standard', 'mean'),
    mGluR5=('mGluR5_standard', 'mean'),
    GABA=('GABA_standard', 'mean')
).reset_index()
print(grouped_std)
grouped_std.to_csv(outpath + '/means_std_' + dataset + '_' + atlas + '.csv', index=False)

                               Parcel hstatus   CESTavg      NMDA    mGluR5  \
0         17Networks_RH_ContA_Cingm_1      HC -0.239851 -0.861131 -1.573938   
1         17Networks_RH_ContA_Cingm_1     PSY -0.235165 -0.861131 -1.573938   
2         17Networks_RH_ContA_Cingm_2      HC -0.255432 -0.970507 -1.450434   
3         17Networks_RH_ContA_Cingm_2     PSY -0.260417 -0.970507 -1.450434   
4         17Networks_RH_ContB_PFCmp_1      HC -0.102085  1.266783  1.540954   
..                                ...     ...       ...       ...       ...   
129  17Networks_RH_VisPeri_ExStrSup_7     PSY  0.450614  0.725344  0.649158   
130   17Networks_RH_VisPeri_StriCal_3      HC  1.342156  1.396886 -0.190620   
131   17Networks_RH_VisPeri_StriCal_3     PSY  1.096243  1.396886 -0.190620   
132   17Networks_RH_VisPeri_StriCal_5      HC  1.024538  1.191815 -0.072134   
133   17Networks_RH_VisPeri_StriCal_5     PSY  0.917279  1.191815 -0.072134   

         GABA  
0   -0.950374  
1   -0.950374  
2  

### Data Imputation

In [213]:
# Now, for the long_dfs, impute data based on average across participants for that parcel
# Subject-wise data
merged_df = pd.merge(long_df, grouped_df2[['Parcel', 'hstatus', 'GluCEST_avg']], on=['Parcel', 'hstatus'], how='left')
merged_df['GluCEST'] = merged_df['GluCEST'].fillna(merged_df['GluCEST_avg'])
#print(merged_df)
imputed_df = merged_df.drop(columns=['GluCEST_avg'])

# Standard data
# First, rename parcels
long_df_std["Parcel"] = long_df_std["Parcel"].str.replace(' NZMean', '', regex=False)
#print(long_df_std)
merged_df_std = pd.merge(long_df_std, grouped_df[['Parcel', 'hstatus', 'GluCEST_avg']], on=['Parcel', 'hstatus'], how='left')
merged_df_std['GluCEST'] = merged_df_std ['GluCEST'].fillna(merged_df_std ['GluCEST_avg'])
#print(merged_df_std)
imputed_df_std = merged_df_std.drop(columns=['GluCEST_avg'])

imputed_df.to_csv(outpath + '/imputed_long_df_' + dataset + '_' + atlas + '.csv', index=False)
imputed_df_std.to_csv(outpath + '/imputed_long_df_standardnmaps_' + dataset + '_' + atlas + '.csv', index=False)

### Normalize GluCEST values

In [12]:
# Make mean group dfs by diagnosis
# Standard nmap data
grouped_std = long_df_std.groupby(['Parcel', 'hstatus']).agg(
    GluCEST_avg=('CEST', 'mean'),
    NMDA_avg=('NMDA_standard', 'mean'),
    mGluR5_avg=('mGluR5_standard', 'mean'),
    GABA_avg=('GABA_standard', 'mean')
).reset_index()
#print(grouped_std)
grouped_std.to_csv(outpath + '/means_std_normalized_cest_' + dataset + '_' + atlas + '.csv', index=False)

In [13]:
# Step 1: Select columns that contain 'NZMean'
nzmean_columns = [col for col in cestmat.columns if 'NZMean' in col]

# Step 2: Calculate mean and std deviation for each subject (row-wise) across selected columns
cestmat['Subject_Avg_NZMean'] = cestmat[nzmean_columns].mean(axis=1)
cestmat['Subject_Std_NZMean'] = cestmat[nzmean_columns].std(axis=1)
print(cestmat)

# Step 3: Calculate z-scores for all selected columns at once and store them in a new dataframe
zscore_df = (cestmat[nzmean_columns].sub(cestmat['Subject_Avg_NZMean'], axis=0)
             .div(cestmat['Subject_Std_NZMean'], axis=0))

# Rename z-score columns
zscore_df.columns = [col + '_Zscore' for col in zscore_df.columns]
#print(zscore_df.size)
#print(zscore_df)
# Step 4: Concatenate the z-scores dataframe to the original cestmat dataframe
zcestmat = pd.concat([cestmat['group'], zscore_df], axis=1)

zcestmat.to_csv(outpath + '/grp_df_means_std_normalized_' + dataset + '_' + atlas + '.csv', index=False)
print(zcestmat)


  cestmat['Subject_Avg_NZMean'] = cestmat[nzmean_columns].mean(axis=1)
  cestmat['Subject_Std_NZMean'] = cestmat[nzmean_columns].std(axis=1)


              Unnamed: 0    group  17Networks_RH_VisCent_Striate_2 NZMean  \
Subject                                                                     
100522_12003           0    TD/NC                                     NaN   
100522_12371           1    TD/NC                                     NaN   
100522_12783           2    TD/NC                                     NaN   
102041_12037           3  PRO/CHR                                6.896060   
102041_12500           4  PRO/CHR                                9.101267   
...                  ...      ...                                     ...   
96902_11903          171    TD/NC                               10.768049   
96902_12440          172    TD/NC                                     NaN   
96902_12788          173    TD/NC                                     NaN   
98370_12558          174    TD/NC                                8.194758   
98370_12952          175    TD/NC                                7.623180   

In [None]:
cestmat2=cestmat.copy()
NMDAmat2=NMDAmat.copy()
GABAmat2=GABAmat.copy()
mGluR5mat2=mGluR5mat.copy()
cestmat2.columns = [f"GluCEST_{col}" if "NZ" in col else col for col in cestmat2.columns]
NMDAmat2.columns = [f"NMDA_{col}" if "NZ" in col else col for col in NMDAmat2.columns]
GABAmat2.columns = [f"GABA_{col}" if "NZ" in col else col for col in GABAmat2.columns]
mGluR5mat2.columns = [f"mGluR5_{col}" if "NZ" in col else col for col in mGluR5mat2.columns]

# Align dataframes by "Subject" index and concatenate along columns
grp_df = cestmat2.join(NMDAmat2.filter(like='NZ'), how='left')
grp_df = grp_df.join(GABAmat2.filter(like='NZ'), how='left')
grp_df = grp_df.join(mGluR5mat2.filter(like='NZ'), how='left')

# Save grp_df
grp_df.to_csv(outpath + '/grp_df_' + dataset + atlas + '.csv', index=True)

In [None]:
# Start with Loop through subjects. 
# Find path to 

I have parcel-wise glucest values and I want to normalize based on that subject's average glucest value.
Loop through rows of df
subj= the index of that row. 
Get average glucest value from path that includes subj in the path name

# Transpose receptor maps.
nmda = NMDAmat.T
gaba = GABAmat.T
mglur5 = mGluR5mat.T

# Keep only parcels contained in cestmat
cestmat_regions = [col.replace(' NZMean', '') for col in cestmat.columns if ' NZMean' in col]
nmda_filtered = nmda[[col for col in nmda.columns if col in cestmat_regions]]
gaba_filtered = gaba[[col for col in gaba.columns if col in cestmat_regions]]
mglur5_filtered = mglur5[[col for col in mglur5.columns if col in cestmat_regions]]

# Filtered columns
nmda_filtered.columns = [f"NMDA_{col}" for col in nmda_filtered.columns]
gaba_filtered.columns = [f"GABA_{col}" for col in gaba_filtered.columns]
mglur5_filtered.columns = [f"mGluR5_{col}" for col in mglur5_filtered.columns]

# Repeat values for length of cestmat
nmda_repeated = pd.concat([nmda_filtered] * len(cestmat), ignore_index=True)
gaba_repeated = pd.concat([gaba_filtered] * len(cestmat), ignore_index=True)
mglur5_repeated = pd.concat([mglur5_filtered] * len(cestmat), ignore_index=True)

# Concatenate
grp_df_std = pd.concat([cestmat2, nmda_repeated, gaba_repeated, mglur5_repeated], axis=1)

# Save grp_df_std
grp_df_std.to_csv(outpath + '/grp_df_std' + dataset + atlas + '.csv', index=True)
print(grp_df_std)