## Pipeline Script 


Inputs: Group CEST and nmap data output from pyGluCEST as well as demographic data from _________. 
Outputs: Compiled dataframes with GluCEST and nmap data. Trimmed based on number of people with sufficient data.

    Trimmed subject-wise dfs: e.g., cestmat (outpath + 'trimmed_cestmat' + dataset + atlas + '.csv')
    Long form dfs: e.g., long_df (outpath + 'longform_grpdf' + dataset + '_' + atlas + '.csv')
         Also have version with standard nmap values
    Mean dfs: e.g., grouped_df (outpath + 'means_' + dataset + '_' + atlas + '.csv')


### Import Packages

In [187]:
import os
import glob
import numpy as np
import pandas as pd
#import network_fcon as fc
import scipy as sp
from scipy.stats import pearsonr
from scipy.stats import linregress
import seaborn as sns
import matplotlib.pyplot as plt
import re
from nilearn.datasets import fetch_atlas_schaefer_2018

### Define paths and variables

In [188]:
# Set variables
dataset = 'longglucest_outputmeasures2'
atlas = 'Schaefer2018_1000Parcels_17Networks'
nmaps = ["NMDA", "mGluR5", "GABA"]
maps = ["cest", "NMDA", "mGluR5", "GABA"]

# Set paths
inpath = "/Users/pecsok/Desktop/ImageData/PMACS_remote/data/nmaps/" + dataset
outpath = "/Users/pecsok/Desktop/ImageData/PMACS_remote/data/nmaps/analyses/" + atlas

# Read in data
cestmat = pd.read_csv(inpath + "/all_subs_GluCEST_" + atlas + "_UNI.csv", sep=',')
NMDAmat = pd.read_csv(inpath + "/all_subs_NMDA_normalized_" + atlas + "_UNI.csv", sep=',')
mGluR5mat = pd.read_csv(inpath + "/all_subs_mGluR5_normalized_" + atlas + "_UNI.csv", sep=',')
GABAmat = pd.read_csv(inpath + "/all_subs_GABA_normalized_" + atlas + "_UNI.csv", sep=',')

# Set indices and correct column names
cestmat.set_index('Subject', inplace = True)
NMDAmat.set_index('Subject', inplace = True)
GABAmat.set_index('Subject', inplace = True)
mGluR5mat.set_index('Subject', inplace = True)
dfs = [cestmat, NMDAmat, mGluR5mat, GABAmat]

# Load in standardized nmap data for alternative approach.
receptor_df = pd.read_csv("/Users/pecsok/projects/Neuromaps/pecsok_pfns/neuromaps/results/receptor_data_scale1000_17.csv", sep=',')


In [11]:
#pd.set_option('display.max_rows', None)
#pd.set_option('display.max_columns', None)
#print(grp_df)

## Trim Data

In [189]:
# ID parcels with < 20 voxels* 
for i, col in enumerate(cestmat.columns):
    if 'NZcount' in col:
        # Set mean col to nan
        mean_col = cestmat.columns[i - 1]
        sigma_col = cestmat.columns[i + 1]
        cestmat[mean_col] = np.where(cestmat[col] < 20, np.nan, cestmat[mean_col])
        cestmat[sigma_col] = np.where(cestmat[col] < 20, np.nan, cestmat[sigma_col])
        cestmat[col] = np.where(cestmat[col] < 20, np.nan, cestmat[col])       
columns = cestmat.columns[cestmat.notnull().sum() > len(cestmat)*.75]
print(cestmat.shape)

# Trim all dfs based on column filter
cestmat= cestmat[columns]
NMDAmat= NMDAmat[columns]
GABAmat= GABAmat[columns]
mGluR5mat= mGluR5mat[columns]
print(cestmat.shape)

# ID subjects missing >65% of remaining GluCEST parcels
sparse_subjs = cestmat[cestmat.isna().sum(axis=1) > cestmat.shape[1] * 0.65].index

# Trim all dfs based on row filter
cestmat = cestmat.drop(index=sparse_subjs)
NMDAmat = NMDAmat.drop(index=sparse_subjs)
GABAmat = GABAmat.drop(index=sparse_subjs)
mGluR5mat = mGluR5mat.drop(index=sparse_subjs)
print(cestmat.shape)

#for df in dfs: Fix, put this back into a loop later
#    df = df[columns]
#    print(df.shape)

(176, 830)
(176, 203)
(172, 203)


In [190]:
# Temporary: Remove mysterious zeros in nmap dataframes
dfs = [NMDAmat, mGluR5mat, GABAmat]
for i in range(len(dfs)):
    df = dfs[i]
    df.replace(0, np.nan, inplace=True)

In [191]:
# Save trimmed dfs
cestmat.to_csv(outpath + 'trimmed_cestmat' + dataset + atlas + '.csv', index=True)
NMDAmat.to_csv(outpath + 'trimmed_NMDAmat' + dataset + atlas + '.csv', index=True)
GABAmat.to_csv(outpath + 'trimmed_GABAmat' + dataset + atlas + '.csv', index=True)
mGluR5mat.to_csv(outpath + 'trimmed_mGluR5mat' + dataset + atlas + '.csv', index=True)

## Make longform group df

In [192]:
# Make longform group df
# Get list of parcel names
parcels = cestmat.filter(like="NZMean").columns.tolist()

# Melt cestmat to get Glu data in long format
cestlong = cestmat.reset_index().melt(id_vars='Subject', value_vars=parcels, 
                                      var_name='Parcel', value_name='GluCEST')

# Melt nmap data. Fix!! turn into loop later.
NMDAlong = NMDAmat.reset_index().melt(id_vars='Subject', value_vars=parcels, 
                                       var_name='Parcel', value_name='NMDA')
GABAlong = GABAmat.reset_index().melt(id_vars='Subject', value_vars=parcels, 
                                       var_name='Parcel', value_name='GABA')
mGluR5long = mGluR5mat.reset_index().melt(id_vars='Subject', value_vars=parcels, 
                                       var_name='Parcel', value_name='mGluR5')

# Merge the long-form dataframes based on Subject and Parcel
long_df = pd.merge(cestlong, NMDAlong,  on=['Subject', 'Parcel'])
long_df = pd.merge(long_df, GABAlong,  on=['Subject', 'Parcel'])
long_df = pd.merge(long_df, mGluR5long,  on=['Subject', 'Parcel'])

# Add diagnostic group
diag_df = cestmat['group']
long_df = pd.merge(long_df, diag_df, on='Subject')
long_df['hstatus'] = np.where(long_df['group'].isin(['TD/NC']), 'HC', 'PSY')

# Display the long-form dataframe
print(long_df)

# Save longformdf
long_df.to_csv(outpath + '/longform_grpdf' + dataset + atlas + '.csv', index=True)

            Subject                                    Parcel   GluCEST  \
0      100522_12003    17Networks_RH_VisCent_Striate_2 NZMean       NaN   
1      100522_12371    17Networks_RH_VisCent_Striate_2 NZMean       NaN   
2      100522_12783    17Networks_RH_VisCent_Striate_2 NZMean       NaN   
3      102041_12037    17Networks_RH_VisCent_Striate_2 NZMean  6.896060   
4      102041_12500    17Networks_RH_VisCent_Striate_2 NZMean  9.101267   
...             ...                                       ...       ...   
11519   96902_11903  17Networks_RH_DefaultA_pCunPCC_12 NZMean  8.674353   
11520   96902_12440  17Networks_RH_DefaultA_pCunPCC_12 NZMean       NaN   
11521   96902_12788  17Networks_RH_DefaultA_pCunPCC_12 NZMean       NaN   
11522   98370_12558  17Networks_RH_DefaultA_pCunPCC_12 NZMean  8.809414   
11523   98370_12952  17Networks_RH_DefaultA_pCunPCC_12 NZMean  7.371335   

           NMDA      GABA    mGluR5    group hstatus  
0           NaN       NaN       NaN    TD/NC

#### Repeat using standard nmaps

In [193]:
# Import and add parcel labels to standard receptor_df
schaefer = fetch_atlas_schaefer_2018(n_rois=1000, yeo_networks=17)
labels = schaefer.labels
labels = [label.decode('utf-8') for label in labels]
receptor_df.index = labels
receptor_df.index.name = 'Parcel'
print(receptor_df)

                                     NMDA    mGluR5     GABAa        D2
Parcel                                                                 
17Networks_LH_VisCent_Striate_1  0.222071 -1.676590  0.454865 -1.793413
17Networks_LH_VisCent_Striate_2 -0.498181 -1.582004  0.778041 -1.725353
17Networks_LH_VisCent_Striate_3  0.901576 -0.612121  2.141349 -1.079382
17Networks_LH_VisCent_Striate_4 -1.250976 -2.019832 -0.681790 -1.734788
17Networks_LH_VisCent_ExStr_1    0.924391 -0.341650  0.853886  0.268522
...                                   ...       ...       ...       ...
17Networks_RH_TempPar_18         0.906115  0.748027  0.435516  0.148707
17Networks_RH_TempPar_19        -0.093347  0.810461 -0.015695  0.217057
17Networks_RH_TempPar_20         0.859066  0.746511  0.069180  0.635338
17Networks_RH_TempPar_21         0.825170  1.057345  0.635444  0.371140
17Networks_RH_TempPar_22         0.219509  1.022109  0.470702  0.717365

[1000 rows x 4 columns]


In [197]:
# Make longform df using standardized nmaps values

# Get list of parcel names
parcels = cestmat.filter(like="NZMean").columns.tolist()

# Keep relevant columns from long_df and rename parcels
longdf_cest = long_df[["Subject","Parcel","GluCEST","group","hstatus"]]
longdf_cest["Parcel"] = longdf_cest["Parcel"].str.replace(' NZMean', '', regex=False)

# Chop up receptor_df by map
NMDAmat = receptor_df["NMDA"]
GABAmat = receptor_df["GABAa"]
mGluR5mat = receptor_df["mGluR5"]

# Convert receptor_df from wide to long format for merging
nmda_long = NMDAmat.reset_index().melt(id_vars='Parcel', var_name='Receptor1', value_name='NMDA_standard')
gaba_long = GABAmat.reset_index().melt(id_vars='Parcel', var_name='Receptor2', value_name='GABA_standard')
mglur5_long = mGluR5mat.reset_index().melt(id_vars='Parcel', var_name='Receptor3', value_name='mGluR5_standard')

# Combine dfs
long_df_std = pd.merge(longdf_cest, nmda_long, on='Parcel', how='left')
long_df_std = pd.merge(long_df_std, gaba_long, on='Parcel', how='left')
long_df_std = pd.merge(long_df_std, mglur5_long, on='Parcel', how='left')
# Remove unnecessary columns
long_df_std = long_df_std.drop(columns=long_df_std.filter(like="Receptor").columns)
print(long_df_std)

# Save the longform dataframe to a CSV
long_df_std.to_csv(outpath + '/longform_grpdf_standardnmaps_' + dataset + '_' + atlas + '.csv', index=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  longdf_cest["Parcel"] = longdf_cest["Parcel"].str.replace(' NZMean', '', regex=False)


            Subject                             Parcel   GluCEST    group  \
0      100522_12003    17Networks_RH_VisCent_Striate_2       NaN    TD/NC   
1      100522_12371    17Networks_RH_VisCent_Striate_2       NaN    TD/NC   
2      100522_12783    17Networks_RH_VisCent_Striate_2       NaN    TD/NC   
3      102041_12037    17Networks_RH_VisCent_Striate_2  6.896060  PRO/CHR   
4      102041_12500    17Networks_RH_VisCent_Striate_2  9.101267  PRO/CHR   
...             ...                                ...       ...      ...   
11519   96902_11903  17Networks_RH_DefaultA_pCunPCC_12  8.674353    TD/NC   
11520   96902_12440  17Networks_RH_DefaultA_pCunPCC_12       NaN    TD/NC   
11521   96902_12788  17Networks_RH_DefaultA_pCunPCC_12       NaN    TD/NC   
11522   98370_12558  17Networks_RH_DefaultA_pCunPCC_12  8.809414    TD/NC   
11523   98370_12952  17Networks_RH_DefaultA_pCunPCC_12  7.371335    TD/NC   

      hstatus  NMDA_standard  GABA_standard  mGluR5_standard  
0          H

In [208]:
# Make mean group dfs by diagnosis
# Subject-wise data
grouped_df2 = long_df.groupby(['Parcel', 'hstatus']).agg(
    GluCEST_avg=('GluCEST', 'mean'),
    NMDA_avg=('NMDA', 'mean'),
    mGluR5_avg=('mGluR5', 'mean'),
    GABA_avg=('GABA', 'mean')
).reset_index()
print(grouped_df2)
grouped_df2.to_csv(outpath + '/means_subjectnmaps_' + dataset + '_' + atlas + '.csv', index=False)

# Standard nmap data
grouped_df = long_df_std.groupby(['Parcel', 'hstatus']).agg(
    GluCEST_avg=('GluCEST', 'mean'),
    NMDA_avg=('NMDA_standard', 'mean'),
    mGluR5_avg=('mGluR5_standard', 'mean'),
    GABA_avg=('GABA_standard', 'mean')
).reset_index()
print(grouped_df)
grouped_df.to_csv(outpath + '/means_standardnmaps_' + dataset + '_' + atlas + '.csv', index=False)

                                      Parcel hstatus  GluCEST_avg  NMDA_avg  \
0         17Networks_RH_ContA_Cingm_1 NZMean      HC     7.348242 -0.515943   
1         17Networks_RH_ContA_Cingm_1 NZMean     PSY     7.536327 -0.478070   
2         17Networks_RH_ContA_Cingm_2 NZMean      HC     7.400822 -0.662924   
3         17Networks_RH_ContA_Cingm_2 NZMean     PSY     7.629667 -0.608125   
4         17Networks_RH_ContB_PFCmp_1 NZMean      HC     7.561262  0.665146   
..                                       ...     ...          ...       ...   
129  17Networks_RH_VisPeri_ExStrSup_7 NZMean     PSY     8.500733  0.819882   
130   17Networks_RH_VisPeri_StriCal_3 NZMean      HC     9.569096 -0.013643   
131   17Networks_RH_VisPeri_StriCal_3 NZMean     PSY     9.336277  0.003004   
132   17Networks_RH_VisPeri_StriCal_5 NZMean      HC     9.077889  0.177231   
133   17Networks_RH_VisPeri_StriCal_5 NZMean     PSY     9.030074  0.039007   

     mGluR5_avg  GABA_avg  
0     -0.290452 -0.8233

### Data Imputation

In [213]:
# Now, for the long_dfs, impute data based on average across participants for that parcel

# Subject-wise data
merged_df = pd.merge(long_df, grouped_df2[['Parcel', 'hstatus', 'GluCEST_avg']], on=['Parcel', 'hstatus'], how='left')
merged_df['GluCEST'] = merged_df['GluCEST'].fillna(merged_df['GluCEST_avg'])
#print(merged_df)
imputed_df = merged_df.drop(columns=['GluCEST_avg'])

# Standard data
# First, rename parcels
long_df_std["Parcel"] = long_df_std["Parcel"].str.replace(' NZMean', '', regex=False)
#print(long_df_std)
merged_df_std = pd.merge(long_df_std, grouped_df[['Parcel', 'hstatus', 'GluCEST_avg']], on=['Parcel', 'hstatus'], how='left')
merged_df_std['GluCEST'] = merged_df_std ['GluCEST'].fillna(merged_df_std ['GluCEST_avg'])
#print(merged_df_std)
imputed_df_std = merged_df_std.drop(columns=['GluCEST_avg'])

imputed_df.to_csv(outpath + '/imputed_long_df_' + dataset + '_' + atlas + '.csv', index=False)
imputed_df_std.to_csv(outpath + '/imputed_long_df_standardnmaps_' + dataset + '_' + atlas + '.csv', index=False)