## Pipeline Script 
This script 

### Import Packages

In [1]:
import os
import glob
import numpy as np
import pandas as pd
import scipy as sp
from scipy.stats import pearsonr
from scipy.stats import linregress
import seaborn as sns
import matplotlib.pyplot as plt
import re

### Define paths and variables

In [2]:
# Set paths
fcpath = "/Users/pecsok/Desktop/ImageData/PMACS_remote/data/fmri/postprocessed/7T/xcp_d"
outpath = "~/Desktop/ImageData/PMACS_remote/analysis/postprocessing/"
clinpath = "~/Desktop/ImageData/PMACS_remote/data/clinical"
cestpath = "/Users/pecsok/Desktop/ImageData/PMACS_remote/data/cest/output_measures/UNI/"

# Choose what to analyse
networks = ["SomMot"] 
CESTnetworks = ["avgCEST_SomMot", "ctCEST_SomMot"]
CNB_scores = ["tap_tot"]
CNB_valids = ["tap_valid"] 
diag_scores = ["hstatus"]
demo_scores = ["sex", "age", "race","ethnic","dateDiff"]
diag_details = ["axis1_desc1", "axis1_desc2", "axis1_desc3","axis1_desc4","axis1_desc5", "axis1_desc6"]

# Make dataframe based on metrics of interest
grp_df = pd.DataFrame(columns = ["BBLID"] + ["Session"] + demo_scores + networks + CESTnetworks + CNB_scores + diag_scores)
diag_df = pd.DataFrame(columns = ["BBLID"] + ["Session"] + diag_scores + diag_details)
print(grp_df)

# Initialize empty lists and vars
bblids = []
sesids = []

# Import group dataframes and set indices
subjlist = pd.read_csv("~/Desktop/ImageData/PMACS_remote/data/subject_list_111623.csv", sep=',') 
cnbmat = pd.read_csv(clinpath + "/cnb.csv", sep=',') 
diagmat = pd.read_csv(clinpath + "/diagnosis.csv", sep=',')
demomat = pd.read_csv(clinpath + "/demographics.csv", sep=',')
# cestmat = pd.read_csv(clinpath + "/demographics.csv", sep='\t') add grp CEST map here
cnbmat.set_index('bblid', inplace = True)
diagmat.set_index('bblid', inplace = True)
demomat.set_index('bblid', inplace = True)

# Set up renaming dictionary for CEST df
schaefer_indices = pd.read_csv('~/Desktop/ImageData/PMACS_remote/github/glucest-rsfmri/Schaefer2018_100Parcels_17Networks_order_FSLMNI152_2mm.Centroid_RAS.csv', sep=',') # Load the CSV with the mapping of numbers to labels
schaefer_dict = dict(zip(schaefer_indices['ROI Label'], schaefer_indices['ROI Name']))

print("yes")

Empty DataFrame
Columns: [BBLID, Session, sex, age, race, ethnic, dateDiff, SomMot, avgCEST_SomMot, ctCEST_SomMot, tap_tot, hstatus]
Index: []


FileNotFoundError: [Errno 2] No such file or directory: '/Users/pecsok/Desktop/ImageData/PMACS_remote/data/subject_list_111623.csv'

### Choose which modules to run

In [71]:
runfcon = False
runCNB = False
rundiag = True
rundemo = False
runcest = False
run_grpanalysis = True

## Stage 1: Create Group Data Frame
### Import data, loop through subjects, and establish file paths


FIX THIS ERROR:
/var/folders/ls/hy_z7hgd4_13km3h7j84vqh40000gp/T/ipykernel_77945/3898733492.py:72: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value 'PSY' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  grp_df.loc[grp_df['BBLID'].astype(str) == bblid, grp_df.columns == diag_score] = diagnosis

In [82]:
# Generates list of all file names
folder_names = [folder for folder in glob.glob(os.path.join(fcpath, "*")) if os.path.isdir(folder)]
#subje

# Loop through subjects
for subj_path in folder_names: # loop through all rows of the spreadsheet
    if "sub" in subj_path:
        # Extract bblid id:
        bblid = subj_path.split('-')[1]
        print("Processing subject " + bblid)
        # Extract session id: 
        items = os.listdir(subj_path)
        ses_folder = [item for item in items if item.startswith("ses")]
        ses = ses_folder[0].split('-')[1]
        ses_path = os.path.join(fcpath, subj_path, ses_folder[0]) # full path to session
        
        
        # Add to running list of IDs grp analysis later:
        bblids.append(bblid)
        sesids.append(ses)
        # Start new row in grp_df for this subject:
        ids = [bblid, ses]  # Values for the first two columns
        grp_df.loc[len(grp_df)] = ids + [float('nan')] * (len(grp_df.columns) - len(ids))
        diag_df.loc[len(diag_df)] = ids + [float('nan')] * (len(diag_df.columns) - len(ids))
        
        # Run a subset of subjs or exclude specific subjs.
        if bblid != "20902" and bblid != "93242"  and bblid != "20754" and bblid != "127065":
            ##################################################################################################
            ## FC
            ##################################################################################################
            if runfcon:
                os.path.join(fcpath, "sub-" + bblid, "ses-" + ses)
                ses_path = os.path.join(fcpath, subj_path, ses_folder[0]) # full path to session
                fcmat_glob = f"{ses_path}/func/*Schaefer117_measure-pearsoncorrelation_conmat.tsv"
                if os.path.isfile(glob.glob(fcmat_glob)[0]):
                    fcmat = pd.read_csv(glob.glob(fcmat_glob)[0], sep='\t') # read in fcmat
                    fcmat.set_index('Node', inplace = True)
                    # Loop through the networks
                    for network in networks:
                        print("Running " + network + " fcon")
                        # Select rows and columns corresponding to the network
                        network_fc = fcmat.loc[fcmat.index.str.contains(network), fcmat.columns[fcmat.columns.str.contains(network)]]
                        # Calculate avg network fc and add value to proper column in grp_df
                        grp_df.loc[len(grp_df)-1, network] = network_fc.values.mean()
            
            ##################################################################################################
            ## CNB
            ##################################################################################################
            if runCNB:
                # Loop through the CNB scores
                for i in range(len(CNB_scores)):
                    CNB_score = CNB_scores[i]
                    CNB_valid = CNB_valids[i]
                    # Select score of interest & validity of that score
                    scores = cnbmat[CNB_score]
                    if int(bblid) in scores.index:
                        score = scores[int(bblid)]
                        valids = cnbmat[CNB_valid]
                        valid = str(valids[int(bblid)])
                        # If score was valid, add to grp_df
                        if 'V' in valid: 
                            grp_df.loc[grp_df['BBLID'] == bblid, grp_df.columns == CNB_score] = score 
            ##################################################################################################
            ## Diagnosis
            ##################################################################################################
            if rundiag:
                # Add hstatus
                for i in range(len(diag_scores)):
                    diag_score = diag_scores[i]
                    # Select score of interest and add to grp_df
                    diagnoses = diagmat[diag_score]
                    if int(bblid) in diagnoses.index:
                        diagnosis = diagnoses[int(bblid)]
                        grp_df.loc[grp_df['BBLID'].astype(str) == bblid, grp_df.columns == diag_score] = diagnosis 
                        diag_df.loc[diag_df['BBLID'].astype(str) == bblid, diag_df.columns == diag_score] = diagnosis 
                    else:
                        diagnosis = "Unknown"
                        grp_df.loc[grp_df['BBLID'].astype(str) == bblid, grp_df.columns == diag_score] = diagnosis 
                        diag_df.loc[diag_df['BBLID'].astype(str) == bblid, diag_df.columns == diag_score] = diagnosis
     
                # Loop through diag scores
                for i in range(len(diag_details)):
                    diag_detail = diag_details[i]
                    # Select score of interest and add to grp_df
                    comorbidities = diagmat[diag_detail]
                    if int(bblid) in comorbidities.index:
                        comorbidity = comorbidities[int(bblid)]
                       # print(comorbidity)
                        diag_df.loc[diag_df['BBLID'].astype(str) == bblid, diag_df.columns == diag_detail] = comorbidity 
                    else:
                        comorbidity = "Unknown"
                        diag_df.loc[diag_df['BBLID'].astype(str) == bblid, diag_df.columns == diag_detail] = comorbidity
            ##################################################################################################
            ## Demographics
            ##################################################################################################
            if rundemo:
                # Loop through the CNB scores
                for i in range(len(demo_scores)):
                    demo_score = demo_scores[i]
                    # Select metric of interest
                    scores = demomat[demo_score]
                    if int(bblid) in scores.index:
                        score = scores[int(bblid)]
                        # Add to grp_df
                        grp_df.loc[grp_df['BBLID'] == bblid, grp_df.columns == demo_score] = score 

            ##################################################################################################
            ## CEST
            ##################################################################################################
            if runcest and bblid != "88760" : #88760's CEST output is empty for some reason.
                print("Processing " + bblid + "'s CEST data'")
                # Extract Glu Session ID
                if bblid in subjlist['BBLID'].astype(str).values:
                    gluses = subjlist.loc[subjlist['BBLID'].astype(str) == bblid, 'SCANID_CEST'].values[0].astype(str) #.
                    cestid = bblid + "_" + gluses
                    # Import data
                    for network in networks:
                        cest_pattern = cestpath + cestid + "/" + cestid + "-2d-GluCEST-s100_7-" + network + "-measures_UNI.tsv"
                        cestfile = glob.glob(cest_pattern)
                        for file in cestfile:
                            if os.path.isfile(file):
                                cestmat = pd.read_csv(file, sep='\t') 
                                means = [] 
                                counts = []
                                col_name = "avgCEST_" + network # for grp_df
                                ct_name = "ctCEST_" + network # for grp_df
                                for index, value in enumerate(cestmat.loc[0,:]):
                                     if "Mean" in cestmat.columns[index] and not np.isnan(value):
                                        # cestmat.at[0, cestmat.columns[index]] = float(value) * float(cestmat.iloc[0, index + 1])
                                        means.append(cestmat.at[0, cestmat.columns[index]])
                                        counts.append(cestmat.at[0, cestmat.columns[index + 1]])
                                if sum(counts) == 0:
                                    grp_df.loc[grp_df['BBLID'] == bblid, grp_df.columns == col_name] = "NaN"
                                    grp_df.loc[grp_df['BBLID'] == bblid, grp_df.columns == col_name] = "NaN"
                                else:
                                    grp_df.loc[grp_df['BBLID'] == bblid, grp_df.columns == col_name] = sum(means) # / sum(counts)
                                    grp_df.loc[grp_df['BBLID'] == bblid, grp_df.columns == ct_name] = sum(counts) # / sum(counts)
                             
                             
print(grp_df)
# sum_of_mean_columns now contains the sum of values in columns with "Mean" in the column name.

Processing subject 20645
Processing subject 120217
Processing subject 125511
Processing subject 105176
Processing subject 132179
Processing subject 15305
Processing subject 17621
Processing subject 98831
Processing subject 116019
Processing subject 17648
Processing subject 96902
Processing subject 118864
Processing subject 20325
Processing subject 80557
Processing subject 20642
Processing subject 94028
Processing subject 125073
Processing subject 121085
Processing subject 85369
Processing subject 93292
Processing subject 116354
Processing subject 90281
Processing subject 96659
Processing subject 93274
Processing subject 20902
Processing subject 93242
Processing subject 106880
Processing subject 94288
Processing subject 90077
Processing subject 20754
Processing subject 21874
Processing subject 121407
Processing subject 102041
Processing subject 88760
Processing subject 20303
Processing subject 119791
Processing subject 106057
Processing subject 127065
Processing subject 20871
Processing

## Stage 2: Group Comparisons and Regressions

In [85]:
#grp_df.to_csv('grp_df_3T_012624.csv', index=True)
diag_df.to_csv('diag_df_3T_012624.csv', index=True)

In [84]:
#grp_df = pd.read_csv('012624_grp_df_3T.csv')
pd.set_option('display.max_rows', None)


### Data Trimming

In [86]:
#CLUNKIER COMPREHENSIVE VERSION:




In [87]:
#BBS

In [34]:
!jupyter nbconvert --to html motor_pipeline.ipynb --output motor_pipeline_3T.html

[NbConvertApp] Converting notebook motor_pipeline.ipynb to html
[NbConvertApp] Writing 1146801 bytes to motor_pipeline_3T.html
