## Code to generate mr_proc_manifest.csv
- Based on this [schema](https://www.neurobagel.org/documentation/mr_proc/configs/)

### QPN naming convention
- `visit`: timepoint of **ANY** clinical (i.e. UPDRS, MoCA, Neuropsy) data collected 
    - e.g. V01, V02 etc. 
- `session`: timepoint of MRI collection
    - e.g. ses-01, ses-02 etc.
- `event`: timepoint relative to a consensus baseline (used for inter-modality i.e. MRI vs clinical data harmonization) 
    - e.g. baseline, m06, m12 etc. 

In [8]:
import pandas as pd
import numpy as np

### Paths

In [9]:
releases_dir = "/home/nikhil/projects/Parkinsons/qpn/releases/"

previous_release = "Nov_2022"
current_release = "Apr_2022"

# Previous mr_proc manifest
previous_mr_proc_manifest_csv = f"{releases_dir}{previous_release}/tabular/mr_proc_manifest.csv"

# Current mr_proc manifest
current_mr_proc_manifest_csv = f"{releases_dir}{current_release}/tabular/mr_proc_manifest.csv"

# Current recruit manifest
current_recruit_manifest_xls = f"{releases_dir}{current_release}/tabular/QPN_Imaging_Codes.xlsx"

### Read recruitment manifest from previous release

In [10]:
previous_recruit_manifest_df = pd.read_csv(previous_mr_proc_manifest_csv)
previous_recruit_manifest_df["participant_id"] = previous_recruit_manifest_df["participant_id"].str.strip()
mr_proc_participants_previous = previous_recruit_manifest_df["participant_id"].dropna().unique()
n_mr_proc_participants_previous = len(mr_proc_participants_previous)

print(f"number of participants from previous mr_proc release: {n_mr_proc_participants_previous}")
previous_recruit_manifest_df.head()

number of participants from previous mr_proc release: 208


Unnamed: 0,participant_id,sex,dob,group,visit_01,visit_02,bids_id
0,MNI0056,M,1942-05-21,PD,2021-08-18,,sub-MNI0056D864854
1,MNI0058,M,1964-03-14,PD,2021-08-18,,sub-MNI0058D197308
2,MNI0068,M,1952-08-05,PD,2021-08-27,,sub-MNI0068D842090
3,MNI0079,F,1971-11-25,PD,2021-12-22,,sub-MNI0079D760662
4,MNI0103,M,1939-08-17,PD,2021-08-11,,sub-MNI0103D369057


### Read latest recruitment manifest
- Using group as a longitudinal clinical var 
- This allows inclusion of participants with demographic info but without specific clinical scores

In [11]:
sheet_name = "Dashboard" #"recruit_manifest"

assessment_dict = {
    "MRI": ["MRI"],
    "clinical": ["Neuropsy","TUG","UPDRS","MoCA"] 
}

# useful cols
useful_cols = {"MRI Time 1": "MRI_V1", "MRI Time 2": "MRI_V2", 
            "Neuropsy Time 1":"Neuropsy_V1", "Neuropsy Time 2":"Neuropsy_V2", 
            "TUG Time 1": "TUG_V1", "TUG Time 2": "TUG_V2",
            "UPDRS (part III) Time 1":"UPDRS_V1", "UPDRS (part III) Time 2":"UPDRS_V2",
            "MoCA Time 1":"MoCA_V1", "MoCA Time 2":"MoCA_V2","MoCA time 3":"MoCA_V3"}

column_list = ["participant_id", "group"] + list(useful_cols.keys())

dashboard_df = pd.read_excel(current_recruit_manifest_xls,sheet_name=sheet_name, engine='openpyxl',usecols=column_list)

dashboard_df = dashboard_df.dropna(how="all", axis=0)
dashboard_df["participant_id"] = dashboard_df["participant_id"].str.strip()

dashboard_df = dashboard_df.rename(columns=useful_cols)
for col in useful_cols.values():
    dashboard_df[col] = pd.to_datetime(dashboard_df[col],errors="coerce", dayfirst=True) #Note: format="%d-%m-%Y" skips MRI02 dates

mr_proc_participants_current = dashboard_df["participant_id"].dropna().unique()
n_mr_proc_participants_current = len(mr_proc_participants_current)
print(f"number of participants from current mr_proc release: {n_mr_proc_participants_current}")
n_MRI01 = len(dashboard_df[~dashboard_df["MRI_V1"].isna()])
n_MRI02 = len(dashboard_df[~dashboard_df["MRI_V2"].isna()])
print(f"n_MRI01: {n_MRI01}, n_MRI02: {n_MRI02}")
dashboard_df.head()

number of participants from current mr_proc release: 208
n_MRI01: 208, n_MRI02: 16


Unnamed: 0,participant_id,group,MRI_V1,MRI_V2,Neuropsy_V1,Neuropsy_V2,UPDRS_V1,UPDRS_V2,MoCA_V1,MoCA_V2,MoCA_V3,TUG_V1,TUG_V2
0,MNI0056,PD,2021-08-18,NaT,2021-07-30,NaT,2021-06-11,NaT,2021-06-11 00:00:00,,NaT,2021-09-03,NaT
1,MNI0058,PD,2021-08-18,NaT,2021-08-18,NaT,2021-07-23,NaT,2021-07-23 00:00:00,,NaT,2021-07-23,NaT
2,MNI0068,PD,2021-08-27,NaT,2021-08-18,NaT,2021-08-27,NaT,,,NaT,NaT,NaT
3,MNI0079,PD,2021-12-22,NaT,2022-01-21,NaT,2022-01-21,NaT,2021-12-22 00:00:00,,NaT,2021-07-02,NaT
4,MNI0103,PD,2021-11-08,NaT,2021-10-01,NaT,2021-10-01,NaT,2021-11-08 00:00:00,,NaT,NaT,NaT


### Get new participants

In [12]:
partcipants_additions = list(set(mr_proc_participants_current) - set(mr_proc_participants_previous))

print(f"number of new participants: {len(partcipants_additions)}")

number of new participants: 2


### Generate current_mr_proc_manifest_df

- Populate manifest with available clinical visits / MRI sessions
- Datatypes: Assuming QPN has all 4 BIDS datatypes: ["anat","dwi","fmap","func"]

- Sample `mr_proc_manifest.csv`

| participant_id | participant_dicom_dir | visit | session | datatype                     | bids_id |
|----------------|-----------------------|-------|---------|------------------------------|---------|
| 001            | MyStudy_001_V2021      | V01   | ses-01  | ["anat","dwi","fmap","func"] | sub-001 |
| 001            | MyStudy_001_V2022      | V02   | ses-02  | ["anat"]                     | sub-001 |
| 002            | MyStudy_002_V2021      | V01   | ses-01  | ["anat","dwi"]               | sub-002 |
| 002            | MyStudy_002_V2024      | V03   | ses-03  | ["anat","dwi"]               | sub-002 |

In [13]:
visit_ids = ["1","2"]
manifest_cols = ["visit","session","datatype"]
avail_datatypes = ["anat","dwi","fmap","func"]

current_mr_proc_manifest_df = pd.DataFrame()

for visit_id in visit_ids:
    _df = pd.DataFrame(index=mr_proc_participants_current, columns=manifest_cols)
    for k,v in assessment_dict.items():
        assessment_cols = [f"{vx}_V{visit_id}" for vx in v]
        avail_ids = dashboard_df[~(dashboard_df[assessment_cols].isna().all(axis=1))]["participant_id"]

        visit_label = f"V0{visit_id}"  
        _df.loc[avail_ids,"visit"] = visit_label  

        if k == "MRI":    
            time_col = "session"    
            visit_label = f"ses-0{visit_id}"
            _df.loc[avail_ids,time_col] = visit_label
            
            for pid in avail_ids:
                _df.loc[pid,"datatype"] = avail_datatypes

    current_mr_proc_manifest_df = current_mr_proc_manifest_df.append(_df)

# Remove participants with all missing visits/session
current_mr_proc_manifest_df = current_mr_proc_manifest_df.dropna(axis=0, how="all")
current_mr_proc_manifest_df.index = current_mr_proc_manifest_df.index.str.strip()
current_mr_proc_manifest_df.head()

Unnamed: 0,visit,session,datatype
MNI0056,V01,ses-01,"[anat, dwi, fmap, func]"
MNI0058,V01,ses-01,"[anat, dwi, fmap, func]"
MNI0068,V01,ses-01,"[anat, dwi, fmap, func]"
MNI0079,V01,ses-01,"[anat, dwi, fmap, func]"
MNI0103,V01,ses-01,"[anat, dwi, fmap, func]"


### Save update CSV

In [15]:
save_current_mr_proc_manifest = False
if save_current_mr_proc_manifest:
    print(f"Saving new mr_proc manifest here: {current_mr_proc_manifest_csv}")
    current_mr_proc_manifest_df = current_mr_proc_manifest_df.reset_index().rename(columns={"index":"participant_id"})
    current_mr_proc_manifest_df.to_csv(current_mr_proc_manifest_csv,index=None)

Saving new mr_proc manifest here: /home/nikhil/projects/Parkinsons/qpn/releases/Apr_2022/tabular/mr_proc_manifest.csv
