## Code to generate manifest.csv
- Based on this [schema](https://www.neurobagel.org/documentation/mr_proc/configs/)

### QPN naming convention
- `visit`: timepoint of **ANY** clinical (i.e. UPDRS, MoCA, Neuropsy) data collected 
    - e.g. V01, V02 etc. 
- `session`: timepoint of MRI collection
    - e.g. ses-01, ses-02 etc.
- `event`: timepoint relative to a consensus baseline (used for inter-modality i.e. MRI vs clinical data harmonization) 
    - e.g. baseline, m06, m12 etc. 

In [1]:
import pandas as pd
import numpy as np

### Paths

In [3]:
releases_dir = "/home/nikhil/projects/Parkinsons/qpn/releases/"

previous_release = "Jan_2024"
current_release = "March_2024" # No new data yet (20 March 2024)

# Previous mr_proc manifest
previous_manifest_csv = f"{releases_dir}{previous_release}/tabular/manifest.csv"

# Current mr_proc manifest
current_manifest_csv = f"{releases_dir}{current_release}/tabular/manifest.csv"

# Current recruit manifest
current_recruit_manifest_xls = f"{releases_dir}{current_release}/tabular/demographics/Suivi_RPQ.xlsx"

### Read recruitment manifest from previous release

In [5]:
previous_recruit_manifest_df = pd.read_csv(previous_manifest_csv)
previous_recruit_manifest_df["participant_id"] = previous_recruit_manifest_df["participant_id"].str.strip()
nipoppy_participants_previous = previous_recruit_manifest_df["participant_id"].dropna().unique()
n_nipoppy_participants_previous = len(nipoppy_participants_previous)

print(f"number of participants from previous nipoppy release: {n_nipoppy_participants_previous}")
previous_recruit_manifest_df.head()

number of participants from previous nipoppy release: 298


Unnamed: 0,participant_id,visit,session,datatype
0,PD00016,MRI_v1,ses-01,"['anat','dwi','fmap','func']"
1,PD00020,MRI_v1,ses-01,"['anat','dwi','fmap','func']"
2,PD00032,MRI_v1,ses-01,"['anat','dwi','fmap','func']"
3,PD00048,MRI_v1,ses-01,"['anat','dwi','fmap','func']"
4,PD00119,MRI_v1,ses-01,"['anat','dwi','fmap','func']"


### Read latest recruitment manifest

In [7]:
suivi_df

Unnamed: 0,subj_id,Group,code_external_C-OPN,IRM01\n(J-M-A),#IRM 1\n PD,#IRM 1\n CTRL,# IRM 1\n RBD,# IRM 1\nOTHER,IRM 2 \n(J-M-A),#IRM 2\n PD,#IRM 2\n CTRL,# IRM 2\n RBD,# IRM 2 OTHER,IRM 3\n(J-M-A)
0,Calculs,,,,211.0,73.0,14.0,,,38,13.0,1.0,,
1,PD00002,PD,0,0,,,,,0,,,,,0
2,PD00003,PD,0,0,,,,,0,,,,,0
3,PD00004,PD,0,0,,,,,0,,,,,0
4,PD00005,PD,0,0,,,,,0,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3124,,,,,,,,,,,,,,
3125,,,,,,,,,,,,,,
3126,,,,,,,,,,,,,,
3127,,,,,,,,,,,,,,


In [8]:
col_range = "A:N"

col_rename_dict = {"subj_id":"participant_id",
                "IRM01\n(J-M-A)":"IRM01_date", "#IRM 1\n PD":"IRM01_PD", "#IRM 1\n CTRL":"IRM01_CTRL", "# IRM 1\n RBD":"IRM01_RBD",
                "IRM 2 \n(J-M-A)":"IRM02_date", "#IRM 2\n PD":"IRM02_PD", "#IRM 2\n CTRL":"IRM02_CTRL", "# IRM 2\n RBD":"IRM02_RBD"}

useful_cols = col_rename_dict.values()

suivi_df = pd.read_excel(current_recruit_manifest_xls,sheet_name="En cours", engine='openpyxl', usecols=col_range)
suivi_df = suivi_df.rename(columns=col_rename_dict)[useful_cols].copy()

# remove the row with tally
suivi_df = suivi_df.drop([0])

# remove rows without participant_id
suivi_df = suivi_df.dropna(axis=0, subset=["participant_id"])
suivi_df = suivi_df[~suivi_df["participant_id"].astype(str).isin(["0"])] 

# remove subjects without imaging data
suivi_df = suivi_df[(suivi_df["IRM01_PD"] == 1) | (suivi_df["IRM01_CTRL"] == 1) | (suivi_df["IRM01_RBD"] == 1) | 
                    (suivi_df["IRM02_PD"] == 1) | (suivi_df["IRM02_CTRL"] == 1) | (suivi_df["IRM02_RBD"] == 1)]


# fix participant_id formatting issues
# Some rows have Dx in participant_id and one participant with two IDs with "="
possible_delimiters = [" ", "(", "="]
for delim in possible_delimiters:
    suivi_df["participant_id"] = suivi_df["participant_id"].str.split(pat=delim, n=1, expand=True)[0]

# nipoppy_participants_current
nipoppy_participants_current = suivi_df["participant_id"].dropna().unique()

suivi_df

Unnamed: 0,participant_id,IRM01_date,IRM01_PD,IRM01_CTRL,IRM01_RBD,IRM02_date,IRM02_PD,IRM02_CTRL,IRM02_RBD
15,PD00016,26-07-2019,1.0,0.0,0.0,0,,,
19,PD00020,2018-05-12 00:00:00,1.0,0.0,0.0,24-11-2022,1,0.0,0.0
31,PD00032,24-07-2019,1.0,0.0,0.0,0,,,
47,PD00048,21-08-2019,1.0,0.0,0.0,0,,,
117,PD00119,13-08-2018,1.0,0.0,0.0,0,,,
...,...,...,...,...,...,...,...,...,...
2229,MNI0565,2023-09-11 00:00:00,0.0,1.0,0.0,0,,,
2231,MNI0602,2023-11-22 00:00:00,1.0,0.0,0.0,0,,,
2234,MNI0605,15-12-2023,0.0,1.0,0.0,0,,,
2236,MNI0607,00:00:00,1.0,0.0,0.0,0,,,


### Set date columns and check visit order

In [9]:
# set date columns to datetime
# has mixed types, but auto formatting + coerce works fine here

# explicitely set 0 to nan to avoid origin issues (date: 0 is 1970-01-01)
suivi_df["IRM01_date"] = suivi_df["IRM01_date"].replace(0, np.nan)
suivi_df["IRM02_date"] = suivi_df["IRM02_date"].replace(0, np.nan)

suivi_df["IRM01_date"] = pd.to_datetime(suivi_df["IRM01_date"], errors="coerce")
suivi_df["IRM02_date"] = pd.to_datetime(suivi_df["IRM02_date"], errors="coerce")

# Check visit orders
suivi_df["visit_interval (V2-V1) in days"] = suivi_df["IRM02_date"] - suivi_df["IRM01_date"]
suivi_df["visit_interval (V2-V1) in days"] = suivi_df["visit_interval (V2-V1) in days"].dt.days
visits_wit_wrong_order_df = suivi_df[suivi_df["visit_interval (V2-V1) in days"] < 0]
print(f"Participants with wrong visit order: ({len(visits_wit_wrong_order_df)}) :{visits_wit_wrong_order_df['participant_id'].values}")


Participants with wrong visit order: (0) :[]


  suivi_df["IRM01_date"] = pd.to_datetime(suivi_df["IRM01_date"], errors="coerce")
  suivi_df["IRM02_date"] = pd.to_datetime(suivi_df["IRM02_date"], errors="coerce")


### Get new participants per session
This is based on a valid date for MRI visit

In [10]:
total_partcipants_additions = list(set(nipoppy_participants_current) - set(nipoppy_participants_previous))
print(f"number of new participants: {len(total_partcipants_additions)}")

MRI_v1_participants = suivi_df[(suivi_df["IRM01_PD"] == 1) | (suivi_df["IRM01_CTRL"] == 1) | (suivi_df["IRM01_RBD"] == 1)]["participant_id"].dropna().unique()
MRI_v2_participants = suivi_df[(suivi_df["IRM02_PD"] == 1) | (suivi_df["IRM02_CTRL"] == 1) | (suivi_df["IRM02_RBD"] == 1)]["participant_id"].dropna().unique()

visit_participant_dict = {"MRI_v1": MRI_v1_participants, "MRI_v2": MRI_v2_participants}

print(f"MRI_v1_participants: {len(MRI_v1_participants)}, MRI_v2_participants: {len(MRI_v2_participants)}")

number of new participants: 0
MRI_v1_participants: 298, MRI_v2_participants: 52


### Generate current_manifest_df

- Populate manifest with available clinical visits / MRI sessions
- Datatypes: Assuming QPN has all 4 BIDS datatypes: ["anat","dwi","fmap","func"]

- Sample `manifest.csv`

| participant_id | participant_dicom_dir | visit | session | datatype                     | bids_id |
|----------------|-----------------------|-------|---------|------------------------------|---------|
| 001            | MyStudy_001_V2021      | V01   | ses-01  | ["anat","dwi","fmap","func"] | sub-001 |
| 001            | MyStudy_001_V2022      | V02   | ses-02  | ["anat"]                     | sub-001 |
| 002            | MyStudy_002_V2021      | V01   | ses-01  | ["anat","dwi"]               | sub-002 |
| 002            | MyStudy_002_V2024      | V03   | ses-03  | ["anat","dwi"]               | sub-002 |

In [11]:
visit_labels = visit_participant_dict.keys()
visit_session_dict = {"MRI_v1": "ses-01", "MRI_v2": "ses-02"}

manifest_cols = ["visit","session","datatype"]
avail_datatypes = "['anat','dwi','fmap','func']"

current_manifest_df = pd.DataFrame()

for visit_label in visit_labels:
    visit_participant_ids = visit_participant_dict[visit_label]
    print(f"visit_id: {visit_label}, n_participants: {len(visit_participant_ids)}")

    _df = pd.DataFrame(index=visit_participant_ids, columns=manifest_cols)

    session = visit_session_dict[visit_label]

    _df.loc[visit_participant_ids,"visit"] = visit_label
    _df.loc[visit_participant_ids,"session"] = session
    _df.loc[visit_participant_ids,"datatype"] = avail_datatypes
    
    current_manifest_df = pd.concat([current_manifest_df, _df], axis=0)

current_manifest_df

visit_id: MRI_v1, n_participants: 298
visit_id: MRI_v2, n_participants: 52


Unnamed: 0,visit,session,datatype
PD00016,MRI_v1,ses-01,"['anat','dwi','fmap','func']"
PD00020,MRI_v1,ses-01,"['anat','dwi','fmap','func']"
PD00032,MRI_v1,ses-01,"['anat','dwi','fmap','func']"
PD00048,MRI_v1,ses-01,"['anat','dwi','fmap','func']"
PD00119,MRI_v1,ses-01,"['anat','dwi','fmap','func']"
...,...,...,...
MNI0301,MRI_v2,ses-02,"['anat','dwi','fmap','func']"
MNI0324,MRI_v2,ses-02,"['anat','dwi','fmap','func']"
MNI0335,MRI_v2,ses-02,"['anat','dwi','fmap','func']"
MNI0336,MRI_v2,ses-02,"['anat','dwi','fmap','func']"


### Save update CSV

In [None]:
save_current_manifest = False
if save_current_manifest:
    print(f"Saving new nipoppy manifest here: {current_manifest_csv}")
    current_manifest_df = current_manifest_df.reset_index().rename(columns={"index":"participant_id"})
    current_manifest_df.to_csv(current_manifest_csv,index=None)