## Notebook to generate participant ids and populate `participants.csv` for nimhans serb pd dataset
- This avoids using nimhans generated ids (UHID) in the downstream processing and anonymizes subject names
- The mapping between genearated `participant_id` and `UHID` is retained in the uhid_dicom_map.csv

In [None]:
import pandas as pd
import numpy as np
import glob
import os
import re

### Paths

In [None]:
DATASET_ROOT = "/home/nimhans/projects/data/PD_SERB/"
demographics_file = f"{DATASET_ROOT}scratch/demographics.csv"
uhid_dicom_map_file = f"{DATASET_ROOT}scratch/uhid_dicom_map.csv"
participants_csv = f"{DATASET_ROOT}tabular/demographics/participants.csv"

raw_dicom_dir = f"{DATASET_ROOT}scratch/raw_dicom/"

### Read demographics file

In [None]:
demo_df = pd.read_csv(demographics_file,index_col=[0])
n_participants = len(demo_df)
print(f"Number of participants in demographics file: {n_participants}")
demo_df.head()

### Identify DICOM filenames and Diagnosis based on data on disk

In [None]:
def parse_raw_dicom_filenames(raw_dicom_dir):
    raw_dicom_dir_names = os.listdir(raw_dicom_dir)

    raw_dir_name_list = []
    UHID_list = []
    for dir_name in raw_dicom_dir_names:
        try:
            UHID = re.split('-|_| ', dir_name)[0]
        except Exception as ex:
            print(ex)
            
        raw_dir_name_list.append(f"{raw_dicom_dir}/{dir_name}")
        UHID_list.append(UHID)

    raw_dicom_df = pd.DataFrame()
    raw_dicom_df["UHID"] = UHID_list
    raw_dicom_df["raw_dir_name"] = raw_dir_name_list

    return raw_dicom_df

In [None]:
control_participants_dir = f"{raw_dicom_dir}/controls/"

control_raw_dicom_df = parse_raw_dicom_filenames(control_participants_dir)
control_raw_dicom_df["group"] = "control"
n_raw_dicom_dirs = len(control_raw_dicom_df)
print(f"Number of control raw_dicom_dirs: {n_raw_dicom_dirs}")

pd_participants_dir = f"{raw_dicom_dir}/pd/"

pd_raw_dicom_df = parse_raw_dicom_filenames(pd_participants_dir)
n_raw_dicom_dirs = len(pd_raw_dicom_df)
print(f"Number of pd raw_dicom_dirs: {n_raw_dicom_dirs}")
pd_raw_dicom_df["group"] = "PD"

raw_dicom_df = pd.concat([control_raw_dicom_df,pd_raw_dicom_df],axis=0)

raw_dicom_df.head()

### Compare UHID lists from demographics and DICOM data

In [None]:
raw_dicom_uhid = set(raw_dicom_df["UHID"])
demo_uhid = set(demo_df["UHID"])

demo_minus_dicom_uhid = demo_uhid - raw_dicom_uhid
dicom_minus_demo_uhid = raw_dicom_uhid - demo_uhid

print(f"{len(demo_minus_dicom_uhid)} demo_missing_dicom_uhid:\n{demo_minus_dicom_uhid}")

print(f"\n{len(dicom_minus_demo_uhid)} dicom_missing_demo_uhid:\n{dicom_minus_demo_uhid}")


### Append demo df with 
    - Newly generated participant_id 
    - Group column (i.e. Dx for the participant)

In [None]:
ds_prefix = "SERB"
n_participants_len = len(str(n_participants))
participant_ids = np.arange(1, n_participants+1)
participant_ids_str = [ds_prefix + str(idx).zfill(n_participants_len) for idx in participant_ids]        

demo_df["participant_id"] = participant_ids_str
demo_df = pd.merge(demo_df, raw_dicom_df, on="UHID", how="inner")

print(f"Number participants with available MR data: {len(demo_df)}")
demo_df.head()

### Save uhid_dicom_map file 
- participants_id --> UHID mapping --> raw_dicom_dir names to be renamed with [organize_dicoms.py](./organize_dicoms.py)
- Note that UHID is still part of the dicom header, but that can be stripped when converting to Niftis

In [None]:
uhid_dicom_map_df = demo_df[["participant_id","UHID","raw_dir_name"]]

uhid_dicom_map_df.to_csv(uhid_dicom_map_file, index=None)
print(f"Saving uhid_dicom_map here: {uhid_dicom_map_file}")

uhid_dicom_map_df.head()

### Populate participants.csv for `mr_proc` based organization and processing

In [None]:
participant_df = pd.read_csv(participants_csv)
participant_df["participant_id"] = demo_df["participant_id"]
participant_df["age"] = demo_df["Age"]
participant_df["sex"] = demo_df["Gender (0_F 1_M)"]
participant_df["sex"] = participant_df["sex"].replace({0:"F", 1:"M"})
participant_df["group"] = demo_df["group"]

print(f"Number of participants: {len(participant_df)}")
participant_df.head()

### Save participants.csv

In [None]:
participant_df.to_csv(participants_csv, index=None)
print(f"Saving file here: {participants_csv}")