## Notebook to generate participant ids and populate `participants.csv` for nimhans serb pd dataset
- This avoids using nimhans generated ids (UHID) in the downstream processing and anonymizes subject names
- The mapping between genearated `participant_id` and `UHID` is retained in the uhid_dicom_map.csv

In [1]:
import pandas as pd
import numpy as np
import glob
import os
import re

### Paths

In [2]:
DATASET_ROOT = "/home/nimhans/projects/data/PD_SERB/"
demographics_file = f"{DATASET_ROOT}scratch/demographics.csv"
uhid_dicom_map_file = f"{DATASET_ROOT}scratch/uhid_dicom_map.csv"
participants_csv = f"{DATASET_ROOT}tabular/demographics/participants.csv"

raw_dicom_dir = f"{DATASET_ROOT}scratch/raw_dicom/"

### Read demographics file

In [3]:
demo_df = pd.read_csv(demographics_file,index_col=[0])
n_participants = len(demo_df)
print(f"Number of participants: {n_participants}")
demo_df.head()

Number of participants: 102


Unnamed: 0_level_0,Name,UHID,D.O.A,Age,Gender (0_F 1_M),Education,participant_id
SL.NO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Omana P,20190051791,04.05.2019,44.0,0.0,12.0,SERB001
2,Tapan Kumar,20130087469,06.06.2019,45.0,1.0,12.0,SERB002
3,Braj Bhushan Prasad,20170107419,14.05.2019,66.0,1.0,15.0,SERB003
4,Ramesh R,20190032049,19.07.2019,42.0,1.0,10.0,SERB004
5,Samsani Naga Brahmananda Rao,20180094521,24.07.2019,62.0,1.0,11.0,SERB005


### Identify DICOM filenames and Diagnosis based on data on disk

In [4]:
def parse_raw_dicom_filenames(raw_dicom_dir):
    raw_dicom_dir_names = os.listdir(raw_dicom_dir)

    raw_dir_name_list = []
    UHID_list = []
    for dir_name in raw_dicom_dir_names:
        try:
            UHID = re.split('-|_| ', dir_name)[0]
        except Exception as ex:
            print(ex)
            
        #Names have either "_" or " " as sep
        # try:
        #     UHID, raw_dir_name = dir_name.split("_",1) 
        # except Exception as ex:
        #     print(ex)
        #     print(f"Trying space as sep for {dir_name}")
        #     try:
        #         UHID, _ = dir_name.split(" ",1)
        #     except Exception as ex:
        #         print(ex)
            
        raw_dir_name_list.append(dir_name)    
        UHID_list.append(UHID)

    raw_dicom_df = pd.DataFrame()
    raw_dicom_df["UHID"] = UHID_list
    raw_dicom_df["raw_dir_name"] = raw_dir_name_list

    return raw_dicom_df

In [5]:
control_participants_dir = f"{raw_dicom_dir}/controls/"

control_raw_dicom_df = parse_raw_dicom_filenames(control_participants_dir)
control_raw_dicom_df["group"] = "control"
n_raw_dicom_dirs = len(control_raw_dicom_df)
print(f"Number of control raw_dicom_dirs: {n_raw_dicom_dirs}")

pd_participants_dir = f"{raw_dicom_dir}/pd/"

pd_raw_dicom_df = parse_raw_dicom_filenames(pd_participants_dir)
n_raw_dicom_dirs = len(pd_raw_dicom_df)
print(f"Number of pd raw_dicom_dirs: {n_raw_dicom_dirs}")
pd_raw_dicom_df["group"] = "PD"

raw_dicom_df = pd.concat([control_raw_dicom_df,pd_raw_dicom_df],axis=0)

raw_dicom_df.head()

Number of control raw_dicom_dirs: 23
Number of pd raw_dicom_dirs: 70


Unnamed: 0,UHID,raw_dir_name,group
0,PJ19001297,PJ19001297_ALAMGIR_PARVEZ,control
1,MANOHAR,MANOHAR VICTOR,control
2,PJ21000356,PJ21000356_KHALEEL_PASHA_13.07.2021,control
3,PJ19001294,PJ19001294_NIKHIL_KUMAR_SAXENA_COM,control
4,PJ22000050,PJ22000050_VIJAYA KUMARI,control


### Compare UHID lists from demographics and DICOM data

In [6]:
raw_dicom_uhid = set(raw_dicom_df["UHID"])
demo_uhid = set(demo_df["UHID"])

demo_minus_dicom_uhid = demo_uhid - raw_dicom_uhid
dicom_minus_demo_uhid = raw_dicom_uhid - demo_uhid

print(f"{len(demo_minus_dicom_uhid)} demo_missing_dicom_uhid:\n{demo_minus_dicom_uhid}")

print(f"\n{len(dicom_minus_demo_uhid)} dicom_missing_demo_uhid:\n{dicom_minus_demo_uhid}")


25 demo_minus_dicom_uhid:
{'20190142414', 'PJ19001653', '20210030843', '2017007959', 'PJ21000348', '20150074405', 'PJ21000332', '2020785098', 'PJ2100199', '20170107419', '20190107946', '2021097509', '2019417864', '20210043218', 'PJ19001582', '20210006263', '2021', 'PJ21000335', '20180094521', '20210056722', '20190114475', '20210017670', '20190032049', '20170133455', '20150094898'}

16 dicom_minus_demo_uhid:
{'20210005183', '20210006323', 'MANOHAR', '20210084672', 'PJ22000051', '20190041062', '20170079594', 'PJ21000199', '20210055462', '20190109746', 'PJ19001554', 'PJ22000050', '20180033695', 'PJ21000464', '20210069906', '20150060949'}


### Append demo df with 
    - Newly generated participant_id 
    - Group column (i.e. Dx for the participant)

In [7]:
ds_prefix = "SERB"
n_participants_len = len(str(n_participants))
participant_ids = np.arange(1, n_participants+1)
participant_ids_str = [ds_prefix + str(idx).zfill(n_participants_len) for idx in participant_ids]        

demo_df["participant_id"] = participant_ids_str
demo_df = pd.merge(demo_df, raw_dicom_df, on="UHID", how="left")
demo_df.head()

Unnamed: 0,Name,UHID,D.O.A,Age,Gender (0_F 1_M),Education,participant_id,raw_dir_name,group
0,Omana P,20190051791,04.05.2019,44.0,0.0,12.0,SERB001,20190051791_OMANA_P,PD
1,Tapan Kumar,20130087469,06.06.2019,45.0,1.0,12.0,SERB002,20130087469_TAPAN KUMAR PATNAIK_COM,PD
2,Braj Bhushan Prasad,20170107419,14.05.2019,66.0,1.0,15.0,SERB003,,
3,Ramesh R,20190032049,19.07.2019,42.0,1.0,10.0,SERB004,,
4,Samsani Naga Brahmananda Rao,20180094521,24.07.2019,62.0,1.0,11.0,SERB005,,


### Save uhid_dicom_map file 
- participants_id --> UHID mapping --> raw_dicom_dir names to be renamed with [organize_dicoms.py](./organize_dicoms.py)
- Note that UHID is still part of the dicom header, but that can be stripped when converting to Niftis

In [8]:
uhid_dicom_map_df = demo_df[["participant_id","UHID","raw_dir_name"]]

uhid_dicom_map_df.to_csv(uhid_dicom_map_file, index=None)
print(f"Saving uhid_dicom_map here: {uhid_dicom_map_file}")

uhid_dicom_map_df.head()

Saving uhid_dicom_map here: /home/nimhans/projects/data/PD_SERB/scratch/uhid_dicom_map.csv


Unnamed: 0,participant_id,UHID,raw_dir_name
0,SERB001,20190051791,20190051791_OMANA_P
1,SERB002,20130087469,20130087469_TAPAN KUMAR PATNAIK_COM
2,SERB003,20170107419,
3,SERB004,20190032049,
4,SERB005,20180094521,


### Populate participants.csv for `mr_proc` based organization and processing

In [9]:
participant_df = pd.read_csv(participants_csv)
participant_df["participant_id"] = demo_df["participant_id"]
participant_df["age"] = demo_df["Age"]
participant_df["sex"] = demo_df["Gender (0_F 1_M)"]
participant_df["sex"] = participant_df["sex"].replace({0:"F", 1:"M"})
participant_df["group"] = demo_df["group"]

participant_df.head()

Unnamed: 0,participant_id,age,sex,group
0,SERB001,44.0,F,PD
1,SERB002,45.0,M,PD
2,SERB003,66.0,M,
3,SERB004,42.0,M,
4,SERB005,62.0,M,


### Save participants.csv

In [10]:
participant_df.to_csv(participants_csv, index=None)
print(f"Saving file here: {participants_csv}")

Saving file here: /home/nimhans/projects/data/PD_SERB/tabular/demographics/participants.csv
