## Notebook to generate participant ids and populate `participants.csv` for nimhans serb pd dataset
- This avoids using nimhans generated ids (UHID) in the downstream processing and anonymizes subject names
- The mapping between genearated `participant_id` and `UHID` is retained in the uhid_dicom_map.csv

### Important notes
- Groundtruth UHID will be in the dicom file
- If `organize_dicoms.py` is already run on your dataset which renames the raw_dicoms into mr_proc organization based on genearated `participant_id`, then DONOT use this notebook to re-create `participants_csv` and `uhid_dicom_map_file`. This may break the correspondance between existing renamed dicoms in mr_proc and `UHID` and consequently demographic data. If you have only a few new subjects to add, do it manually inside `participants_csv` and `uhid_dicom_map_file`. This notebooks is meant to work on a completely "new" dataset and requires running of `organize_dicoms.py` after new participant_ids are generated. 

In [1]:
import pandas as pd
import numpy as np
import glob
import os
import re
from pathlib import Path

### Paths

In [2]:
DATASET_ROOT = "/home/nimhans/projects/data/PD_SERB/"
demographics_file = f"{DATASET_ROOT}scratch/demographics.csv"
uhid_dicom_map_file = f"{DATASET_ROOT}scratch/uhid_dicom_map.csv"
participants_csv = f"{DATASET_ROOT}tabular/demographics/participants.csv"

raw_dicom_dir = f"{DATASET_ROOT}scratch/raw_dicom/"

### Read demographics file

In [3]:
demo_df = pd.read_csv(demographics_file,index_col=[0])
n_participants = len(demo_df)
print(f"Number of participants in demographics file: {n_participants}")
demo_df.head()

Number of participants in demographics file: 102


Unnamed: 0_level_0,Name,UHID,D.O.A,Age,Gender (0_F 1_M),Education
SL.NO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,Omana P,20190051791,04.05.2019,44.0,0.0,12.0
2,Tapan Kumar,20130087469,06.06.2019,45.0,1.0,12.0
3,Braj Bhushan Prasad,20170107419,14.05.2019,66.0,1.0,15.0
4,Ramesh R,20190032049,19.07.2019,42.0,1.0,10.0
5,Samsani Naga Brahmananda Rao,20180094521,24.07.2019,62.0,1.0,11.0


### Check duplicates in demographics

In [4]:
duplicate_uhid = demo_df[demo_df["UHID"].duplicated()]["UHID"]
duplicate_names = demo_df[demo_df["UHID"].isin(duplicate_uhid)]["Name"].values
print(f"Duplicate Names: {duplicate_names}\nDuplicate UHIDs: {list(duplicate_uhid)}")
demo_df[demo_df["UHID"].isin(duplicate_uhid)]

Duplicate Names: ['Raj Kumar Das' 'Rajkumar Das' 'Biman Roy ' 'Asit Mirdha ']
Duplicate UHIDs: ['20190133585', '20140044001']


Unnamed: 0_level_0,Name,UHID,D.O.A,Age,Gender (0_F 1_M),Education
SL.NO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
18,Raj Kumar Das,20190133585,17.10.2019,47.0,1.0,10.0
35,Rajkumar Das,20190133585,17.10.2019,47.0,1.0,10.0
45,Biman Roy,20140044001,31.07.2019,61.0,1.0,15.0
47,Asit Mirdha,20140044001,05.08.2019,50.0,1.0,15.0


### Remove duplicates from demographics

In [5]:
## Asit Mirdha	20140044001 is wrong. Need to update that in the demographics file.

exclusion_list = ["Rajkumar Das","Biman Roy ","Asit Mirdha "]
demo_df = demo_df[~demo_df["Name"].isin(exclusion_list)].copy()

n_demo_participants = len(demo_df)
print(f"Number of participants in demographics file after duplicate removal: {n_demo_participants}")
demo_df.head()

Number of participants in demographics file after duplicate removal: 99


Unnamed: 0_level_0,Name,UHID,D.O.A,Age,Gender (0_F 1_M),Education
SL.NO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,Omana P,20190051791,04.05.2019,44.0,0.0,12.0
2,Tapan Kumar,20130087469,06.06.2019,45.0,1.0,12.0
3,Braj Bhushan Prasad,20170107419,14.05.2019,66.0,1.0,15.0
4,Ramesh R,20190032049,19.07.2019,42.0,1.0,10.0
5,Samsani Naga Brahmananda Rao,20180094521,24.07.2019,62.0,1.0,11.0


### Identify DICOM filenames and Diagnosis based on data on disk

In [6]:
def parse_raw_dicom_filenames(raw_dicom_dir):
    raw_dicom_dir_names = os.listdir(raw_dicom_dir)

    raw_dir_name_list = []
    UHID_list = []
    name_list = []
    for dir_name in raw_dicom_dir_names:
        try:
            dir_token = re.split('-|_| ', dir_name)
            UHID = dir_token[0]
            name = ""
            for s in dir_token[1:]:
                name = f"{name}{s}" 
            name = re.sub('[^A-Za-z]+', '', name).lower()

        except Exception as ex:
            print(ex)
            
        raw_dir_name_list.append(f"{raw_dicom_dir}/{dir_name}")
        UHID_list.append(UHID)
        name_list.append(name)

    raw_dicom_df = pd.DataFrame()
    raw_dicom_df["UHID"] = UHID_list
    raw_dicom_df["raw_dir_name"] = raw_dir_name_list
    raw_dicom_df["name"] = name_list

    return raw_dicom_df

In [7]:
control_participants_dir = f"{raw_dicom_dir}/controls/"
# control_participants_dir = "/media/nimhans/Extreme SSD/nimhans_colab/controls/"

control_raw_dicom_df = parse_raw_dicom_filenames(control_participants_dir)
control_raw_dicom_df["group"] = "control"
n_raw_dicom_dirs = len(control_raw_dicom_df)
print(f"Number of control raw_dicom_dirs: {n_raw_dicom_dirs}")

pd_participants_dir = f"{raw_dicom_dir}/PD/"
# pd_participants_dir = "/media/nimhans/Extreme SSD/nimhans_colab/PD/"

pd_raw_dicom_df = parse_raw_dicom_filenames(pd_participants_dir)
n_raw_dicom_dirs = len(pd_raw_dicom_df)
print(f"Number of pd raw_dicom_dirs: {n_raw_dicom_dirs}")
pd_raw_dicom_df["group"] = "PD"

raw_dicom_df = pd.concat([control_raw_dicom_df,pd_raw_dicom_df],axis=0)
print(f"Number of total raw_dicom_dirs: {len(raw_dicom_df)}")

raw_dicom_df.head()

Number of control raw_dicom_dirs: 22
Number of pd raw_dicom_dirs: 69
Number of total raw_dicom_dirs: 91


Unnamed: 0,UHID,raw_dir_name,name,group
0,PJ19001297,/home/nimhans/projects/data/PD_SERB/scratch/ra...,alamgirparvez,control
1,MANOHAR,/home/nimhans/projects/data/PD_SERB/scratch/ra...,victor,control
2,PJ21000356,/home/nimhans/projects/data/PD_SERB/scratch/ra...,khaleelpasha,control
3,PJ19001294,/home/nimhans/projects/data/PD_SERB/scratch/ra...,nikhilkumarsaxenacom,control
4,PJ22000050,/home/nimhans/projects/data/PD_SERB/scratch/ra...,vijayakumari,control


### Check duplicates in dicoms

In [8]:
duplicate_uhid = raw_dicom_df[raw_dicom_df["UHID"].duplicated()]["UHID"]
duplicate_names = raw_dicom_df[raw_dicom_df["UHID"].isin(duplicate_uhid)]["raw_dir_name"].values
print(f"Duplicate Names: {duplicate_names}\nDuplicate UHIDs: {list(duplicate_uhid)}")
raw_dicom_df[raw_dicom_df["UHID"].isin(duplicate_uhid)]

Duplicate Names: []
Duplicate UHIDs: []


Unnamed: 0,UHID,raw_dir_name,name,group


## Compare demographics and dicom lists

In [9]:
# Compare UHID
raw_dicom_uhid = set(raw_dicom_df["UHID"])
demo_uhid = set(demo_df["UHID"])

demo_minus_dicom_uhid = demo_uhid - raw_dicom_uhid
dicom_minus_demo_uhid = raw_dicom_uhid - demo_uhid

print(f"{len(demo_minus_dicom_uhid)} subject uhids in demographics file are missing dicoms:\n{demo_minus_dicom_uhid}")

print(f"\n{len(dicom_minus_demo_uhid)} subject uhids who have dicoms are missing demographic uhid:\n{dicom_minus_demo_uhid}")

25 subject uhids in demographics file are missing dicoms:
{'20180094521', '20170133455', 'PJ2100199', '20210056722', '2021', 'PJ21000335', 'PJ21000348', '20190107946', '2020785098', '20150074405', '20210006263', '20190032049', '20170107419', 'PJ19001653', '20190114475', '20210017670', '20210043218', 'PJ19001582', '2017007959', '20150094898', '20190142414', 'PJ21000332', '20210030843', '2019417864', '2021097509'}

17 subject uhids who have dicoms are missing demographic uhid:
{'PJ21000199', '20190109746', '20210055462', '20170079594', '20190041062', '20210069906', 'PJ21000464', 'PJ19001554', '20140044001', 'PJ22000050', '20150060949', '20180033695', '20210006323', '20210005183', 'PJ22000051', 'MANOHAR', '20210084672'}


### Merge and create single dataframe with matched uhid and names in dicom_dir and demographic file

#### Groundtruth UHID will be in the dicom file --> keep all the uhid from raw_dicom_df

In [10]:
dicom_demo_avail_df = pd.merge(raw_dicom_df, demo_df, on=["UHID"], how="left")

n_participants = len(dicom_demo_avail_df)

print(f"Available participants after dicom+demo merge: {n_participants}")

dicom_demo_avail_df.head()

Available participants after dicom+demo merge: 91


Unnamed: 0,UHID,raw_dir_name,name,group,Name,D.O.A,Age,Gender (0_F 1_M),Education
0,PJ19001297,/home/nimhans/projects/data/PD_SERB/scratch/ra...,alamgirparvez,control,Alamgir Parvez,27.07.2019,56.0,1.0,12.0
1,MANOHAR,/home/nimhans/projects/data/PD_SERB/scratch/ra...,victor,control,,,,,
2,PJ21000356,/home/nimhans/projects/data/PD_SERB/scratch/ra...,khaleelpasha,control,Khaleel Basha,13.07.2021,47.0,1.0,8.0
3,PJ19001294,/home/nimhans/projects/data/PD_SERB/scratch/ra...,nikhilkumarsaxenacom,control,Nikhil Kumar Saxena,26.07.2019,60.0,1.0,16.0
4,PJ22000050,/home/nimhans/projects/data/PD_SERB/scratch/ra...,vijayakumari,control,,,,,


## Generate new participant_ids
### Need to do this only once. 
### Subsequently just add rows directly to the CSVs: `uhid_dicom_map_file` and `participants_csv`.

In [11]:
ds_prefix = "SERB"

n_participants_len = np.max([3,len(str(n_participants))])
participant_ids = np.arange(1, n_participants+1)
participant_ids_str = [ds_prefix + str(idx).zfill(n_participants_len) for idx in participant_ids]    

print(f"Creating {len(participant_ids)} new participant ids")

dicom_demo_avail_df["participant_id"] = participant_ids_str

dicom_demo_avail_df.head()

Creating 91 new participant ids


Unnamed: 0,UHID,raw_dir_name,name,group,Name,D.O.A,Age,Gender (0_F 1_M),Education,participant_id
0,PJ19001297,/home/nimhans/projects/data/PD_SERB/scratch/ra...,alamgirparvez,control,Alamgir Parvez,27.07.2019,56.0,1.0,12.0,SERB001
1,MANOHAR,/home/nimhans/projects/data/PD_SERB/scratch/ra...,victor,control,,,,,,SERB002
2,PJ21000356,/home/nimhans/projects/data/PD_SERB/scratch/ra...,khaleelpasha,control,Khaleel Basha,13.07.2021,47.0,1.0,8.0,SERB003
3,PJ19001294,/home/nimhans/projects/data/PD_SERB/scratch/ra...,nikhilkumarsaxenacom,control,Nikhil Kumar Saxena,26.07.2019,60.0,1.0,16.0,SERB004
4,PJ22000050,/home/nimhans/projects/data/PD_SERB/scratch/ra...,vijayakumari,control,,,,,,SERB005


### Generate (participant_id, uhid, raw_dicom_dir) mapping for available raw_dicom_dirs

In [12]:
uhid_dicom_map_df = dicom_demo_avail_df.copy()

print(f"Number participants with available MR data: {len(uhid_dicom_map_df)}")
uhid_dicom_map_df.head()

Number participants with available MR data: 91


Unnamed: 0,UHID,raw_dir_name,name,group,Name,D.O.A,Age,Gender (0_F 1_M),Education,participant_id
0,PJ19001297,/home/nimhans/projects/data/PD_SERB/scratch/ra...,alamgirparvez,control,Alamgir Parvez,27.07.2019,56.0,1.0,12.0,SERB001
1,MANOHAR,/home/nimhans/projects/data/PD_SERB/scratch/ra...,victor,control,,,,,,SERB002
2,PJ21000356,/home/nimhans/projects/data/PD_SERB/scratch/ra...,khaleelpasha,control,Khaleel Basha,13.07.2021,47.0,1.0,8.0,SERB003
3,PJ19001294,/home/nimhans/projects/data/PD_SERB/scratch/ra...,nikhilkumarsaxenacom,control,Nikhil Kumar Saxena,26.07.2019,60.0,1.0,16.0,SERB004
4,PJ22000050,/home/nimhans/projects/data/PD_SERB/scratch/ra...,vijayakumari,control,,,,,,SERB005


### Save uhid_dicom_map file 
- participants_id --> UHID mapping --> raw_dicom_dir names to be renamed with [organize_dicoms.py](./organize_dicoms.py)
- Note that UHID is still part of the dicom header, but that can be stripped when converting to Niftis

#### Check if uhid_dicom_map_file already exists (See notes on the top to avoid breaking map beteen demographics and renamed dicom_dirs)


In [13]:
overwrite_uhid_dicom_map_file = False

if Path(uhid_dicom_map_file).is_file():
    print("uhid_dicom_map_file already exists")

if overwrite_uhid_dicom_map_file: 
    print("Overwritting uhid_dicom_map_file")

    uhid_dicom_map_df.to_csv(uhid_dicom_map_file, index=None)
    print(f"Saving uhid_dicom_map here: {uhid_dicom_map_file}")

uhid_dicom_map_df.head()

Overwritting uhid_dicom_map_file
Saving uhid_dicom_map here: /home/nimhans/projects/data/PD_SERB/scratch/uhid_dicom_map.csv


Unnamed: 0,UHID,raw_dir_name,name,group,Name,D.O.A,Age,Gender (0_F 1_M),Education,participant_id
0,PJ19001297,/home/nimhans/projects/data/PD_SERB/scratch/ra...,alamgirparvez,control,Alamgir Parvez,27.07.2019,56.0,1.0,12.0,SERB001
1,MANOHAR,/home/nimhans/projects/data/PD_SERB/scratch/ra...,victor,control,,,,,,SERB002
2,PJ21000356,/home/nimhans/projects/data/PD_SERB/scratch/ra...,khaleelpasha,control,Khaleel Basha,13.07.2021,47.0,1.0,8.0,SERB003
3,PJ19001294,/home/nimhans/projects/data/PD_SERB/scratch/ra...,nikhilkumarsaxenacom,control,Nikhil Kumar Saxena,26.07.2019,60.0,1.0,16.0,SERB004
4,PJ22000050,/home/nimhans/projects/data/PD_SERB/scratch/ra...,vijayakumari,control,,,,,,SERB005


### Populate participants.csv for `mr_proc` based organization and processing

In [15]:
participant_df = pd.DataFrame(columns=["participant_id","age","sex","group"])

participant_df["participant_id"] = dicom_demo_avail_df["participant_id"]
participant_df["age"] = dicom_demo_avail_df["Age"]
participant_df["sex"] = dicom_demo_avail_df["Gender (0_F 1_M)"]
participant_df["sex"] = participant_df["sex"].replace({0:"F", 1:"M"})
participant_df["group"] = dicom_demo_avail_df["group"]

print(f"Number of participants: {len(participant_df)}")
participant_df.head()

Number of participants: 91


Unnamed: 0,participant_id,age,sex,group
0,SERB001,56.0,M,control
1,SERB002,,,control
2,SERB003,47.0,M,control
3,SERB004,60.0,M,control
4,SERB005,,,control


### Save participants.csv
#### Check if participants_csv already exists (See notes on the top to avoid breaking map beteen demographics and renamed dicom_dirs)


In [16]:
overwrite_participants_csv = False

if Path(participants_csv).is_file():
    print("participants_csv already exists")

if overwrite_participants_csv: 
    print("Overwritting uhid_dicom_map_file")
    participant_df.to_csv(participants_csv, index=None)
    print(f"Saving file here: {participants_csv}")

Overwritting uhid_dicom_map_file
Saving file here: /home/nimhans/projects/data/PD_SERB/tabular/demographics/participants.csv
