## Notebook to keep track of all the subjects

- Global manifest: `participants.csv`. This should be placed within `<mr_proc_dataset>/tabular/demographics`. This should consists following columns:
    - `participant_id`, `age`, `sex`, `group` generated by study coordinator Google sheets / database
    - `BIDS_id` column used by Heudiconv and other mr_proc pipelines that maps `participant_id` 

In [1]:
import pandas as pd
import numpy as np

### Paths

In [2]:
demographics_dir = "/home/nikhil/projects/QPN_processing/tabular/"

global_participants_csv = f"{demographics_dir}participants.csv"
bids_participants_csv = f"{demographics_dir}bids_participants.tsv"

updated_participants_csv = f"{demographics_dir}updated_participants.csv"

loris_imaging_participants_csv = f"{demographics_dir}LORIS_MNI_imaging_participants_6_Sept_2022.csv"
demographics_file = f"{demographics_dir}/BD_RPQ_UPDATE_Neuropsy_6_Sept_2022.xlsx"


### Read exisiting participants.csv and bids_participants.tsv

In [3]:
global_participants_df = pd.read_csv(global_participants_csv)
bids_df = pd.read_csv(bids_participants_csv,sep="\t")

n_total_participants = len(global_participants_df["participant_id"].dropna().unique())
print(f"number of total participants: {n_total_participants}")
global_participants_df.head()

number of total participants: 296


Unnamed: 0,participant_id,age,sex,group
0,MNI0056,79.0,M,Parkinson
1,MNI0058,57.0,M,Parkinson
2,MNI0059,63.0,M,Parkinson
3,MNI0068,69.0,M,Parkinson
4,MNI0079,50.0,F,Parkinson


In [4]:
PSCID_start = 4
PSCID_end = 11
bids_df["bids_id"] = bids_df["participant_id"].copy()
bids_df["participant_id"] = bids_df["bids_id"].str[PSCID_start:PSCID_end]
bids_df = bids_df[["participant_id","bids_id"]]
n_bids_participants = len(bids_df["bids_id"].dropna().unique())
print(f"number of BIDS participants: {n_bids_participants}")

bids_df.head()

number of BIDS participants: 163


Unnamed: 0,participant_id,bids_id
0,PD01369,sub-PD01369D713546
1,MNI0103,sub-MNI0103D369057
2,PD00215,sub-PD00215D795669
3,MNI0056,sub-MNI0056D864854
4,PD00296,sub-PD00296D884528


In [5]:
global_participants_df = pd.merge(global_participants_df,bids_df, on="participant_id", how="left")
n_total_participants = len(global_participants_df["participant_id"].dropna().unique())
n_bids_participants = len(global_participants_df["bids_id"].dropna().unique())

print(f"After updated merge: number of total participants: {n_total_participants} BIDS participants: {n_bids_participants}")

global_participants_df["group"] = global_participants_df["group"].str.strip()
global_participants_df.head(10)

After updated merge: number of total participants: 296 BIDS participants: 143


Unnamed: 0,participant_id,age,sex,group,bids_id
0,MNI0056,79.0,M,Parkinson,sub-MNI0056D864854
1,MNI0058,57.0,M,Parkinson,sub-MNI0058D197308
2,MNI0059,63.0,M,Parkinson,
3,MNI0068,69.0,M,Parkinson,sub-MNI0068D842090
4,MNI0079,50.0,F,Parkinson,sub-MNI0079D760662
5,MNI0081,67.0,F,Control,
6,MNI0103,82.0,M,Parkinson,sub-MNI0103D369057
7,MNI0109,45.0,F,Parkinson,sub-MNI0109D584299
8,MNI0110,74.0,M,Parkinson,sub-MNI0110D514228
9,MNI0119,71.0,M,Parkinson,


### Save update CSV

In [6]:
# global_participants_df.to_csv(updated_participants_csv,index=None)

### BIDS participants missing from global participant list

In [78]:
global_participants = global_participants_df["bids_id"].dropna().unique()
bids_participants = bids_df["bids_id"].dropna().unique()

participants_missing_in_global_list = list(set(bids_participants) - set(global_participants))
print(f"Number of missing participants: {len(participants_missing_in_global_list)}")

Number of missing participants: 20


# TODO update global participants.csv

### Check participants in copn.loris csv
- This is for sanity checks. Ideally this should come from QPN study coordinators: (e.g. Roozbeh)

In [12]:
# QPN/COPN LORIS
loris_imaging_subs_df = pd.read_csv(loris_imaging_participants_csv)
n_loris_imaging_participants = len(loris_imaging_subs_df["PSCID"].unique())
print(f"number of unique loris participants: {n_loris_imaging_participants}")
loris_imaging_subs_df.head()

number of unique loris participants: 180


Unnamed: 0,PSCID,DCCID,Age,Sex,Subproject,Diagnosis,Epidemio Questionnaire,Biospecimen,MRI,Genetic,...,UPDRS - Part III,Visit Label,Visit Count,Site,Project,Entity Type,Participant Status,Feedback,Latest Visit Status,External ID
0,MNI0056,864854,80,Male,Disease,,N,Y,Y,N,...,N,"InitialVisit,MRI01,SampleCollection01",3,Montreal Neurological Institute,COPN,Human,Active,0,Visit,
1,MNI0058,197308,58,Male,Disease,,N,Y,Y,N,...,N,"InitialVisit,MRI01,SampleCollection01",3,Montreal Neurological Institute,QPN,Human,Active,0,Visit,
2,MNI0068,842090,70,Male,Disease,,N,Y,Y,N,...,N,"InitialVisit,MRI01,SampleCollection01",3,Montreal Neurological Institute,QPN,Human,Active,0,Visit,
3,MNI0079,760662,50,Female,Disease,,N,Y,Y,N,...,N,"InitialVisit,MRI01,SampleCollection01",3,Montreal Neurological Institute,COPN,Human,Active,0,Visit,
4,MNI0103,369057,83,Male,Disease,,N,Y,Y,N,...,N,"InitialVisit,MRI01,SampleCollection01",3,Montreal Neurological Institute,COPN,Human,Active,0,Visit,


In [13]:
loris_imaging_subs_df = loris_imaging_subs_df.rename(columns={"PSCID":"participant_id","Age":"age","Sex":"sex","Diagnosis":"group"})
loris_imaging_subs_df["sex"] = loris_imaging_subs_df["sex"].replace({"Female":"F","Male":"M"})
loris_imaging_subs_df["group"] = loris_imaging_subs_df["group"].replace({np.nan:"Control"})

useful_cols = ["participant_id", "age","sex","group"]
tmp_df = pd.concat([global_participants_df[useful_cols],loris_imaging_subs_df[useful_cols]]).drop_duplicates()
len(tmp_df["participant_id"].unique())
tmp_df

Unnamed: 0,participant_id,age,sex,group
0,MNI0056,79.0,M,Parkinson
1,MNI0058,57.0,M,Parkinson
2,MNI0059,63.0,M,Parkinson
3,MNI0068,69.0,M,Parkinson
4,MNI0079,50.0,F,Parkinson
...,...,...,...,...
175,PD01751,82.0,F,Control
176,PD01753,72.0,M,Control
177,PD01755,64.0,F,Control
178,PD01756,76.0,F,Control


### Check participants in neuropsych

In [69]:
sheet_names = ["Parkinson patients", "Control"]
subject_id_list = []
dx_list = []

neuropsy_df_concat = pd.DataFrame()

for sheet_name in sheet_names:
    print(sheet_name)

    neuropsy_df = pd.read_excel(demographics_file,sheet_name=sheet_name, engine='openpyxl',header=1)
    neuropsy_df = neuropsy_df.dropna(how='all')

    # Get rid of "/T1" from subject ID (timepoint)
    neuropsy_df["Patient #"] = neuropsy_df["Patient #"].str.split("/",expand=True)[0]
    subject_ids = list(neuropsy_df["Patient #"].values)
    print(f"Number of participants: {len(subject_ids)}")

    subject_id_list +=subject_ids
    dx_list += list(np.tile(sheet_name,len(subject_ids)))

    neuropsy_df_concat = neuropsy_df_concat.append(neuropsy_df)

print(f"Total number of neuropsy subjects: {len(subject_id_list)}")
neuropsy_df_concat.head()

Parkinson patients
Number of participants: 292
Control
Number of participants: 53
Total number of neuropsy subjects: 345


Unnamed: 0,Patient #,Administered by,Parkinson Disease or Control subject,Sex (1=men; 2=women),Language,Date of assessment,Date of birth,Age at time of assessment,Date of apparition of first symptom,Date of diagnosis,...,"Stroop - D-Kefs, COLORS (condition 1): Time (sec) (Raw score)",BNT sans indices,"Subjective Complaint (yes/no) DO you have feel that you issues with your memory, judgement, concentration, planning, etc?",Comments,Unnamed: 75,Unnamed: 76,Unnamed: 77,Unnamed: 78,Unnamed: 79,Unnamed: 80
0,PD00209,Sabrina,Parkinson,1.0,french,no record,21/05/1960,59.0,,2013,...,,,,,,,,,,
1,PD00119,Erika,Parkinson,1.0,english,2017-07-12 00:00:00,12/11/1951,66.0,,2008,...,,,,,,,,,,
2,PD00820,Erika,Parkinson,1.0,french,2017-08-01 00:00:00,20/08/1947,69.0,,2009,...,,,,,,,,,,
3,PD00262,Erika,Parkinson,2.0,999,2017-08-12 00:00:00,26/07/1947,71.0,2011.0,2011,...,,,,,,,,,,
4,PD00523,Erika,Parkinson,2.0,999,2017-12-14 00:00:00,01/10/1933,84.0,,999,...,,,,,,,,,,


In [70]:
save_csv = False

partcipants_df = neuropsy_df_concat[["Patient #","Parkinson Disease or Control subject","Sex (1=men; 2=women)","Age at time of assessment"]].copy()

partcipants_df = partcipants_df.rename(columns={"Patient #":"participant_id",
                                                "Parkinson Disease or Control subject":"group",
                                                "Sex (1=men; 2=women)":"sex",
                                                "Age at time of assessment":"age"})

partcipants_df = partcipants_df.dropna(how="all")

partcipants_df['participant_id'] = partcipants_df['participant_id'].str.strip()
partcipants_df["age"] = np.round(partcipants_df["age"],1)
partcipants_df["sex"] = partcipants_df["sex"].replace({1:"M",2:"F"})

partcipants_df = partcipants_df[["participant_id","age","sex","group"]]
partcipants_df = partcipants_df.sort_values(by=["participant_id","age"])

# Keep only unique participant ids (age refers to the baseline visit)
print(f"number of participants entries (includes duplicates from multiple visits): {len(partcipants_df)}")
partcipants_df = partcipants_df.drop_duplicates(subset=["participant_id"],keep="first")
print(f"number of unique participants: {len(partcipants_df)}")

if save_csv:
    partcipants_df.to_csv("../metadata/participants.csv", index=False)

partcipants_df.head()


number of participants entries (includes duplicates from multiple visits): 338
number of unique participants: 306


Unnamed: 0,participant_id,age,sex,group
174,MNI0056,79.0,M,Parkinson
183,MNI0058,57.0,M,Parkinson
175,MNI0059,63.0,M,Parkinson
182,MNI0068,69.0,M,Parkinson
227,MNI0079,50.0,F,Parkinson


In [71]:
partcipants_df = pd.merge(partcipants_df,bids_df, on="participant_id", how="left")
n_total_participants = len(partcipants_df["participant_id"].dropna().unique())
n_bids_participants = len(partcipants_df["bids_id"].dropna().unique())

print(f"After updated merge: number of total participants: {n_total_participants} BIDS participants: {n_bids_participants}")

partcipants_df.head()

After updated merge: number of total participants: 306 BIDS participants: 144


Unnamed: 0,participant_id,age,sex,group,bids_id
0,MNI0056,79.0,M,Parkinson,sub-MNI0056D864854
1,MNI0058,57.0,M,Parkinson,sub-MNI0058D197308
2,MNI0059,63.0,M,Parkinson,
3,MNI0068,69.0,M,Parkinson,sub-MNI0068D842090
4,MNI0079,50.0,F,Parkinson,sub-MNI0079D760662


### Check participants on BIC 
- BIC will have some duplicates due to failed acq


In [39]:
bic_data_qpn_dicom_subs_df = pd.read_csv(bic_data_qpn_dicom_subs_file, header=None)[0].str.split("_",expand=True)
cols = ["PSCID","DCCID","Vist Label","Site","First Acquisition","Unknown Col"]
bic_data_qpn_dicom_subs_df.columns = cols

bic_data_qpn_dicom_subs_df["DCCID"] = bic_data_qpn_dicom_subs_df["DCCID"].astype("int64")

bic_data_qpn_dicom_subs_df

Unnamed: 0,PSCID,DCCID,Vist Label,Site,First Acquisition,Unknown Col
0,MNI0056,864854,MRI01,MNI,20210818,151510608
1,MNI0058,197308,MRI01,MNI,20210818,105219098
2,MNI0068,842090,MRI01,MNI,20210827,150412426
3,MNI0103,369057,MRI01,MNI,20211116,132143505
4,MNI0109,584299,MRI01,MNI,20210924,135512466
...,...,...,...,...,...,...
199,PD75,20210730,134904606,,,
200,PD76,20211104,094045499,,,
201,PD77,20211202,110105862,,,
202,PD78,20211122,132024191,,,


In [9]:
# image_proc_visit_01_df = pd.merge(loris_imaging_subs_df[cols[:5]], bic_data_qpn_dicom_subs_df, 
# on=["PSCID","DCCID","Vist Label"], how="left")
# image_proc_visit_01_df["dicom_name"] = image_proc_visit_01_df["PSCID"] + "_" + image_proc_visit_01_df["DCCID"].astype(str) + "_" + \
#     image_proc_visit_01_df["Vist Label"] + "_MNI"

# image_proc_visit_01_df["bids_name"] = image_proc_visit_01_df["PSCID"] + "D" + image_proc_visit_01_df["DCCID"].astype(str) 

# image_proc_visit_01_df.head()