In [1]:
import pandas as pd
import numpy as np
import requests
import json
import time
import pickle
from dateutil import relativedelta
import seaborn as sns

### API call for redcap

In [2]:
def api_call(url, query, logger=None):
    """ helper function to make API calls to RedCap
    """
    r = requests.post(url, data=query, verify=False)
    http_status = str(r.status_code)
    print(f'HTTP Status: {http_status}')

    if http_status == "200":
        query_results = r.json()
        query_df = pd.DataFrame(query_results)

    else:
        print(f"RedCap API request Failed with HTTP Status: {http_status}")
        query_df = None
        
    return query_df

def get_inventory_count(df, index_col, availability_indicators):
    """ helper function to count participants with recorded data in redcap
    """
    assess_cols = df.columns.drop(index_col)

    if availability_indicators == 'number':
        df = df.replace("", np.nan)
        df[assess_cols] = df[assess_cols].astype(np.float64)

    inventory = {}
    for col in assess_cols:        
        if availability_indicators == 'number':
            availability_count = df[~df[col].isna()][index_col].nunique()
        else:
            availability_count = df[df[col].isin(availability_indicators)][index_col].nunique()
        inventory[col] = availability_count
    return inventory

def get_available_data(config_json, DATASET_ROOT, var_name, preferred_var_source="primary"):
    """ Get data for given variables from available sources
        All return dataframes should have participant_id and visit_id as index
    """
    config_data = json.load(open(config_json))
    data_sources = config_data['data_sources']
    variable_info = config_data['variables'][var_name]
    variable_type = variable_info["type"]
    variable_sources = variable_info["sources"]

    if preferred_var_source == "primary":
        selected_var_source = variable_info['primary_source']
        selected_var_instrument = variable_info['primary_instrument']
    elif preferred_var_source == "secondary":
        selected_var_source = variable_info['secondary_source']
        selected_var_instrument = variable_info['secondary_instrument']
    else:
        print(f"Using preferred source {preferred_var_source} for variable {var_name}")
        preferred_var_data_source = preferred_var_source["data_source"]
        preferred_var_instrument = preferred_var_source["instrument"]

        if preferred_var_data_source not in variable_sources.keys():
            print(f"Preferred data source {preferred_var_data_source} not available for variable {var_name}")
            return None
        else:
            selected_var_source = preferred_var_data_source

        if preferred_var_instrument not in variable_sources[selected_var_source].keys():
            print(f"Preferred var instrument {preferred_var_instrument} not available for variable {var_name}")
            return None
        else:
            selected_var_instrument = preferred_var_instrument

    print(f"Using variable {var_name} from source {selected_var_source} and instrument {selected_var_instrument}")

    external_var_cols = variable_sources[selected_var_source][selected_var_instrument]

    # Get data from primary source
    var_file = data_sources[selected_var_source][selected_var_instrument]["path"]
    var_file_path = f"{DATASET_ROOT}/{var_file}"
    var_file_index = data_sources[selected_var_source][selected_var_instrument]["index_cols"]

    var_df = pd.read_csv(var_file_path)
    selected_var_cols = list(set(var_file_index + external_var_cols))
    var_df = var_df[selected_var_cols]
    
    if (variable_type == "date") & (len(external_var_cols) == 1):
        var_df[external_var_cols[0]] = pd.to_datetime(var_df[external_var_cols[0]], errors="coerce", dayfirst=False)

    if (len(external_var_cols) == 1):
        var_df = var_df.rename(columns={external_var_cols[0]:var_name})
    
    return var_df

def get_age_at_visit(df, date_col, age_col, dob_col="dob", rounding_digits=2, age_range=(0,100)):
    """ Get age at visit. Expects column name to be: var_date """
    
    df[age_col] = df[date_col] - df[dob_col]
    df[age_col] = np.round(df[age_col].dt.days / 365.25, rounding_digits)

    if (len(df[df[age_col] > 100]) | len(df[df[age_col] < 0])):
        print(f"Warning: Age values outside range {age_range} for variable {var}")

    return df

### Paths


In [3]:
DATASET_ROOT = "/home/nikhil/projects/Parkinsons/qpn/"

# Current nipoppy release
current_release = "Oct_2024"

data_release_dir = f"{DATASET_ROOT}/releases/{current_release}/"
tabular_data_release_dir = f"{data_release_dir}/tabular/"

redcap_release_dir = f"{data_release_dir}tabular/redcap/chunked/"
# redcap_chunked_report_COPY = f"{redcap_release_dir}/1. COPN-QPNDataMoCAUPDRSNeur_DATA_LABELS_2024-06-19_0910_copy.xlsx"
# colleted_redcap_report_file = f"{redcap_release_dir}/redcap_chunked_report.csv"

redcap_legacy_updrs_file = f"{redcap_release_dir}/COPN-QPNMDSUPDRS_DATA_LABELS_2024-06-19_0945.xlsx"
redcap_legacy_moca_file = f"{redcap_release_dir}/COPN-QPNMoCA_DATA_LABELS_2024-06-19_0938.xlsx"

filtered_legacy_updrs_file = f"{redcap_release_dir}/legacy_updrs.csv"
collated_moca_file = f"{redcap_release_dir}/redcap_and_legacy_moca.csv"

demo_config_json = "../workflow/tabular/demographics.json"
pheno_config_json = "../workflow/tabular/pheno.json"

# Special cohort inclusion/exclusion criteria files
# These files are used to filter participants based on certain criteria - typically would go inside `demographics.csv` 
## Roche participant list
roche_participant_list_csv = f"{tabular_data_release_dir}/recruitment/roche_participants.csv"
retracted_participant_list_csv = f"{tabular_data_release_dir}/recruitment/retracted_participants.csv"

# Neuromelanin cohort
neuromelanin_participant_list_csv = f"{tabular_data_release_dir}/recruitment/MRI_NM_LORIS_map.csv"

# output files
demographics_file = f"{tabular_data_release_dir}/demographics.csv"
mri_session_date_file = f"{tabular_data_release_dir}/mri_info/mri_sessions.csv"
updrs_file = f"{tabular_data_release_dir}/assessments/updrs.csv"
moca_file = f"{tabular_data_release_dir}/assessments/moca.csv"
dx_file = f"{tabular_data_release_dir}/assessments/diagnosis.csv"
neuropsych_file = f"{tabular_data_release_dir}/assessments/neuropsych.csv"

### Standardized index names

In [4]:
baseline_event_name = "Baseline (Arm 1: C-OPN)"

## redcap event name variations
config_data = json.load(open(demo_config_json))
data_sources = config_data['data_sources']
redcap_data_sources = data_sources['redcap']

redcap_field_name_map = {}

for instrument in redcap_data_sources.keys():
    index_cols = redcap_data_sources[instrument]['index_cols']
    record_id = index_cols[0]
    event_name = index_cols[1]

    redcap_field_name_map[record_id] = "participant_id"
    redcap_field_name_map[event_name] = "redcap_event_name"
print(f"redcap_field_name_map: {redcap_field_name_map}")

# legacy participant_id variations in DOB and BD_RPQ
legacy_field_name_map = {}
legacy_field_name_map['Record ID'] = "participant_id"
legacy_field_name_map['Patient #'] = "participant_id"
legacy_field_name_map['Name of visit (V01, V02, V03)'] = "visit"
print(f"legacy_field_name_map: {legacy_field_name_map}")

redcap_field_name_map: {'Record ID:': 'participant_id', 'Event Name': 'redcap_event_name', 'record_id': 'participant_id', 'redcap_event_name': 'redcap_event_name'}
legacy_field_name_map: {'Record ID': 'participant_id', 'Patient #': 'participant_id', 'Name of visit (V01, V02, V03)': 'visit'}


### Update RedCAP reports through API 
(Not updating extended report since it has to come from Sarah)
- "global_records_query"
- "QPN MoCA-UPDRS-Neuropsy data_Sarah"

In [5]:
update_redcap_reports = False

redcap_report_list = ["MDSUPDRS-1_Base"] #["Demographic QPN", "global_records_query", "QPN MoCA-UPDRS-Neuropsy data_Sarah"]
if update_redcap_reports:
    redcap_config_json = f"{DATASET_ROOT}/proc/.redcap.json"
    redcap_config = json.load(open(redcap_config_json))
    url = redcap_config["url"]
    
    for redcap_report in redcap_report_list:
        print(f"Getting data for RedCap report: {redcap_report}")
        records_query = redcap_config["queries"][redcap_report]
        query_df = api_call(url, records_query, logger=None)
        report_csv = f"{tabular_data_release_dir}/redcap/{redcap_report}.csv"
        query_df.to_csv(report_csv, index=False)
        print(f"Saved RedCap report to {report_csv}")



### Available participants

In [6]:
QPN_participants_df = get_available_data(demo_config_json,data_release_dir,"participant_id")
QPN_participants = QPN_participants_df["participant_id"].unique()
n_participants = len(QPN_participants)
session_counts = QPN_participants_df["participant_id"].value_counts()
print(f"Number of participants: {n_participants}")

### Retracted participants
print(f"Removing retracted participants from the dataset")
retracted_participants_df = pd.read_csv(retracted_participant_list_csv)
retracted_participants = retracted_participants_df["participant_id"].unique()
print(f"removing following {len(retracted_participants)} participants from the dataset: {retracted_participants}")
QPN_participants_df = QPN_participants_df[~QPN_participants_df["participant_id"].isin(retracted_participants)].copy()

QPN_participants = QPN_participants_df["participant_id"].unique()
n_participants = len(QPN_participants)
session_counts = QPN_participants_df["participant_id"].value_counts()
print(f"Number of participants: {n_participants}")

Using variable participant_id from source local and instrument manifest
Number of participants: 306
Removing retracted participants from the dataset
removing following 5 participants from the dataset: ['MNI0436' 'MNI0482' 'PD01100' 'MNI0369' 'MNI0607']
Number of participants: 301


### Collate chunked RedCap data
- The new generate report is formatted as mutli-tab excel spreadsheet based on redcap-event. 


In [7]:
regenerate_collated_report = False

if regenerate_collated_report:
    sheet_names = ["Baseline (without CHQ)","F-U 12months & MNI","F-U 18months & MNI", "F-U 24months & MNI",
                "F-U 12months & PD, UDM", "F-U 18months & PD, UDM", "F-U 24months & PD, UDM"]
    redcap_chunked_report_df = pd.DataFrame()
    for sheet_name in sheet_names:
        _df = pd.read_excel(redcap_chunked_report_COPY, sheet_name=sheet_name, engine='openpyxl')
        _df = _df[_df["Record ID:"].isin(QPN_participants)]  
        redcap_chunked_report_df = pd.concat([redcap_chunked_report_df, _df], axis=0)
        print(f"Sheet: {sheet_name} - Shape: {_df.shape}")
        print(f"redcap_chunked_report_df - Shape: {redcap_chunked_report_df.shape}")

    print(f"Saving collated redcap report to {redcap_release_dir}/redcap_chunked_report.csv")
    redcap_chunked_report_df.to_csv(collated_redcap_report_file, index=False)

else:
    print(f"Loading collated redcap report from {redcap_release_dir}/redcap_chunked_report.csv")

Loading collated redcap report from /home/nikhil/projects/Parkinsons/qpn//releases/Oct_2024/tabular/redcap/chunked//redcap_chunked_report.csv


### Collate and calculate legacy UPDRS data

In [8]:
regenerate_legacy_data = False

if regenerate_legacy_data:
    legacy_updrs_df = pd.read_excel(redcap_legacy_updrs_file, engine='openpyxl')

    all_updrs3_cols = legacy_updrs_df.columns[legacy_updrs_df.columns.str.startswith("Updrs_3")]
    all_legacy_cols = legacy_updrs_df.columns[legacy_updrs_df.columns.str.endswith(".1")]

    legacy_updrs3_cols = list(set(all_updrs3_cols) & set(all_legacy_cols))

    legacy_total_cols = ['Part I: Non-Motor Aspects of Experiences of Daily Living (nM-EDL).1',
                        'Part II: Motor Aspects of Experiences of Daily Living (M-EDL).1',	
                        'Part III: Motor Examination.1',
                        'Part IV: Motor Complications.1']

    legacy_admin_cols = ['Record ID:',	'Event Name',
                        'Assessment completed:     Évaluation remplie:  .1',
                        'Assessment completed by:     Évaluation complétée par:.1',
                        'How was the MDS-UPDRS administered?   Comment le MDS-UPDRS a-t-il été administré?.1']


    legacy_filter_cols = legacy_admin_cols + legacy_total_cols + legacy_updrs3_cols

    legacy_updrs_filtered_df = legacy_updrs_df.loc[:, legacy_filter_cols]

    legacy_updrs_filtered_df = legacy_updrs_filtered_df.dropna(subset=legacy_updrs3_cols, how='all')

    # Filter out two subjects that have all UPDRS subscore (most likely not legacy instrument)
    legacy_updrs_filtered_df = legacy_updrs_filtered_df[legacy_updrs_filtered_df["Updrs_3_16_l value.1"].isna()]

    n_legacy_participants = legacy_updrs_filtered_df["Record ID:"].nunique()

    print(f"Number of participants with legacy UPDRS data: {n_legacy_participants}")

    print("Summing all UPDRS3 sub-scores")
    legacy_updrs_filtered_df["legacy_updrs3"] = legacy_updrs_filtered_df[legacy_updrs3_cols].sum(axis=1)
    legacy_updrs_filtered_df["Event Name"] = "pre-redcap-baseline-1 (legacy)"

    print(f"Saving filtered legacy UPDRS data to {filtered_legacy_updrs_file}")
    legacy_updrs_filtered_df.to_csv(filtered_legacy_updrs_file, index=False)

    legacy_updrs_filtered_df.head()

### Collate and calculate legacy MoCA data

In [9]:
regenerate_legacy_data = False

if regenerate_legacy_data:
    moca_df = pd.read_excel(redcap_legacy_moca_file, engine='openpyxl')

    first_legacy_cols = moca_df.columns[moca_df.columns.str.endswith(".1")]
    second_legacy_cols = moca_df.columns[moca_df.columns.str.endswith(".2")]

    index_cols = ['Record ID:',	'Event Name']
    first_legacy_moca_df = moca_df.loc[:, index_cols + list(first_legacy_cols)]
    second_legacy_moca_df = moca_df.loc[:, index_cols + list(second_legacy_cols)]

    n_first_legacy_participants = first_legacy_moca_df["Record ID:"].nunique()
    n_second_legacy_participants = second_legacy_moca_df["Record ID:"].nunique()

    print(f"Number of participants with first legacy MoCA data: {n_first_legacy_participants}")
    print(f"Number of participants with second legacy MoCA data: {n_second_legacy_participants}")

    # merge first and second legacy moca data
    moca_cols = first_legacy_cols.str.replace(".1","")
    first_legacy_cols_dict = dict(zip(first_legacy_cols, moca_cols))
    second_legacy_cols_dict = dict(zip(second_legacy_cols, moca_cols))

    first_legacy_moca_df = first_legacy_moca_df.rename(columns=first_legacy_cols_dict)
    first_legacy_moca_df["Event Name"] = "pre-redcap-baseline-1 (legacy)"

    second_legacy_moca_df = second_legacy_moca_df.rename(columns=second_legacy_cols_dict)
    second_legacy_moca_df["Event Name"] = "pre-redcap-baseline-2 (legacy)"

    legacy_moca_df = pd.concat([first_legacy_moca_df, second_legacy_moca_df], axis=0)

    na_check_cols = legacy_moca_df.columns[legacy_moca_df.columns.str.startswith("TOTAL")]

    legacy_moca_df = legacy_moca_df.dropna(subset=na_check_cols, how='all')

    n_legacy_participants = legacy_moca_df["Record ID:"].nunique()
    legacy_visit_counts = legacy_moca_df["Event Name"].value_counts()

    print(f"Number of participants with legacy MoCA data: {n_legacy_participants}")
    print(f"legacy_visit_counts MoCA data: {legacy_visit_counts}")


    # Merge legacy data with redcap visit data 
    print("-"*50)
    print("Merging redcap and legacy MoCA data")
    
    redcap_moca_df = moca_df.loc[:, index_cols + list(moca_cols)]
    n_redcap_participants = redcap_moca_df["Record ID:"].nunique()
    redcap_events = redcap_moca_df["Event Name"].unique()
    print(f"Number of participants with redcap MoCA data: {n_redcap_participants}")
    print(f"redcap_events MoCA data: {redcap_events}")

    redcap_and_legacy_moca_df = pd.concat([redcap_moca_df, legacy_moca_df], axis=0)
    n_redcap_participants = redcap_and_legacy_moca_df["Record ID:"].nunique()
    redcap_events = redcap_and_legacy_moca_df["Event Name"].unique()
    print(f"Number of participants with redcap and legacy MoCA data: {n_redcap_participants}")
    print(f"redcap_events MoCA data: {redcap_events}")

    print(f"Saving filtered legacy MoCA data to {collated_moca_file}")
    redcap_and_legacy_moca_df.to_csv(collated_moca_file, index=False)

    legacy_moca_df.head()

### Fetch demographic data

In [10]:
demo_vars = ["dob", "enrollment_group", "sex", "education"]
# vars_with_secondary_source = ["dob"]

config_json = demo_config_json
index_cols = ["participant_id", "redcap_event_name"]

demo_var_df = pd.DataFrame()
for var in demo_vars:
    _df = get_available_data(config_json,data_release_dir,var)
    _df = _df.rename(columns=redcap_field_name_map)
    _df = _df.rename(columns=legacy_field_name_map)
    _df = _df[_df["participant_id"].isin(QPN_participants)].copy()

    if demo_var_df.empty:
        demo_var_df = _df
    else:
        demo_var_df = pd.merge(demo_var_df, _df, on=index_cols, how="outer")   


# add only DoB from seconday source
var = "dob"
print(f"**Getting data from the secondary source for dob**")
legacy_dob_df = get_available_data(config_json,data_release_dir,var,preferred_var_source="secondary")
legacy_dob_df = legacy_dob_df.rename(columns=legacy_field_name_map)
legacy_dob_df = legacy_dob_df.rename(columns={var:var+"_secondary"})

participants_with_missing_value_in_primary = demo_var_df[(demo_var_df["redcap_event_name"]==baseline_event_name) & (demo_var_df[var].isna())]["participant_id"].unique()
legacy_dob_df = legacy_dob_df[legacy_dob_df["participant_id"].isin(participants_with_missing_value_in_primary)].copy()

demo_var_df = pd.merge(demo_var_df, legacy_dob_df, on=["participant_id"], how="left")
demo_var_df[var] = demo_var_df[var].fillna(demo_var_df["dob_secondary"])


demo_participants = demo_var_df["participant_id"].unique()
n_demo_participants = len(demo_participants)
print('-'*50)
print(f"Number of participants with demographics data: {n_demo_participants}")
print('-'*50)

demo_redcap_events = demo_var_df["redcap_event_name"].unique()
print(f"Demographics data available for events: {demo_redcap_events}")
print('-'*50)

# Only keep data for baseline event
print('-'*50)
print(f"Keeping data for event: {baseline_event_name} (i.e. static variables)")
print(f"All temporal data goes into assessment files")
print('-'*50)
demo_var_df = demo_var_df[demo_var_df["redcap_event_name"]==baseline_event_name].copy()

for var in demo_vars:
    n_unique = demo_var_df[var].nunique()
    n_missing = demo_var_df[var].isna().sum()
    print(f"Var: {var}, n_unique: {n_unique}, n_missing: {n_missing} (out of {n_demo_participants})")

demo_var_df.head()

Using variable dob from source redcap and instrument enrollment_report
Using variable enrollment_group from source redcap and instrument enrollment_report
Using variable sex from source redcap and instrument demographic_query
Using variable education from source redcap and instrument demographic_query
**Getting data from the secondary source for dob**
Using variable dob from source local and instrument legacy_DOB
--------------------------------------------------
Number of participants with demographics data: 296
--------------------------------------------------
Demographics data available for events: ['Baseline (Arm 1: C-OPN)' '18 Months Follow-Up/Suivi (Arm 1: C-OPN)'
 '12 Months Follow-Up/Suivi (Arm 1: C-OPN)']
--------------------------------------------------
--------------------------------------------------
Keeping data for event: Baseline (Arm 1: C-OPN) (i.e. static variables)
All temporal data goes into assessment files
--------------------------------------------------
Var: 

Unnamed: 0,redcap_event_name,dob,participant_id,enrollment_group,sex,education,dob_secondary
0,Baseline (Arm 1: C-OPN),1963-07-27,MNI0028,PD (Parkinson's Disease)/(Maladie de Parkinson),Male/Masculin,18,NaT
1,Baseline (Arm 1: C-OPN),1942-05-21,MNI0056,PD (Parkinson's Disease)/(Maladie de Parkinson),Male/Masculin,6,NaT
2,Baseline (Arm 1: C-OPN),1964-03-14,MNI0058,PD (Parkinson's Disease)/(Maladie de Parkinson),Male/Masculin,11,NaT
3,Baseline (Arm 1: C-OPN),1952-05-08,MNI0068,PD (Parkinson's Disease)/(Maladie de Parkinson),Male/Masculin,12,NaT
4,Baseline (Arm 1: C-OPN),1971-11-25,MNI0079,PD (Parkinson's Disease)/(Maladie de Parkinson),Female/Féminin,20+,NaT


### Tag / retract certain participants based on special criteria

In [12]:
### Roche tag 
roche_participants_df = pd.read_csv(roche_participant_list_csv)
roche_participants = roche_participants_df["participant_id"].unique()

print(f"Number of Roche participants: {len(roche_participants)}")

demo_var_df["recruitment_cohort"] = "QPN"
demo_var_df.loc[demo_var_df["participant_id"].isin(roche_participants),"recruitment_cohort"] = "Roche"

demo_var_df.head()

Number of Roche participants: 14


Unnamed: 0,redcap_event_name,dob,participant_id,enrollment_group,sex,education,dob_secondary,recruitment_cohort
0,Baseline (Arm 1: C-OPN),1963-07-27,MNI0028,PD (Parkinson's Disease)/(Maladie de Parkinson),Male/Masculin,18,NaT,QPN
1,Baseline (Arm 1: C-OPN),1942-05-21,MNI0056,PD (Parkinson's Disease)/(Maladie de Parkinson),Male/Masculin,6,NaT,QPN
2,Baseline (Arm 1: C-OPN),1964-03-14,MNI0058,PD (Parkinson's Disease)/(Maladie de Parkinson),Male/Masculin,11,NaT,QPN
3,Baseline (Arm 1: C-OPN),1952-05-08,MNI0068,PD (Parkinson's Disease)/(Maladie de Parkinson),Male/Masculin,12,NaT,QPN
4,Baseline (Arm 1: C-OPN),1971-11-25,MNI0079,PD (Parkinson's Disease)/(Maladie de Parkinson),Female/Féminin,20+,NaT,QPN


### Save demographics data **without the DOB** 

In [13]:
demo_save_cols = ["participant_id", "redcap_event_name", "recruitment_cohort", "enrollment_group", "sex", "education"]
demo_var_without_dob_df = demo_var_df[demo_save_cols]
demo_var_without_dob_df.to_csv(demographics_file, index=False)
print(f"Saved demographics data to {demographics_file}")

Saved demographics data to /home/nikhil/projects/Parkinsons/qpn//releases/Oct_2024//tabular//demographics.csv


### Find records with phenotypic data

In [14]:
pheno_vars = ["updrs_scores", "moca_scores", "updrs_date", "moca_date",
              "diagnosis_date", "diagnosis_confirmation"] #"legacy_updrs3_scores", "legacy_updrs3_date",

config_json = pheno_config_json
index_cols = ["participant_id", "redcap_event_name"]
pheno_var_df = pd.DataFrame()
for var in pheno_vars:
    _df = get_available_data(config_json,data_release_dir,var)
    _df = _df.rename(columns=redcap_field_name_map)
    _df = _df.rename(columns=legacy_field_name_map)
    _df = _df[_df["participant_id"].isin(QPN_participants)].copy()
    if pheno_var_df.empty:
        pheno_var_df = _df
    else:
        pheno_var_df = pd.merge(pheno_var_df, _df, on=index_cols, how="outer")   

pheno_participants = pheno_var_df["participant_id"].unique()
n_pheno_participants = len(pheno_participants)
print('-'*50)
print(f"Number of participants with pheno data: {n_pheno_participants}")
print('-'*50)

pheno_redcap_events = pheno_var_df["redcap_event_name"].unique()
print(f"Pheno data available for events: {pheno_redcap_events}")
print('-'*50)

for var in pheno_var_df.columns:
    for redcap_event in pheno_redcap_events:
        if var not in index_cols:
            pheno_var_event_df = pheno_var_df[pheno_var_df["redcap_event_name"]==redcap_event].copy()
            n_pheno_var_event_participants = pheno_var_event_df["participant_id"].nunique()
            if pheno_var_event_df[var].nunique() > 0:    
                print(f"Var: {var}, Event: {redcap_event}")
                n_unique = pheno_var_event_df[var].nunique()
                n_missing = pheno_var_event_df[var].isna().sum()
                print(f"n_unique: {n_unique}, n_missing: {n_missing} (out of {n_pheno_var_event_participants})")
    print('-'*50)

pheno_var_df.head()

Using variable updrs_scores from source redcap and instrument collated_updrs_report
Using variable moca_scores from source redcap and instrument collated_moca_report
Using variable updrs_date from source redcap and instrument collated_updrs_report


Using variable moca_date from source redcap and instrument collated_moca_report
Using variable diagnosis_date from source redcap and instrument Dx_confirm_dates_report
Using variable diagnosis_confirmation from source redcap and instrument Dx_confirm_notes_report
--------------------------------------------------
Number of participants with pheno data: 294
--------------------------------------------------
Pheno data available for events: ['Baseline (Arm 1: C-OPN)' '18 Months Follow-Up/Suivi (Arm 1: C-OPN)'
 '12 Months Follow-Up/Suivi (Arm 1: C-OPN)']
--------------------------------------------------
--------------------------------------------------
Var: Part III: Motor Examination, Event: Baseline (Arm 1: C-OPN)
n_unique: 47, n_missing: 150 (out of 294)
Var: Part III: Motor Examination, Event: 18 Months Follow-Up/Suivi (Arm 1: C-OPN)
n_unique: 13, n_missing: 71 (out of 84)
Var: Part III: Motor Examination, Event: 12 Months Follow-Up/Suivi (Arm 1: C-OPN)
n_unique: 24, n_missing: 25 (

Unnamed: 0,participant_id,Part III: Motor Examination,Updrs_3_15_l value,Updrs_3_12 value,Updrs_3_3_lle value,Updrs_3_10 value,Updrs_3_6_l value,Updrs_3_16_r value,Updrs_3_18 value,redcap_event_name,...,Updrs_3_2 value,Updrs_3_3_neck value,Did the participant receive +1 extra point for 12 years or less of education? Le participant a-t-il reçu +1 point supplémentaire pour 12 ans ou moins d'études?,TOTAL SCORE (make sure to include extra point for 12 years or less of education): SCORE TOTAL (assurez-vous d'inclure un point supplémentaire pour 12 ans ou moins d'éducation) :,updrs_date,moca_date,diagnosis_date,Parkinson's disease in opinion of treating neurologist / Maladie de Parkinson selon l'avis du neurologue traitant,Final impression / Impression finale,"Determined diagnosis: If score = 0, Parkinson's Disease (PD) If score = 1, Progressive Supranuclear Palsy (PSP) If score = 2, Multiple System Atrophy (MSA) If score = 3, Corticobasal Syndrome (CBS) If score = 4, Dementia with Lewy Bodies (DLB) If score = 5, Frontotemporal Dementia (FTD) If score = 6, Essential Tremor (ET) If score = 7, REM Sleep Behaviour Disorder (RBD)"
0,MNI0028,29.0,0.0,1.0,3.0,0.0,2.0,0.0,4.0,Baseline (Arm 1: C-OPN),...,1.0,2.0,No/Non,28.0,2023-10-04,2023-10-04,2019-06-01,Unsure / Incertain,Uncertain / Incertain,
1,MNI0056,58.0,0.0,3.0,1.0,3.0,3.0,0.0,0.0,Baseline (Arm 1: C-OPN),...,2.0,3.0,Yes/Oui,21.0,2021-06-11,2021-06-11,2017-09-01,Yes / Oui,Meets exclusion criteria / Rencontre des critè...,0.0
2,MNI0058,26.0,0.0,0.0,2.0,1.0,1.0,1.0,1.0,Baseline (Arm 1: C-OPN),...,2.0,1.0,Yes/Oui,25.0,2021-07-23,2021-07-23,2020-05-22,Yes / Oui,Uncertain / Incertain,0.0
3,MNI0079,22.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,Baseline (Arm 1: C-OPN),...,1.0,2.0,No/Non,26.0,2022-01-21,2021-12-22,2017-12-01,Yes / Oui,Meets criteria for Parkinson's disease / Répon...,0.0
4,MNI0079,,,,,,,,,18 Months Follow-Up/Suivi (Arm 1: C-OPN),...,,,,,NaT,NaT,NaT,,,


### Tag legacy participants for UPDRS

In [15]:
pheno_vars = ["legacy_updrs3_scores", "legacy_updrs3_date"]

config_json = pheno_config_json
index_cols = ["participant_id", "redcap_event_name"]
legacy_var_df = pd.DataFrame()
for var in pheno_vars:
    _df = get_available_data(config_json,data_release_dir,var)
    _df = _df.rename(columns=redcap_field_name_map)
    _df = _df.rename(columns=legacy_field_name_map)
    # _df = _df[_df["participant_id"].isin(QPN_participants)].copy()
    if legacy_var_df.empty:
        legacy_var_df = _df
    else:
        legacy_var_df = pd.merge(legacy_var_df, _df, on=index_cols, how="outer")   

n_legacy_participants = legacy_var_df["participant_id"].nunique()
print(f"Number of participants with legacy data: {n_legacy_participants}")

legacy_var_df["redcap_event_name"] = "legacy-updrs3"
legacy_var_df = legacy_var_df.rename(columns={"legacy_updrs3_date":"updrs_date"})

# Append phono_var_df with legacy updrs data
pheno_var_df = pd.concat([pheno_var_df, legacy_var_df], axis=0)

legacy_var_df.head()


Using variable legacy_updrs3_scores from source redcap and instrument legacy_updrs_report
Using variable legacy_updrs3_date from source redcap and instrument legacy_updrs_report
Number of participants with legacy data: 23


Unnamed: 0,participant_id,Part III: Motor Examination,Updrs_3_15_l value,Updrs_3_12 value,Updrs_3_3_lle value,Updrs_3_10 value,Updrs_3_6_l value,Updrs_3_16_r value,Updrs_3_18 value,redcap_event_name,...,Updrs_3_5_r value,Updrs_3_8_l value,Updrs_3_17_lipjaw value,Part I: Non-Motor Aspects of Experiences of Daily Living (nM-EDL),Updrs_3_3_lue value,Updrs_3_7_r value,Updrs_3_17_rle value,Updrs_3_2 value,Updrs_3_3_neck value,updrs_date
0,PD00020,26.0,1,2,1,1,1,0,0,legacy-updrs3,...,1,1,0,,1,0,1,1,2,2019-02-05
1,PD00596,30.0,1,2,2,1,2,0,0,legacy-updrs3,...,1,2,0,,2,0,0,0,2,2018-10-22
2,PD00620,45.0,2,3,4,1,0,0,0,legacy-updrs3,...,0,0,1,,4,0,2,1,4,2019-07-24
3,PD00668,20.0,0,1,3,1,0,0,0,legacy-updrs3,...,2,0,0,,0,0,0,0,4,2020-02-28
4,PD00757,42.0,2,3,0,1,2,0,0,legacy-updrs3,...,3,2,4,,0,0,1,2,0,2020-02-21


### Append legacy MoCA report

In [16]:
pheno_vars = ["legacy_moca_scores", "legacy_moca_date"]

config_json = pheno_config_json
index_cols = ["participant_id", "redcap_event_name"]
legacy_var_df = pd.DataFrame()
for var in pheno_vars:
    _df = get_available_data(config_json,data_release_dir,var)
    _df = _df.rename(columns=redcap_field_name_map)
    _df = _df.rename(columns=legacy_field_name_map)
    # _df = _df[_df["participant_id"].isin(QPN_participants)].copy()
    if legacy_var_df.empty:
        legacy_var_df = _df
    else:
        legacy_var_df = pd.merge(legacy_var_df, _df, on=index_cols, how="outer")   

n_legacy_participants = legacy_var_df["participant_id"].nunique()
print(f"Number of participants with legacy data: {n_legacy_participants}")

legacy_var_df["redcap_event_name"] = "legacy-moca"
legacy_var_df = legacy_var_df.rename(columns={"legacy_moca_date":"moca_date"})

# Append phono_var_df with legacy updrs data
pheno_var_df = pd.concat([pheno_var_df, legacy_var_df], axis=0)

legacy_var_df.head()



Using variable legacy_moca_scores from source redcap and instrument legacy_moca_report
Using variable legacy_moca_date from source redcap and instrument legacy_moca_report
Number of participants with legacy data: 17


Unnamed: 0,Did the participant receive +1 extra point for 12 years or less of education? Le participant a-t-il reçu +1 point supplémentaire pour 12 ans ou moins d'études?,TOTAL SCORE (make sure to include extra point for 12 years or less of education): SCORE TOTAL (assurez-vous d'inclure un point supplémentaire pour 12 ans ou moins d'éducation) :,redcap_event_name,participant_id,moca_date
0,No/Non,23,legacy-moca,PD00020,2019-05-02
1,No/Non,26,legacy-moca,PD00048,2016-12-01
2,No/Non,28,legacy-moca,PD00215,2019-04-10
3,No/Non,29,legacy-moca,PD00622,2017-07-28
4,No/Non,29,legacy-moca,PD00653,2017-07-13


### Assign final Dx group label for analysis based on the following rules:

1. if (in enrollment_report) enrollment_group  == 'Healthy control/Contrôle', then group = 'control'
2. else (in the diagnosis report ) if determined_diagnosis == 0 & (final_impression == "Meets criteria for Parkinson's disease / Répond aux critères de la maladie de Parkinson") | final_impression == NA), then group = 'pd'

In [17]:
final_impression_col = "Final impression / Impression finale"	
determined_diagnosis_col = "Determined diagnosis:  If score = 0, Parkinson's Disease (PD)  If score = 1, Progressive Supranuclear Palsy (PSP)  If score = 2, Multiple System Atrophy (MSA)  If score = 3, Corticobasal Syndrome (CBS)  If score = 4, Dementia with Lewy Bodies (DLB)  If score = 5, Frontotemporal Dementia (FTD)  If score = 6, Essential Tremor (ET)  If score = 7, REM Sleep Behaviour Disorder (RBD)"
final_impression_notes_for_PD = ["Meets criteria for Parkinson's disease / Répond aux critères de la maladie de Parkinson", np.nan]

control_participants = demo_var_df[demo_var_df["enrollment_group"]=="Healthy control/Contrôle"]["participant_id"].unique()
PD_participants = pheno_var_df[(pheno_var_df[determined_diagnosis_col]==0) & (pheno_var_df[final_impression_col].isin(final_impression_notes_for_PD))]["participant_id"].unique()
unknown_dx_participants = set(pheno_var_df["participant_id"].unique()) - set(control_participants) - set(PD_participants)

redcap_events_for_dx_confirmation = [baseline_event_name, "legacy-updrs3", "legacy-moca"]
pheno_var_df["diagnosis_group_for_analysis"] = np.nan
pheno_var_df.loc[(pheno_var_df["participant_id"].isin(control_participants)) & (pheno_var_df["redcap_event_name"].isin(redcap_events_for_dx_confirmation)), "diagnosis_group_for_analysis"] = "control"
pheno_var_df.loc[(pheno_var_df["participant_id"].isin(PD_participants)) & (pheno_var_df["redcap_event_name"].isin(redcap_events_for_dx_confirmation)), "diagnosis_group_for_analysis"] = "PD"
pheno_var_df.loc[(pheno_var_df["participant_id"].isin(unknown_dx_participants)) & (pheno_var_df["redcap_event_name"].isin(redcap_events_for_dx_confirmation)), "diagnosis_group_for_analysis"] = "unknown"

print('-'*50)
print(f"Number of participants with diagnosis data: {len(pheno_var_df['participant_id'].unique())}")
print(f"Number of control participants: {len(control_participants)}")
print(f"Number of PD participants: {len(PD_participants)}")
print(f"Number of unknown diagnosis participants: {len(unknown_dx_participants)}")
print('-'*50)

pheno_var_df[index_cols + ["diagnosis_group_for_analysis"]].head()

--------------------------------------------------
Number of participants with diagnosis data: 298
Number of control participants: 69
Number of PD participants: 194
Number of unknown diagnosis participants: 35
--------------------------------------------------


  pheno_var_df.loc[(pheno_var_df["participant_id"].isin(control_participants)) & (pheno_var_df["redcap_event_name"].isin(redcap_events_for_dx_confirmation)), "diagnosis_group_for_analysis"] = "control"


Unnamed: 0,participant_id,redcap_event_name,diagnosis_group_for_analysis
0,MNI0028,Baseline (Arm 1: C-OPN),unknown
1,MNI0056,Baseline (Arm 1: C-OPN),unknown
2,MNI0058,Baseline (Arm 1: C-OPN),unknown
3,MNI0079,Baseline (Arm 1: C-OPN),PD
4,MNI0079,18 Months Follow-Up/Suivi (Arm 1: C-OPN),


### Neuropsych data
- Comes from either from Sarah's extended report or BD_RPQ_UPDATE_Neuropsy

In [18]:
neuropsych_vars = ["neuropsy_scores","neuropsy_date"]

config_data = json.load(open(config_json))
variable_info = config_data['variables'][neuropsych_vars[0]]
variable_sources = variable_info["sources"]
neuropsy_source = variable_info['primary_source']

print(f"Using neuropsych data source: {neuropsy_source}")
# local BD_RPQ data
if neuropsy_source == "local":
    index_cols = ["participant_id", "visit", "TimePoint (based on REDCap; baseline, 18m, 36m, etc.)", "Délai depuis baseline (mois)"]
    
# redcap data
if neuropsy_source == "redcap":
    index_cols = ["participant_id", "redcap_event_name"]

    
neuropsych_df = pd.DataFrame()
for var in neuropsych_vars:
    _df = get_available_data(config_json,data_release_dir,var)
    _df = _df.rename(columns=redcap_field_name_map)
    _df = _df.rename(columns=legacy_field_name_map)
    _df = _df[_df["participant_id"].isin(QPN_participants)].copy()
    if neuropsych_df.empty:
        neuropsych_df = _df
    else:
        neuropsych_df = pd.merge(neuropsych_df, _df, on=index_cols, how="left")   

neuropsych_participants = neuropsych_df["participant_id"].unique()
n_neuropsych_participants = len(neuropsych_participants)
print('-'*50)
print(f"Number of participants with neuropysch data: {n_neuropsych_participants}")
print('-'*50)

neuropsych_cols = neuropsych_df.columns.drop(index_cols).tolist()
n_neuropsuch_cols = len(neuropsych_cols)
print(f"Neuropsych data available for variables: {n_neuropsuch_cols}")
print('-'*50)

# BD_RPQ data
if neuropsy_source == "local":
    neuropsych_visits = neuropsych_df["visit"].unique()

# REDCap data
if neuropsy_source == "redcap":
    neuropsych_visits = neuropsych_df["redcap_event_name"].unique()

print(f"neuropsych data available for events: {neuropsych_visits}")
print('-'*50)

neuropsych_df.head()

Using neuropsych data source: redcap
Using variable neuropsy_scores from source redcap and instrument sarah_extended_export


Using variable neuropsy_date from source redcap and instrument sarah_extended_export
--------------------------------------------------
Number of participants with neuropysch data: 291
--------------------------------------------------
Neuropsych data available for variables: 83
--------------------------------------------------
neuropsych data available for events: ['Baseline (Arm 1: C-OPN)' '12 Months Follow-Up/Suivi (Arm 1: C-OPN)'
 '18 Months Follow-Up/Suivi (Arm 1: C-OPN)'
 '24 Months Follow-Up/Suivi (Arm 1: C-OPN)']
--------------------------------------------------


Unnamed: 0,Total repetition errors,"Stroop GOLDEN, ink, uncorrected errors (raw score)",Main dominante (1 min),redcap_event_name,Digit span forward - longest correct serie (Raw score),Stroop - D-Kefs - Cond.2 WORDS: Time (sec) (Raw score),Stroop - D-Kefs - Cond. 3 - 1 (time),"STROOP GOLDEN : colors, self-corrected errors (raw scores)",Copy time (sec),Main non dominante (1 min),...,Trial 3 raw,Trail B Errors raw score,"Trial total 1,2,3 (Raw score)",Stroop - D-Kefs - Cond.2: Self-corrected errors (Raw score),Stroop - D-Kefs - Cond.3: Total errors (Automatic calculation),Immediate recall time (sec),Was the Purdue pegboard administered?,Stroop - D-Kefs - Cond. 3 - 1 (time) (Automatic Calculation),Was the Stroop Colour and Word (D-KEFS) test administered?,neuropsy_date
0,6.0,,25.0,Baseline (Arm 1: C-OPN),6.0,19.0,37.0,,125.0,17.0,...,10.0,0.0,25.0,0.0,0.0,185.0,"Yes, completed",37.0,"Yes, completed",2023-10-03
1,0.0,997.0,997.0,Baseline (Arm 1: C-OPN),5.0,28.0,,997.0,321.0,997.0,...,4.0,0.0,11.0,0.0,4.0,994.0,Missing Data,115.0,"Yes, completed",2021-07-30
2,0.0,997.0,997.0,Baseline (Arm 1: C-OPN),7.0,22.0,,997.0,190.0,997.0,...,10.0,0.0,26.0,1.0,7.0,994.0,Missing Data,44.0,"Yes, completed",2021-08-18
3,2.0,997.0,997.0,Baseline (Arm 1: C-OPN),6.0,27.0,,997.0,144.0,997.0,...,9.0,0.0,21.0,0.0,0.0,994.0,Missing Data,45.0,"Yes, completed",2021-08-18
4,2.0,997.0,25.0,Baseline (Arm 1: C-OPN),5.0,17.0,,997.0,151.0,22.0,...,11.0,0.0,31.0,0.0,3.0,994.0,"Yes, completed",26.0,"Yes, completed",2022-01-21


### Basic clean-up and data checks

In [19]:
# Fix dtypes
for series_name, series in neuropsych_df.items():
    if "score" in series_name:
        if series.dtype == 'object':
            print(f"recasting {series_name} to float by replacing , with .")
            neuropsych_df[series_name] = neuropsych_df[series_name].str.replace(",",".").astype(float)
            neuropsych_df.loc[neuropsych_df[series_name]>900, series_name] = np.nan
            
    # Replace >900 with NaNs
    if series.dtype == 'float':
        neuropsych_df.loc[neuropsych_df[series_name]>900, series_name] = np.nan

# assign redcap_event_name
visit_months = [12, 18, 24, 30, 36, 42, 48, 54]
month_bins = [9, 15, 21, 27, 33, 39, 45, 51, 57]
event_str_suffix = "Months Follow-Up/Suivi (Arm 1: C-OPN)"
event_names = [f"{m} {event_str_suffix}" for m in visit_months]

if neuropsy_source == "local":
    neuropsych_df["redcap_event_name"] = pd.cut(neuropsych_df["Délai depuis baseline (mois)"], bins=month_bins, labels=event_names).astype(str)
    neuropsych_df.loc[neuropsych_df["TimePoint (based on REDCap; baseline, 18m, 36m, etc.)"]=="baseline", 
                      "redcap_event_name"] = "Baseline (Arm 1: C-OPN)"

# Merge with pheno_var_df
index_cols = ["participant_id", "redcap_event_name"]
pheno_var_df = pd.merge(pheno_var_df, neuropsych_df, on=index_cols, how="left")  

pheno_var_df.head()

Unnamed: 0,participant_id,Part III: Motor Examination,Updrs_3_15_l value,Updrs_3_12 value,Updrs_3_3_lle value,Updrs_3_10 value,Updrs_3_6_l value,Updrs_3_16_r value,Updrs_3_18 value,redcap_event_name,...,Trial 3 raw,Trail B Errors raw score,"Trial total 1,2,3 (Raw score)",Stroop - D-Kefs - Cond.2: Self-corrected errors (Raw score),Stroop - D-Kefs - Cond.3: Total errors (Automatic calculation),Immediate recall time (sec),Was the Purdue pegboard administered?,Stroop - D-Kefs - Cond. 3 - 1 (time) (Automatic Calculation),Was the Stroop Colour and Word (D-KEFS) test administered?,neuropsy_date
0,MNI0028,29.0,0.0,1.0,3.0,0.0,2.0,0.0,4.0,Baseline (Arm 1: C-OPN),...,10.0,0.0,25.0,0.0,0.0,185.0,"Yes, completed",37.0,"Yes, completed",2023-10-03
1,MNI0056,58.0,0.0,3.0,1.0,3.0,3.0,0.0,0.0,Baseline (Arm 1: C-OPN),...,4.0,0.0,11.0,0.0,4.0,,Missing Data,115.0,"Yes, completed",2021-07-30
2,MNI0058,26.0,0.0,0.0,2.0,1.0,1.0,1.0,1.0,Baseline (Arm 1: C-OPN),...,10.0,0.0,26.0,1.0,7.0,,Missing Data,44.0,"Yes, completed",2021-08-18
3,MNI0079,22.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,Baseline (Arm 1: C-OPN),...,11.0,0.0,31.0,0.0,3.0,,"Yes, completed",26.0,"Yes, completed",2022-01-21
4,MNI0079,,,,,,,,,18 Months Follow-Up/Suivi (Arm 1: C-OPN),...,,,,,,,,,,NaT


### Add mri_acq date
- Needs to map to redcap_event_name

In [20]:
var = "MRI_date"
config_json = pheno_config_json
mri_date_df = get_available_data(config_json,data_release_dir,var)
mri_date_df["MRI_date"] = pd.to_datetime(mri_date_df["MRI_date"], errors="coerce", dayfirst=False)

n_mri_participants = mri_date_df["participant_id"].nunique()
print(f"Number of participants with MRI data: {n_mri_participants}")

n_sessions = mri_date_df["session"].nunique()
print(f"Number of MRI sessions: {n_sessions}")

### Retracted participants
print(f"Removing retracted participants from the dataset")
retracted_participants_df = pd.read_csv(retracted_participant_list_csv)
retracted_participants = retracted_participants_df["participant_id"].unique()
print(f"removing following {len(retracted_participants)} participants from the dataset: {retracted_participants}")
mri_date_df = mri_date_df[~mri_date_df["participant_id"].isin(retracted_participants)].copy()

participants_with_follow_ups = mri_date_df[mri_date_df["participant_id"].duplicated()]["participant_id"].unique()
n_participants_with_follow_ups = len(participants_with_follow_ups)
print(f"Number of participants with follow-up MRI: {n_participants_with_follow_ups}")

mri_ses01_date_df = mri_date_df[mri_date_df["session"]=="ses-01"].copy()
mri_ses01_date_df["redcap_event_name"] = "Baseline (Arm 1: C-OPN)"

mri_ses02_date_df = mri_date_df[mri_date_df["session"]=="ses-02"].copy()
mri_ses02_participants = mri_ses02_date_df["participant_id"].unique()
print(f"Number of participants with ses-02 MRI: {len(mri_ses02_participants)}")

baseline_df = mri_ses01_date_df[mri_ses01_date_df["participant_id"].isin(mri_ses02_participants)].set_index("participant_id")
followup_df = mri_ses02_date_df.set_index("participant_id")

visit_months = [12, 18, 24, 30, 36, 42, 48, 54]
month_bins = [9, 15, 21, 27, 33, 39, 45, 51, 57]

event_str_suffix = "Months Follow-Up/Suivi (Arm 1: C-OPN)"
event_names = [f"{m} {event_str_suffix}" for m in visit_months]

# --- Bin the months --- #
followup_df["months_since_baseline"] = followup_df["MRI_date"].dt.to_period('M').astype(int) - baseline_df["MRI_date"].dt.to_period('M').astype(int)
followup_df["months_since_baseline"] = followup_df["months_since_baseline"].replace({0:np.nan}) # Some visits get same acq_date from brodacasting merge. 

followup_df["redcap_event_name"] = pd.cut(followup_df["months_since_baseline"], bins=month_bins, labels=event_names)

mri_date_redcap_event_df = pd.concat([mri_ses01_date_df, followup_df.reset_index()], axis=0)
# mri_date_redcap_event_df = mri_date_redcap_event_df

mri_date_redcap_event_df.sort_values(["participant_id","session"]).head()

Using variable MRI_date from source local and instrument MRI_dates
Number of participants with MRI data: 305
Number of MRI sessions: 3
Removing retracted participants from the dataset
removing following 5 participants from the dataset: ['MNI0436' 'MNI0482' 'PD01100' 'MNI0369' 'MNI0607']
Number of participants with follow-up MRI: 67
Number of participants with ses-02 MRI: 67


Unnamed: 0,participant_id,session,MRI_date,redcap_event_name,months_since_baseline
0,MNI0028,ses-01,2023-10-04,Baseline (Arm 1: C-OPN),
1,MNI0056,ses-01,2021-08-18,Baseline (Arm 1: C-OPN),
2,MNI0058,ses-01,2021-08-18,Baseline (Arm 1: C-OPN),
3,MNI0068,ses-01,2021-08-27,Baseline (Arm 1: C-OPN),
4,MNI0079,ses-01,2021-12-22,Baseline (Arm 1: C-OPN),


#### Add MRI date to pheno data


In [21]:
mri_date_redcap_event_df[mri_date_redcap_event_df["MRI_date"].isna()]

Unnamed: 0,participant_id,session,MRI_date,redcap_event_name,months_since_baseline
32,PD01223,ses-02,NaT,,9.223372e+18


In [22]:
pheno_var_df = pd.merge(pheno_var_df, mri_date_redcap_event_df, on=index_cols, how="outer")  
var = "MRI_date"
for redcap_event in mri_date_redcap_event_df["redcap_event_name"].unique():    
    pheno_var_event_df = pheno_var_df[pheno_var_df["redcap_event_name"]==redcap_event].copy()
    n_pheno_var_event_participants = pheno_var_event_df["participant_id"].nunique()
    if pheno_var_event_df[var].nunique() > 0:    
        print(f"Var: {var}, Event: {redcap_event}")
        n_unique = pheno_var_event_df[var].nunique()
        n_missing = pheno_var_event_df[var].isna().sum()
        print(f"n_unique: {n_unique}, n_missing: {n_missing} (out of {n_pheno_var_event_participants})")
pheno_var_df.head()

Var: MRI_date, Event: Baseline (Arm 1: C-OPN)
n_unique: 246, n_missing: 0 (out of 301)
Var: MRI_date, Event: 12 Months Follow-Up/Suivi (Arm 1: C-OPN)
n_unique: 45, n_missing: 28 (out of 77)
Var: MRI_date, Event: 18 Months Follow-Up/Suivi (Arm 1: C-OPN)
n_unique: 10, n_missing: 73 (out of 84)
Var: MRI_date, Event: 48 Months Follow-Up/Suivi (Arm 1: C-OPN)
n_unique: 1, n_missing: 0 (out of 1)
Var: MRI_date, Event: 42 Months Follow-Up/Suivi (Arm 1: C-OPN)
n_unique: 2, n_missing: 0 (out of 2)
Var: MRI_date, Event: 24 Months Follow-Up/Suivi (Arm 1: C-OPN)
n_unique: 1, n_missing: 0 (out of 1)
Var: MRI_date, Event: 36 Months Follow-Up/Suivi (Arm 1: C-OPN)
n_unique: 1, n_missing: 0 (out of 1)
Var: MRI_date, Event: 30 Months Follow-Up/Suivi (Arm 1: C-OPN)
n_unique: 1, n_missing: 0 (out of 1)


Unnamed: 0,participant_id,Part III: Motor Examination,Updrs_3_15_l value,Updrs_3_12 value,Updrs_3_3_lle value,Updrs_3_10 value,Updrs_3_6_l value,Updrs_3_16_r value,Updrs_3_18 value,redcap_event_name,...,Stroop - D-Kefs - Cond.2: Self-corrected errors (Raw score),Stroop - D-Kefs - Cond.3: Total errors (Automatic calculation),Immediate recall time (sec),Was the Purdue pegboard administered?,Stroop - D-Kefs - Cond. 3 - 1 (time) (Automatic Calculation),Was the Stroop Colour and Word (D-KEFS) test administered?,neuropsy_date,session,MRI_date,months_since_baseline
0,MNI0028,29.0,0.0,1.0,3.0,0.0,2.0,0.0,4.0,Baseline (Arm 1: C-OPN),...,0.0,0.0,185.0,"Yes, completed",37.0,"Yes, completed",2023-10-03,ses-01,2023-10-04,
1,MNI0056,58.0,0.0,3.0,1.0,3.0,3.0,0.0,0.0,Baseline (Arm 1: C-OPN),...,0.0,4.0,,Missing Data,115.0,"Yes, completed",2021-07-30,ses-01,2021-08-18,
2,MNI0058,26.0,0.0,0.0,2.0,1.0,1.0,1.0,1.0,Baseline (Arm 1: C-OPN),...,1.0,7.0,,Missing Data,44.0,"Yes, completed",2021-08-18,ses-01,2021-08-18,
3,MNI0079,22.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,Baseline (Arm 1: C-OPN),...,0.0,3.0,,"Yes, completed",26.0,"Yes, completed",2022-01-21,ses-01,2021-12-22,
4,MNI0079,,,,,,,,,18 Months Follow-Up/Suivi (Arm 1: C-OPN),...,,,,,,,NaT,,NaT,


### Calculate age

In [23]:
demo_cols = ["participant_id", "dob", "enrollment_group", "sex"]
demo_var_df[demo_var_df["participant_id"]==participants_with_follow_ups[0]]
baseline_demo_df = demo_var_df[demo_var_df["redcap_event_name"]=="Baseline (Arm 1: C-OPN)"][demo_cols].copy()

index_cols = ["participant_id"] # not using redcap_event_name to allow broadcast of demographics vars
tabular_df = pd.merge(pheno_var_df, baseline_demo_df, on=index_cols, how="left")
tabular_df[tabular_df["participant_id"]==participants_with_follow_ups[0]]

tabular_df.head()

Unnamed: 0,participant_id,Part III: Motor Examination,Updrs_3_15_l value,Updrs_3_12 value,Updrs_3_3_lle value,Updrs_3_10 value,Updrs_3_6_l value,Updrs_3_16_r value,Updrs_3_18 value,redcap_event_name,...,Was the Purdue pegboard administered?,Stroop - D-Kefs - Cond. 3 - 1 (time) (Automatic Calculation),Was the Stroop Colour and Word (D-KEFS) test administered?,neuropsy_date,session,MRI_date,months_since_baseline,dob,enrollment_group,sex
0,MNI0028,29.0,0.0,1.0,3.0,0.0,2.0,0.0,4.0,Baseline (Arm 1: C-OPN),...,"Yes, completed",37.0,"Yes, completed",2023-10-03,ses-01,2023-10-04,,1963-07-27,PD (Parkinson's Disease)/(Maladie de Parkinson),Male/Masculin
1,MNI0056,58.0,0.0,3.0,1.0,3.0,3.0,0.0,0.0,Baseline (Arm 1: C-OPN),...,Missing Data,115.0,"Yes, completed",2021-07-30,ses-01,2021-08-18,,1942-05-21,PD (Parkinson's Disease)/(Maladie de Parkinson),Male/Masculin
2,MNI0058,26.0,0.0,0.0,2.0,1.0,1.0,1.0,1.0,Baseline (Arm 1: C-OPN),...,Missing Data,44.0,"Yes, completed",2021-08-18,ses-01,2021-08-18,,1964-03-14,PD (Parkinson's Disease)/(Maladie de Parkinson),Male/Masculin
3,MNI0079,22.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,Baseline (Arm 1: C-OPN),...,"Yes, completed",26.0,"Yes, completed",2022-01-21,ses-01,2021-12-22,,1971-11-25,PD (Parkinson's Disease)/(Maladie de Parkinson),Female/Féminin
4,MNI0079,,,,,,,,,18 Months Follow-Up/Suivi (Arm 1: C-OPN),...,,,,NaT,,NaT,,1971-11-25,PD (Parkinson's Disease)/(Maladie de Parkinson),Female/Féminin


In [24]:
date_cols = ["diagnosis_date", "updrs_date", "moca_date", "MRI_date", "neuropsy_date"]

date_age_cols_dict = {}
for col in date_cols:
    date_age_cols_dict[col] = f"{col.rsplit('_',1)[0]}_age"

age_cols = list(date_age_cols_dict.values())

for date_col, age_col in date_age_cols_dict.items():
    tabular_df = get_age_at_visit(tabular_df, date_col, age_col)

tabular_df.head()

Unnamed: 0,participant_id,Part III: Motor Examination,Updrs_3_15_l value,Updrs_3_12 value,Updrs_3_3_lle value,Updrs_3_10 value,Updrs_3_6_l value,Updrs_3_16_r value,Updrs_3_18 value,redcap_event_name,...,MRI_date,months_since_baseline,dob,enrollment_group,sex,diagnosis_age,updrs_age,moca_age,MRI_age,neuropsy_age
0,MNI0028,29.0,0.0,1.0,3.0,0.0,2.0,0.0,4.0,Baseline (Arm 1: C-OPN),...,2023-10-04,,1963-07-27,PD (Parkinson's Disease)/(Maladie de Parkinson),Male/Masculin,55.85,60.19,60.19,60.19,60.19
1,MNI0056,58.0,0.0,3.0,1.0,3.0,3.0,0.0,0.0,Baseline (Arm 1: C-OPN),...,2021-08-18,,1942-05-21,PD (Parkinson's Disease)/(Maladie de Parkinson),Male/Masculin,75.28,79.06,79.06,79.24,79.19
2,MNI0058,26.0,0.0,0.0,2.0,1.0,1.0,1.0,1.0,Baseline (Arm 1: C-OPN),...,2021-08-18,,1964-03-14,PD (Parkinson's Disease)/(Maladie de Parkinson),Male/Masculin,56.19,57.36,57.36,57.43,57.43
3,MNI0079,22.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,Baseline (Arm 1: C-OPN),...,2021-12-22,,1971-11-25,PD (Parkinson's Disease)/(Maladie de Parkinson),Female/Féminin,46.02,50.16,50.08,50.08,50.16
4,MNI0079,,,,,,,,,18 Months Follow-Up/Suivi (Arm 1: C-OPN),...,NaT,,1971-11-25,PD (Parkinson's Disease)/(Maladie de Parkinson),Female/Féminin,,,,,


### Save demo, mri_dates, and pheno (dx, updrs, moca, neuropsych) in separate files
- remove DoB and other date columns 
- add age columns

In [25]:
index_cols = ["participant_id", "redcap_event_name"]

mri_cols = ["session", "MRI_age"]

dx_cols = ['Hoehn and Yahr Stage: ',            
           "Parkinson's disease in opinion of treating neurologist / Maladie de Parkinson selon l'avis du neurologue traitant",
            "Final impression / Impression finale",	
            "Determined diagnosis:  If score = 0, Parkinson's Disease (PD)  If score = 1, Progressive Supranuclear Palsy (PSP)  If score = 2, Multiple System Atrophy (MSA)  If score = 3, Corticobasal Syndrome (CBS)  If score = 4, Dementia with Lewy Bodies (DLB)  If score = 5, Frontotemporal Dementia (FTD)  If score = 6, Essential Tremor (ET)  If score = 7, REM Sleep Behaviour Disorder (RBD)",
            "diagnosis_group_for_analysis",
            "diagnosis_age"
            ]

updrs_cols = ['Part I: Non-Motor Aspects of Experiences of Daily Living (nM-EDL)',
              'Part II: Motor Aspects of Experiences of Daily Living (M-EDL)',
              "Updrs_3_1 value", "Updrs_3_2 value", "Updrs_3_3_neck value",
              "Updrs_3_3_rue value", "Updrs_3_3_lue value", "Updrs_3_3_rle value",
              "Updrs_3_3_lle value", "Updrs_3_4_r value", "Updrs_3_4_l value",
              "Updrs_3_5_r value", "Updrs_3_5_l value", "Updrs_3_6_r value",
              "Updrs_3_6_l value", "Updrs_3_7_r value", "Updrs_3_7_l value",
              "Updrs_3_8_r value", "Updrs_3_8_l value", "Updrs_3_9 value",
              "Updrs_3_10 value", "Updrs_3_11 value", "Updrs_3_12 value",
              "Updrs_3_13 value", "Updrs_3_14", "Updrs_3_15_r value",
              "Updrs_3_15_l value", "Updrs_3_16_r value", "Updrs_3_16_l value",
              "Updrs_3_17_rue value", "Updrs_3_17_lue value", "Updrs_3_17_rle value",
              "Updrs_3_17_lle value", "Updrs_3_17_lipjaw value", "Updrs_3_18 value",
              'Part III: Motor Examination', 'Part IV: Motor Complications', 
              'updrs_age',
              ]

# moca_cols = ["moca_scores", "moca_age"]
moca_cols = [
            "TOTAL SCORE (make sure to include extra point for 12 years or less of education):    SCORE TOTAL (assurez-vous d'inclure un point supplémentaire pour 12 ans ou moins d'éducation) : ",
            "Did the participant receive +1 extra point for 12 years or less of education?    Le participant a-t-il reçu +1 point supplémentaire pour 12 ans ou moins d'études?",
            "moca_age"
            ]

neuropsych_cols = neuropsych_cols + ["neuropsy_age"]
if "neuropsy_date" in neuropsych_cols:
    neuropsych_cols.remove("neuropsy_date")

mri_df = tabular_df[index_cols + mri_cols]
dx_df = tabular_df[index_cols + dx_cols].copy()
updrs_df = tabular_df[index_cols + updrs_cols].copy()
moca_df = tabular_df[index_cols + moca_cols].copy()
neuropsych_df = tabular_df[index_cols + neuropsych_cols].copy()

# filter na rows
mri_df = mri_df.dropna(subset=mri_cols, how='all').sort_values(["redcap_event_name"], ascending=False).sort_values(["participant_id"])
dx_df = dx_df.dropna(subset=dx_cols, how='all').sort_values(["redcap_event_name"], ascending=False).sort_values(["participant_id"])
updrs_df = updrs_df.dropna(subset=updrs_cols, how='all').sort_values(["redcap_event_name"], ascending=False).sort_values(["participant_id"])
moca_df = moca_df.dropna(subset=moca_cols, how='all').sort_values(["redcap_event_name"], ascending=False).sort_values(["participant_id"])
neuropsych_df = neuropsych_df.dropna(subset=neuropsych_cols, how='all').sort_values(["redcap_event_name"], ascending=False).sort_values(["participant_id"])


# Save data to files
mri_df.to_csv(mri_session_date_file, index=False)
print(f"Saved MRI session data to {mri_session_date_file}")

dx_df.to_csv(dx_file, index=False)
print(f"Saved diagnosis data to {dx_file}")

updrs_df.to_csv(updrs_file, index=False)
print(f"Saved UPDRS data to {updrs_file}")

moca_df.to_csv(moca_file, index=False)
print(f"Saved MoCA data to {moca_file}")

neuropsych_df.to_csv(neuropsych_file, index=False)
print(f"Saved neuropsych data to {neuropsych_file}")

Saved MRI session data to /home/nikhil/projects/Parkinsons/qpn//releases/Oct_2024//tabular//mri_info/mri_sessions.csv
Saved diagnosis data to /home/nikhil/projects/Parkinsons/qpn//releases/Oct_2024//tabular//assessments/diagnosis.csv
Saved UPDRS data to /home/nikhil/projects/Parkinsons/qpn//releases/Oct_2024//tabular//assessments/updrs.csv
Saved MoCA data to /home/nikhil/projects/Parkinsons/qpn//releases/Oct_2024//tabular//assessments/moca.csv
Saved neuropsych data to /home/nikhil/projects/Parkinsons/qpn//releases/Oct_2024//tabular//assessments/neuropsych.csv


In [26]:
updrs_df[updrs_df["redcap_event_name"]=="legacy-updrs3"].head()

Unnamed: 0,participant_id,redcap_event_name,Part I: Non-Motor Aspects of Experiences of Daily Living (nM-EDL),Part II: Motor Aspects of Experiences of Daily Living (M-EDL),Updrs_3_1 value,Updrs_3_2 value,Updrs_3_3_neck value,Updrs_3_3_rue value,Updrs_3_3_lue value,Updrs_3_3_rle value,...,Updrs_3_16_l value,Updrs_3_17_rue value,Updrs_3_17_lue value,Updrs_3_17_rle value,Updrs_3_17_lle value,Updrs_3_17_lipjaw value,Updrs_3_18 value,Part III: Motor Examination,Part IV: Motor Complications,updrs_age
434,PD00020,legacy-updrs3,,,1.0,1.0,2.0,1.0,1.0,1.0,...,0.0,1.0,1.0,1.0,1.0,0.0,0.0,26.0,,73.45
435,PD00596,legacy-updrs3,,,2.0,0.0,2.0,1.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30.0,,
436,PD00620,legacy-updrs3,,,2.0,1.0,4.0,4.0,4.0,4.0,...,0.0,1.0,1.0,2.0,2.0,1.0,0.0,45.0,,48.52
437,PD00668,legacy-updrs3,,,1.0,0.0,4.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,,78.21
438,PD00757,legacy-updrs3,,,1.0,2.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,1.0,1.0,4.0,0.0,42.0,,78.72


In [27]:
moca_df[moca_df["redcap_event_name"]=="legacy-moca"].head()

Unnamed: 0,participant_id,redcap_event_name,TOTAL SCORE (make sure to include extra point for 12 years or less of education): SCORE TOTAL (assurez-vous d'inclure un point supplémentaire pour 12 ans ou moins d'éducation) :,Did the participant receive +1 extra point for 12 years or less of education? Le participant a-t-il reçu +1 point supplémentaire pour 12 ans ou moins d'études?,moca_age
457,PD00020,legacy-moca,23.0,No/Non,73.69
458,PD00048,legacy-moca,26.0,No/Non,72.64
459,PD00215,legacy-moca,28.0,No/Non,47.89
460,PD00622,legacy-moca,29.0,No/Non,63.91
461,PD00653,legacy-moca,29.0,No/Non,74.53


### MRI dob checks

In [28]:
participants_to_check = ["PD01654"]#["MNI0426", "MNI0578", "MNI0586"]
demo_var_df[demo_var_df["participant_id"].isin(participants_to_check)]

Unnamed: 0,redcap_event_name,dob,participant_id,enrollment_group,sex,education,dob_secondary,recruitment_cohort
364,Baseline (Arm 1: C-OPN),1948-05-21,PD01654,PD (Parkinson's Disease)/(Maladie de Parkinson),Male/Masculin,17,NaT,QPN


In [29]:
mri_date_df[mri_date_df["participant_id"].isin(participants_to_check)]

Unnamed: 0,participant_id,session,MRI_date
317,PD01654,ses-01,2021-09-20


In [30]:
pheno_var_df[pheno_var_df["participant_id"].isin(participants_to_check)][["participant_id", "redcap_event_name", "moca_date"]]

Unnamed: 0,participant_id,redcap_event_name,moca_date
415,PD01654,Baseline (Arm 1: C-OPN),NaT
455,PD01654,legacy-updrs3,NaT


In [31]:
demo_var_df.head()

Unnamed: 0,redcap_event_name,dob,participant_id,enrollment_group,sex,education,dob_secondary,recruitment_cohort
0,Baseline (Arm 1: C-OPN),1963-07-27,MNI0028,PD (Parkinson's Disease)/(Maladie de Parkinson),Male/Masculin,18,NaT,QPN
1,Baseline (Arm 1: C-OPN),1942-05-21,MNI0056,PD (Parkinson's Disease)/(Maladie de Parkinson),Male/Masculin,6,NaT,QPN
2,Baseline (Arm 1: C-OPN),1964-03-14,MNI0058,PD (Parkinson's Disease)/(Maladie de Parkinson),Male/Masculin,11,NaT,QPN
3,Baseline (Arm 1: C-OPN),1952-05-08,MNI0068,PD (Parkinson's Disease)/(Maladie de Parkinson),Male/Masculin,12,NaT,QPN
4,Baseline (Arm 1: C-OPN),1971-11-25,MNI0079,PD (Parkinson's Disease)/(Maladie de Parkinson),Female/Féminin,20+,NaT,QPN


In [32]:
mri_dx_df = pd.merge(mri_df[["participant_id","session"]], dx_df[["participant_id","diagnosis_group_for_analysis"]], on=["participant_id"], how="left").drop_duplicates()
mri_dx_df = pd.merge(mri_dx_df, demo_var_df[["participant_id", "recruitment_cohort", "enrollment_group"]], on=["participant_id"], how="left").drop_duplicates()
mri_dx_df.groupby(["session", "recruitment_cohort", "enrollment_group", "diagnosis_group_for_analysis"]).size()

session  recruitment_cohort  enrollment_group                                     diagnosis_group_for_analysis
ses-01   QPN                 AP (Atypical Parkinsonism)/(Parkinsonisme Atypique)  PD                                2
                                                                                  unknown                           5
                             Healthy control/Contrôle                             control                          69
                             PD (Parkinson's Disease)/(Maladie de Parkinson)      PD                              192
                                                                                  unknown                          17
         Roche               AP (Atypical Parkinsonism)/(Parkinsonisme Atypique)  unknown                           8
                             PD (Parkinson's Disease)/(Maladie de Parkinson)      unknown                           1
ses-02   QPN                 AP (Atypical Parkinsonism)/(Parkin

In [33]:
mri_dx_df = pd.merge(mri_df[["participant_id","session"]], dx_df[["participant_id","diagnosis_group_for_analysis"]], on=["participant_id"], how="left").drop_duplicates()
mri_dx_df = pd.merge(mri_dx_df, demo_var_df[["participant_id", "recruitment_cohort", "enrollment_group"]], on=["participant_id"], how="left").drop_duplicates()
mri_dx_df.groupby(["session", "recruitment_cohort", "enrollment_group", "diagnosis_group_for_analysis"]).size()

session  recruitment_cohort  enrollment_group                                     diagnosis_group_for_analysis
ses-01   QPN                 AP (Atypical Parkinsonism)/(Parkinsonisme Atypique)  PD                                2
                                                                                  unknown                           5
                             Healthy control/Contrôle                             control                          69
                             PD (Parkinson's Disease)/(Maladie de Parkinson)      PD                              192
                                                                                  unknown                          17
         Roche               AP (Atypical Parkinsonism)/(Parkinsonisme Atypique)  unknown                           8
                             PD (Parkinson's Disease)/(Maladie de Parkinson)      unknown                           1
ses-02   QPN                 AP (Atypical Parkinsonism)/(Parkin

In [34]:
mri_with_missing_age = mri_df[mri_df["MRI_age"].isna()]["participant_id"].unique()
baseline_mri_with_missing_age = mri_df[(mri_df["redcap_event_name"]==baseline_event_name) & (mri_df["MRI_age"].isna())]["participant_id"].unique()
missing_dob_in_redcap = ["MNI0136", "PD00296"]
legacy_dob_participants = legacy_dob_df["participant_id"].unique()

print(f"Participants with missing MRI age: {mri_with_missing_age}")
print(f"Participants with missing MRI age in baseline: {baseline_mri_with_missing_age}")
print(f"Participants with missing DoB in redcap: {missing_dob_in_redcap}")
print(f"Participants with missing DoB in legacy: {legacy_dob_participants}")


Participants with missing MRI age: ['MNI0136' 'MNI0147' 'PD00296' 'PD01223' 'PD01253' 'PD01662' 'PD01686'
 'PD01687']
Participants with missing MRI age in baseline: ['MNI0136' 'MNI0147' 'PD00296' 'PD01253' 'PD01662' 'PD01686' 'PD01687']
Participants with missing DoB in redcap: ['MNI0136', 'PD00296']
Participants with missing DoB in legacy: ['PD00509' 'PD01758']


In [35]:
set(mri_with_missing_age) - set(roche_participants) - set(missing_dob_in_redcap) - set(legacy_dob_participants)

{'MNI0147', 'PD01223'}

In [36]:
demo_var_df[demo_var_df["participant_id"].isin(mri_with_missing_age)]

Unnamed: 0,redcap_event_name,dob,participant_id,enrollment_group,sex,education,dob_secondary,recruitment_cohort
262,Baseline (Arm 1: C-OPN),1955-05-23,PD01223,PD (Parkinson's Disease)/(Maladie de Parkinson),Male/Masculin,13.0,NaT,QPN
434,Baseline (Arm 1: C-OPN),NaT,MNI0136,,Male/Masculin,,NaT,QPN
435,Baseline (Arm 1: C-OPN),NaT,PD00296,,Male/Masculin,,NaT,QPN


### Mark NM participants

In [37]:
NM_participants_df = pd.read_csv(neuromelanin_participant_list_csv)
NM_participants_df = NM_participants_df.rename(columns={"PSCID":"participant_id"})
NM_participants = NM_participants_df["participant_id"].unique()
print(f"Number of Neuromelanin participants: {len(NM_participants)}")

NM_participants_df.loc[NM_participants_df["Visit Label"] == "MRI01", "session"] = "ses-01"
NM_participants_df.loc[NM_participants_df["Visit Label"] == "MRI02", "session"] = "ses-02"
NM_participants_df.loc[NM_participants_df["Visit Label"] == "MRI03", "session"] = "ses-03"

# session wise counts
session_counts = NM_participants_df["session"].value_counts()
print(f"session_counts: {session_counts}")

# compare with mri_df
mri_participants = mri_df["participant_id"].unique()
print(f"Number of participants with MRI data: {len(mri_participants)}")

# participants in NM cohort but not in MRI cohort
NM_participants_not_in_mri = set(NM_participants) - set(mri_participants)
print(f"Number of NM participants not in MRI cohort: {len(NM_participants_not_in_mri)}")

# participants in NM cohort and in MRI cohort
NM_participants_in_mri = set(NM_participants) & set(mri_participants)
print(f"Number of NM participants in MRI cohort: {len(NM_participants_in_mri)}")

# participants in MRI cohort but not in NM cohort
mri_participants_not_in_NM = set(mri_participants) - set(NM_participants)
print(f"Number of MRI participants not in NM cohort: {len(mri_participants_not_in_NM)}")

# save NM participants
NM_participants_df.to_csv(f"{tabular_data_release_dir}/recruitment/neuromelanin_participants_VM.csv", index=False)

NM_participants_df.head()


Number of Neuromelanin participants: 290
session_counts: session
ses-01    289
ses-02     55
ses-03      6
Name: count, dtype: int64
Number of participants with MRI data: 301
Number of NM participants not in MRI cohort: 1
Number of NM participants in MRI cohort: 289
Number of MRI participants not in NM cohort: 12


Unnamed: 0,participant_id,DCCID,Visit Label,session
0,MNI0028,152209,MRI01,ses-01
1,MNI0056,864854,MRI01,ses-01
2,MNI0058,197308,MRI01,ses-01
3,MNI0068,842090,MRI01,ses-01
4,MNI0079,760662,MRI01,ses-01
