In [1]:
import pandas as pd
import numpy as np
import requests
import json
import time
import pickle
from dateutil import relativedelta
import seaborn as sns

### API call for redcap

In [2]:
def api_call(url, query, logger=None):
    """ helper function to make API calls to RedCap
    """
    r = requests.post(url, data=query, verify=False)
    http_status = str(r.status_code)
    print(f'HTTP Status: {http_status}')

    if http_status == "200":
        query_results = r.json()
        query_df = pd.DataFrame(query_results)

    else:
        print(f"RedCap API request Failed with HTTP Status: {http_status}")
        query_df = None
        
    return query_df

def get_inventory_count(df, index_col, availability_indicators):
    """ helper function to count participants with recorded data in redcap
    """
    assess_cols = df.columns.drop(index_col)

    if availability_indicators == 'number':
        df = df.replace("", np.nan)
        df[assess_cols] = df[assess_cols].astype(np.float64)

    inventory = {}
    for col in assess_cols:        
        if availability_indicators == 'number':
            availability_count = df[~df[col].isna()][index_col].nunique()
        else:
            availability_count = df[df[col].isin(availability_indicators)][index_col].nunique()
        inventory[col] = availability_count
    return inventory

def get_available_data(config_json, DATASET_ROOT, var_name, preferred_var_source="primary"):
    """ Get data for given variables from available sources
        All return dataframes should have participant_id and visit_id as index
    """
    config_data = json.load(open(config_json))
    data_sources = config_data['data_sources']
    variable_info = config_data['variables'][var_name]
    variable_type = variable_info["type"]
    variable_sources = variable_info["sources"]

    if preferred_var_source == "primary":
        selected_var_source = variable_info['primary_source']
        selected_var_instrument = variable_info['primary_instrument']
    elif preferred_var_source == "secondary":
        selected_var_source = variable_info['secondary_source']
        selected_var_instrument = variable_info['secondary_instrument']
    else:
        print(f"Using preferred source {preferred_var_source} for variable {var_name}")
        preferred_var_data_source = preferred_var_source["data_source"]
        preferred_var_instrument = preferred_var_source["instrument"]

        if preferred_var_data_source not in variable_sources.keys():
            print(f"Preferred data source {preferred_var_data_source} not available for variable {var_name}")
            return None
        else:
            selected_var_source = preferred_var_data_source

        if preferred_var_instrument not in variable_sources[selected_var_source].keys():
            print(f"Preferred var instrument {preferred_var_instrument} not available for variable {var_name}")
            return None
        else:
            selected_var_instrument = preferred_var_instrument

    print(f"Using variable {var_name} from source {selected_var_source} and instrument {selected_var_instrument}")

    external_var_cols = variable_sources[selected_var_source][selected_var_instrument]

    # Get data from primary source
    var_file = data_sources[selected_var_source][selected_var_instrument]["path"]
    var_file_path = f"{DATASET_ROOT}/{var_file}"
    var_file_index = data_sources[selected_var_source][selected_var_instrument]["index_cols"]

    var_df = pd.read_csv(var_file_path)
    selected_var_cols = list(set(var_file_index + external_var_cols))
    var_df = var_df[selected_var_cols]
    
    if (variable_type == "date") & (len(external_var_cols) == 1):
        var_df[external_var_cols[0]] = pd.to_datetime(var_df[external_var_cols[0]], errors="coerce", dayfirst=False)

    if (len(external_var_cols) == 1):
        var_df = var_df.rename(columns={external_var_cols[0]:var_name})
    
    return var_df

def get_age_at_visit(df, date_col, age_col, dob_col="dob", rounding_digits=2, age_range=(0,100)):
    """ Get age at visit. Expects column name to be: var_date """
    
    df[age_col] = df[date_col] - df[dob_col]
    df[age_col] = np.round(df[age_col].dt.days / 365.25, rounding_digits)

    if (len(df[df[age_col] > 100]) | len(df[df[age_col] < 0])):
        print(f"Warning: Age values outside range {age_range} for variable {var}")

    return df

### Paths


In [3]:
DATASET_ROOT = "/home/nikhil/projects/Parkinsons/qpn/"

# Current nipoppy release
current_release = "Aug_2024"

data_release_dir = f"{DATASET_ROOT}/releases/{current_release}/"
tabular_data_release_dir = f"{data_release_dir}/tabular/"

redcap_release_dir = f"{data_release_dir}tabular/redcap/chunked/"
redcap_chunked_report_COPY = f"{redcap_release_dir}/1. COPN-QPNDataMoCAUPDRSNeur_DATA_LABELS_2024-06-19_0910_copy.xlsx"
colleted_redcap_report_file = f"{redcap_release_dir}/redcap_chunked_report.csv"

redcap_legacy_updrs_file = f"{redcap_release_dir}/COPN-QPNMDSUPDRS_DATA_LABELS_2024-06-19_0945.xlsx"
redcap_legacy_moca_file = f"{redcap_release_dir}/COPN-QPNMoCA_DATA_LABELS_2024-06-19_0938.xlsx"

filtered_legacy_updrs_file = f"{redcap_release_dir}/legacy_updrs.csv"
collated_moca_file = f"{redcap_release_dir}/redcap_and_legacy_moca.csv"

demo_config_json = "../workflow/tabular/demographics.json"
pheno_config_json = "../workflow/tabular/pheno.json"

# Special cohort inclusion/exclusion criteria files
# These files are used to filter participants based on certain criteria - typically would go inside `demographics.csv` 
## Roche participant list
roche_participant_list_csv = f"{tabular_data_release_dir}/recruitment/roche_participants.csv"
retracted_participant_list_csv = f"{tabular_data_release_dir}/recruitment/retracted_participants.csv"

# Neuromelanin cohort
neuromelanin_participant_list_csv = f"{tabular_data_release_dir}/recruitment/MRI_NM_LORIS_map.csv"

# output files
demographics_file = f"{tabular_data_release_dir}/demographics.csv"
mri_session_date_file = f"{tabular_data_release_dir}/mri_sessions.csv"
updrs_file = f"{tabular_data_release_dir}/assessments/updrs.csv"
moca_file = f"{tabular_data_release_dir}/assessments/moca.csv"
dx_file = f"{tabular_data_release_dir}/assessments/diagnosis.csv"
neuropsych_file = f"{tabular_data_release_dir}/assessments/neuropsych.csv"

### Standardized index names

In [4]:
baseline_event_name = "Baseline (Arm 1: C-OPN)"

## redcap event name variations
config_data = json.load(open(demo_config_json))
data_sources = config_data['data_sources']
redcap_data_sources = data_sources['redcap']

redcap_field_name_map = {}

for instrument in redcap_data_sources.keys():
    index_cols = redcap_data_sources[instrument]['index_cols']
    record_id = index_cols[0]
    event_name = index_cols[1]

    redcap_field_name_map[record_id] = "participant_id"
    redcap_field_name_map[event_name] = "redcap_event_name"
print(f"redcap_field_name_map: {redcap_field_name_map}")

# legacy participant_id variations in DOB and BD_RPQ
legacy_field_name_map = {}
legacy_field_name_map['Record ID'] = "participant_id"
legacy_field_name_map['Patient #'] = "participant_id"
legacy_field_name_map['Name of visit (V01, V02, V03)'] = "visit"
print(f"legacy_field_name_map: {legacy_field_name_map}")

redcap_field_name_map: {'Record ID:': 'participant_id', 'Event Name': 'redcap_event_name', 'record_id': 'participant_id', 'redcap_event_name': 'redcap_event_name'}
legacy_field_name_map: {'Record ID': 'participant_id', 'Patient #': 'participant_id', 'Name of visit (V01, V02, V03)': 'visit'}


### Update RedCAP reports through API 
(Not updating extended report since it has to come from Sarah)
- "global_records_query"
- "QPN MoCA-UPDRS-Neuropsy data_Sarah"

In [5]:
update_redcap_reports = False

redcap_report_list = ["global_records_query", "QPN MoCA-UPDRS-Neuropsy data_Sarah"]
if update_redcap_reports:
    redcap_config_json = f"{DATASET_ROOT}/proc/.redcap.json"
    redcap_config = json.load(open(redcap_config_json))
    url = redcap_config["url"]
    
    for redcap_report in redcap_report_list:
        print(f"Getting data for RedCap report: {redcap_report}")
        records_query = redcap_config["queries"][redcap_report]
        query_df = api_call(url, records_query, logger=None)
        report_csv = f"{tabular_data_release_dir}/redcap/{redcap_report}.csv"
        query_df.to_csv(report_csv, index=False)
        print(f"Saved RedCap report to {report_csv}")



### Available participants

In [6]:
QPN_participants_df = get_available_data(demo_config_json,data_release_dir,"participant_id")
QPN_participants = QPN_participants_df["participant_id"].unique()
n_participants = len(QPN_participants)
session_counts = QPN_participants_df["participant_id"].value_counts()
print(f"Number of participants: {n_participants}")

### Retracted participants
print(f"Removing retracted participants from the dataset")
retracted_participants_df = pd.read_csv(retracted_participant_list_csv)
retracted_participants = retracted_participants_df["participant_id"].unique()
print(f"removing following {len(retracted_participants)} participants from the dataset: {retracted_participants}")
QPN_participants_df = QPN_participants_df[~QPN_participants_df["participant_id"].isin(retracted_participants)].copy()

QPN_participants = QPN_participants_df["participant_id"].unique()
n_participants = len(QPN_participants)
session_counts = QPN_participants_df["participant_id"].value_counts()
print(f"Number of participants: {n_participants}")

Using variable participant_id from source local and instrument manifest
Number of participants: 306
Removing retracted participants from the dataset
removing following 5 participants from the dataset: ['MNI0436' 'MNI0482' 'PD01100' 'MNI0369' 'MNI0607']
Number of participants: 301


### Collate chunked RedCap data
- The new generate report is formatted as mutli-tab excel spreadsheet based on redcap-event. 


In [7]:
regenerate_collated_report = False

if regenerate_collated_report:
    sheet_names = ["Baseline (without CHQ)","F-U 12months & MNI","F-U 18months & MNI", "F-U 24months & MNI",
                "F-U 12months & PD, UDM", "F-U 18months & PD, UDM", "F-U 24months & PD, UDM"]
    redcap_chunked_report_df = pd.DataFrame()
    for sheet_name in sheet_names:
        _df = pd.read_excel(redcap_chunked_report_COPY, sheet_name=sheet_name, engine='openpyxl')
        _df = _df[_df["Record ID:"].isin(QPN_participants)]  
        redcap_chunked_report_df = pd.concat([redcap_chunked_report_df, _df], axis=0)
        print(f"Sheet: {sheet_name} - Shape: {_df.shape}")
        print(f"redcap_chunked_report_df - Shape: {redcap_chunked_report_df.shape}")

    print(f"Saving collated redcap report to {redcap_release_dir}/redcap_chunked_report.csv")
    redcap_chunked_report_df.to_csv(collated_redcap_report_file, index=False)

else:
    print(f"Loading collated redcap report from {redcap_release_dir}/redcap_chunked_report.csv")

Loading collated redcap report from /home/nikhil/projects/Parkinsons/qpn//releases/Aug_2024/tabular/redcap/chunked//redcap_chunked_report.csv


### Collate and calculate legacy UPDRS data

In [8]:
regenerate_legacy_data = False

if regenerate_legacy_data:
    legacy_updrs_df = pd.read_excel(redcap_legacy_updrs_file, engine='openpyxl')

    all_updrs3_cols = legacy_updrs_df.columns[legacy_updrs_df.columns.str.startswith("Updrs_3")]
    all_legacy_cols = legacy_updrs_df.columns[legacy_updrs_df.columns.str.endswith(".1")]

    legacy_updrs3_cols = list(set(all_updrs3_cols) & set(all_legacy_cols))

    legacy_total_cols = ['Part I: Non-Motor Aspects of Experiences of Daily Living (nM-EDL).1',
                        'Part II: Motor Aspects of Experiences of Daily Living (M-EDL).1',	
                        'Part III: Motor Examination.1',
                        'Part IV: Motor Complications.1']

    legacy_admin_cols = ['Record ID:',	'Event Name',
                        'Assessment completed:     Évaluation remplie:  .1',
                        'Assessment completed by:     Évaluation complétée par:.1',
                        'How was the MDS-UPDRS administered?   Comment le MDS-UPDRS a-t-il été administré?.1']


    legacy_filter_cols = legacy_admin_cols + legacy_total_cols + legacy_updrs3_cols

    legacy_updrs_filtered_df = legacy_updrs_df.loc[:, legacy_filter_cols]

    legacy_updrs_filtered_df = legacy_updrs_filtered_df.dropna(subset=legacy_updrs3_cols, how='all')

    # Filter out two subjects that have all UPDRS subscore (most likely not legacy instrument)
    legacy_updrs_filtered_df = legacy_updrs_filtered_df[legacy_updrs_filtered_df["Updrs_3_16_l value.1"].isna()]

    n_legacy_participants = legacy_updrs_filtered_df["Record ID:"].nunique()

    print(f"Number of participants with legacy UPDRS data: {n_legacy_participants}")

    print("Summing all UPDRS3 sub-scores")
    legacy_updrs_filtered_df["legacy_updrs3"] = legacy_updrs_filtered_df[legacy_updrs3_cols].sum(axis=1)
    legacy_updrs_filtered_df["Event Name"] = "pre-redcap-baseline-1 (legacy)"

    print(f"Saving filtered legacy UPDRS data to {filtered_legacy_updrs_file}")
    legacy_updrs_filtered_df.to_csv(filtered_legacy_updrs_file, index=False)

    legacy_updrs_filtered_df.head()

### Collate and calculate legacy MoCA data

In [9]:
regenerate_legacy_data = False

if regenerate_legacy_data:
    moca_df = pd.read_excel(redcap_legacy_moca_file, engine='openpyxl')

    first_legacy_cols = moca_df.columns[moca_df.columns.str.endswith(".1")]
    second_legacy_cols = moca_df.columns[moca_df.columns.str.endswith(".2")]

    index_cols = ['Record ID:',	'Event Name']
    first_legacy_moca_df = moca_df.loc[:, index_cols + list(first_legacy_cols)]
    second_legacy_moca_df = moca_df.loc[:, index_cols + list(second_legacy_cols)]

    n_first_legacy_participants = first_legacy_moca_df["Record ID:"].nunique()
    n_second_legacy_participants = second_legacy_moca_df["Record ID:"].nunique()

    print(f"Number of participants with first legacy MoCA data: {n_first_legacy_participants}")
    print(f"Number of participants with second legacy MoCA data: {n_second_legacy_participants}")

    # merge first and second legacy moca data
    moca_cols = first_legacy_cols.str.replace(".1","")
    first_legacy_cols_dict = dict(zip(first_legacy_cols, moca_cols))
    second_legacy_cols_dict = dict(zip(second_legacy_cols, moca_cols))

    first_legacy_moca_df = first_legacy_moca_df.rename(columns=first_legacy_cols_dict)
    first_legacy_moca_df["Event Name"] = "pre-redcap-baseline-1 (legacy)"

    second_legacy_moca_df = second_legacy_moca_df.rename(columns=second_legacy_cols_dict)
    second_legacy_moca_df["Event Name"] = "pre-redcap-baseline-2 (legacy)"

    legacy_moca_df = pd.concat([first_legacy_moca_df, second_legacy_moca_df], axis=0)

    na_check_cols = legacy_moca_df.columns[legacy_moca_df.columns.str.startswith("TOTAL")]

    legacy_moca_df = legacy_moca_df.dropna(subset=na_check_cols, how='all')

    n_legacy_participants = legacy_moca_df["Record ID:"].nunique()
    legacy_visit_counts = legacy_moca_df["Event Name"].value_counts()

    print(f"Number of participants with legacy MoCA data: {n_legacy_participants}")
    print(f"legacy_visit_counts MoCA data: {legacy_visit_counts}")


    # Merge legacy data with redcap visit data 
    print("-"*50)
    print("Merging redcap and legacy MoCA data")
    
    redcap_moca_df = moca_df.loc[:, index_cols + list(moca_cols)]
    n_redcap_participants = redcap_moca_df["Record ID:"].nunique()
    redcap_events = redcap_moca_df["Event Name"].unique()
    print(f"Number of participants with redcap MoCA data: {n_redcap_participants}")
    print(f"redcap_events MoCA data: {redcap_events}")

    redcap_and_legacy_moca_df = pd.concat([redcap_moca_df, legacy_moca_df], axis=0)
    n_redcap_participants = redcap_and_legacy_moca_df["Record ID:"].nunique()
    redcap_events = redcap_and_legacy_moca_df["Event Name"].unique()
    print(f"Number of participants with redcap and legacy MoCA data: {n_redcap_participants}")
    print(f"redcap_events MoCA data: {redcap_events}")

    print(f"Saving filtered legacy MoCA data to {collated_moca_file}")
    redcap_and_legacy_moca_df.to_csv(collated_moca_file, index=False)

    legacy_moca_df.head()

### Fetch demographic data

In [10]:
demo_vars = ["dob", "enrollment_group", "sex", "education"]
# vars_with_secondary_source = ["dob"]

config_json = demo_config_json
index_cols = ["participant_id", "redcap_event_name"]

demo_var_df = pd.DataFrame()
for var in demo_vars:
    _df = get_available_data(config_json,data_release_dir,var)
    _df = _df.rename(columns=redcap_field_name_map)
    _df = _df.rename(columns=legacy_field_name_map)
    _df = _df[_df["participant_id"].isin(QPN_participants)].copy()

    if demo_var_df.empty:
        demo_var_df = _df
    else:
        demo_var_df = pd.merge(demo_var_df, _df, on=index_cols, how="outer")   


# add only DoB from seconday source
var = "dob"
print(f"**Getting data from the secondary source for dob**")
legacy_dob_df = get_available_data(config_json,data_release_dir,var,preferred_var_source="secondary")
legacy_dob_df = legacy_dob_df.rename(columns=legacy_field_name_map)
legacy_dob_df = legacy_dob_df.rename(columns={var:var+"_secondary"})

participants_with_missing_value_in_primary = demo_var_df[(demo_var_df["redcap_event_name"]==baseline_event_name) & (demo_var_df[var].isna())]["participant_id"].unique()
legacy_dob_df = legacy_dob_df[legacy_dob_df["participant_id"].isin(participants_with_missing_value_in_primary)].copy()

demo_var_df = pd.merge(demo_var_df, legacy_dob_df, on=["participant_id"], how="left")
demo_var_df[var] = demo_var_df[var].fillna(demo_var_df["dob_secondary"])


demo_participants = demo_var_df["participant_id"].unique()
n_demo_participants = len(demo_participants)
print('-'*50)
print(f"Number of participants with demographics data: {n_demo_participants}")
print('-'*50)

demo_redcap_events = demo_var_df["redcap_event_name"].unique()
print(f"Demographics data available for events: {demo_redcap_events}")
print('-'*50)

# Only keep data for baseline event
print('-'*50)
print(f"Keeping data for event: {baseline_event_name} (i.e. static variables)")
print(f"All temporal data goes into assessment files")
print('-'*50)
demo_var_df = demo_var_df[demo_var_df["redcap_event_name"]==baseline_event_name].copy()

for var in demo_vars:
    n_unique = demo_var_df[var].nunique()
    n_missing = demo_var_df[var].isna().sum()
    print(f"Var: {var}, n_unique: {n_unique}, n_missing: {n_missing} (out of {n_demo_participants})")

demo_var_df.head()

Using variable dob from source redcap and instrument sarah_extended_export
Using variable enrollment_group from source redcap and instrument enrollment_report
Using variable sex from source redcap and instrument sarah_extended_export
Using variable education from source redcap and instrument sarah_extended_export
**Getting data from the secondary source for dob**
Using variable dob from source local and instrument legacy_DOB
--------------------------------------------------
Number of participants with demographics data: 296
--------------------------------------------------
Demographics data available for events: ['Baseline (Arm 1: C-OPN)' '12 Months Follow-Up/Suivi (Arm 1: C-OPN)'
 '18 Months Follow-Up/Suivi (Arm 1: C-OPN)'
 '24 Months Follow-Up/Suivi (Arm 1: C-OPN)'
 '36 Months Follow-Up/Suivi (Arm 1: C-OPN)'
 '72 Months Follow-Up/Suivi (Arm 1: C-OPN)'
 '54 Months Follow-Up/Suivi (Arm 1: C-OPN)'
 '48 Months Follow-Up/Suivi (Arm 1: C-OPN)']
-------------------------------------------

Unnamed: 0,participant_id,redcap_event_name,dob,enrollment_group,sex,education,dob_secondary
0,MNI0028,Baseline (Arm 1: C-OPN),1963-07-27,PD (Parkinson's Disease)/Maladie de Parkinson,Male/Masculin,,NaT
1,MNI0056,Baseline (Arm 1: C-OPN),1942-05-21,PD (Parkinson's Disease)/Maladie de Parkinson,Male/Masculin,12.0,NaT
2,MNI0058,Baseline (Arm 1: C-OPN),1964-03-14,PD (Parkinson's Disease)/Maladie de Parkinson,Male/Masculin,11.0,NaT
3,MNI0068,Baseline (Arm 1: C-OPN),1952-05-08,PD (Parkinson's Disease)/Maladie de Parkinson,Male/Masculin,11.0,NaT
4,MNI0079,Baseline (Arm 1: C-OPN),1971-11-25,PD (Parkinson's Disease)/Maladie de Parkinson,Female/Féminin,16.0,NaT


### Tag / retract certain participants based on special criteria

In [11]:
### Roche tag 
roche_participants_df = pd.read_csv(roche_participant_list_csv)
roche_participants = roche_participants_df["participant_id"].unique()

print(f"Number of Roche participants: {len(roche_participants)}")

demo_var_df["recruitment_cohort"] = "QPN"
demo_var_df.loc[demo_var_df["participant_id"].isin(roche_participants),"recruitment_cohort"] = "Roche"

demo_var_df.head()

Number of Roche participants: 14


Unnamed: 0,participant_id,redcap_event_name,dob,enrollment_group,sex,education,dob_secondary,recruitment_cohort
0,MNI0028,Baseline (Arm 1: C-OPN),1963-07-27,PD (Parkinson's Disease)/Maladie de Parkinson,Male/Masculin,,NaT,QPN
1,MNI0056,Baseline (Arm 1: C-OPN),1942-05-21,PD (Parkinson's Disease)/Maladie de Parkinson,Male/Masculin,12.0,NaT,QPN
2,MNI0058,Baseline (Arm 1: C-OPN),1964-03-14,PD (Parkinson's Disease)/Maladie de Parkinson,Male/Masculin,11.0,NaT,QPN
3,MNI0068,Baseline (Arm 1: C-OPN),1952-05-08,PD (Parkinson's Disease)/Maladie de Parkinson,Male/Masculin,11.0,NaT,QPN
4,MNI0079,Baseline (Arm 1: C-OPN),1971-11-25,PD (Parkinson's Disease)/Maladie de Parkinson,Female/Féminin,16.0,NaT,QPN


### Save demographics data **without the DOB** 

In [12]:
demo_save_cols = ["participant_id", "redcap_event_name", "recruitment_cohort", "enrollment_group", "sex", "education"]
demo_var_without_dob_df = demo_var_df[demo_save_cols]
demo_var_without_dob_df.to_csv(demographics_file, index=False)
print(f"Saved demographics data to {demographics_file}")

Saved demographics data to /home/nikhil/projects/Parkinsons/qpn//releases/Aug_2024//tabular//demographics.csv


### Find records with phenotypic data

In [16]:
pheno_vars = ["diagnosis", "updrs_scores", "moca_scores",
              "diagnosis_date", "diagnosis_confirmation", "updrs_date", "moca_date"] #"legacy_updrs3_scores", "legacy_updrs3_date",

config_json = pheno_config_json
index_cols = ["participant_id", "redcap_event_name"]
pheno_var_df = pd.DataFrame()
for var in pheno_vars:
    _df = get_available_data(config_json,data_release_dir,var)
    _df = _df.rename(columns=redcap_field_name_map)
    _df = _df.rename(columns=legacy_field_name_map)
    _df = _df[_df["participant_id"].isin(QPN_participants)].copy()
    if pheno_var_df.empty:
        pheno_var_df = _df
    else:
        pheno_var_df = pd.merge(pheno_var_df, _df, on=index_cols, how="outer")   

pheno_participants = pheno_var_df["participant_id"].unique()
n_pheno_participants = len(pheno_participants)
print('-'*50)
print(f"Number of participants with pheno data: {n_pheno_participants}")
print('-'*50)

pheno_redcap_events = pheno_var_df["redcap_event_name"].unique()
print(f"Pheno data available for events: {pheno_redcap_events}")
print('-'*50)

for var in pheno_var_df.columns:
    for redcap_event in pheno_redcap_events:
        if var not in index_cols:
            pheno_var_event_df = pheno_var_df[pheno_var_df["redcap_event_name"]==redcap_event].copy()
            n_pheno_var_event_participants = pheno_var_event_df["participant_id"].nunique()
            if pheno_var_event_df[var].nunique() > 0:    
                print(f"Var: {var}, Event: {redcap_event}")
                n_unique = pheno_var_event_df[var].nunique()
                n_missing = pheno_var_event_df[var].isna().sum()
                print(f"n_unique: {n_unique}, n_missing: {n_missing} (out of {n_pheno_var_event_participants})")
    print('-'*50)

pheno_var_df.head()

Using variable diagnosis from source redcap and instrument sarah_extended_export
Using variable updrs_scores from source redcap and instrument collated_updrs_report
Using variable moca_scores from source redcap and instrument collated_moca_report
Using variable diagnosis_date from source redcap and instrument sarah_extended_export
Using variable diagnosis_confirmation from source redcap and instrument Dx_confirm_report
Using variable updrs_date from source redcap and instrument collated_updrs_report
Using variable moca_date from source redcap and instrument collated_moca_report
--------------------------------------------------
Number of participants with pheno data: 294
--------------------------------------------------
Pheno data available for events: ['Baseline (Arm 1: C-OPN)' '12 Months Follow-Up/Suivi (Arm 1: C-OPN)'
 '18 Months Follow-Up/Suivi (Arm 1: C-OPN)'
 '24 Months Follow-Up/Suivi (Arm 1: C-OPN)'
 '72 Months Follow-Up/Suivi (Arm 1: C-OPN)'
 '36 Months Follow-Up/Suivi (Arm 1

Unnamed: 0,diagnosis,participant_id,redcap_event_name,Updrs_3_16_r value,Updrs_3_15_r value,Updrs_3_3_rle value,Updrs_3_17_lle value,Updrs_3_3_lue value,Updrs_3_3_neck value,Updrs_3_3_rue value,...,Updrs_3_1 value,Updrs_3_6_r value,Updrs_3_14,TOTAL SCORE (make sure to include extra point for 12 years or less of education): SCORE TOTAL (assurez-vous d'inclure un point supplémentaire pour 12 ans ou moins d'éducation) :,Did the participant receive +1 extra point for 12 years or less of education? Le participant a-t-il reçu +1 point supplémentaire pour 12 ans ou moins d'études?,diagnosis_date,Final impression / Impression finale,Parkinson's disease in opinion of treating neurologist / Maladie de Parkinson selon l'avis du neurologue traitant,updrs_date,moca_date
0,0.0,MNI0028,Baseline (Arm 1: C-OPN),0.0,0.0,3.0,0.0,2.0,2.0,1.0,...,1.0,1.0,1.0,28.0,No/Non,2020-09-13,Uncertain / Incertain,Unsure / Incertain,2023-10-04,2023-10-04
1,0.0,MNI0056,Baseline (Arm 1: C-OPN),0.0,0.0,1.0,0.0,1.0,3.0,0.0,...,2.0,3.0,3.0,21.0,Yes/Oui,2017-09-01,Meets exclusion criteria / Rencontre des critè...,Yes / Oui,2021-06-11,2021-06-11
2,0.0,MNI0058,Baseline (Arm 1: C-OPN),1.0,1.0,2.0,0.0,1.0,1.0,2.0,...,0.0,3.0,0.0,25.0,Yes/Oui,2020-05-22,Uncertain / Incertain,Yes / Oui,2021-07-23,2021-07-23
3,0.0,MNI0068,Baseline (Arm 1: C-OPN),1.0,0.0,2.0,0.0,1.0,2.0,1.0,...,1.0,0.0,1.0,,,2014-01-01,,,2021-08-27,NaT
4,0.0,MNI0079,Baseline (Arm 1: C-OPN),0.0,0.0,1.0,0.0,1.0,2.0,1.0,...,0.0,1.0,0.0,26.0,No/Non,2017-05-01,Meets criteria for Parkinson's disease / Répon...,Yes / Oui,2022-01-21,2021-12-22


### Neuropsych data
- Comes from either from Sarah's extended report or BD_RPQ_UPDATE_Neuropsy

In [17]:
neuropsych_vars = ["neuropsy_scores","neuropsy_date"]

config_data = json.load(open(config_json))
variable_info = config_data['variables'][neuropsych_vars[0]]
variable_sources = variable_info["sources"]
neuropsy_source = variable_info['primary_source']

print(f"Using neuropsych data source: {neuropsy_source}")
# local BD_RPQ data
if neuropsy_source == "local":
    index_cols = ["participant_id", "visit", "TimePoint (based on REDCap; baseline, 18m, 36m, etc.)", "Délai depuis baseline (mois)"]
    
# redcap data
if neuropsy_source == "redcap":
    index_cols = ["participant_id", "redcap_event_name"]

    
neuropsych_df = pd.DataFrame()
for var in neuropsych_vars:
    _df = get_available_data(config_json,data_release_dir,var)
    _df = _df.rename(columns=redcap_field_name_map)
    _df = _df.rename(columns=legacy_field_name_map)
    _df = _df[_df["participant_id"].isin(QPN_participants)].copy()
    if neuropsych_df.empty:
        neuropsych_df = _df
    else:
        neuropsych_df = pd.merge(neuropsych_df, _df, on=index_cols, how="left")   

neuropsych_participants = neuropsych_df["participant_id"].unique()
n_neuropsych_participants = len(neuropsych_participants)
print('-'*50)
print(f"Number of participants with neuropysch data: {n_neuropsych_participants}")
print('-'*50)

neuropsych_cols = neuropsych_df.columns.drop(index_cols).tolist()
n_neuropsuch_cols = len(neuropsych_cols)
print(f"Neuropsych data available for variables: {n_neuropsuch_cols}")
print('-'*50)

# BD_RPQ data
if neuropsy_source == "local":
    neuropsych_visits = neuropsych_df["visit"].unique()

# REDCap data
if neuropsy_source == "redcap":
    neuropsych_visits = neuropsych_df["redcap_event_name"].unique()

print(f"neuropsych data available for events: {neuropsych_visits}")
print('-'*50)

neuropsych_df.head()

Using neuropsych data source: redcap
Using variable neuropsy_scores from source redcap and instrument sarah_extended_export
Using variable neuropsy_date from source redcap and instrument sarah_extended_export
--------------------------------------------------
Number of participants with neuropysch data: 291
--------------------------------------------------
Neuropsych data available for variables: 83
--------------------------------------------------
neuropsych data available for events: ['Baseline (Arm 1: C-OPN)' '12 Months Follow-Up/Suivi (Arm 1: C-OPN)'
 '18 Months Follow-Up/Suivi (Arm 1: C-OPN)'
 '24 Months Follow-Up/Suivi (Arm 1: C-OPN)']
--------------------------------------------------


Unnamed: 0,participant_id,Trial 3 raw,Was the Stroop Colour and Word test (Golden) administered?,Digit Span Forward - total correct (Raw score),Was the Letter Fluency test administered?,Was the Purdue pegboard administered?,"STROOP GOLDEN : words, self-corrected errors (raw score)","Repetitions total 1,2,3 (Raw score)",BNT sans indice (Raw score),Stroop - D-Kefs - Cond.3 - INK: Time (sec) (Raw score),...,Copy raw,Main non dominante (30 sec),Delayed recall time (sec),Letter Fluency S (Raw score),Stroop - D-Kefs - Cond.2: uncorrected errors (Raw score),Trial 1 raw,Stroop - D-Kefs - Cond.1: Total errors (Automatic Calculation),Copy time (sec),Deux mains (30 sec),neuropsy_date
0,MNI0028,10.0,Not administered,8.0,"Yes, completed","Yes, completed",,7.0,36.0,78.0,...,35,8.0,155.0,16.0,0.0,6.0,0.0,125.0,5.0,2023-10-03
1,MNI0056,4.0,Missing Data,8.0,"Yes, completed",Missing Data,997.0,2.0,49.0,166.0,...,28,997.0,994.0,4.0,0.0,3.0,0.0,321.0,997.0,2021-07-30
2,MNI0058,10.0,Missing Data,11.0,"Yes, completed",Missing Data,997.0,1.0,57.0,73.0,...,26,997.0,994.0,10.0,0.0,6.0,2.0,190.0,997.0,2021-08-18
3,MNI0068,9.0,Missing Data,9.0,"Yes, completed",Missing Data,997.0,2.0,50.0,91.0,...,215,997.0,994.0,9.0,0.0,5.0,2.0,144.0,997.0,2021-08-18
4,MNI0079,11.0,Missing Data,6.0,"Yes, completed","Yes, completed",997.0,2.0,53.0,50.0,...,30,9.0,994.0,15.0,0.0,8.0,1.0,151.0,10.0,2022-01-21


### Basic clean-up and data checks

In [18]:
# Fix dtypes
for series_name, series in neuropsych_df.items():
    if "score" in series_name:
        if series.dtype == 'object':
            print(f"recasting {series_name} to float by replacing , with .")
            neuropsych_df[series_name] = neuropsych_df[series_name].str.replace(",",".").astype(float)
            neuropsych_df.loc[neuropsych_df[series_name]>900, series_name] = np.nan
            
    # Replace >900 with NaNs
    if series.dtype == 'float':
        neuropsych_df.loc[neuropsych_df[series_name]>900, series_name] = np.nan

# assign redcap_event_name
visit_months = [12, 18, 24, 30, 36, 42, 48, 54]
month_bins = [9, 15, 21, 27, 33, 39, 45, 51, 57]
event_str_suffix = "Months Follow-Up/Suivi (Arm 1: C-OPN)"
event_names = [f"{m} {event_str_suffix}" for m in visit_months]

if neuropsy_source == "local":
    neuropsych_df["redcap_event_name"] = pd.cut(neuropsych_df["Délai depuis baseline (mois)"], bins=month_bins, labels=event_names).astype(str)
    neuropsych_df.loc[neuropsych_df["TimePoint (based on REDCap; baseline, 18m, 36m, etc.)"]=="baseline", 
                      "redcap_event_name"] = "Baseline (Arm 1: C-OPN)"

# Merge with pheno_var_df
index_cols = ["participant_id", "redcap_event_name"]
pheno_var_df = pd.merge(pheno_var_df, neuropsych_df, on=index_cols, how="left")  

pheno_var_df.head()

Unnamed: 0,diagnosis,participant_id,redcap_event_name,Updrs_3_16_r value,Updrs_3_15_r value,Updrs_3_3_rle value,Updrs_3_17_lle value,Updrs_3_3_lue value,Updrs_3_3_neck value,Updrs_3_3_rue value,...,Copy raw,Main non dominante (30 sec),Delayed recall time (sec),Letter Fluency S (Raw score),Stroop - D-Kefs - Cond.2: uncorrected errors (Raw score),Trial 1 raw,Stroop - D-Kefs - Cond.1: Total errors (Automatic Calculation),Copy time (sec),Deux mains (30 sec),neuropsy_date
0,0.0,MNI0028,Baseline (Arm 1: C-OPN),0.0,0.0,3.0,0.0,2.0,2.0,1.0,...,35,8.0,155.0,16.0,0.0,6.0,0.0,125.0,5.0,2023-10-03
1,0.0,MNI0056,Baseline (Arm 1: C-OPN),0.0,0.0,1.0,0.0,1.0,3.0,0.0,...,28,,,4.0,0.0,3.0,0.0,321.0,,2021-07-30
2,0.0,MNI0058,Baseline (Arm 1: C-OPN),1.0,1.0,2.0,0.0,1.0,1.0,2.0,...,26,,,10.0,0.0,6.0,2.0,190.0,,2021-08-18
3,0.0,MNI0068,Baseline (Arm 1: C-OPN),1.0,0.0,2.0,0.0,1.0,2.0,1.0,...,215,,,9.0,0.0,5.0,2.0,144.0,,2021-08-18
4,0.0,MNI0079,Baseline (Arm 1: C-OPN),0.0,0.0,1.0,0.0,1.0,2.0,1.0,...,30,9.0,,15.0,0.0,8.0,1.0,151.0,10.0,2022-01-21


### Add mri_acq date
- Needs to map to redcap_event_name

In [19]:
var = "MRI_date"
config_json = pheno_config_json
mri_date_df = get_available_data(config_json,data_release_dir,var)
mri_date_df["MRI_date"] = pd.to_datetime(mri_date_df["MRI_date"], errors="coerce", dayfirst=False)

n_mri_participants = mri_date_df["participant_id"].nunique()
print(f"Number of participants with MRI data: {n_mri_participants}")

n_sessions = mri_date_df["session"].nunique()
print(f"Number of MRI sessions: {n_sessions}")

### Retracted participants
print(f"Removing retracted participants from the dataset")
retracted_participants_df = pd.read_csv(retracted_participant_list_csv)
retracted_participants = retracted_participants_df["participant_id"].unique()
print(f"removing following {len(retracted_participants)} participants from the dataset: {retracted_participants}")
mri_date_df = mri_date_df[~mri_date_df["participant_id"].isin(retracted_participants)].copy()

participants_with_follow_ups = mri_date_df[mri_date_df["participant_id"].duplicated()]["participant_id"].unique()
n_participants_with_follow_ups = len(participants_with_follow_ups)
print(f"Number of participants with follow-up MRI: {n_participants_with_follow_ups}")

mri_ses01_date_df = mri_date_df[mri_date_df["session"]=="ses-01"].copy()
mri_ses01_date_df["redcap_event_name"] = "Baseline (Arm 1: C-OPN)"

mri_ses02_date_df = mri_date_df[mri_date_df["session"]=="ses-02"].copy()
mri_ses02_participants = mri_ses02_date_df["participant_id"].unique()
print(f"Number of participants with ses-02 MRI: {len(mri_ses02_participants)}")

baseline_df = mri_ses01_date_df[mri_ses01_date_df["participant_id"].isin(mri_ses02_participants)].set_index("participant_id")
followup_df = mri_ses02_date_df.set_index("participant_id")

visit_months = [12, 18, 24, 30, 36, 42, 48, 54]
month_bins = [9, 15, 21, 27, 33, 39, 45, 51, 57]

event_str_suffix = "Months Follow-Up/Suivi (Arm 1: C-OPN)"
event_names = [f"{m} {event_str_suffix}" for m in visit_months]

# --- Bin the months --- #
followup_df["months_since_baseline"] = followup_df["MRI_date"].dt.to_period('M').astype(int) - baseline_df["MRI_date"].dt.to_period('M').astype(int)
followup_df["months_since_baseline"] = followup_df["months_since_baseline"].replace({0:np.nan}) # Some visits get same acq_date from brodacasting merge. 

followup_df["redcap_event_name"] = pd.cut(followup_df["months_since_baseline"], bins=month_bins, labels=event_names)

mri_date_redcap_event_df = pd.concat([mri_ses01_date_df, followup_df.reset_index()], axis=0)
# mri_date_redcap_event_df = mri_date_redcap_event_df

mri_date_redcap_event_df.sort_values(["participant_id","session"]).head()

Using variable MRI_date from source local and instrument MRI_dates
Number of participants with MRI data: 305
Number of MRI sessions: 3
Removing retracted participants from the dataset
removing following 5 participants from the dataset: ['MNI0436' 'MNI0482' 'PD01100' 'MNI0369' 'MNI0607']
Number of participants with follow-up MRI: 67
Number of participants with ses-02 MRI: 67


Unnamed: 0,session,MRI_date,participant_id,redcap_event_name,months_since_baseline
0,ses-01,2023-10-04,MNI0028,Baseline (Arm 1: C-OPN),
1,ses-01,2021-08-18,MNI0056,Baseline (Arm 1: C-OPN),
2,ses-01,2021-08-18,MNI0058,Baseline (Arm 1: C-OPN),
3,ses-01,2021-08-27,MNI0068,Baseline (Arm 1: C-OPN),
4,ses-01,2021-12-22,MNI0079,Baseline (Arm 1: C-OPN),


#### Add MRI date to pheno data


In [20]:
pheno_var_df = pd.merge(pheno_var_df, mri_date_redcap_event_df, on=index_cols, how="outer")  
var = "MRI_date"
for redcap_event in mri_date_redcap_event_df["redcap_event_name"].unique():    
    pheno_var_event_df = pheno_var_df[pheno_var_df["redcap_event_name"]==redcap_event].copy()
    n_pheno_var_event_participants = pheno_var_event_df["participant_id"].nunique()
    if pheno_var_event_df[var].nunique() > 0:    
        print(f"Var: {var}, Event: {redcap_event}")
        n_unique = pheno_var_event_df[var].nunique()
        n_missing = pheno_var_event_df[var].isna().sum()
        print(f"n_unique: {n_unique}, n_missing: {n_missing} (out of {n_pheno_var_event_participants})")
pheno_var_df.head()

Var: MRI_date, Event: Baseline (Arm 1: C-OPN)
n_unique: 246, n_missing: 0 (out of 301)
Var: MRI_date, Event: 12 Months Follow-Up/Suivi (Arm 1: C-OPN)
n_unique: 45, n_missing: 244 (out of 293)
Var: MRI_date, Event: 18 Months Follow-Up/Suivi (Arm 1: C-OPN)
n_unique: 10, n_missing: 280 (out of 291)
Var: MRI_date, Event: 48 Months Follow-Up/Suivi (Arm 1: C-OPN)
n_unique: 1, n_missing: 0 (out of 1)
Var: MRI_date, Event: 42 Months Follow-Up/Suivi (Arm 1: C-OPN)
n_unique: 2, n_missing: 0 (out of 2)
Var: MRI_date, Event: 24 Months Follow-Up/Suivi (Arm 1: C-OPN)
n_unique: 1, n_missing: 290 (out of 291)
Var: MRI_date, Event: 36 Months Follow-Up/Suivi (Arm 1: C-OPN)
n_unique: 1, n_missing: 11 (out of 12)
Var: MRI_date, Event: 30 Months Follow-Up/Suivi (Arm 1: C-OPN)
n_unique: 1, n_missing: 0 (out of 1)


Unnamed: 0,diagnosis,participant_id,redcap_event_name,Updrs_3_16_r value,Updrs_3_15_r value,Updrs_3_3_rle value,Updrs_3_17_lle value,Updrs_3_3_lue value,Updrs_3_3_neck value,Updrs_3_3_rue value,...,Letter Fluency S (Raw score),Stroop - D-Kefs - Cond.2: uncorrected errors (Raw score),Trial 1 raw,Stroop - D-Kefs - Cond.1: Total errors (Automatic Calculation),Copy time (sec),Deux mains (30 sec),neuropsy_date,session,MRI_date,months_since_baseline
0,0.0,MNI0028,Baseline (Arm 1: C-OPN),0.0,0.0,3.0,0.0,2.0,2.0,1.0,...,16.0,0.0,6.0,0.0,125.0,5.0,2023-10-03,ses-01,2023-10-04,
1,0.0,MNI0056,Baseline (Arm 1: C-OPN),0.0,0.0,1.0,0.0,1.0,3.0,0.0,...,4.0,0.0,3.0,0.0,321.0,,2021-07-30,ses-01,2021-08-18,
2,0.0,MNI0058,Baseline (Arm 1: C-OPN),1.0,1.0,2.0,0.0,1.0,1.0,2.0,...,10.0,0.0,6.0,2.0,190.0,,2021-08-18,ses-01,2021-08-18,
3,0.0,MNI0068,Baseline (Arm 1: C-OPN),1.0,0.0,2.0,0.0,1.0,2.0,1.0,...,9.0,0.0,5.0,2.0,144.0,,2021-08-18,ses-01,2021-08-27,
4,0.0,MNI0079,Baseline (Arm 1: C-OPN),0.0,0.0,1.0,0.0,1.0,2.0,1.0,...,15.0,0.0,8.0,1.0,151.0,10.0,2022-01-21,ses-01,2021-12-22,


### Calculate age

In [21]:
demo_cols = ["participant_id", "dob", "enrollment_group", "sex"]
demo_var_df[demo_var_df["participant_id"]==participants_with_follow_ups[0]]
baseline_demo_df = demo_var_df[demo_var_df["redcap_event_name"]=="Baseline (Arm 1: C-OPN)"][demo_cols].copy()

index_cols = ["participant_id"] # not using redcap_event_name to allow broadcast of demographics vars
tabular_df = pd.merge(pheno_var_df, baseline_demo_df, on=index_cols, how="left")
tabular_df[tabular_df["participant_id"]==participants_with_follow_ups[0]]

tabular_df.head()

Unnamed: 0,diagnosis,participant_id,redcap_event_name,Updrs_3_16_r value,Updrs_3_15_r value,Updrs_3_3_rle value,Updrs_3_17_lle value,Updrs_3_3_lue value,Updrs_3_3_neck value,Updrs_3_3_rue value,...,Stroop - D-Kefs - Cond.1: Total errors (Automatic Calculation),Copy time (sec),Deux mains (30 sec),neuropsy_date,session,MRI_date,months_since_baseline,dob,enrollment_group,sex
0,0.0,MNI0028,Baseline (Arm 1: C-OPN),0.0,0.0,3.0,0.0,2.0,2.0,1.0,...,0.0,125.0,5.0,2023-10-03,ses-01,2023-10-04,,1963-07-27,PD (Parkinson's Disease)/Maladie de Parkinson,Male/Masculin
1,0.0,MNI0056,Baseline (Arm 1: C-OPN),0.0,0.0,1.0,0.0,1.0,3.0,0.0,...,0.0,321.0,,2021-07-30,ses-01,2021-08-18,,1942-05-21,PD (Parkinson's Disease)/Maladie de Parkinson,Male/Masculin
2,0.0,MNI0058,Baseline (Arm 1: C-OPN),1.0,1.0,2.0,0.0,1.0,1.0,2.0,...,2.0,190.0,,2021-08-18,ses-01,2021-08-18,,1964-03-14,PD (Parkinson's Disease)/Maladie de Parkinson,Male/Masculin
3,0.0,MNI0068,Baseline (Arm 1: C-OPN),1.0,0.0,2.0,0.0,1.0,2.0,1.0,...,2.0,144.0,,2021-08-18,ses-01,2021-08-27,,1952-05-08,PD (Parkinson's Disease)/Maladie de Parkinson,Male/Masculin
4,0.0,MNI0079,Baseline (Arm 1: C-OPN),0.0,0.0,1.0,0.0,1.0,2.0,1.0,...,1.0,151.0,10.0,2022-01-21,ses-01,2021-12-22,,1971-11-25,PD (Parkinson's Disease)/Maladie de Parkinson,Female/Féminin


In [22]:
date_cols = ["diagnosis_date", "updrs_date", "moca_date", "MRI_date", "neuropsy_date"]

date_age_cols_dict = {}
for col in date_cols:
    date_age_cols_dict[col] = f"{col.rsplit('_',1)[0]}_age"

age_cols = list(date_age_cols_dict.values())

for date_col, age_col in date_age_cols_dict.items():
    tabular_df = get_age_at_visit(tabular_df, date_col, age_col)

tabular_df.head()

Unnamed: 0,diagnosis,participant_id,redcap_event_name,Updrs_3_16_r value,Updrs_3_15_r value,Updrs_3_3_rle value,Updrs_3_17_lle value,Updrs_3_3_lue value,Updrs_3_3_neck value,Updrs_3_3_rue value,...,MRI_date,months_since_baseline,dob,enrollment_group,sex,diagnosis_age,updrs_age,moca_age,MRI_age,neuropsy_age
0,0.0,MNI0028,Baseline (Arm 1: C-OPN),0.0,0.0,3.0,0.0,2.0,2.0,1.0,...,2023-10-04,,1963-07-27,PD (Parkinson's Disease)/Maladie de Parkinson,Male/Masculin,57.13,60.19,60.19,60.19,60.19
1,0.0,MNI0056,Baseline (Arm 1: C-OPN),0.0,0.0,1.0,0.0,1.0,3.0,0.0,...,2021-08-18,,1942-05-21,PD (Parkinson's Disease)/Maladie de Parkinson,Male/Masculin,75.28,79.06,79.06,79.24,79.19
2,0.0,MNI0058,Baseline (Arm 1: C-OPN),1.0,1.0,2.0,0.0,1.0,1.0,2.0,...,2021-08-18,,1964-03-14,PD (Parkinson's Disease)/Maladie de Parkinson,Male/Masculin,56.19,57.36,57.36,57.43,57.43
3,0.0,MNI0068,Baseline (Arm 1: C-OPN),1.0,0.0,2.0,0.0,1.0,2.0,1.0,...,2021-08-27,,1952-05-08,PD (Parkinson's Disease)/Maladie de Parkinson,Male/Masculin,61.65,69.3,,69.3,69.28
4,0.0,MNI0079,Baseline (Arm 1: C-OPN),0.0,0.0,1.0,0.0,1.0,2.0,1.0,...,2021-12-22,,1971-11-25,PD (Parkinson's Disease)/Maladie de Parkinson,Female/Féminin,45.43,50.16,50.08,50.08,50.16


### Save demo, mri_dates, and pheno (dx, updrs, moca, neuropsych) in separate files
- remove DoB and other date columns 
- add age columns

In [24]:
index_cols = ["participant_id", "redcap_event_name"]

mri_cols = ["session", "MRI_age"]

dx_cols = ['Hoehn and Yahr Stage: ',            
           "Parkinson's disease in opinion of treating neurologist / Maladie de Parkinson selon l'avis du neurologue traitant",
            "Final impression / Impression finale",	
            # "Determined diagnosis:  If score = 0, Parkinson's Disease (PD)  If score = 1, Progressive Supranuclear Palsy (PSP)  If score = 2, Multiple System Atrophy (MSA)  If score = 3, Corticobasal Syndrome (CBS)  If score = 4, Dementia with Lewy Bodies (DLB)  If score = 5, Frontotemporal Dementia (FTD)  If score = 6, Essential Tremor (ET)  If score = 7, REM Sleep Behaviour Disorder (RBD)",
            "diagnosis_age"
            ]

updrs_cols = ['Part I: Non-Motor Aspects of Experiences of Daily Living (nM-EDL)',
              'Part II: Motor Aspects of Experiences of Daily Living (M-EDL)',
              "Updrs_3_1 value", "Updrs_3_2 value", "Updrs_3_3_neck value",
              "Updrs_3_3_rue value", "Updrs_3_3_lue value", "Updrs_3_3_rle value",
              "Updrs_3_3_lle value", "Updrs_3_4_r value", "Updrs_3_4_l value",
              "Updrs_3_5_r value", "Updrs_3_5_l value", "Updrs_3_6_r value",
              "Updrs_3_6_l value", "Updrs_3_7_r value", "Updrs_3_7_l value",
              "Updrs_3_8_r value", "Updrs_3_8_l value", "Updrs_3_9 value",
              "Updrs_3_10 value", "Updrs_3_11 value", "Updrs_3_12 value",
              "Updrs_3_13 value", "Updrs_3_14", "Updrs_3_15_r value",
              "Updrs_3_15_l value", "Updrs_3_16_r value", "Updrs_3_16_l value",
              "Updrs_3_17_rue value", "Updrs_3_17_lue value", "Updrs_3_17_rle value",
              "Updrs_3_17_lle value", "Updrs_3_17_lipjaw value", "Updrs_3_18 value",
              'Part III: Motor Examination', 'Part IV: Motor Complications', 
              'updrs_age',
              ]

# moca_cols = ["moca_scores", "moca_age"]
moca_cols = [
            "TOTAL SCORE (make sure to include extra point for 12 years or less of education):    SCORE TOTAL (assurez-vous d'inclure un point supplémentaire pour 12 ans ou moins d'éducation) : ",
            "Did the participant receive +1 extra point for 12 years or less of education?    Le participant a-t-il reçu +1 point supplémentaire pour 12 ans ou moins d'études?",
            "moca_age"
            ]

neuropsych_cols = neuropsych_cols + ["neuropsy_age"]
if "neuropsy_date" in neuropsych_cols:
    neuropsych_cols.remove("neuropsy_date")

mri_df = tabular_df[index_cols + mri_cols]
dx_df = tabular_df[index_cols + dx_cols].copy()
updrs_df = tabular_df[index_cols + updrs_cols].copy()
moca_df = tabular_df[index_cols + moca_cols].copy()
neuropsych_df = tabular_df[index_cols + neuropsych_cols].copy()

# filter na rows
mri_df = mri_df.dropna(subset=mri_cols, how='all')
dx_df = dx_df.dropna(subset=dx_cols, how='all')
updrs_df = updrs_df.dropna(subset=updrs_cols, how='all')
moca_df = moca_df.dropna(subset=moca_cols, how='all')
neuropsych_df = neuropsych_df.dropna(subset=neuropsych_cols, how='all')


# Save data to files
mri_df.to_csv(mri_session_date_file, index=False)
print(f"Saved MRI session data to {mri_session_date_file}")

dx_df.to_csv(dx_file, index=False)
print(f"Saved diagnosis data to {dx_file}")

updrs_df.to_csv(updrs_file, index=False)
print(f"Saved UPDRS data to {updrs_file}")

moca_df.to_csv(moca_file, index=False)
print(f"Saved MoCA data to {moca_file}")

neuropsych_df.to_csv(neuropsych_file, index=False)
print(f"Saved neuropsych data to {neuropsych_file}")

Saved MRI session data to /home/nikhil/projects/Parkinsons/qpn//releases/Aug_2024//tabular//mri_sessions.csv
Saved diagnosis data to /home/nikhil/projects/Parkinsons/qpn//releases/Aug_2024//tabular//assessments/diagnosis.csv
Saved UPDRS data to /home/nikhil/projects/Parkinsons/qpn//releases/Aug_2024//tabular//assessments/updrs.csv
Saved MoCA data to /home/nikhil/projects/Parkinsons/qpn//releases/Aug_2024//tabular//assessments/moca.csv
Saved neuropsych data to /home/nikhil/projects/Parkinsons/qpn//releases/Aug_2024//tabular//assessments/neuropsych.csv


In [38]:
updrs_df[~updrs_df["Part III: Motor Examination"].isna()].groupby("redcap_event_name")["participant_id"].count()

redcap_event_name
12 Months Follow-Up/Suivi (Arm 1: C-OPN)     34
18 Months Follow-Up/Suivi (Arm 1: C-OPN)     15
Baseline (Arm 1: C-OPN)                     193
Name: participant_id, dtype: int64

In [49]:
updrs_df[updrs_df["participant_id"].isin(legacy_participants)]

Unnamed: 0,participant_id,redcap_event_name,Part I: Non-Motor Aspects of Experiences of Daily Living (nM-EDL),Part II: Motor Aspects of Experiences of Daily Living (M-EDL),Updrs_3_1 value,Updrs_3_2 value,Updrs_3_3_neck value,Updrs_3_3_rue value,Updrs_3_3_lue value,Updrs_3_3_rle value,...,Updrs_3_16_l value,Updrs_3_17_rue value,Updrs_3_17_lue value,Updrs_3_17_rle value,Updrs_3_17_lle value,Updrs_3_17_lipjaw value,Updrs_3_18 value,Part III: Motor Examination,Part IV: Motor Complications,updrs_age
110,PD00119,Baseline (Arm 1: C-OPN),,,2.0,3.0,2.0,2.0,2.0,2.0,...,0.0,1.0,1.0,0.0,0.0,1.0,0.0,39.0,,66.76
113,PD00146,Baseline (Arm 1: C-OPN),,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,1.0,0.0,16.0,,64.27
120,PD00457,Baseline (Arm 1: C-OPN),,,1.0,0.0,2.0,2.0,1.0,3.0,...,0.0,1.0,1.0,1.0,1.0,1.0,0.0,31.0,,65.08
121,PD00458,Baseline (Arm 1: C-OPN),,,1.0,0.0,2.0,2.0,2.0,2.0,...,0.0,1.0,1.0,2.0,2.0,1.0,0.0,38.0,,66.56
122,PD00471,Baseline (Arm 1: C-OPN),,,3.0,2.0,4.0,0.0,0.0,2.0,...,0.0,1.0,2.0,0.0,0.0,0.0,0.0,51.0,,60.78
123,PD00472,Baseline (Arm 1: C-OPN),,,2.0,0.0,1.0,2.0,2.0,2.0,...,0.0,1.0,1.0,1.0,1.0,0.0,0.0,38.0,,73.55
124,PD00509,Baseline (Arm 1: C-OPN),,,2.0,1.0,2.0,0.0,0.0,1.0,...,0.0,2.0,2.0,0.0,0.0,0.0,0.0,36.0,,79.92
126,PD00576,Baseline (Arm 1: C-OPN),,,1.0,1.0,2.0,2.0,2.0,0.0,...,0.0,2.0,2.0,1.0,1.0,0.0,0.0,27.0,,66.04
129,PD00622,Baseline (Arm 1: C-OPN),,,1.0,0.0,0.0,0.0,0.0,0.0,...,2.0,1.0,1.0,0.0,1.0,0.0,0.0,22.0,,65.92
130,PD00647,Baseline (Arm 1: C-OPN),,,1.0,0.0,1.0,1.0,0.0,1.0,...,0.0,1.0,1.0,1.0,1.0,0.0,0.0,28.0,,72.21


In [39]:
prev_updrs_csv = "/home/nikhil/projects/Parkinsons/qpn//releases/Aug_2024//tabular//assessments/tmp/updrs.csv"
prev_updrs_df = pd.read_csv(prev_updrs_csv)
prev_updrs_df.groupby("redcap_event_name")["participant_id"].count()

redcap_event_name
12 Months Follow-Up/Suivi (Arm 1: C-OPN)     34
18 Months Follow-Up/Suivi (Arm 1: C-OPN)     14
24 Months Follow-Up/Suivi (Arm 1: C-OPN)      1
36 Months Follow-Up/Suivi (Arm 1: C-OPN)      2
48 Months Follow-Up/Suivi (Arm 1: C-OPN)      1
Baseline (Arm 1: C-OPN)                     159
Legacy                                       34
Name: participant_id, dtype: int64

In [48]:
legacy_participants = prev_updrs_df[prev_updrs_df["redcap_event_name"]=="Legacy"]["participant_id"].unique()

### MRI dob checks

In [25]:
mri_with_missing_age = mri_df[mri_df["MRI_age"].isna()]["participant_id"].unique()
missing_dob_in_redcap = ["MNI0136", "MNI0147", "MNI0436", "MNI0482", "MNI0605", "PD00296", "PD01182", "PD01258"]
legacy_dob_participants = legacy_dob_df["participant_id"].unique()

print(f"Participants with missing MRI age: {mri_with_missing_age}")
print(f"Participants with missing DoB in redcap: {missing_dob_in_redcap}")
print(f"Participants with missing DoB in legacy: {legacy_dob_participants}")


Participants with missing MRI age: ['MNI0605' 'PD01182' 'PD01258' 'MNI0136' 'MNI0147' 'PD00296' 'PD01253'
 'PD01662' 'PD01686' 'PD01687' 'PD01223']
Participants with missing DoB in redcap: ['MNI0136', 'MNI0147', 'MNI0436', 'MNI0482', 'MNI0605', 'PD00296', 'PD01182', 'PD01258']
Participants with missing DoB in legacy: ['MNI0602' 'PD00509' 'PD01758']


In [26]:
set(mri_with_missing_age) - set(roche_participants) - set(missing_dob_in_redcap) - set(legacy_dob_participants)

{'PD01223'}

In [27]:
demo_var_df[demo_var_df["participant_id"].isin(mri_with_missing_age)]

Unnamed: 0,participant_id,redcap_event_name,dob,enrollment_group,sex,education,dob_secondary,recruitment_cohort
182,PD01223,Baseline (Arm 1: C-OPN),1955-05-23,PD (Parkinson's Disease)/Maladie de Parkinson,Male/Masculin,13.0,NaT,QPN
1164,MNI0136,Baseline (Arm 1: C-OPN),NaT,Healthy control/Contrôle,,,NaT,QPN
1166,MNI0605,Baseline (Arm 1: C-OPN),NaT,Healthy control/Contrôle,,,NaT,QPN
1168,PD00296,Baseline (Arm 1: C-OPN),NaT,,,,NaT,QPN
1175,PD01182,Baseline (Arm 1: C-OPN),NaT,Healthy control/Contrôle,,,NaT,QPN
1176,PD01258,Baseline (Arm 1: C-OPN),NaT,,,,NaT,QPN


In [28]:
mri_df.head()

Unnamed: 0,participant_id,redcap_event_name,session,MRI_age
0,MNI0028,Baseline (Arm 1: C-OPN),ses-01,60.19
1,MNI0056,Baseline (Arm 1: C-OPN),ses-01,79.24
2,MNI0058,Baseline (Arm 1: C-OPN),ses-01,57.43
3,MNI0068,Baseline (Arm 1: C-OPN),ses-01,69.3
4,MNI0079,Baseline (Arm 1: C-OPN),ses-01,50.08


### Mark NM participants

In [40]:
NM_participants_df = pd.read_csv(neuromelanin_participant_list_csv)
NM_participants_df = NM_participants_df.rename(columns={"PSCID":"participant_id"})
NM_participants = NM_participants_df["participant_id"].unique()
print(f"Number of Neuromelanin participants: {len(NM_participants)}")

NM_participants_df.loc[NM_participants_df["Visit Label"] == "MRI01", "session"] = "ses-01"
NM_participants_df.loc[NM_participants_df["Visit Label"] == "MRI02", "session"] = "ses-02"
NM_participants_df.loc[NM_participants_df["Visit Label"] == "MRI03", "session"] = "ses-03"

# session wise counts
session_counts = NM_participants_df["session"].value_counts()
print(f"session_counts: {session_counts}")

# compare with mri_df
mri_participants = mri_df["participant_id"].unique()
print(f"Number of participants with MRI data: {len(mri_participants)}")

# participants in NM cohort but not in MRI cohort
NM_participants_not_in_mri = set(NM_participants) - set(mri_participants)
print(f"Number of NM participants not in MRI cohort: {len(NM_participants_not_in_mri)}")

# participants in NM cohort and in MRI cohort
NM_participants_in_mri = set(NM_participants) & set(mri_participants)
print(f"Number of NM participants in MRI cohort: {len(NM_participants_in_mri)}")

# participants in MRI cohort but not in NM cohort
mri_participants_not_in_NM = set(mri_participants) - set(NM_participants)
print(f"Number of MRI participants not in NM cohort: {len(mri_participants_not_in_NM)}")

# save NM participants
NM_participants_df.to_csv(f"{tabular_data_release_dir}/recruitment/neuromelanin_participants_VM.csv", index=False)

NM_participants_df.head()


Number of Neuromelanin participants: 290
session_counts: session
ses-01    289
ses-02     55
ses-03      6
Name: count, dtype: int64
Number of participants with MRI data: 301
Number of NM participants not in MRI cohort: 1
Number of NM participants in MRI cohort: 289
Number of MRI participants not in NM cohort: 12


Unnamed: 0,participant_id,DCCID,Visit Label,session
0,MNI0028,152209,MRI01,ses-01
1,MNI0056,864854,MRI01,ses-01
2,MNI0058,197308,MRI01,ses-01
3,MNI0068,842090,MRI01,ses-01
4,MNI0079,760662,MRI01,ses-01


## Save merged tabular data
### TODO after finalizing format for the merged dataframe index 

In [None]:
# index_cols  = ["participant_id", "redcap_event_name"]
# save_cols = index_cols + dx_cols + updrs_cols + moca_cols + neuropsych_cols + age_cols
# for col in date_cols:
#     if col in save_cols:
#         print(f"removing {col}")
#         save_cols.remove(col)

# n_save_cols = len(save_cols)    

# print(f"n_save_cols: {n_save_cols}")

# tabular_df.to_csv(tabular_file, index=None)

In [46]:
demo_var_df[demo_var_df["enrollment_group"].isin(["Healthy control/Contrôle", "PD   (Parkinson's Disease)/Maladie de Parkinson"])].groupby(["redcap_event_name","enrollment_group","sex"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,participant_id,dob,education,dob_secondary,recruitment_cohort
redcap_event_name,enrollment_group,sex,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Baseline (Arm 1: C-OPN),Healthy control/Contrôle,Female/Féminin,44,44,34,0,44
Baseline (Arm 1: C-OPN),Healthy control/Contrôle,Male/Masculin,22,22,17,0,22
Baseline (Arm 1: C-OPN),PD (Parkinson's Disease)/Maladie de Parkinson,Female/Féminin,68,68,51,1,68
Baseline (Arm 1: C-OPN),PD (Parkinson's Disease)/Maladie de Parkinson,Male/Masculin,140,140,111,1,140


In [47]:
demo_var_df[demo_var_df["enrollment_group"].isin(["Healthy control/Contrôle", "PD   (Parkinson's Disease)/Maladie de Parkinson"])].groupby(["redcap_event_name","enrollment_group","sex"]).count().sum(axis=0)

participant_id        274
dob                   274
education             213
dob_secondary           2
recruitment_cohort    274
dtype: int64

In [44]:
tabular_df[tabular_df["enrollment_group"].isin(["Healthy control/Contrôle", "PD   (Parkinson's Disease)/Maladie de Parkinson"])].groupby(["redcap_event_name","enrollment_group","sex"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,participant_id,diagnosis,Updrs_3_17_rue value,Hoehn and Yahr Stage:,Updrs_3_10 value,Updrs_3_16_r value,Updrs_3_17_lipjaw value,Updrs_3_13 value,Updrs_3_2 value,Updrs_3_9 value,...,neuropsy_date,session,MRI_date,months_since_baseline,dob,diagnosis_age,updrs_age,moca_age,MRI_age,neuropsy_age
redcap_event_name,enrollment_group,sex,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
12 Months Follow-Up/Suivi (Arm 1: C-OPN),Healthy control/Contrôle,Female/Féminin,44,0,0,0,0,0,0,0,0,0,...,0,9,9,9,44,0,0,5,9,0
12 Months Follow-Up/Suivi (Arm 1: C-OPN),Healthy control/Contrôle,Male/Masculin,22,0,0,0,0,0,0,0,0,0,...,0,5,5,5,22,0,0,4,5,0
12 Months Follow-Up/Suivi (Arm 1: C-OPN),PD (Parkinson's Disease)/Maladie de Parkinson,Female/Féminin,68,0,8,7,8,8,8,8,8,8,...,0,7,7,7,68,0,8,11,7,0
12 Months Follow-Up/Suivi (Arm 1: C-OPN),PD (Parkinson's Disease)/Maladie de Parkinson,Male/Masculin,140,0,25,25,25,25,25,25,25,25,...,0,19,19,19,140,0,25,23,19,0
18 Months Follow-Up/Suivi (Arm 1: C-OPN),Healthy control/Contrôle,Female/Féminin,44,0,0,0,0,0,0,0,0,0,...,13,3,3,3,44,0,0,0,3,13
18 Months Follow-Up/Suivi (Arm 1: C-OPN),Healthy control/Contrôle,Male/Masculin,22,0,0,0,0,0,0,0,0,0,...,6,1,1,1,22,0,0,0,1,6
18 Months Follow-Up/Suivi (Arm 1: C-OPN),PD (Parkinson's Disease)/Maladie de Parkinson,Female/Féminin,68,0,5,5,5,5,5,5,5,5,...,4,3,3,3,68,0,5,3,3,4
18 Months Follow-Up/Suivi (Arm 1: C-OPN),PD (Parkinson's Disease)/Maladie de Parkinson,Male/Masculin,140,0,9,9,9,9,9,9,9,9,...,10,4,4,4,140,0,9,8,4,10
24 Months Follow-Up/Suivi (Arm 1: C-OPN),Healthy control/Contrôle,Female/Féminin,44,0,0,0,0,0,0,0,0,0,...,0,0,0,0,44,0,0,1,0,0
24 Months Follow-Up/Suivi (Arm 1: C-OPN),Healthy control/Contrôle,Male/Masculin,22,0,0,0,0,0,0,0,0,0,...,0,0,0,0,22,0,0,0,0,0


## Identify missing data

### UPDRS legacy and redcap data

In [None]:
updrs3_2019_csv = f"{data_release_dir}/tabular/recruitment/updrs3_2019.csv"
mds_updrs_hackathon_csv = f"{data_release_dir}/tabular/recruitment/mds_updrs_hackathon.csv"

updrs3_2019_df = pd.read_csv(updrs3_2019_csv)
mds_updrs_hackathon_df = pd.read_csv(mds_updrs_hackathon_csv)

updrs3_2019_participants = updrs3_2019_df["Record ID:"].unique()
mds_updrs_hackathon_participants = mds_updrs_hackathon_df["Record ID:"].unique()

n_updrs3_2019_participants = len(updrs3_2019_participants)
n_mds_updrs_hackathon_participants = len(mds_updrs_hackathon_participants)

print(f"Number of participants in UPDRS3 2019: {n_updrs3_2019_participants}")
print(f"Number of participants in MDS UPDRS Hackathon: {n_mds_updrs_hackathon_participants}")

In [None]:
updrs_redcap_participants = pheno_var_df[pheno_var_df["Part III: Motor Examination"].notna()]["participant_id"].unique()
print(f"Number of participants with UPDRS data in redcap: {len(updrs_redcap_participants)}")

paper_df = tabular_df.copy()
paper_participants = paper_df["participant_id"].unique()
print(f"Number of participants in paper: {len(paper_participants)}")

updrs_missing_participants = set(paper_participants) - set(updrs_redcap_participants) - set(updrs3_2019_participants) - set(mds_updrs_hackathon_participants)
n_updrs_missing_participants = len(updrs_missing_participants)
print(f"Number of participants missing UPDRS data: {n_updrs_missing_participants}")

In [None]:
len(list(set(paper_participants) & set(mds_updrs_hackathon_participants)))

### Suivi updrs dates

In [None]:
current_recruit_manifest_xls = f"{data_release_dir}/tabular/recruitment/Suivi_RPQ.xlsx"

col_range = "A:BD"

col_rename_dict = {
    "subj_id":"participant_id",
    "IRM01\n(J-M-A)":"IRM01_date", "#IRM 1\n PD":"IRM01_PD", "#IRM 1\n CTRL":"IRM01_CTRL", 
    "# IRM 1\n RBD":"IRM01_RBD", "# IRM 1\nOTHER":"IRM01_OTHER",
    "IRM 2 \n(J-M-A)":"IRM02_date", "#IRM 2\n PD":"IRM02_PD", "#IRM 2\n CTRL":"IRM02_CTRL", 
    "# IRM 2\n RBD":"IRM02_RBD", "# IRM 2 OTHER":"IRM02_OTHER",
    "IRM 3\n(J-M-A)":"IRM03_date", "#IRM 3\n PD":"IRM03_PD", "#IRM 3\n CTRL":"IRM03_CTRL", 
    "# IRM 3\n RBD":"IRM03_RBD", "# IRM 3 OTHER":"IRM03_OTHER",
    "MDS-UPDRS_partie_III_1\n(J-M-A)":"MDS-UPDRS_III_v01_date", 
    "MDS-UPDRS_complet (J-M-A)": "MDS-UPDRS_complete_date"
    }

useful_cols = col_rename_dict.values()

suivi_df = pd.read_excel(current_recruit_manifest_xls,sheet_name="En cours", engine='openpyxl', usecols=col_range)
suivi_df = suivi_df.rename(columns=col_rename_dict)[useful_cols].copy()

# remove the row with tally
suivi_df = suivi_df.drop([0])

# remove rows without participant_id
suivi_df = suivi_df.dropna(axis=0, subset=["participant_id"])
suivi_df = suivi_df[~suivi_df["participant_id"].astype(str).isin(["0"])] 
suivi_df["participant_id"] = suivi_df["participant_id"].str.strip().astype(str)

# remove subjects without imaging data
suivi_df = suivi_df[(suivi_df["IRM01_PD"] == 1) | (suivi_df["IRM01_CTRL"] == 1) | 
                    (suivi_df["IRM01_RBD"] == 1) | (suivi_df["IRM01_OTHER"] == 1) |
                    (suivi_df["IRM02_PD"] == 1) | (suivi_df["IRM02_CTRL"] == 1) | 
                    (suivi_df["IRM02_RBD"] == 1) |(suivi_df["IRM02_OTHER"] == 1) |
                    (suivi_df["IRM03_PD"] == 1) | (suivi_df["IRM03_CTRL"] == 1) | 
                    (suivi_df["IRM03_RBD"] == 1) |(suivi_df["IRM03_OTHER"] == 1) ]


# fix participant_id formatting issues
# Some rows have Dx in participant_id and one participant with two IDs with "="
possible_delimiters = [" ", "(", "=", "\n"]
for delim in possible_delimiters:        
    suivi_df["participant_id"] = suivi_df["participant_id"].str.strip().str.split(pat=delim, n=1, expand=True)[0]

# nipoppy_participants_current
nipoppy_participants_current = suivi_df["participant_id"].dropna().unique()

suivi_df["MDS-UPDRS_III_v01_date"] = pd.to_datetime(suivi_df["MDS-UPDRS_III_v01_date"], errors="coerce")
suivi_df["MDS-UPDRS_complete_date"] = pd.to_datetime(suivi_df["MDS-UPDRS_complete_date"], errors="coerce")

suivi_df

In [None]:
suivi_updrs_participants = suivi_df[(suivi_df["MDS-UPDRS_III_v01_date"].notna() | (suivi_df["MDS-UPDRS_complete_date"].notna()))]["participant_id"]
n_suivi_updrs_participants = len(suivi_updrs_participants)
print(f"Number of participants with UPDRS data in suivi: {n_suivi_updrs_participants}")

In [None]:
len(list(set(suivi_updrs_participants) & set(paper_participants)))

In [None]:
len(list(set(suivi_updrs_participants) & set(updrs_missing_participants)))

In [None]:
missing_updrs_participants_df = pd.DataFrame(list(updrs_missing_participants), columns=["participant_id"])
missing_updrs_participants_df["Suivi_date_present"] = "no"
missing_updrs_participants_df.loc[missing_updrs_participants_df["participant_id"].isin(suivi_updrs_participants),"Suivi_date_present"] = "yes"
missing_updrs_participants_df = pd.merge(missing_updrs_participants_df, demo_var_df, on="participant_id", how="left")
missing_updrs_participants_df


In [None]:
missing_updrs_participants_df_PD = missing_updrs_participants_df[
                                                                # (missing_updrs_participants_df["Suivi_date_present"]=="yes") & 
                                                                (missing_updrs_participants_df["group"]=="PD") &
                                                                (missing_updrs_participants_df["redcap_event_name"]=="Baseline (Arm 1: C-OPN)")
                                                                ]
len(missing_updrs_participants_df_PD)


In [None]:
missing_updrs_participants_df_PD.reset_index().drop(columns=["index"])

In [None]:
missing_updrs_participants_df_PD.to_csv(f"{data_release_dir}/tabular/recruitment/missing_updrs_participants_after_hackathon.csv", index=False)