In [1]:
import pandas as pd
import numpy as np
import requests
import json
import time
import pickle
from dateutil import relativedelta
import seaborn as sns

### API call for redcap

In [2]:
def api_call(url, query, logger=None):
    """ helper function to make API calls to RedCap
    """
    r = requests.post(url, data=query, verify=False)
    http_status = str(r.status_code)
    print(f'HTTP Status: {http_status}')

    if http_status == "200":
        query_results = r.json()
        query_df = pd.DataFrame(query_results)

    else:
        print(f"RedCap API request Failed with HTTP Status: {http_status}")
        query_df = None
        
    return query_df

def get_inventory_count(df, index_col, availability_indicators):
    """ helper function to count participants with recorded data in redcap
    """
    assess_cols = df.columns.drop(index_col)

    if availability_indicators == 'number':
        df = df.replace("", np.nan)
        df[assess_cols] = df[assess_cols].astype(np.float64)

    inventory = {}
    for col in assess_cols:        
        if availability_indicators == 'number':
            availability_count = df[~df[col].isna()][index_col].nunique()
        else:
            availability_count = df[df[col].isin(availability_indicators)][index_col].nunique()
        inventory[col] = availability_count
    return inventory

def get_available_data(config_json, DATASET_ROOT, var_name, preferred_var_source="primary"):
    """ Get data for given variables from available sources
        All return dataframes should have participant_id and visit_id as index
    """
    config_data = json.load(open(config_json))
    data_sources = config_data['data_sources']
    variable_info = config_data['variables'][var_name]
    variable_type = variable_info["type"]
    variable_sources = variable_info["sources"]

    if preferred_var_source == "primary":
        selected_var_source = variable_info['primary_source']
        selected_var_instrument = variable_info['primary_instrument']
    elif preferred_var_source == "secondary":
        selected_var_source = variable_info['secondary_source']
        selected_var_instrument = variable_info['secondary_instrument']
    else:
        print(f"Using preferred source {preferred_var_source} for variable {var_name}")
        preferred_var_data_source = preferred_var_source["data_source"]
        preferred_var_instrument = preferred_var_source["instrument"]

        if preferred_var_data_source not in variable_sources.keys():
            print(f"Preferred data source {preferred_var_data_source} not available for variable {var_name}")
            return None
        else:
            selected_var_source = preferred_var_data_source

        if preferred_var_instrument not in variable_sources[selected_var_source].keys():
            print(f"Preferred var instrument {preferred_var_instrument} not available for variable {var_name}")
            return None
        else:
            selected_var_instrument = preferred_var_instrument

    print(f"Using variable {var_name} from source {selected_var_source} and instrument {selected_var_instrument}")

    external_var_cols = variable_sources[selected_var_source][selected_var_instrument]

    # Get data from primary source
    var_file = data_sources[selected_var_source][selected_var_instrument]["path"]
    var_file_path = f"{DATASET_ROOT}/{var_file}"
    var_file_index = data_sources[selected_var_source][selected_var_instrument]["index_cols"]

    var_df = pd.read_csv(var_file_path)
    selected_var_cols = list(set(var_file_index + external_var_cols))
    var_df = var_df[selected_var_cols]
    
    if (variable_type == "date") & (len(external_var_cols) == 1):
        var_df[external_var_cols[0]] = pd.to_datetime(var_df[external_var_cols[0]], errors="coerce", dayfirst=False)

    if (len(external_var_cols) == 1):
        var_df = var_df.rename(columns={external_var_cols[0]:var_name})
    
    return var_df

def get_age_at_visit(df, date_col, age_col, dob_col="dob", rounding_digits=2, age_range=(0,100)):
    """ Get age at visit. Expects column name to be: var_date """
    
    df[age_col] = df[date_col] - df[dob_col]
    df[age_col] = np.round(df[age_col].dt.days / 365.25, rounding_digits)

    if (len(df[df[age_col] > 100]) | len(df[df[age_col] < 0])):
        print(f"Warning: Age values outside range {age_range} for variable {var}")

    return df

### Paths


In [3]:
DATASET_ROOT = "/home/nikhil/projects/Parkinsons/qpn/"

# Current nipoppy release
current_release = "June_2024"

data_release_dir = f"{DATASET_ROOT}/releases/{current_release}/"
tabular_data_release_dir = f"{data_release_dir}/tabular/"

redcap_release_dir = f"{data_release_dir}tabular/redcap/chunked/"
redcap_chunked_report_COPY = f"{redcap_release_dir}/1. COPN-QPNDataMoCAUPDRSNeur_DATA_LABELS_2024-06-19_0910_copy.xlsx"

demo_config_json = "../workflow/tabular/demographics.json"
pheno_config_json = "../workflow/tabular/pheno.json"

# output files
demographics_file = f"{tabular_data_release_dir}/demographics.csv"
mri_session_date_file = f"{tabular_data_release_dir}/mri_sessions.csv"
updrs_file = f"{tabular_data_release_dir}/assessments/updrs.csv"
moca_file = f"{tabular_data_release_dir}/assessments/moca.csv"
dx_file = f"{tabular_data_release_dir}/assessments/diagnosis.csv"
neuropsych_file = f"{tabular_data_release_dir}/assessments/neuropsych.csv"

### Standardized index names

In [4]:
baseline_event_name = "Baseline (Arm 1: C-OPN)"

## redcap event name variations
config_data = json.load(open(demo_config_json))
data_sources = config_data['data_sources']
redcap_data_sources = data_sources['redcap']

redcap_field_name_map = {}

for instrument in redcap_data_sources.keys():
    index_cols = redcap_data_sources[instrument]['index_cols']
    record_id = index_cols[0]
    event_name = index_cols[1]

    redcap_field_name_map[record_id] = "participant_id"
    redcap_field_name_map[event_name] = "redcap_event_name"
print(f"redcap_field_name_map: {redcap_field_name_map}")

# legacy participant_id variations in DOB and BD_RPQ
legacy_field_name_map = {}
legacy_field_name_map['Record ID'] = "participant_id"
legacy_field_name_map['Patient #'] = "participant_id"
legacy_field_name_map['Name of visit (V01, V02, V03)'] = "visit"
print(f"legacy_field_name_map: {legacy_field_name_map}")

redcap_field_name_map: {'Record ID:': 'participant_id', 'Event Name': 'redcap_event_name', 'record_id': 'participant_id', 'redcap_event_name': 'redcap_event_name'}
legacy_field_name_map: {'Record ID': 'participant_id', 'Patient #': 'participant_id', 'Name of visit (V01, V02, V03)': 'visit'}


### Update RedCAP reports through API 
(Not updating extended report since it has to come from Sarah)
- "global_records_query"
- "QPN MoCA-UPDRS-Neuropsy data_Sarah"

In [5]:
update_redcap_reports = False

redcap_report_list = ["global_records_query", "QPN MoCA-UPDRS-Neuropsy data_Sarah"]
if update_redcap_reports:
    redcap_config_json = f"{DATASET_ROOT}/proc/.redcap.json"
    redcap_config = json.load(open(redcap_config_json))
    url = redcap_config["url"]
    
    for redcap_report in redcap_report_list:
        print(f"Getting data for RedCap report: {redcap_report}")
        records_query = redcap_config["queries"][redcap_report]
        query_df = api_call(url, records_query, logger=None)
        report_csv = f"{release_dir}{current_release}/tabular/redcap/{redcap_report}.csv"
        query_df.to_csv(report_csv, index=False)
        print(f"Saved RedCap report to {report_csv}")



### Available participants

In [6]:
QPN_participants_df = get_available_data(demo_config_json,data_release_dir,"participant_id")
QPN_participants = QPN_participants_df["participant_id"].unique()
n_participants = len(QPN_participants)
print(f"Number of participants: {n_participants}")

Using variable participant_id from source local and instrument manifest
Number of participants: 306


### Collate chunked RedCap data
- The new generate report is formatted as mutli-tab excel spreadsheet based on redcap-event. 


In [7]:
regenerate_collated_report = False

if regenerate_collated_report:
    sheet_names = ["Baseline (without CHQ)","F-U 12months & MNI","F-U 18months & MNI", "F-U 24months & MNI",
                "F-U 12months & PD, UDM", "F-U 18months & PD, UDM", "F-U 24months & PD, UDM"]
    redcap_chunked_report_df = pd.DataFrame()
    for sheet_name in sheet_names:
        _df = pd.read_excel(redcap_chunked_report_COPY, sheet_name=sheet_name, engine='openpyxl')
        _df = _df[_df["Record ID:"].isin(QPN_participants)]  
        redcap_chunked_report_df = pd.concat([redcap_chunked_report_df, _df], axis=0)
        print(f"Sheet: {sheet_name} - Shape: {_df.shape}")
        print(f"redcap_chunked_report_df - Shape: {redcap_chunked_report_df.shape}")

    print(f"Saving collated redcap report to {redcap_release_dir}/redcap_chunked_report.csv")
    redcap_chunked_report_df.to_csv(f"{redcap_release_dir}/redcap_chunked_report.csv", index=False)

else:
    print(f"Loading collated redcap report from {redcap_release_dir}/redcap_chunked_report.csv")

Loading collated redcap report from /home/nikhil/projects/Parkinsons/qpn//releases/June_2024/tabular/redcap/chunked//redcap_chunked_report.csv


### Fetch demographic data

In [8]:
demo_vars = ["dob", "group", "sex", "education"]
# preferred_var_source = {"data_source":"local","instrument":"legacy_DOB"}
vars_with_secondary_source = ["dob"]

config_json = demo_config_json
index_cols = ["participant_id", "redcap_event_name"]

demo_var_df = pd.DataFrame()
for var in demo_vars:
    _df = get_available_data(config_json,data_release_dir,var)
    _df = _df.rename(columns=redcap_field_name_map)
    _df = _df.rename(columns=legacy_field_name_map)
    _df = _df[_df["participant_id"].isin(QPN_participants)].copy()

    if var in vars_with_secondary_source:
        print(f"**Getting data from the secondary source for variable {var}**")
        _df2 = get_available_data(config_json,data_release_dir,var,preferred_var_source="secondary")
        _df2 = _df2.rename(columns=legacy_field_name_map)
        _df2 = _df2.rename(columns={var:var+"_secondary"})
        _df2 = _df2[_df2["participant_id"].isin(QPN_participants)].copy()
        
        # Merge primary and secondary sources
        n_missing_in_primary = _df[_df["redcap_event_name"]==baseline_event_name][var].isna().sum()
        print(f"Missing data in primary source: {n_missing_in_primary}")

        if "redcap_event_name" in _df2.columns:
            _df = pd.merge(_df, _df2, on=["participant_id","redcap_event_name"], how="outer")
        else:
            _df = pd.merge(_df, _df2, on="participant_id", how="outer")
        _df[var] = _df[var].fillna(_df[var+"_secondary"])
        # _df = _df.drop(columns=[var+"_secondary"])

        n_missing_after_secondary_fill = _df[_df["redcap_event_name"]==baseline_event_name][var].isna().sum()
        print(f"Missing data after secondary source fill: {n_missing_after_secondary_fill}")

    if demo_var_df.empty:
        demo_var_df = _df
    else:
        demo_var_df = pd.merge(demo_var_df, _df, on=index_cols, how="outer")   

demo_participants = demo_var_df["participant_id"].unique()
n_demo_participants = len(demo_participants)
print('-'*50)
print(f"Number of participants with demographics data: {n_demo_participants}")
print('-'*50)

demo_redcap_events = demo_var_df["redcap_event_name"].unique()
print(f"Demographics data available for events: {demo_redcap_events}")
print('-'*50)

for var in demo_vars:
    n_unique = demo_var_df[demo_var_df["redcap_event_name"]=="Baseline (Arm 1: C-OPN)"][var].nunique()
    n_missing = demo_var_df[demo_var_df["redcap_event_name"]=="Baseline (Arm 1: C-OPN)"][var].isna().sum()
    print(f"Var: {var}, n_unique: {n_unique}, n_missing: {n_missing} (out of {n_demo_participants})")

demo_var_df.head()

Using variable dob from source redcap and instrument sarah_extended_export
**Getting data from the secondary source for variable dob**
Using variable dob from source local and instrument legacy_DOB
Missing data in primary source: 3
Missing data after secondary source fill: 1
Using variable group from source redcap and instrument sarah_extended_export
Using variable sex from source redcap and instrument sarah_extended_export
Using variable education from source redcap and instrument sarah_extended_export
--------------------------------------------------
Number of participants with demographics data: 299
--------------------------------------------------
Demographics data available for events: ['Baseline (Arm 1: C-OPN)' '12 Months Follow-Up/Suivi (Arm 1: C-OPN)'
 '18 Months Follow-Up/Suivi (Arm 1: C-OPN)'
 '24 Months Follow-Up/Suivi (Arm 1: C-OPN)' nan]
--------------------------------------------------
Var: dob, n_unique: 287, n_missing: 1 (out of 299)
Var: group, n_unique: 3, n_missin

Unnamed: 0,dob,redcap_event_name,participant_id,dob_secondary,group,sex,education
0,1963-07-27,Baseline (Arm 1: C-OPN),MNI0028,NaT,PD (Parkinson's Disease)/Maladie de Parkinson,Male/Masculin,
1,NaT,12 Months Follow-Up/Suivi (Arm 1: C-OPN),MNI0028,NaT,,,
2,NaT,18 Months Follow-Up/Suivi (Arm 1: C-OPN),MNI0028,NaT,,,
3,NaT,24 Months Follow-Up/Suivi (Arm 1: C-OPN),MNI0028,NaT,,,
4,1942-05-21,Baseline (Arm 1: C-OPN),MNI0056,NaT,PD (Parkinson's Disease)/Maladie de Parkinson,Male/Masculin,12.0


### Save demographics data **without the DOB** 

In [9]:
demo_var_without_dob_df = demo_var_df.drop(columns=["dob", "dob_secondary"])
demo_var_without_dob_df.to_csv(demographics_file, index=False)
print(f"Saved demographics data to {demographics_file}")

Saved demographics data to /home/nikhil/projects/Parkinsons/qpn//releases/June_2024//tabular//demographics.csv


### Find records with phenotypic data

In [10]:
pheno_vars = ["diagnosis", "updrs_scores", "moca_scores", "diagnosis_date", "updrs_date", "moca_date"]

config_json = pheno_config_json
index_cols = ["participant_id", "redcap_event_name"]
pheno_var_df = pd.DataFrame()
for var in pheno_vars:
    _df = get_available_data(config_json,data_release_dir,var)
    _df = _df.rename(columns=redcap_field_name_map)
    _df = _df.rename(columns=legacy_field_name_map)
    _df = _df[_df["participant_id"].isin(QPN_participants)].copy()
    if pheno_var_df.empty:
        pheno_var_df = _df
    else:
        pheno_var_df = pd.merge(pheno_var_df, _df, on=index_cols, how="outer")   

pheno_participants = pheno_var_df["participant_id"].unique()
n_pheno_participants = len(pheno_participants)
print('-'*50)
print(f"Number of participants with pheno data: {n_pheno_participants}")
print('-'*50)

pheno_redcap_events = pheno_var_df["redcap_event_name"].unique()
print(f"Pheno data available for events: {pheno_redcap_events}")
print('-'*50)

for var in pheno_var_df.columns:
    for redcap_event in pheno_redcap_events:
        if var not in index_cols:
            pheno_var_event_df = pheno_var_df[pheno_var_df["redcap_event_name"]==redcap_event].copy()
            n_pheno_var_event_participants = pheno_var_event_df["participant_id"].nunique()
            if pheno_var_event_df[var].nunique() > 0:    
                print(f"Var: {var}, Event: {redcap_event}")
                n_unique = pheno_var_event_df[var].nunique()
                n_missing = pheno_var_event_df[var].isna().sum()
                print(f"n_unique: {n_unique}, n_missing: {n_missing} (out of {n_pheno_var_event_participants})")
    print('-'*50)

pheno_var_df.head()

Using variable diagnosis from source redcap and instrument sarah_extended_export
Using variable updrs_scores from source redcap and instrument sarah_extended_export
Using variable moca_scores from source redcap and instrument sarah_extended_export
Using variable diagnosis_date from source redcap and instrument sarah_extended_export
Using variable updrs_date from source redcap and instrument sarah_extended_export
Using variable moca_date from source redcap and instrument sarah_extended_export
--------------------------------------------------
Number of participants with pheno data: 294
--------------------------------------------------
Pheno data available for events: ['Baseline (Arm 1: C-OPN)' '12 Months Follow-Up/Suivi (Arm 1: C-OPN)'
 '18 Months Follow-Up/Suivi (Arm 1: C-OPN)'
 '24 Months Follow-Up/Suivi (Arm 1: C-OPN)']
--------------------------------------------------
Var: diagnosis, Event: Baseline (Arm 1: C-OPN)
n_unique: 5, n_missing: 101 (out of 294)
--------------------------

Unnamed: 0,diagnosis,redcap_event_name,participant_id,Hoehn and Yahr Stage:,Part IV: Motor Complications,Part I: Non-Motor Aspects of Experiences of Daily Living (nM-EDL),Part III: Motor Examination,Part II: Motor Aspects of Experiences of Daily Living (M-EDL),moca_scores,diagnosis_date,updrs_date,moca_date
0,0.0,Baseline (Arm 1: C-OPN),MNI0028,(2) Bilateral involvement without impairment o...,0.0,0.0,29.0,2.0,28.0,2020-09-13,2023-10-04,2023-10-04
1,0.0,Baseline (Arm 1: C-OPN),MNI0056,(3) Bilateral disease: mild to moderate disabi...,0.0,12.0,58.0,22.0,21.0,2017-09-01,2021-06-11,2021-06-11
2,0.0,Baseline (Arm 1: C-OPN),MNI0058,"(1) Unilateral involvement only, usually with ...",0.0,0.0,26.0,6.0,25.0,2020-05-22,2021-07-23,2021-07-23
3,0.0,Baseline (Arm 1: C-OPN),MNI0068,(2) Bilateral involvement without impairment o...,0.0,11.0,28.0,8.0,,2014-01-01,2021-08-27,NaT
4,0.0,Baseline (Arm 1: C-OPN),MNI0079,(2) Bilateral involvement without impairment o...,8.0,13.0,22.0,10.0,26.0,2017-05-01,2022-01-21,2021-12-22


### Neuropsych data
- Comes from either from Sarah's extended report or BD_RPQ_UPDATE_Neuropsy

In [11]:
neuropsych_vars = ["neuropsy_scores","neuropsy_date"]

config_data = json.load(open(config_json))
variable_info = config_data['variables'][neuropsych_vars[0]]
variable_sources = variable_info["sources"]
neuropsy_source = variable_info['primary_source']

print(f"Using neuropsych data source: {neuropsy_source}")
# local BD_RPQ data
if neuropsy_source == "local":
    index_cols = ["participant_id", "visit", "TimePoint (based on REDCap; baseline, 18m, 36m, etc.)", "Délai depuis baseline (mois)"]
    
# redcap data
if neuropsy_source == "redcap":
    index_cols = ["participant_id", "redcap_event_name"]

    
neuropsych_df = pd.DataFrame()
for var in neuropsych_vars:
    _df = get_available_data(config_json,data_release_dir,var)
    _df = _df.rename(columns=redcap_field_name_map)
    _df = _df.rename(columns=legacy_field_name_map)
    _df = _df[_df["participant_id"].isin(QPN_participants)].copy()
    if neuropsych_df.empty:
        neuropsych_df = _df
    else:
        neuropsych_df = pd.merge(neuropsych_df, _df, on=index_cols, how="left")   

neuropsych_participants = neuropsych_df["participant_id"].unique()
n_neuropsych_participants = len(neuropsych_participants)
print('-'*50)
print(f"Number of participants with neuropysch data: {n_neuropsych_participants}")
print('-'*50)

neuropsych_cols = neuropsych_df.columns.drop(index_cols).tolist()
n_neuropsuch_cols = len(neuropsych_cols)
print(f"Neuropsych data available for variables: {n_neuropsuch_cols}")
print('-'*50)

# BD_RPQ data
if neuropsy_source == "local":
    neuropsych_visits = neuropsych_df["visit"].unique()

# REDCap data
if neuropsy_source == "redcap":
    neuropsych_visits = neuropsych_df["redcap_event_name"].unique()

print(f"neuropsych data available for events: {neuropsych_visits}")
print('-'*50)

neuropsych_df.head()

Using neuropsych data source: redcap
Using variable neuropsy_scores from source redcap and instrument sarah_extended_export
Using variable neuropsy_date from source redcap and instrument sarah_extended_export
--------------------------------------------------
Number of participants with neuropysch data: 294
--------------------------------------------------
Neuropsych data available for variables: 83
--------------------------------------------------
neuropsych data available for events: ['Baseline (Arm 1: C-OPN)' '12 Months Follow-Up/Suivi (Arm 1: C-OPN)'
 '18 Months Follow-Up/Suivi (Arm 1: C-OPN)'
 '24 Months Follow-Up/Suivi (Arm 1: C-OPN)']
--------------------------------------------------


Unnamed: 0,Immediate recall time (sec),Stroop - D-Kefs - Cond.2: uncorrected errors (Raw score),participant_id,Trail A raw score (time in sec.),Stroop - D-Kefs - Cond. 3 - 1 (time),Deux mains (30 sec),Total Repetition errors,Was the Hopkins Verbal Learning Test Revised administered?,Stroop - D-Kefs - Cond.3: Total errors (Automatic calculation),Copy raw,...,STROOP GOLDEN : COLORS Number of responses,Letter Fluency F (Raw score),Was the CDT test administered?,Digit Span Backward - total correct (Raw score),Was the Stroop Colour and Word (D-KEFS) test administered?,Digit span backward - longest correct serie (Raw score),"Trial total 1,2,3 (Raw score)","STROOP GOLDEN, words, uncorrected errors (raw score)",Trial 1 raw,neuropsy_date
0,185.0,0.0,MNI0028,40.0,37.0,5.0,0.0,"Yes, completed",0.0,35,...,,18.0,"Yes, completed",7.0,"Yes, completed",4.0,25.0,,6.0,2023-10-03
1,994.0,0.0,MNI0056,73.0,,997.0,1.0,"Yes, completed",4.0,28,...,997.0,9.0,"Yes, completed",6.0,"Yes, completed",3.0,11.0,997.0,3.0,2021-07-30
2,994.0,0.0,MNI0058,28.0,,997.0,0.0,"Yes, completed",7.0,26,...,997.0,9.0,"Yes, completed",7.0,"Yes, completed",5.0,26.0,997.0,6.0,2021-08-18
3,994.0,0.0,MNI0068,45.0,,997.0,0.0,"Yes, completed",0.0,215,...,997.0,9.0,"Yes, completed",10.0,"Yes, completed",5.0,21.0,997.0,5.0,2021-08-18
4,994.0,0.0,MNI0079,33.0,,10.0,0.0,"Yes, completed",3.0,30,...,997.0,16.0,"Yes, completed",7.0,"Yes, completed",4.0,31.0,997.0,8.0,2022-01-21


### Basic clean-up and data checks

In [12]:
# Fix dtypes
if neuropsy_source == "local":
    for series_name, series in neuropsych_df.items():
        if "score" in series_name:
            if series.dtype == 'object':
                print(f"recasting {series_name} to float by replacing , with .")
                neuropsych_df[series_name] = neuropsych_df[series_name].str.replace(",",".").astype(float)
                neuropsych_df.loc[neuropsych_df[series_name]>900, series_name] = np.nan
                
        # Replace >900 with NaNs
        if series.dtype == 'float':
            neuropsych_df.loc[neuropsych_df[series_name]>900, series_name] = np.nan

    # assign redcap_event_name
    visit_months = [12, 18, 24, 30, 36, 42, 48, 54]
    month_bins = [9, 15, 21, 27, 33, 39, 45, 51, 57]
    event_str_suffix = "Months Follow-Up/Suivi (Arm 1: C-OPN)"
    event_names = [f"{m} {event_str_suffix}" for m in visit_months]

    neuropsych_df["redcap_event_name"] = pd.cut(neuropsych_df["Délai depuis baseline (mois)"], bins=month_bins, labels=event_names).astype(str)
    neuropsych_df.loc[neuropsych_df["TimePoint (based on REDCap; baseline, 18m, 36m, etc.)"]=="baseline", 
                      "redcap_event_name"] = "Baseline (Arm 1: C-OPN)"

# Merge with pheno_var_df
index_cols = ["participant_id", "redcap_event_name"]
pheno_var_df = pd.merge(pheno_var_df, neuropsych_df, on=index_cols, how="left")  

pheno_var_df.head()

Unnamed: 0,diagnosis,redcap_event_name,participant_id,Hoehn and Yahr Stage:,Part IV: Motor Complications,Part I: Non-Motor Aspects of Experiences of Daily Living (nM-EDL),Part III: Motor Examination,Part II: Motor Aspects of Experiences of Daily Living (M-EDL),moca_scores,diagnosis_date,...,STROOP GOLDEN : COLORS Number of responses,Letter Fluency F (Raw score),Was the CDT test administered?,Digit Span Backward - total correct (Raw score),Was the Stroop Colour and Word (D-KEFS) test administered?,Digit span backward - longest correct serie (Raw score),"Trial total 1,2,3 (Raw score)","STROOP GOLDEN, words, uncorrected errors (raw score)",Trial 1 raw,neuropsy_date
0,0.0,Baseline (Arm 1: C-OPN),MNI0028,(2) Bilateral involvement without impairment o...,0.0,0.0,29.0,2.0,28.0,2020-09-13,...,,18.0,"Yes, completed",7.0,"Yes, completed",4.0,25.0,,6.0,2023-10-03
1,0.0,Baseline (Arm 1: C-OPN),MNI0056,(3) Bilateral disease: mild to moderate disabi...,0.0,12.0,58.0,22.0,21.0,2017-09-01,...,997.0,9.0,"Yes, completed",6.0,"Yes, completed",3.0,11.0,997.0,3.0,2021-07-30
2,0.0,Baseline (Arm 1: C-OPN),MNI0058,"(1) Unilateral involvement only, usually with ...",0.0,0.0,26.0,6.0,25.0,2020-05-22,...,997.0,9.0,"Yes, completed",7.0,"Yes, completed",5.0,26.0,997.0,6.0,2021-08-18
3,0.0,Baseline (Arm 1: C-OPN),MNI0068,(2) Bilateral involvement without impairment o...,0.0,11.0,28.0,8.0,,2014-01-01,...,997.0,9.0,"Yes, completed",10.0,"Yes, completed",5.0,21.0,997.0,5.0,2021-08-18
4,0.0,Baseline (Arm 1: C-OPN),MNI0079,(2) Bilateral involvement without impairment o...,8.0,13.0,22.0,10.0,26.0,2017-05-01,...,997.0,16.0,"Yes, completed",7.0,"Yes, completed",4.0,31.0,997.0,8.0,2022-01-21


### Add mri_acq date
- Needs to map to redcap_event_name

In [13]:
var = "MRI_date"
config_json = pheno_config_json
mri_date_df = get_available_data(config_json,data_release_dir,var)
mri_date_df["MRI_date"] = pd.to_datetime(mri_date_df["MRI_date"], errors="coerce", dayfirst=False)

n_mri_participants = mri_date_df["participant_id"].nunique()
print(f"Number of participants with MRI data: {n_mri_participants}")

n_sessions = mri_date_df["session"].nunique()
print(f"Number of MRI sessions: {n_sessions}")

participants_with_follow_ups = mri_date_df[mri_date_df["participant_id"].duplicated()]["participant_id"].unique()
n_participants_with_follow_ups = len(participants_with_follow_ups)
print(f"Number of participants with follow-up MRI: {n_participants_with_follow_ups}")

mri_ses01_date_df = mri_date_df[mri_date_df["session"]=="ses-01"].copy()
mri_ses01_date_df["redcap_event_name"] = "Baseline (Arm 1: C-OPN)"

mri_ses02_date_df = mri_date_df[mri_date_df["session"]=="ses-02"].copy()
mri_ses02_participants = mri_ses02_date_df["participant_id"].unique()
print(f"Number of participants with ses-02 MRI: {len(mri_ses02_participants)}")

baseline_df = mri_ses01_date_df[mri_ses01_date_df["participant_id"].isin(mri_ses02_participants)].set_index("participant_id")
followup_df = mri_ses02_date_df.set_index("participant_id")

visit_months = [12, 18, 24, 30, 36, 42, 48, 54]
month_bins = [9, 15, 21, 27, 33, 39, 45, 51, 57]

event_str_suffix = "Months Follow-Up/Suivi (Arm 1: C-OPN)"
event_names = [f"{m} {event_str_suffix}" for m in visit_months]

# --- Bin the months --- #
followup_df["months_since_baseline"] = followup_df["MRI_date"].dt.to_period('M').astype(int) - baseline_df["MRI_date"].dt.to_period('M').astype(int)
followup_df["months_since_baseline"] = followup_df["months_since_baseline"].replace({0:np.nan}) # Some visits get same acq_date from brodacasting merge. 

followup_df["redcap_event_name"] = pd.cut(followup_df["months_since_baseline"], bins=month_bins, labels=event_names)

mri_date_redcap_event_df = pd.concat([mri_ses01_date_df, followup_df.reset_index()], axis=0)
# mri_date_redcap_event_df = mri_date_redcap_event_df

mri_date_redcap_event_df.sort_values(["participant_id","session"]).head()

Using variable MRI_date from source local and instrument MRI_dates
Number of participants with MRI data: 298
Number of MRI sessions: 2
Number of participants with follow-up MRI: 51
Number of participants with ses-02 MRI: 51


Unnamed: 0,session,participant_id,MRI_date,redcap_event_name,months_since_baseline
0,ses-01,MNI0028,2023-10-04,Baseline (Arm 1: C-OPN),
1,ses-01,MNI0056,2021-08-18,Baseline (Arm 1: C-OPN),
2,ses-01,MNI0058,2021-08-18,Baseline (Arm 1: C-OPN),
3,ses-01,MNI0068,2021-08-27,Baseline (Arm 1: C-OPN),
4,ses-01,MNI0079,2021-12-22,Baseline (Arm 1: C-OPN),


#### Add MRI date to pheno data


In [14]:
pheno_var_df = pd.merge(pheno_var_df, mri_date_redcap_event_df, on=index_cols, how="right")  
var = "MRI_date"
for redcap_event in mri_date_redcap_event_df["redcap_event_name"].unique():    
    pheno_var_event_df = pheno_var_df[pheno_var_df["redcap_event_name"]==redcap_event].copy()
    n_pheno_var_event_participants = pheno_var_event_df["participant_id"].nunique()
    if pheno_var_event_df[var].nunique() > 0:    
        print(f"Var: {var}, Event: {redcap_event}")
        n_unique = pheno_var_event_df[var].nunique()
        n_missing = pheno_var_event_df[var].isna().sum()
        print(f"n_unique: {n_unique}, n_missing: {n_missing} (out of {n_pheno_var_event_participants})")
pheno_var_df.head()

Var: MRI_date, Event: Baseline (Arm 1: C-OPN)
n_unique: 234, n_missing: 0 (out of 298)
Var: MRI_date, Event: 12 Months Follow-Up/Suivi (Arm 1: C-OPN)
n_unique: 27, n_missing: 0 (out of 30)
Var: MRI_date, Event: 18 Months Follow-Up/Suivi (Arm 1: C-OPN)
n_unique: 8, n_missing: 0 (out of 9)
Var: MRI_date, Event: 48 Months Follow-Up/Suivi (Arm 1: C-OPN)
n_unique: 1, n_missing: 0 (out of 1)
Var: MRI_date, Event: 42 Months Follow-Up/Suivi (Arm 1: C-OPN)
n_unique: 2, n_missing: 0 (out of 2)
Var: MRI_date, Event: 36 Months Follow-Up/Suivi (Arm 1: C-OPN)
n_unique: 1, n_missing: 0 (out of 1)
Var: MRI_date, Event: 30 Months Follow-Up/Suivi (Arm 1: C-OPN)
n_unique: 1, n_missing: 0 (out of 1)


Unnamed: 0,diagnosis,redcap_event_name,participant_id,Hoehn and Yahr Stage:,Part IV: Motor Complications,Part I: Non-Motor Aspects of Experiences of Daily Living (nM-EDL),Part III: Motor Examination,Part II: Motor Aspects of Experiences of Daily Living (M-EDL),moca_scores,diagnosis_date,...,Digit Span Backward - total correct (Raw score),Was the Stroop Colour and Word (D-KEFS) test administered?,Digit span backward - longest correct serie (Raw score),"Trial total 1,2,3 (Raw score)","STROOP GOLDEN, words, uncorrected errors (raw score)",Trial 1 raw,neuropsy_date,session,MRI_date,months_since_baseline
0,0.0,Baseline (Arm 1: C-OPN),MNI0028,(2) Bilateral involvement without impairment o...,0.0,0.0,29.0,2.0,28.0,2020-09-13,...,7.0,"Yes, completed",4.0,25.0,,6.0,2023-10-03,ses-01,2023-10-04,
1,0.0,Baseline (Arm 1: C-OPN),MNI0056,(3) Bilateral disease: mild to moderate disabi...,0.0,12.0,58.0,22.0,21.0,2017-09-01,...,6.0,"Yes, completed",3.0,11.0,997.0,3.0,2021-07-30,ses-01,2021-08-18,
2,0.0,Baseline (Arm 1: C-OPN),MNI0058,"(1) Unilateral involvement only, usually with ...",0.0,0.0,26.0,6.0,25.0,2020-05-22,...,7.0,"Yes, completed",5.0,26.0,997.0,6.0,2021-08-18,ses-01,2021-08-18,
3,0.0,Baseline (Arm 1: C-OPN),MNI0068,(2) Bilateral involvement without impairment o...,0.0,11.0,28.0,8.0,,2014-01-01,...,10.0,"Yes, completed",5.0,21.0,997.0,5.0,2021-08-18,ses-01,2021-08-27,
4,0.0,Baseline (Arm 1: C-OPN),MNI0079,(2) Bilateral involvement without impairment o...,8.0,13.0,22.0,10.0,26.0,2017-05-01,...,7.0,"Yes, completed",4.0,31.0,997.0,8.0,2022-01-21,ses-01,2021-12-22,


### Calculate age

In [15]:
demo_cols = ["participant_id", "dob", "group", "sex"]
demo_var_df[demo_var_df["participant_id"]==participants_with_follow_ups[0]]
baseline_demo_df = demo_var_df[demo_var_df["redcap_event_name"]=="Baseline (Arm 1: C-OPN)"][demo_cols].copy()

index_cols = ["participant_id"] # not using redcap_event_name to allow broadcast of demographics vars
tabular_df = pd.merge(pheno_var_df, baseline_demo_df, on=index_cols, how="left")
tabular_df[tabular_df["participant_id"]==participants_with_follow_ups[0]]

tabular_df.head()

Unnamed: 0,diagnosis,redcap_event_name,participant_id,Hoehn and Yahr Stage:,Part IV: Motor Complications,Part I: Non-Motor Aspects of Experiences of Daily Living (nM-EDL),Part III: Motor Examination,Part II: Motor Aspects of Experiences of Daily Living (M-EDL),moca_scores,diagnosis_date,...,"Trial total 1,2,3 (Raw score)","STROOP GOLDEN, words, uncorrected errors (raw score)",Trial 1 raw,neuropsy_date,session,MRI_date,months_since_baseline,dob,group,sex
0,0.0,Baseline (Arm 1: C-OPN),MNI0028,(2) Bilateral involvement without impairment o...,0.0,0.0,29.0,2.0,28.0,2020-09-13,...,25.0,,6.0,2023-10-03,ses-01,2023-10-04,,1963-07-27,PD (Parkinson's Disease)/Maladie de Parkinson,Male/Masculin
1,0.0,Baseline (Arm 1: C-OPN),MNI0056,(3) Bilateral disease: mild to moderate disabi...,0.0,12.0,58.0,22.0,21.0,2017-09-01,...,11.0,997.0,3.0,2021-07-30,ses-01,2021-08-18,,1942-05-21,PD (Parkinson's Disease)/Maladie de Parkinson,Male/Masculin
2,0.0,Baseline (Arm 1: C-OPN),MNI0058,"(1) Unilateral involvement only, usually with ...",0.0,0.0,26.0,6.0,25.0,2020-05-22,...,26.0,997.0,6.0,2021-08-18,ses-01,2021-08-18,,1964-03-14,PD (Parkinson's Disease)/Maladie de Parkinson,Male/Masculin
3,0.0,Baseline (Arm 1: C-OPN),MNI0068,(2) Bilateral involvement without impairment o...,0.0,11.0,28.0,8.0,,2014-01-01,...,21.0,997.0,5.0,2021-08-18,ses-01,2021-08-27,,1952-05-08,PD (Parkinson's Disease)/Maladie de Parkinson,Male/Masculin
4,0.0,Baseline (Arm 1: C-OPN),MNI0079,(2) Bilateral involvement without impairment o...,8.0,13.0,22.0,10.0,26.0,2017-05-01,...,31.0,997.0,8.0,2022-01-21,ses-01,2021-12-22,,1971-11-25,PD (Parkinson's Disease)/Maladie de Parkinson,Female/Féminin


In [16]:
date_cols = ["diagnosis_date", "updrs_date", "moca_date", "MRI_date", "neuropsy_date"]

date_age_cols_dict = {}
for col in date_cols:
    date_age_cols_dict[col] = f"{col.split('_')[0]}_age"

age_cols = list(date_age_cols_dict.values())

for date_col, age_col in date_age_cols_dict.items():
    tabular_df = get_age_at_visit(tabular_df, date_col, age_col)

tabular_df.head()

Unnamed: 0,diagnosis,redcap_event_name,participant_id,Hoehn and Yahr Stage:,Part IV: Motor Complications,Part I: Non-Motor Aspects of Experiences of Daily Living (nM-EDL),Part III: Motor Examination,Part II: Motor Aspects of Experiences of Daily Living (M-EDL),moca_scores,diagnosis_date,...,MRI_date,months_since_baseline,dob,group,sex,diagnosis_age,updrs_age,moca_age,MRI_age,neuropsy_age
0,0.0,Baseline (Arm 1: C-OPN),MNI0028,(2) Bilateral involvement without impairment o...,0.0,0.0,29.0,2.0,28.0,2020-09-13,...,2023-10-04,,1963-07-27,PD (Parkinson's Disease)/Maladie de Parkinson,Male/Masculin,57.13,60.19,60.19,60.19,60.19
1,0.0,Baseline (Arm 1: C-OPN),MNI0056,(3) Bilateral disease: mild to moderate disabi...,0.0,12.0,58.0,22.0,21.0,2017-09-01,...,2021-08-18,,1942-05-21,PD (Parkinson's Disease)/Maladie de Parkinson,Male/Masculin,75.28,79.06,79.06,79.24,79.19
2,0.0,Baseline (Arm 1: C-OPN),MNI0058,"(1) Unilateral involvement only, usually with ...",0.0,0.0,26.0,6.0,25.0,2020-05-22,...,2021-08-18,,1964-03-14,PD (Parkinson's Disease)/Maladie de Parkinson,Male/Masculin,56.19,57.36,57.36,57.43,57.43
3,0.0,Baseline (Arm 1: C-OPN),MNI0068,(2) Bilateral involvement without impairment o...,0.0,11.0,28.0,8.0,,2014-01-01,...,2021-08-27,,1952-05-08,PD (Parkinson's Disease)/Maladie de Parkinson,Male/Masculin,61.65,69.3,,69.3,69.28
4,0.0,Baseline (Arm 1: C-OPN),MNI0079,(2) Bilateral involvement without impairment o...,8.0,13.0,22.0,10.0,26.0,2017-05-01,...,2021-12-22,,1971-11-25,PD (Parkinson's Disease)/Maladie de Parkinson,Female/Féminin,45.43,50.16,50.08,50.08,50.16


### Save demo, mri_dates, and pheno (dx, updrs, moca, neuropsych) in separate files
- remove DoB and other date columns 
- add age columns

In [17]:
index_cols = ["participant_id", "redcap_event_name"]

mri_cols = ["session", "MRI_age"]

dx_cols = ["diagnosis", 'Hoehn and Yahr Stage: ', "diagnosis_age"]

updrs_cols = ['Part I: Non-Motor Aspects of Experiences of Daily Living (nM-EDL)',
              'Part II: Motor Aspects of Experiences of Daily Living (M-EDL)',
              'Part III: Motor Examination', 'Part IV: Motor Complications', 'updrs_age']

moca_cols = ["moca_scores", "moca_age"]

neuropsych_cols = neuropsych_cols + ["neuropsy_age"]
if "neuropsy_date" in neuropsych_cols:
    neuropsych_cols.remove("neuropsy_date")

mri_df = tabular_df[index_cols + mri_cols]
dx_df = tabular_df[index_cols + dx_cols].copy()
updrs_df = tabular_df[index_cols + updrs_cols].copy()
moca_df = tabular_df[index_cols + moca_cols].copy()
neuropsych_df = tabular_df[index_cols + neuropsych_cols].copy()

# Save data to files
mri_df.to_csv(mri_session_date_file, index=False)
print(f"Saved MRI session data to {mri_session_date_file}")

dx_df.to_csv(dx_file, index=False)
print(f"Saved diagnosis data to {dx_file}")

updrs_df.to_csv(updrs_file, index=False)
print(f"Saved UPDRS data to {updrs_file}")

moca_df.to_csv(moca_file, index=False)
print(f"Saved MoCA data to {moca_file}")

neuropsych_df.to_csv(neuropsych_file, index=False)
print(f"Saved neuropsych data to {neuropsych_file}")

Saved MRI session data to /home/nikhil/projects/Parkinsons/qpn//releases/June_2024//tabular//mri_sessions.csv
Saved diagnosis data to /home/nikhil/projects/Parkinsons/qpn//releases/June_2024//tabular//assessments/diagnosis.csv
Saved UPDRS data to /home/nikhil/projects/Parkinsons/qpn//releases/June_2024//tabular//assessments/updrs.csv
Saved MoCA data to /home/nikhil/projects/Parkinsons/qpn//releases/June_2024//tabular//assessments/moca.csv
Saved neuropsych data to /home/nikhil/projects/Parkinsons/qpn//releases/June_2024//tabular//assessments/neuropsych.csv


## Save merged tabular data
### TODO after finalizing format for the merged dataframe index 

In [18]:
# index_cols  = ["participant_id", "redcap_event_name"]
# save_cols = index_cols + dx_cols + updrs_cols + moca_cols + neuropsych_cols + age_cols
# for col in date_cols:
#     if col in save_cols:
#         print(f"removing {col}")
#         save_cols.remove(col)

# n_save_cols = len(save_cols)    

# print(f"n_save_cols: {n_save_cols}")

# tabular_df.to_csv(tabular_file, index=None)

## Identify missing data

### UPDRS legacy and redcap data

In [19]:
updrs3_2019_csv = f"{data_release_dir}/tabular/recruitment/updrs3_2019.csv"
mds_updrs_hackathon_csv = f"{data_release_dir}/tabular/recruitment/mds_updrs_hackathon.csv"

updrs3_2019_df = pd.read_csv(updrs3_2019_csv)
mds_updrs_hackathon_df = pd.read_csv(mds_updrs_hackathon_csv)

updrs3_2019_participants = updrs3_2019_df["Record ID:"].unique()
mds_updrs_hackathon_participants = mds_updrs_hackathon_df["Record ID:"].unique()

n_updrs3_2019_participants = len(updrs3_2019_participants)
n_mds_updrs_hackathon_participants = len(mds_updrs_hackathon_participants)

print(f"Number of participants in UPDRS3 2019: {n_updrs3_2019_participants}")
print(f"Number of participants in MDS UPDRS Hackathon: {n_mds_updrs_hackathon_participants}")

Number of participants in UPDRS3 2019: 50
Number of participants in MDS UPDRS Hackathon: 23


In [21]:
updrs_redcap_participants = pheno_var_df[pheno_var_df["Part III: Motor Examination"].notna()]["participant_id"].unique()
print(f"Number of participants with UPDRS data in redcap: {len(updrs_redcap_participants)}")

paper_df = tabular_df.copy()
paper_participants = paper_df["participant_id"].unique()
print(f"Number of participants in paper: {len(paper_participants)}")

updrs_missing_participants = set(paper_participants) - set(updrs_redcap_participants) - set(updrs3_2019_participants) - set(mds_updrs_hackathon_participants)
n_updrs_missing_participants = len(updrs_missing_participants)
print(f"Number of participants missing UPDRS data: {n_updrs_missing_participants}")

Number of participants with UPDRS data in redcap: 167
Number of participants in paper: 298
Number of participants missing UPDRS data: 107


In [22]:
len(list(set(paper_participants) & set(mds_updrs_hackathon_participants)))

18

### Suivi updrs dates

In [23]:
current_recruit_manifest_xls = f"{data_release_dir}/tabular/recruitment/Suivi_RPQ.xlsx"

col_range = "A:BD"

col_rename_dict = {
    "subj_id":"participant_id",
    "IRM01\n(J-M-A)":"IRM01_date", "#IRM 1\n PD":"IRM01_PD", "#IRM 1\n CTRL":"IRM01_CTRL", 
    "# IRM 1\n RBD":"IRM01_RBD", "# IRM 1\nOTHER":"IRM01_OTHER",
    "IRM 2 \n(J-M-A)":"IRM02_date", "#IRM 2\n PD":"IRM02_PD", "#IRM 2\n CTRL":"IRM02_CTRL", 
    "# IRM 2\n RBD":"IRM02_RBD", "# IRM 2 OTHER":"IRM02_OTHER",
    "IRM 3\n(J-M-A)":"IRM03_date", "#IRM 3\n PD":"IRM03_PD", "#IRM 3\n CTRL":"IRM03_CTRL", 
    "# IRM 3\n RBD":"IRM03_RBD", "# IRM 3 OTHER":"IRM03_OTHER",
    "MDS-UPDRS_partie_III_1\n(J-M-A)":"MDS-UPDRS_III_v01_date", 
    "MDS-UPDRS_complet (J-M-A)": "MDS-UPDRS_complete_date"
    }

useful_cols = col_rename_dict.values()

suivi_df = pd.read_excel(current_recruit_manifest_xls,sheet_name="En cours", engine='openpyxl', usecols=col_range)
suivi_df = suivi_df.rename(columns=col_rename_dict)[useful_cols].copy()

# remove the row with tally
suivi_df = suivi_df.drop([0])

# remove rows without participant_id
suivi_df = suivi_df.dropna(axis=0, subset=["participant_id"])
suivi_df = suivi_df[~suivi_df["participant_id"].astype(str).isin(["0"])] 
suivi_df["participant_id"] = suivi_df["participant_id"].str.strip().astype(str)

# remove subjects without imaging data
suivi_df = suivi_df[(suivi_df["IRM01_PD"] == 1) | (suivi_df["IRM01_CTRL"] == 1) | 
                    (suivi_df["IRM01_RBD"] == 1) | (suivi_df["IRM01_OTHER"] == 1) |
                    (suivi_df["IRM02_PD"] == 1) | (suivi_df["IRM02_CTRL"] == 1) | 
                    (suivi_df["IRM02_RBD"] == 1) |(suivi_df["IRM02_OTHER"] == 1) |
                    (suivi_df["IRM03_PD"] == 1) | (suivi_df["IRM03_CTRL"] == 1) | 
                    (suivi_df["IRM03_RBD"] == 1) |(suivi_df["IRM03_OTHER"] == 1) ]


# fix participant_id formatting issues
# Some rows have Dx in participant_id and one participant with two IDs with "="
possible_delimiters = [" ", "(", "=", "\n"]
for delim in possible_delimiters:        
    suivi_df["participant_id"] = suivi_df["participant_id"].str.strip().str.split(pat=delim, n=1, expand=True)[0]

# nipoppy_participants_current
nipoppy_participants_current = suivi_df["participant_id"].dropna().unique()

suivi_df["MDS-UPDRS_III_v01_date"] = pd.to_datetime(suivi_df["MDS-UPDRS_III_v01_date"], errors="coerce")
suivi_df["MDS-UPDRS_complete_date"] = pd.to_datetime(suivi_df["MDS-UPDRS_complete_date"], errors="coerce")

suivi_df

Unnamed: 0,participant_id,IRM01_date,IRM01_PD,IRM01_CTRL,IRM01_RBD,IRM01_OTHER,IRM02_date,IRM02_PD,IRM02_CTRL,IRM02_RBD,IRM02_OTHER,IRM03_date,IRM03_PD,IRM03_CTRL,IRM03_RBD,IRM03_OTHER,MDS-UPDRS_III_v01_date,MDS-UPDRS_complete_date
15,PD00016,26-07-2019,1.0,0.0,0.0,,0,,,,,0,,,,,NaT,NaT
19,PD00020,2018-05-12 00:00:00,1.0,0.0,0.0,,24-11-2022,1,0.0,0.0,,0,,,,,2019-05-02,2021-07-23
31,PD00032,24-07-2019,1.0,0.0,0.0,,0,,,,,0,,,,,NaT,NaT
47,PD00048,21-08-2019,1.0,0.0,0.0,,0,,,,,0,,,,,2019-08-21,NaT
117,PD00119,13-08-2018,1.0,0.0,0.0,,0,,,,,0,,,,,2018-08-15,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2229,MNI0565,2023-09-11 00:00:00,0.0,1.0,0.0,0.0,0,,,,,0,,,,,NaT,NaT
2231,MNI0602,2023-11-22 00:00:00,1.0,0.0,0.0,0.0,0,,,,,0,,,,,2023-11-22,2023-11-22
2234,MNI0605,15-12-2023,0.0,1.0,0.0,0.0,0,,,,,0,,,,,NaT,NaT
2236,MNI0607,00:00:00,1.0,0.0,0.0,0.0,0,,,,,0,,,,,2023-11-30,2023-11-30


In [24]:
suivi_updrs_participants = suivi_df[(suivi_df["MDS-UPDRS_III_v01_date"].notna() | (suivi_df["MDS-UPDRS_complete_date"].notna()))]["participant_id"]
n_suivi_updrs_participants = len(suivi_updrs_participants)
print(f"Number of participants with UPDRS data in suivi: {n_suivi_updrs_participants}")

Number of participants with UPDRS data in suivi: 214


In [25]:
len(list(set(suivi_updrs_participants) & set(paper_participants)))

211

In [26]:
len(list(set(suivi_updrs_participants) & set(updrs_missing_participants)))

22

In [28]:
missing_updrs_participants_df = pd.DataFrame(list(updrs_missing_participants), columns=["participant_id"])
missing_updrs_participants_df["Suivi_date_present"] = "no"
missing_updrs_participants_df.loc[missing_updrs_participants_df["participant_id"].isin(suivi_updrs_participants),"Suivi_date_present"] = "yes"
missing_updrs_participants_df = pd.merge(missing_updrs_participants_df, demo_var_df, on="participant_id", how="left")
missing_updrs_participants_df


Unnamed: 0,participant_id,Suivi_date_present,dob,redcap_event_name,dob_secondary,group,sex,education
0,MNI0352,no,1977-09-30,Baseline (Arm 1: C-OPN),NaT,Healthy control/Contrôle,Female/Féminin,
1,MNI0352,no,NaT,12 Months Follow-Up/Suivi (Arm 1: C-OPN),NaT,,,
2,MNI0352,no,NaT,18 Months Follow-Up/Suivi (Arm 1: C-OPN),NaT,,,
3,MNI0352,no,NaT,24 Months Follow-Up/Suivi (Arm 1: C-OPN),NaT,,,
4,PD01643,no,1963-08-31,Baseline (Arm 1: C-OPN),1963-08-31,PPS (Parkinson Plus Syndrome)/PPS (Syndrome Pa...,Male/Masculin,
...,...,...,...,...,...,...,...,...
390,PD01182,no,NaT,,NaT,,,
391,PD01160,yes,1957-01-13,Baseline (Arm 1: C-OPN),1957-01-13,PD (Parkinson's Disease)/Maladie de Parkinson,Male/Masculin,11.0
392,PD01160,yes,1957-01-13,24 Months Follow-Up/Suivi (Arm 1: C-OPN),1957-01-13,,,
393,PD01160,yes,1957-01-13,12 Months Follow-Up/Suivi (Arm 1: C-OPN),1957-01-13,,,


In [29]:
missing_updrs_participants_df_PD = missing_updrs_participants_df[
                                                                # (missing_updrs_participants_df["Suivi_date_present"]=="yes") & 
                                                                (missing_updrs_participants_df["group"]=="PD") &
                                                                (missing_updrs_participants_df["redcap_event_name"]=="Baseline (Arm 1: C-OPN)")
                                                                ]
len(missing_updrs_participants_df_PD)


0

In [30]:
missing_updrs_participants_df_PD.reset_index().drop(columns=["index"])

Unnamed: 0,participant_id,Suivi_date_present,dob,redcap_event_name,dob_secondary,group,sex,education


In [31]:
missing_updrs_participants_df_PD.to_csv(f"{data_release_dir}/tabular/recruitment/missing_updrs_participants_after_hackathon.csv", index=False)