In [2]:
import pandas as pd
import numpy as np
import requests
import json
import time
import pickle
import seaborn as sns

### API call for redcap

In [131]:
def api_call(url, query, logger=None):
    """ helper function to make API calls to RedCap
    """
    r = requests.post(url, data=query, verify=False)
    http_status = str(r.status_code)
    print(f'HTTP Status: {http_status}')

    if http_status == "200":
        query_results = r.json()
        query_df = pd.DataFrame(query_results)

    else:
        print(f"RedCap API request Failed with HTTP Status: {http_status}")
        query_df = None
        
    return query_df

def get_inventory_count(df, index_col, availability_indicators):
    """ helper function to count participants with recorded data in redcap
    """
    assess_cols = df.columns.drop(index_col)

    if availability_indicators == 'number':
        df = df.replace("", np.nan)
        df[assess_cols] = df[assess_cols].astype(np.float64)

    inventory = {}
    for col in assess_cols:        
        if availability_indicators == 'number':
            availability_count = df[~df[col].isna()][index_col].nunique()
        else:
            availability_count = df[df[col].isin(availability_indicators)][index_col].nunique()
        inventory[col] = availability_count
    return inventory

def get_available_data(config_json,DATASET_ROOT,var_name,preferred_var_source=None):
    """ Get data for given variables from available sources
        All return dataframes should have participant_id and visit_id as index
    """
    config_data = json.load(open(config_json))
    data_sources = config_data['data_sources']
    variable_info = config_data['variables'][var_name]
    variable_type = variable_info["type"]
    variable_sources = variable_info["sources"]

    if preferred_var_source:
        preferred_var_data_source = preferred_var_source["data_source"]
        preferred_var_instrument = preferred_var_source["instrument"]

        if preferred_var_data_source not in variable_sources.keys():
            print(f"Preferred data source {preferred_var_data_source} not available for variable {var_name}")
            return None
        else:
            selected_var_source = preferred_var_data_source

        if preferred_var_instrument not in variable_sources[selected_var_source].keys():
            print(f"Preferred var instrument {preferred_var_instrument} not available for variable {var_name}")
            return None
        else:
            selected_var_instrument = preferred_var_instrument

    else:
        selected_var_source = variable_info['primary_source']
        selected_var_instrument = variable_info['primary_instrument']

    print(f"Using variable {var_name} from source {selected_var_source} and instrument {selected_var_instrument}")

    external_var_cols = variable_sources[selected_var_source][selected_var_instrument]

    # Get data from primary source
    var_file = data_sources[selected_var_source][selected_var_instrument]["path"]
    var_file_path = f"{DATASET_ROOT}/{var_file}"
    var_file_index = data_sources[selected_var_source][selected_var_instrument]["index_cols"]

    var_df = pd.read_csv(var_file_path)
    selected_var_cols = list(set(var_file_index + external_var_cols))
    var_df = var_df[selected_var_cols]
    
    if (variable_type == "date") & (len(external_var_cols) == 1):
        var_df[external_var_cols[0]] = pd.to_datetime(var_df[external_var_cols[0]], errors="coerce", dayfirst=False)

    if (len(external_var_cols) == 1):
        var_df = var_df.rename(columns={external_var_cols[0]:var_name})
        
    return var_df


### Paths


In [132]:
DATASET_ROOT = "/home/nikhil/projects/Parkinsons/qpn/"

# Current nipoppy manifest
release_dir = f"{DATASET_ROOT}/releases/"
current_release = "Jan_2024"

tabular_data_release_dir = f"{release_dir}/{current_release}/"

demo_config_json = "../workflow/tabular/demographics.json"
pheno_config_json = "../workflow/tabular/pheno.json"


### standardized index names

In [133]:
participant_id = 'participant_id'
redcap_event_name = 'redcap_event_name'
visit = 'visit'
session = 'session'

## redcap event name variations
config_data = json.load(open(demo_config_json))
data_sources = config_data['data_sources']
redcap_data_sources = data_sources['redcap']

redcap_field_name_map = {}

for instrument in redcap_data_sources.keys():
    index_cols = redcap_data_sources[instrument]['index_cols']
    record_id = index_cols[0]
    event_name = index_cols[1]

    redcap_field_name_map[record_id] = participant_id
    redcap_field_name_map[event_name] = redcap_event_name

redcap_field_name_map


{'Record ID:': 'participant_id',
 'Event Name': 'redcap_event_name',
 'record_id': 'participant_id',
 'redcap_event_name': 'redcap_event_name'}

### Available participants

In [134]:
QPN_participants_df = get_available_data(demo_config_json,tabular_data_release_dir,"participant_id")
QPN_participants = QPN_participants_df[participant_id].unique()
n_participants = len(QPN_participants)
print(f"Number of participants: {n_participants}")

Using variable participant_id from source local and instrument manifest
Number of participants: 303


### Fetch demographic data

In [136]:
demo_vars = ["group", "sex", "dob"]
# preferred_var_source = {"data_source":"local","instrument":"legacy_DOB"}

demo_var_df = pd.DataFrame()
for var in demo_vars:
    _df = get_available_data(demo_config_json,tabular_data_release_dir,var)
    _df = _df.rename(columns=redcap_field_name_map)
    _df = _df[_df[participant_id].isin(QPN_participants)].copy()
    if demo_var_df.empty:
        demo_var_df = _df
    else:
        demo_var_df = pd.merge(demo_var_df, _df, on=[participant_id, redcap_event_name], how="outer")

demo_var_df.head()

Using variable group from source redcap and instrument COPN_MRI_export
Using variable sex from source redcap and instrument COPN_MRI_export
Using variable dob from source redcap and instrument COPN_MRI_export


Unnamed: 0,participant_id,group,redcap_event_name,sex,dob
0,MNI0028,PD (Parkinson's Disease)/Maladie de Parkinson,Baseline (Arm 1: C-OPN),Male/Masculin,1963-07-27
1,MNI0056,PD (Parkinson's Disease)/Maladie de Parkinson,Baseline (Arm 1: C-OPN),Male/Masculin,1942-05-21
2,MNI0058,PD (Parkinson's Disease)/Maladie de Parkinson,Baseline (Arm 1: C-OPN),Male/Masculin,1964-03-14
3,MNI0068,PD (Parkinson's Disease)/Maladie de Parkinson,Baseline (Arm 1: C-OPN),Male/Masculin,1952-05-08
4,MNI0079,PD (Parkinson's Disease)/Maladie de Parkinson,Baseline (Arm 1: C-OPN),Female/Féminin,1971-11-25


In [115]:
var_name = "moca_date" #"diagnosis", "MRI_date", "updrs_score", "moca_score", "diagnosis_date", "updrs_date", "moca_date"

# preferred_var_source = {"data_source":"redcap","instrument":"global_query"}

var_df = get_available_data(pheno_config_json,tabular_data_release_dir,var_name)

var_df.head()

SyntaxError: invalid syntax (896292328.py, line 1)

In [52]:
redcap_config_json = f"{DATASET_ROOT}/proc/.redcap.json"
redcap_config = json.load(open(redcap_config_json))
url = redcap_config["url"]
global_records_query = redcap_config["queries"]["global_records_query"]

index_col = "record_id"

In [53]:
sarah_query = redcap_config["queries"]["QPN MoCA-UPDRS-Neuropsy data_Sarah"]
query_df = api_call(url, global_records_query, logger=None)
sarah_report_csv = f"{release_dir}{current_release}/tabular/redcap/QPN MoCA-UPDRS-Neuropsy data_Sarah.csv"
query_df.to_csv(sarah_report_csv, index=False)



HTTP Status: 200


In [144]:
redcap_records = query_df["record_id"].unique()
n_records = query_df["record_id"].nunique()
redcap_cols = query_df.columns

print(f"Number of records: {n_records}")
print(f"Number of columns: {len(redcap_cols)}")

Number of records: 1394
Number of columns: 1111


In [70]:
substr = "date"
res = [i for i in redcap_cols if substr in i]
n_dates = query_df[~(query_df["moca_date_1"]=="")]["moca_date_1"].count()
print(f"Number of records with MoCA dates: {n_dates}")
query_df[res].head()

Number of records with MoCA dates: 243


0
1
2
3
4


### Redcap API call

In [78]:
load_local_redcap_records = True

if load_local_redcap_records:
    query_df = pd.read_csv(redcap_global_records_query_csv)

else:
    query_df = api_call(url, global_records_query, logger=None)
    query_df.to_csv(redcap_global_records_query_csv, index=False)

redcap_records = query_df["record_id"].unique()
n_records = query_df["record_id"].nunique()
redcap_cols = query_df.columns

print(f"Number of records: {n_records}")
print(f"Number of columns: {len(redcap_cols)}")


## Get inventory of redcap records for various assessments
record_dict = {}
record_dict["all_redcap_records"] = redcap_records


Number of records: 1394
Number of columns: 1111


  query_df = pd.read_csv(redcap_global_records_query_csv)


In [79]:
substr = "diagnosis"
res = [i for i in redcap_cols if substr in i]
n_dates = query_df[~(query_df["moca_date_1"]=="")]["moca_date_1"].count()
print(f"Number of records with MoCA dates: {n_dates}")
query_df[res].head()

Number of records with MoCA dates: 220


Unnamed: 0,diagnosis_determined,diagnosis_pd,diagnosis_pdplus,diagnosis_probability,diagnosis_confirmation,diagnosis_dementia,diagnosis_change_t2,diagnosis_determined_t2,diagnosis_change_chart_t2,diagnosis_t2,diagnosis_probability_2,diagnosis_pps_spec,diagnosis_confirmation_2,diagnosis_dementia_t2_t3
0,,,,,,,,,,,,,,
1,0.0,Yes/Oui,,,,No/Non,,,,,,,,
2,,,,,,,No/Non,,,,,,,
3,,,,,,,,,,,,,,
4,0.0,Yes/Oui,,Unknown/inconnu,General neurologist/neurologue général,No/Non,,,,,,,,


In [80]:
_df= query_df[[index_col] + res]
n_records = _df["record_id"].nunique()
print(f"Number of records: {n_records}")

availability_indicators = ["Checked", "Yes/Oui", "Uncertain/Incertain"]
qpn_redcap_inventory = get_inventory_count(_df, index_col, availability_indicators)

qpn_redcap_df = pd.DataFrame(qpn_redcap_inventory.items(), columns=["Assessment", "Count"]).sort_values(by="Count", ascending=False)

qpn_redcap_df.head(10)

Number of records: 1394


Unnamed: 0,Assessment,Count
1,diagnosis_pd,627
8,diagnosis_change_chart_t2,17
6,diagnosis_change_t2,11
5,diagnosis_dementia,3
13,diagnosis_dementia_t2_t3,3
0,diagnosis_determined,0
2,diagnosis_pdplus,0
3,diagnosis_probability,0
4,diagnosis_confirmation,0
7,diagnosis_determined_t2,0


### Find QPN records

In [57]:
substr = "qpn"
res = [i for i in redcap_cols if substr in i]
query_df[res].head()

Unnamed: 0,bio_fluid_qpn___1,bio_fluid_qpn___2,bio_fluid_qpn___3,bio_fluid_qpn___4,bio_fluid_qpn___5,qpn_moca_completed___1,qpn_moca_completed___2,qpn_moca_completed___3,qpn_moca_completed___4,qpn_moca_completed___5,...,qpn_intensity_t2,qpn_intensity_2_t2,qpn_intensity_3_t2,qpn_mri_2_t2,qpn_poids_t2,qpn_symptoms_progress_t2,qpn_symptoms_remission_t2,qpn_other_symptoms_t2,qpn_symptoms_t2,qpn_mental_health_t2
0,Unchecked,Unchecked,Unchecked,Unchecked,Unchecked,Unchecked,Unchecked,Unchecked,Unchecked,Unchecked,...,,,,,,,,,,
1,Unchecked,Unchecked,Unchecked,Unchecked,Unchecked,Unchecked,Unchecked,Unchecked,Unchecked,Unchecked,...,,,,,,,,,,
2,,,,,,,,,,,...,1.0,1.0,1.0,No/Non,,,,,,
3,Unchecked,Unchecked,Unchecked,Unchecked,Unchecked,Unchecked,Unchecked,Unchecked,Unchecked,Unchecked,...,,,,,,,,,,
4,Unchecked,Unchecked,Unchecked,Unchecked,Unchecked,Unchecked,Unchecked,Unchecked,Unchecked,Unchecked,...,,,,,,,,,,


In [58]:
_df= query_df[[index_col] + res]
n_records = _df["record_id"].nunique()
print(f"Number of records: {n_records}")

availability_indicators = ["Checked", "Yes/Oui", "Uncertain/Incertain"]
qpn_redcap_inventory = get_inventory_count(_df, index_col, availability_indicators)

qpn_redcap_df = pd.DataFrame(qpn_redcap_inventory.items(), columns=["Assessment", "Count"]).sort_values(by="Count", ascending=False)

qpn_redcap_df.head()


Number of records: 1394


Unnamed: 0,Assessment,Count
48,qpn_language___2,623
108,qpn_mri,566
88,qpn_symptoms_progress,565
47,qpn_language___1,478
97,qpn_other_symptoms___8,434


### Find records with MRI data

In [59]:
substr = "mri"
res = [i for i in redcap_cols if substr in i]
query_df[res].head()

Unnamed: 0,mri_consent,qpn_mri_completed___1,qpn_mri_completed___2,qpn_mri_completed___3,qpn_mri_completed___6,qpn_mri_completed___7,qpn_mri_completed___8,qpn_mri_completed___4,qpn_mri_completed___5,dicom_mri_1,dicom_mri_2,dicom_mri_3,dicom_mri_4,dicom_mri_5,dicom_mri_collins,dicom_mri_doyon,dicom_mri_hbhl,qpn_mri,qpn_mri_2,qpn_mri_2_t2
0,,Unchecked,Unchecked,Unchecked,Unchecked,Unchecked,Unchecked,Unchecked,Unchecked,,,,,,,,,,,
1,,Unchecked,Unchecked,Unchecked,Unchecked,Unchecked,Unchecked,Unchecked,Unchecked,,,,,,,,,Yes/Oui,No/Non,
2,,,,,,,,,,,,,,,,,,,,No/Non
3,,Unchecked,Unchecked,Unchecked,Unchecked,Unchecked,Unchecked,Unchecked,Unchecked,,,,,,,,,,,
4,,Unchecked,Unchecked,Unchecked,Unchecked,Unchecked,Unchecked,Unchecked,Unchecked,,,,,,,,,Yes/Oui,No/Non,


In [60]:
_df= query_df[[index_col] + res]
n_records = _df["record_id"].nunique()
print(f"Number of records: {n_records}")

availability_indicators = ["Checked", "Yes/Oui", "Uncertain/Incertain"]
qpn_mri_inventory = get_inventory_count(_df, index_col, availability_indicators)

qpn_mri_inventory_df = pd.DataFrame(qpn_mri_inventory.items(), columns=["Assessment", "Count"]).sort_values(by="Count", ascending=False)

qpn_mri_redcap_participants = query_df[query_df["qpn_mri"].isin(availability_indicators)]["record_id"].unique()
print(f"Number of participants with MRI data: {len(qpn_mri_redcap_participants)}")

record_dict["qpn_mri"] = qpn_mri_redcap_participants

qpn_mri_inventory_df.head()

Number of records: 1394
Number of participants with MRI data: 566


Unnamed: 0,Assessment,Count
17,qpn_mri,566
18,qpn_mri_2,174
1,qpn_mri_completed___1,100
7,qpn_mri_completed___4,54
19,qpn_mri_2_t2,41


### Find records with demographics data


#### Age

In [61]:
substr = "age"
ignore_substr = "language"
res = [i for i in redcap_cols if substr in i]
res = [i for i in res if ignore_substr not in i]
query_df[res].head()

Unnamed: 0,age_blood_draw,study_visit_age,age_onset,age_onset_4,age_onset_2,age_onset_3
0,,,,,,
1,70.0,70.0,66.0,,59.0,
2,,,,,,
3,,,,,,
4,71.0,71.0,60.0,,57.0,


In [62]:
_df= query_df[[index_col] + res]

n_records = _df["record_id"].nunique()
print(f"n_records:{n_records}")

availability_indicators = "number"
qpn_redcap_inventory = get_inventory_count(_df, index_col, availability_indicators)

inventory_df = pd.DataFrame(qpn_redcap_inventory.items(), columns=["Assessment", "Count"]).sort_values(by="Count", ascending=False)

# QPN overlap
var_name = "study_visit_age"
age_redcap_participants = _df[~_df[var_name].isna()]["record_id"].unique()

print(f"Number of participants with {var_name} data: {len(age_redcap_participants)}")
record_dict["study_visit_age"] = age_redcap_participants

inventory_df.head()

n_records:1394
Number of participants with study_visit_age data: 585


Unnamed: 0,Assessment,Count
1,study_visit_age,585
4,age_onset_2,535
2,age_onset,530
0,age_blood_draw,510
3,age_onset_4,0


#### Sex

In [63]:
substr = "gender"
res = [i for i in redcap_cols if substr in i]
query_df[res].head()

Unnamed: 0,gender
0,
1,Male/Masculin
2,
3,
4,Female/Féminin


In [64]:
_df= query_df[[index_col] + res]

n_records = _df["record_id"].nunique()
print(f"n_records:{n_records}")

availability_indicators = ['Male/Masculin', 'Female/Féminin', 'Transgender/Transsexuel']
qpn_redcap_inventory = get_inventory_count(_df, index_col, availability_indicators)

inventory_df = pd.DataFrame(qpn_redcap_inventory.items(), columns=["Assessment", "Count"]).sort_values(by="Count", ascending=False)

# QPN overlap
var_name = "gender"
gender_redcap_participants = _df[_df[var_name].isin(availability_indicators)]["record_id"].unique()

print(f"Number of participants with {var_name} data: {len(gender_redcap_participants)}")
record_dict["gender"] = gender_redcap_participants

inventory_df.head()

n_records:1394
Number of participants with gender data: 962


Unnamed: 0,Assessment,Count
0,gender,962


#### Dx

In [65]:
res = ["enrolment_group_v2"] #"diagnosis_determined" - this is integer
query_df[res].head()

Unnamed: 0,enrolment_group_v2
0,
1,PD (Parkinson's Disease)/Maladie de Parkinson
2,
3,
4,PD (Parkinson's Disease)/Maladie de Parkinson


In [66]:
_df= query_df[[index_col] + res]

n_records = _df["record_id"].nunique()
print(f"n_records:{n_records}")

availability_indicators = ["PD   (Parkinson's Disease)/Maladie de Parkinson",
       'Healthy control/Contrôle',
       'PPS (Parkinson Plus Syndrome)/PPS (Syndrome Parkinson Plus)']

qpn_redcap_inventory = get_inventory_count(_df, index_col, availability_indicators)

inventory_df = pd.DataFrame(qpn_redcap_inventory.items(), columns=["Assessment", "Count"]).sort_values(by="Count", ascending=False)

# QPN overlap
var_name = "enrolment_group_v2"
dx_redcap_participants = _df[_df[var_name].isin(availability_indicators)]["record_id"].unique()

print(f"Number of participants with {var_name} data: {len(dx_redcap_participants)}")
record_dict["Dx"] = dx_redcap_participants

inventory_df.head()

n_records:1394
Number of participants with enrolment_group_v2 data: 1323


Unnamed: 0,Assessment,Count
0,enrolment_group_v2,1323


### Find records with phenotypic data

#### Find records with updrs data

In [67]:
substr = "updrs_score"
res = [i for i in redcap_cols if substr in i]
query_df[res].head()

Unnamed: 0,updrs_score_part_1,updrs_score_part_2,updrs_score_part_3,updrs_score_part_4
0,,,,
1,11.0,8.0,36.0,0.0
2,7.0,12.0,41.0,0.0
3,,,,
4,6.0,11.0,46.0,7.0


In [68]:
_df= query_df[[index_col] + res]

n_records = _df["record_id"].nunique()
print(f"n_records:{n_records}")

availability_indicators = "number"
qpn_redcap_inventory = get_inventory_count(_df, index_col, availability_indicators)

inventory_df = pd.DataFrame(qpn_redcap_inventory.items(), columns=["Assessment", "Count"]).sort_values(by="Count", ascending=False)

# QPN overlap
var_name = "updrs_score_part_3"
updrs_redcap_participants = _df[~_df[var_name].isna()]["record_id"].unique()

print(f"Number of participants with {var_name} data: {len(updrs_redcap_participants)}")

record_dict["updrs_score_part_3"] = updrs_redcap_participants

inventory_df.head()

n_records:1394
Number of participants with updrs_score_part_3 data: 426


Unnamed: 0,Assessment,Count
2,updrs_score_part_3,426
0,updrs_score_part_1,424
1,updrs_score_part_2,423
3,updrs_score_part_4,422


#### MoCA

In [69]:
res = ["moca_result", "moca_calculation"]
query_df[res].head()

Unnamed: 0,moca_result,moca_calculation
0,,
1,24.0,23.0
2,30.0,30.0
3,,
4,22.0,22.0


In [70]:
_df= query_df[[index_col] + res]

n_records = _df["record_id"].nunique()
print(f"n_records:{n_records}")

availability_indicators = "number"
qpn_redcap_inventory = get_inventory_count(_df, index_col, availability_indicators)

inventory_df = pd.DataFrame(qpn_redcap_inventory.items(), columns=["Assessment", "Count"]).sort_values(by="Count", ascending=False)

# QPN overlap
var_name = "moca_result"
moca_redcap_participants = _df[~_df[var_name].isna()]["record_id"].unique()

print(f"Number of participants with {var_name} data: {len(moca_redcap_participants)}")

record_dict["moca_result"] = moca_redcap_participants

inventory_df.head()

n_records:1394
Number of participants with moca_result data: 528


Unnamed: 0,Assessment,Count
0,moca_result,528
1,moca_calculation,523


#### Neurocog

In [71]:
substr = "neurocog"
ignore_substr = "date"
res = [i for i in redcap_cols if substr in i]
res = [i for i in res if ignore_substr not in i]
query_df[res].head()

Unnamed: 0,qpn_neurocognitive___1,qpn_neurocognitive___2,qpn_neurocognitive___3,qpn_neurocognitive___4,qpn_neurocognitive___5
0,Unchecked,Unchecked,Unchecked,Unchecked,Unchecked
1,Unchecked,Unchecked,Unchecked,Unchecked,Unchecked
2,,,,,
3,Unchecked,Unchecked,Unchecked,Unchecked,Unchecked
4,Unchecked,Unchecked,Unchecked,Unchecked,Unchecked


In [72]:
_df= query_df[[index_col] + res]

n_records = _df["record_id"].nunique()
print(f"n_records:{n_records}")

availability_indicators = ['Checked', 'Yes/Oui', 'Uncertain/Incertain']
qpn_redcap_inventory = get_inventory_count(_df, index_col, availability_indicators)

inventory_df = pd.DataFrame(qpn_redcap_inventory.items(), columns=["Assessment", "Count"]).sort_values(by="Count", ascending=False)

# QPN overlap
var_name = "qpn_neurocognitive___1"
neurocog_redcap_participants = _df[_df[var_name].isin(availability_indicators)]["record_id"].unique()

print(f"Number of participants with {var_name} data: {len(neurocog_redcap_participants)}")

record_dict["qpn_neurocognitive"] = neurocog_redcap_participants

inventory_df.head()

n_records:1394
Number of participants with qpn_neurocognitive___1 data: 0


Unnamed: 0,Assessment,Count
0,qpn_neurocognitive___1,0
1,qpn_neurocognitive___2,0
2,qpn_neurocognitive___3,0
3,qpn_neurocognitive___4,0
4,qpn_neurocognitive___5,0


### QPN manifest based on Suivi data

In [73]:
visit_list = ["MRI_v1"] 
session_id_list = ["01"]
session_list = [f"ses-{idx}" for idx in session_id_list]

manifest_cols = ["participant_id", "visit", "session"]

current_nipoppy_manifest_df = pd.read_csv(current_nipoppy_manifest_csv)
current_nipoppy_manifest_df = current_nipoppy_manifest_df[current_nipoppy_manifest_df["visit"].isin(visit_list)]
current_nipoppy_manifest_df = current_nipoppy_manifest_df[current_nipoppy_manifest_df["session"].isin(session_list)]
current_nipoppy_manifest_df = current_nipoppy_manifest_df[manifest_cols]
current_nipoppy_manifest_df["participant_id"] = current_nipoppy_manifest_df["participant_id"].str.upper()
nipoppy_participants = current_nipoppy_manifest_df["participant_id"].unique()
n_participants = len(nipoppy_participants)
print(f"n_participants: {n_participants}")

record_dict["nipoppy_participants"] = nipoppy_participants

current_nipoppy_manifest_df.head()

n_participants: 303


Unnamed: 0,participant_id,visit,session
0,PD00016,MRI_v1,ses-01
1,PD00020,MRI_v1,ses-01
2,PD00032,MRI_v1,ses-01
3,PD00048,MRI_v1,ses-01
4,PD00119,MRI_v1,ses-01


### Get overlap between redcap and nipoppy

In [74]:
for k,v in record_dict.items():
    print(f"{k}: {len(v)}")
    n_overlap = len(set(v).intersection(set(nipoppy_participants)))
    print(f"\tOverlap: {n_overlap}")

all_redcap_records: 1394
	Overlap: 291
qpn_mri: 566
	Overlap: 152
study_visit_age: 585
	Overlap: 176
gender: 962
	Overlap: 277
Dx: 1323
	Overlap: 286
updrs_score_part_3: 426
	Overlap: 159
moca_result: 528
	Overlap: 202
qpn_neurocognitive: 0
	Overlap: 0
nipoppy_participants: 303
	Overlap: 303


In [136]:
substr = "moca"
res = [i for i in redcap_cols if substr in i]
query_df[[index_col]+res].head()

Unnamed: 0,record_id,qpn_moca_completed___1,qpn_moca_completed___2,qpn_moca_completed___3,qpn_moca_completed___4,qpn_moca_completed___5,moca_date_1,moca_date_2,moca_date_3,moca_date_4,...,moca_result_4,moca_result_5,moca_result_6,moca_result_7,moca_result_8,moca_result_9,moca_result,moca_calculation,moca_extra_point,moca_complete
0,CHQ0009,Unchecked,Unchecked,Unchecked,Unchecked,Unchecked,NaT,NaT,NaT,,...,,,,,,,,,,Incomplete
1,CHQ0011,Unchecked,Unchecked,Unchecked,Unchecked,Unchecked,NaT,NaT,NaT,,...,No points/Pas de point,4.0,3.0,2.0,3.0,6.0,24.0,23.0,Yes/Oui,Complete
2,CHQ0011,,,,,,NaT,NaT,NaT,,...,No points/Pas de point,6.0,3.0,2.0,5.0,6.0,30.0,30.0,No/Non,Complete
3,CHQ0012,Unchecked,Unchecked,Unchecked,Unchecked,Unchecked,NaT,NaT,NaT,,...,,,,,,,,,,Incomplete
4,CHQ0035,Unchecked,Unchecked,Unchecked,Unchecked,Unchecked,NaT,NaT,NaT,,...,No points/Pas de point,6.0,2.0,1.0,2.0,6.0,22.0,22.0,No/Non,Complete


In [126]:
date_cols = ["irm1_date", "part3_date_1","full_mds_date_1","full_mds_date_2","full_mds_date_3","moca_date_1","moca_date_2","moca_date_3"]

for col in date_cols:
    query_df[col] = pd.to_datetime(query_df[col], errors='coerce')
    n_available_dates = query_df[~query_df[col].isna()][index_col].nunique()
    print(f"{col}: {n_available_dates}")

irm1_date: 100
part3_date_1: 183
full_mds_date_1: 108
full_mds_date_2: 25
full_mds_date_3: 4
moca_date_1: 220
moca_date_2: 86
moca_date_3: 19


In [140]:
query_df[~query_df[col].isna()][[index_col,"irm1_date","moca_date_1","moca_date_2","moca_date_3","moca_result_5", "moca_result", "moca_calculation"]]


Unnamed: 0,record_id,irm1_date,moca_date_1,moca_date_2,moca_date_3,moca_result_5,moca_result,moca_calculation
602,PD00020,2018-05-12,2016-08-10,2019-05-02,2021-09-07,6.0,22.0,22.0
646,PD00119,2018-08-13,2016-07-01,2017-12-07,2018-08-15,5.0,29.0,29.0
684,PD00208,NaT,2016-07-21,2017-05-01,2019-06-21,6.0,26.0,26.0
687,PD00215,2019-10-04,2019-10-04,2022-11-03,2023-07-13,6.0,29.0,29.0
740,PD00339,2021-10-07,2021-10-06,2023-04-26,2023-05-25,6.0,27.0,27.0
838,PD00596,NaT,2017-07-18,2018-10-22,2022-02-23,5.0,25.0,25.0
847,PD00629,NaT,2017-02-15,2017-07-20,2019-11-29,,,
881,PD00769,NaT,2017-02-15,2018-12-18,2022-08-05,6.0,26.0,26.0
893,PD00792,2019-06-20,2017-09-03,2017-05-04,2019-06-19,6.0,29.0,28.0
915,PD00834,NaT,2017-05-01,2019-07-08,2022-01-19,5.0,25.0,25.0


In [77]:
qpn_updrs_participants = set(nipoppy_participants).intersection(set(updrs_redcap_participants))
updrs_df = query_df[query_df[index_col].isin(qpn_updrs_participants)][[index_col,"redcap_event_name", "full_mds_date_1", "updrs_score_part_3"]]
updrs_df

Unnamed: 0,record_id,redcap_event_name,full_mds_date_1,updrs_score_part_3
130,MNI0028,Baseline (Arm 1: C-OPN),NaT,29.0
136,MNI0056,Baseline (Arm 1: C-OPN),NaT,58.0
138,MNI0058,Baseline (Arm 1: C-OPN),NaT,26.0
143,MNI0068,Baseline (Arm 1: C-OPN),NaT,28.0
148,MNI0079,Baseline (Arm 1: C-OPN),NaT,22.0
...,...,...,...,...
1629,PD01756,Baseline (Arm 1: C-OPN),NaT,22.0
1630,PD01756,12 Months Follow-Up/Suivi (Arm 1: C-OPN),NaT,45.0
1631,PD01756,18 Months Follow-Up/Suivi (Arm 1: C-OPN),NaT,
1654,UDM0045,Baseline (Arm 1: C-OPN),NaT,16.0


In [78]:
len(qpn_updrs_participants)

159

In [86]:
check_participants = ["MNI0369", "MNI0607", "MNI0436", "MNI0421","PD00215","MNI0342"]

In [91]:
set(nipoppy_participants).intersection(set(check_participants))

{'MNI0342', 'MNI0369', 'MNI0421', 'MNI0436', 'MNI0607', 'PD00215'}

In [89]:
query_df[query_df[index_col].isin(check_participants)][[index_col, "redcap_event_name", "enrolment_group_v2"]]

Unnamed: 0,record_id,redcap_event_name,enrolment_group_v2
297,MNI0342,Baseline (Arm 1: C-OPN),PD (Parkinson's Disease)/Maladie de Parkinson
323,MNI0369,Baseline (Arm 1: C-OPN),PPS (Parkinson Plus Syndrome)/PPS (Syndrome Pa...
373,MNI0421,Baseline (Arm 1: C-OPN),Healthy control/Contrôle
555,MNI0607,Baseline (Arm 1: C-OPN),PD (Parkinson's Disease)/Maladie de Parkinson
687,PD00215,Baseline (Arm 1: C-OPN),PD (Parkinson's Disease)/Maladie de Parkinson
688,PD00215,18 Months Follow-Up/Suivi (Arm 1: C-OPN),
