In [None]:
import pandas as pd
import numpy as np
import requests
import json
import time
import pickle
import seaborn as sns

### API call for redcap

In [None]:
def api_call(url, query, logger=None):
    """ helper function to make API calls to RedCap
    """
    r = requests.post(url, data=query, verify=False)
    http_status = str(r.status_code)
    print(f'HTTP Status: {http_status}')

    if http_status == "200":
        query_results = r.json()
        query_df = pd.DataFrame(query_results)

    else:
        print(f"RedCap API request Failed with HTTP Status: {http_status}")
        query_df = None
        
    return query_df

def get_inventory_count(df, index_col, availability_indicators):
    """ helper function to count participants with recorded data in redcap
    """
    assess_cols = df.columns.drop(index_col)

    if availability_indicators == 'number':
        df = df.replace("", np.nan)
        df[assess_cols] = df[assess_cols].astype(np.float64)

    inventory = {}
    for col in assess_cols:        
        if availability_indicators == 'number':
            availability_count = df[~df[col].isna()][index_col].nunique()
        else:
            availability_count = df[df[col].isin(availability_indicators)][index_col].nunique()
        inventory[col] = availability_count
    return inventory

### Data paths

In [None]:
DATASET_ROOT = "/home/nikhil/projects/Parkinsons/qpn/"

# Current nipoppy manifest
release_dir = f"{DATASET_ROOT}/releases/"
current_release = "Jan_2024"
current_nipoppy_manifest_csv = f"{release_dir}{current_release}/tabular/manifest.csv"
legacy_qpn_imaging_codes_xlsx = f"{release_dir}{current_release}/tabular/assessments/QPN_Imaging_Codes.xlsx"

# Legacy DoB (Roozbeh)
legacy_participants_DOB = f"{DATASET_ROOT}/tabular/legacy_freeze/QPN-DOB-90subjects.csv"
legacy_participants_DOB_codes = f"{DATASET_ROOT}/tabular/legacy_freeze/QPN-DOB-codes.csv"

# Redcap report (Sarah)
# redcap_report_csv = f"{release_dir}{current_release}/tabular/assessments/COPN-MRIDataReport2023110_DATA_LABELS_2024-01-10_1004.csv"
redcap_report_csv = f"{release_dir}{current_release}/tabular/assessments/COPN-MRIDataReport2023110_DATA_LABELS_2024-02-19_0851.csv"

# MRI dates (dicom header)
MRI_dates_csv = f"{DATASET_ROOT}/scratch/mri_dates_sanity_check.csv"
MRI_acq_data_csv = f"{release_dir}{current_release}/tabular/MRI_acqdata.csv"

# Sharp lab Neurocog dates
neurocog_date_xlsx = f"{release_dir}{current_release}/tabular/assessments/Sharp_QPN_List.xlsx"
revised_neurocog_date_xlsx = f"{release_dir}{current_release}/tabular/assessments/Sharp_QPN_List_revised.xlsx"

# Local Redcap query - avoid frequent API calls
redcap_global_records_query_csv = f"{release_dir}{current_release}/tabular/redcap_global_records_query.csv"

### Redcap API configs

In [None]:
redcap_config_json = f"{DATASET_ROOT}/proc/.redcap.json"
redcap_config = json.load(open(redcap_config_json))
url = redcap_config["url"]
global_records_query = redcap_config["queries"]["global_records_query"]

index_col = "record_id"

In [None]:
sarah_query = redcap_config["queries"]["QPN MoCA-UPDRS-Neuropsy data_Sarah"]
query_df = api_call(url, global_records_query, logger=None)

In [None]:
redcap_records = query_df["record_id"].unique()
n_records = query_df["record_id"].nunique()
redcap_cols = query_df.columns

print(f"Number of records: {n_records}")
print(f"Number of columns: {len(redcap_cols)}")

### Redcap API call

In [None]:
load_local_redcap_records = True

if load_local_redcap_records:
    query_df = pd.read_csv(redcap_global_records_query_csv)

else:
    query_df = api_call(url, global_records_query, logger=None)
    query_df.to_csv(redcap_global_records_query_csv, index=False)

redcap_records = query_df["record_id"].unique()
n_records = query_df["record_id"].nunique()
redcap_cols = query_df.columns

print(f"Number of records: {n_records}")
print(f"Number of columns: {len(redcap_cols)}")


## Get inventory of redcap records for various assessments
record_dict = {}
record_dict["all_redcap_records"] = redcap_records


### Find QPN records

In [None]:
substr = "qpn"
res = [i for i in redcap_cols if substr in i]
query_df[res].head()

In [None]:
_df= query_df[[index_col] + res]
n_records = _df["record_id"].nunique()
print(f"Number of records: {n_records}")

availability_indicators = ["Checked", "Yes/Oui", "Uncertain/Incertain"]
qpn_redcap_inventory = get_inventory_count(_df, index_col, availability_indicators)

qpn_redcap_df = pd.DataFrame(qpn_redcap_inventory.items(), columns=["Assessment", "Count"]).sort_values(by="Count", ascending=False)

qpn_redcap_df.head()


### Find records with MRI data

In [None]:
substr = "mri"
res = [i for i in redcap_cols if substr in i]
query_df[res].head()

In [None]:
_df= query_df[[index_col] + res]
n_records = _df["record_id"].nunique()
print(f"Number of records: {n_records}")

availability_indicators = ["Checked", "Yes/Oui", "Uncertain/Incertain"]
qpn_mri_inventory = get_inventory_count(_df, index_col, availability_indicators)

qpn_mri_inventory_df = pd.DataFrame(qpn_mri_inventory.items(), columns=["Assessment", "Count"]).sort_values(by="Count", ascending=False)

qpn_mri_redcap_participants = query_df[query_df["qpn_mri"].isin(availability_indicators)]["record_id"].unique()
print(f"Number of participants with MRI data: {len(qpn_mri_redcap_participants)}")

record_dict["qpn_mri"] = qpn_mri_redcap_participants

qpn_mri_inventory_df.head()

### Find records with demographics data


#### Age

In [None]:
substr = "age"
ignore_substr = "language"
res = [i for i in redcap_cols if substr in i]
res = [i for i in res if ignore_substr not in i]
query_df[res].head()

In [None]:
_df= query_df[[index_col] + res]

n_records = _df["record_id"].nunique()
print(f"n_records:{n_records}")

availability_indicators = "number"
qpn_redcap_inventory = get_inventory_count(_df, index_col, availability_indicators)

inventory_df = pd.DataFrame(qpn_redcap_inventory.items(), columns=["Assessment", "Count"]).sort_values(by="Count", ascending=False)

# QPN overlap
var_name = "study_visit_age"
age_redcap_participants = _df[~_df[var_name].isna()]["record_id"].unique()

print(f"Number of participants with {var_name} data: {len(age_redcap_participants)}")
record_dict["study_visit_age"] = age_redcap_participants

inventory_df.head()

#### Sex

In [None]:
substr = "gender"
res = [i for i in redcap_cols if substr in i]
query_df[res].head()

In [None]:
_df= query_df[[index_col] + res]

n_records = _df["record_id"].nunique()
print(f"n_records:{n_records}")

availability_indicators = ['Male/Masculin', 'Female/Féminin', 'Transgender/Transsexuel']
qpn_redcap_inventory = get_inventory_count(_df, index_col, availability_indicators)

inventory_df = pd.DataFrame(qpn_redcap_inventory.items(), columns=["Assessment", "Count"]).sort_values(by="Count", ascending=False)

# QPN overlap
var_name = "gender"
gender_redcap_participants = _df[_df[var_name].isin(availability_indicators)]["record_id"].unique()

print(f"Number of participants with {var_name} data: {len(gender_redcap_participants)}")
record_dict["gender"] = gender_redcap_participants

inventory_df.head()

#### Dx

In [None]:
res = ["enrolment_group_v2"] #"diagnosis_determined" - this is integer
query_df[res].head()

In [None]:
_df= query_df[[index_col] + res]

n_records = _df["record_id"].nunique()
print(f"n_records:{n_records}")

availability_indicators = ["PD   (Parkinson's Disease)/Maladie de Parkinson",
       'Healthy control/Contrôle',
       'PPS (Parkinson Plus Syndrome)/PPS (Syndrome Parkinson Plus)']

qpn_redcap_inventory = get_inventory_count(_df, index_col, availability_indicators)

inventory_df = pd.DataFrame(qpn_redcap_inventory.items(), columns=["Assessment", "Count"]).sort_values(by="Count", ascending=False)

# QPN overlap
var_name = "enrolment_group_v2"
dx_redcap_participants = _df[_df[var_name].isin(availability_indicators)]["record_id"].unique()

print(f"Number of participants with {var_name} data: {len(dx_redcap_participants)}")
record_dict["Dx"] = dx_redcap_participants

inventory_df.head()

### Find records with phenotypic data

#### Find records with updrs data

In [None]:
substr = "updrs_score"
res = [i for i in redcap_cols if substr in i]
query_df[res].head()

In [None]:
_df= query_df[[index_col] + res]

n_records = _df["record_id"].nunique()
print(f"n_records:{n_records}")

availability_indicators = "number"
qpn_redcap_inventory = get_inventory_count(_df, index_col, availability_indicators)

inventory_df = pd.DataFrame(qpn_redcap_inventory.items(), columns=["Assessment", "Count"]).sort_values(by="Count", ascending=False)

# QPN overlap
var_name = "updrs_score_part_3"
updrs_redcap_participants = _df[~_df[var_name].isna()]["record_id"].unique()

print(f"Number of participants with {var_name} data: {len(updrs_redcap_participants)}")

record_dict["updrs_score_part_3"] = updrs_redcap_participants

inventory_df.head()

#### MoCA

In [None]:
res = ["moca_result", "moca_calculation"]
query_df[res].head()

In [None]:
_df= query_df[[index_col] + res]

n_records = _df["record_id"].nunique()
print(f"n_records:{n_records}")

availability_indicators = "number"
qpn_redcap_inventory = get_inventory_count(_df, index_col, availability_indicators)

inventory_df = pd.DataFrame(qpn_redcap_inventory.items(), columns=["Assessment", "Count"]).sort_values(by="Count", ascending=False)

# QPN overlap
var_name = "moca_result"
moca_redcap_participants = _df[~_df[var_name].isna()]["record_id"].unique()

print(f"Number of participants with {var_name} data: {len(moca_redcap_participants)}")

record_dict["moca_result"] = moca_redcap_participants

inventory_df.head()

#### Neurocog

In [None]:
substr = "neurocog"
ignore_substr = "date"
res = [i for i in redcap_cols if substr in i]
res = [i for i in res if ignore_substr not in i]
query_df[res].head()

In [None]:
_df= query_df[[index_col] + res]

n_records = _df["record_id"].nunique()
print(f"n_records:{n_records}")

availability_indicators = ['Checked', 'Yes/Oui', 'Uncertain/Incertain']
qpn_redcap_inventory = get_inventory_count(_df, index_col, availability_indicators)

inventory_df = pd.DataFrame(qpn_redcap_inventory.items(), columns=["Assessment", "Count"]).sort_values(by="Count", ascending=False)

# QPN overlap
var_name = "qpn_neurocognitive___1"
neurocog_redcap_participants = _df[_df[var_name].isin(availability_indicators)]["record_id"].unique()

print(f"Number of participants with {var_name} data: {len(neurocog_redcap_participants)}")

record_dict["qpn_neurocognitive"] = neurocog_redcap_participants

inventory_df.head()

### QPN manifest based on Suivi data

In [None]:
visit_list = ["MRI_v1"] 
session_id_list = ["01"]
session_list = [f"ses-{idx}" for idx in session_id_list]

manifest_cols = ["participant_id", "visit", "session"]

current_nipoppy_manifest_df = pd.read_csv(current_nipoppy_manifest_csv)
current_nipoppy_manifest_df = current_nipoppy_manifest_df[current_nipoppy_manifest_df["visit"].isin(visit_list)]
current_nipoppy_manifest_df = current_nipoppy_manifest_df[current_nipoppy_manifest_df["session"].isin(session_list)]
current_nipoppy_manifest_df = current_nipoppy_manifest_df[manifest_cols]
current_nipoppy_manifest_df["participant_id"] = current_nipoppy_manifest_df["participant_id"].str.upper()
nipoppy_participants = current_nipoppy_manifest_df["participant_id"].unique()
n_participants = len(nipoppy_participants)
print(f"n_participants: {n_participants}")

record_dict["nipoppy_participants"] = nipoppy_participants

current_nipoppy_manifest_df.head()

### Get overlap between redcap and nipoppy

In [None]:
for k,v in record_dict.items():
    print(f"{k}: {len(v)}")
    n_overlap = len(set(v).intersection(set(nipoppy_participants)))
    print(f"\tOverlap: {n_overlap}")

In [None]:
substr = "date"
res = [i for i in redcap_cols if substr in i]
query_df[res].head()

In [None]:
date_cols = ["full_mds_date_1","full_mds_date_2","full_mds_date_3","moca_date_1","moca_date_2","moca_date_3"]

for col in date_cols:
    query_df[col] = pd.to_datetime(query_df[col], errors='coerce')
    n_available_dates = query_df[~query_df[col].isna()][index_col].nunique()
    print(f"{col}: {n_available_dates}")

In [None]:
qpn_updrs_participants = set(nipoppy_participants).intersection(set(updrs_redcap_participants))
updrs_df = query_df[query_df[index_col].isin(qpn_updrs_participants)][[index_col,"redcap_event_name", "full_mds_date_1", "updrs_score_part_3"]]
updrs_df

In [None]:
len(qpn_updrs_participants)