### Notebook to prototype REDCap API calls

In [None]:
import pandas as pd
import numpy as np
import requests
import json
import nipoppy.workflow.logger as my_logger
import seaborn as sns
import nipoppy.workflow.utils as utils
from itertools import product

In [None]:
def api_call(url, query, logger):
    r = requests.post(url, data=query, verify=False)
    http_status = str(r.status_code)
    logger.info(f'HTTP Status: {http_status}')

    if http_status == "200":
        query_results = r.json()
        query_df = pd.DataFrame(query_results)

    else:
        logger.error(f"RedCap API request Failed with HTTP Status: {http_status}")

    return query_df

In [None]:
DATASET_ROOT = "/home/nikhil/projects/Parkinsons/qpn/"

# Current nipoppy manifest
release_dir = f"{DATASET_ROOT}/releases/"
current_release = "Jan_2024"
current_nipoppy_manifest_csv = f"{release_dir}{current_release}/tabular/manifest.csv"
legacy_qpn_imaging_codes_xlsx = f"{release_dir}{current_release}/tabular/assessments/QPN_Imaging_Codes.xlsx"

# log
log_dir = f"{DATASET_ROOT}/scratch/logs/"
log_file = f"{log_dir}/bids_conv.log"
logger = my_logger.get_logger(log_file)

redcap_report_csv = f"{release_dir}{current_release}/tabular/assessments/COPN-MRIDataReport2023110_DATA_LABELS_2024-01-10_1004.csv"


### Manifest

In [None]:
visit_list = ["MRI_v1"] 
session_id_list = ["01"]
session_list = [f"ses-{idx}" for idx in session_id_list]

manifest_cols = ["participant_id", "visit", "session"]

current_nipoppy_manifest_df = pd.read_csv(current_nipoppy_manifest_csv)
current_nipoppy_manifest_df = current_nipoppy_manifest_df[current_nipoppy_manifest_df["visit"].isin(visit_list)]
current_nipoppy_manifest_df = current_nipoppy_manifest_df[current_nipoppy_manifest_df["session"].isin(session_list)]
current_nipoppy_manifest_df = current_nipoppy_manifest_df[manifest_cols]
current_nipoppy_manifest_df["participant_id"] = current_nipoppy_manifest_df["participant_id"].str.upper()
nipoppy_participants = current_nipoppy_manifest_df["participant_id"].unique()
n_participants = len(nipoppy_participants)
logger.info(f"n_participants: {n_participants}")
current_nipoppy_manifest_df.head()

### Legacy demographics and visit dates
- Using this while REDCap is getting updated

In [None]:
sheet_name = "recruit_manifest"
usecols = ["participant_id", "group", "sex", "dob", "visit_01"]
legacy_recruit_df = pd.read_excel(legacy_qpn_imaging_codes_xlsx, sheet_name=sheet_name, engine='openpyxl', usecols=usecols)
legacy_recruit_df = legacy_recruit_df.rename(columns={"visit_01": "date_MRI_v1"})
legacy_recruit_df["participant_id"] = legacy_recruit_df["participant_id"].str.upper()

legacy_col_dict = {"group": "legacy_group", "sex":"legacy_sex", "dob":"legacy_dob"}
legacy_recruit_df = legacy_recruit_df.rename(columns=legacy_col_dict)

legacy_participants = legacy_recruit_df["participant_id"].unique()
n_legacy_participants = len(legacy_participants)
logger.info(f"n_legacy_participants: {n_legacy_participants}")

legacy_recruit_df["legacy_dob"] = pd.to_datetime(legacy_recruit_df["legacy_dob"], errors="coerce", dayfirst=True)
legacy_recruit_df["date_MRI_v1"] = pd.to_datetime(legacy_recruit_df["date_MRI_v1"], errors="coerce", dayfirst=True)

# calculate age at MRI_V01
legacy_recruit_df["age_MRI_v1"] = legacy_recruit_df["date_MRI_v1"] - legacy_recruit_df["legacy_dob"]
legacy_recruit_df["age_MRI_v1"] = np.round(legacy_recruit_df["age_MRI_v1"].dt.days / 365.25, 1)
legacy_recruit_df.head()


In [None]:
legacy_recruit_df.groupby("legacy_group").count()

### Redcap config
QPN specific reports:
['QPN participants', 'External QPN (June 2021)', 'Demographic QPN', 'QPN-Clinical questionnaire', 'QPN sex', 'Diagnosis QPN', 'MoCA-MDS-UPDRS part 3', 'Victoria - Weston Project', 'MotorAndNon-Motor', 'MoCA']

In [None]:
redcap_config_json = f"{DATASET_ROOT}/proc/.redcap.json"
redcap_config = json.load(open(redcap_config_json))
url = redcap_config["url"]
redcap_reports = list(redcap_config["queries"].keys())
n_redcap_reports = len(redcap_reports)
logger.info(f"redcap_reports ({n_redcap_reports}): {redcap_reports}")

In [None]:
query_dict = {}
redcap_participants = []
for query_label in redcap_reports:
    query = redcap_config["queries"][query_label]

    # run query
    logger.info(f"Running query {query_label}...")
    query_df = api_call(url, query, logger=logger)
    query_df["record_id"] = query_df["record_id"].str.upper()

    # get the list of participants
    _participants = query_df["record_id"].unique()
    redcap_participants.extend(_participants)
    n_participants = len(redcap_participants)

    # get the list of redcap events
    redcap_events = query_df["redcap_event_name"].unique()
    n_events = len(redcap_events)

    logger.info(f"Fetched {n_participants} participants and {n_events} event_ids: {redcap_events}")

    query_dict[query_label] = query_df.copy()

redcap_participants = list(set(redcap_participants))

### Redcap report
This comes from Sarah with DoB

In [None]:
redcap_report_df = pd.read_csv(redcap_report_csv)

demo_col_dict = {"Record ID:": "participant_id", "Event Name": "redcap_event_name", 
                 "Enrolment Group": "group", "Date of Birth":"dob", "1. Gender (technically, it's sex)": "sex", 
                 "Date of MoCA administration": "moca_date","Assessment completed date":"updrs_date",
                 "Neuropsycholgical Test Date:": "neuropsy_date"}

redcap_report_df = redcap_report_df[list(demo_col_dict.keys())].rename(columns=demo_col_dict)

redcap_report_df["participant_id"] = redcap_report_df["participant_id"].str.upper()
redcap_report_df["sex"] = redcap_report_df["sex"].replace({"Male/Masculin":"M", "Female/Féminin":"F"})

redcap_report_participants = redcap_report_df["participant_id"].unique()
n_participants = redcap_report_df["participant_id"].nunique()
n_events = redcap_report_df["redcap_event_name"].unique()

print(f"Number of participants: {n_participants}")
print(f"Number of events: {n_events}")

redcap_report_df.head()


#### Calculate ages from dates in the redcap report

In [None]:
redcap_report_df["dob"] = pd.to_datetime(redcap_report_df["dob"], errors="coerce", dayfirst=False)
redcap_report_df["moca_date"] = pd.to_datetime(redcap_report_df["moca_date"], errors="coerce", dayfirst=False)
redcap_report_df["updrs_date"] = pd.to_datetime(redcap_report_df["updrs_date"], errors="coerce", dayfirst=False)
redcap_report_df["neuropsy_date"] = pd.to_datetime(redcap_report_df["neuropsy_date"], errors="coerce", dayfirst=False)

redcap_report_df["age_moca_v1"] = redcap_report_df["moca_date"] - redcap_report_df["dob"]
redcap_report_df["age_moca_v1"] = np.round(redcap_report_df["age_moca_v1"].dt.days / 365.25, 1)

redcap_report_df["age_updrs_v1"] = redcap_report_df["updrs_date"] - redcap_report_df["dob"]
redcap_report_df["age_updrs_v1"] = np.round(redcap_report_df["age_updrs_v1"].dt.days / 365.25, 1)

redcap_report_df["age_neuropsy_v1"] = redcap_report_df["neuropsy_date"] - redcap_report_df["dob"]
redcap_report_df["age_neuropsy_v1"] = np.round(redcap_report_df["age_neuropsy_v1"].dt.days / 365.25, 1)


redcap_report_df.head()

### Participant tallys

In [None]:
# redcap_participants = [p.upper() for p in redcap_participants]

print(f"Number of nipoppy participants: {len(nipoppy_participants)}")
print(f"Number of legacy participants: {len(legacy_participants)}")
print(f"Number of redcap participants: {len(redcap_participants)}")
print(f"Number of redcap report participants: {len(redcap_report_participants)}")


a = set(nipoppy_participants)
b = set(legacy_participants)
c = set(redcap_participants)
d = set(redcap_report_participants)

nipoppy_legacy_common_participants = a.intersection(b)
n_nipoppy_legacy_common_participants = len(nipoppy_legacy_common_participants)

nipoppy_redcap_common_participants = a.intersection(c)
n_nipoppy_redcap_common_participants = len(nipoppy_redcap_common_participants)

nipoppy_redcap_report_common_participants = a.intersection(d)
n_nipoppy_redcap_report_common_participants = len(nipoppy_redcap_report_common_participants)

nipoppy_not_in_legacy_participants  = a.difference(b) # nipoppy participants not in the legacy spreadsheets
n_nipoppy_not_in_legacy_participants = len(nipoppy_not_in_legacy_participants)

nipoppy_not_in_redcap_participants  = a.difference(c) # nipoppy participants not in the redcap yet
n_nipoppy_not_in_redcap_participants = len(nipoppy_not_in_redcap_participants)

nipoppy_not_in_redcap_report_participants  = a.difference(d) # nipoppy participants not in the redcap report yet
n_nipoppy_not_in_redcap_report_participants = len(nipoppy_not_in_redcap_report_participants)

missing_nipoppy_participants = a.difference(b).difference(c).difference(d)
n_missing_nipoppy_participants = len(missing_nipoppy_participants)

new_redcap_participants  = nipoppy_redcap_common_participants.difference(d) # nipoppy-redcap participants not in the report yet
n_new_redcap_participants = len(new_redcap_participants)

print(f"nipoppy-legacy common participants: {n_nipoppy_legacy_common_participants}")
print(f"nipoppy-redcap common participants: {n_nipoppy_redcap_common_participants}")
print(f"nipoppy-recdap_report common participants: {n_nipoppy_redcap_report_common_participants}")

print(f"nipoppy-redcap participants not in the report yet (n={n_new_redcap_participants}): {new_redcap_participants}")
print(f"missing_nipoppy_participants (n={n_missing_nipoppy_participants}): {missing_nipoppy_participants}")

### Demographics, Dx, and summary clinical scores
- `Demographic QPN`
    - "study_visit_age", "gender", "yrs_education"
        - Note: study visit age will be different for different assessments
- `Diagnosis QPN` does not have Dx for all participants
- `Victoria - Weston Project` (Legend for Determined diagnosis)
    - If score = 0, Parkinson's Disease (PD)  
    - If score = 1, Progressive Supranuclear Palsy (PSP)  
    - If score = 2, Multiple System Atrophy (MSA) 
    - If score = 3, Corticobasal Syndrome (CBS)  
    - If score = 4, Dementia wi1th Lewy Bodies (DLB)  
    - If score = 5, Frontotemporal Dementia (FTD)  
    - If score = 6, Essential Tremor (ET)  
    - If score = 7, REM Sleep Behaviour Disorder (RBD)
- `MotorAndNon-Motor` 
    - summary clinical scores (updrs, moca)
    - verify moca column with extra point i.e. `moca_extra_point`


### Aggregate useful redcap data

In [None]:
index_columns = ["record_id", "redcap_event_name"]

demo_cols = ["study_visit_age", "gender", "yrs_education"]
demo_df = query_dict['Demographic QPN'][index_columns + demo_cols].copy()

dx_cols = ["diagnosis_determined", "duration_disease"]
dx_df = query_dict['Victoria - Weston Project'][index_columns + dx_cols]

diagnosis_determined_label_map = {
    "0" : "PD",
    "1" : "PSP",
    "2" : "MSA",
    "3" : "CBS", 
    "4" : "DLB",
    "5" : "FTD", 
    "6" : "ET",
    "7" : "RBD"
}
dx_df.loc[:,"diagnosis_determined"] = dx_df["diagnosis_determined"].astype(str).replace(diagnosis_determined_label_map).copy()

updrs_cols = ["mds_updrs_h_y", "updrs_score_part_1", "updrs_score_part_2", "updrs_score_part_3", "updrs_score_part_4"]
moca_cols = ['moca_result'] 
moca_subscore_cols = ['moca_result_2', 'moca_result_3', 'moca_result_4','moca_result_5', 'moca_result_6', 
                      'moca_result_7', 'moca_result_8','moca_result_9']

score_cols = updrs_cols + moca_cols
score_df = query_dict["Victoria - Weston Project"][index_columns + score_cols].copy()

redcap_df = pd.merge(demo_df, dx_df, on=index_columns)
redcap_df = pd.merge(redcap_df, score_df, on=index_columns)

redcap_df["gender"] = redcap_df["gender"].replace({"Male/Masculin":"M", "Female/Féminin":"F"})

n_redcap_common_participants = len(redcap_df["record_id"].unique())
logger.info(f"Found {n_redcap_common_participants} recdap-nipoppy common participants")

logger.info(f"redcap events: {redcap_df['redcap_event_name'].unique()}")

redcap_df.head()

### Merge MRI, legacy and redcap tabular data

In [None]:
# merge redcap query df
nipoppy_redcap_df = redcap_df[(redcap_df["record_id"].str.upper().isin(nipoppy_participants)) & 
                              (redcap_df["redcap_event_name"].isin(["Baseline (Arm 1: C-OPN)"]))].copy()

n_nipoppy_redcap_participants = len(nipoppy_redcap_df["record_id"].unique())
print(f"n_nipoppy_redcap_participants: {n_nipoppy_redcap_participants}")
               
nipoppy_redcap_df = nipoppy_redcap_df.replace("", np.nan)
nipoppy_redcap_df = nipoppy_redcap_df.rename(columns={"record_id": "participant_id"})

nipoppy_redcap_df = pd.merge(nipoppy_redcap_df, legacy_recruit_df, on="participant_id", how="left")

n_nipoppy_redcap_participants = len(nipoppy_redcap_df["participant_id"].unique())
print(f"n_nipoppy_redcap_participants: {n_nipoppy_redcap_participants}")

redcap_events = nipoppy_redcap_df["redcap_event_name"].unique()
print(f"redcap_events: {redcap_events}")

nipoppy_redcap_df.head()

In [None]:
redcap_report_df.head()

### Merge redcap_report_df with nipoppy_redcap df

In [None]:
nipoppy_redcap_report_df = redcap_report_df[(redcap_report_df["participant_id"].str.upper().isin(nipoppy_participants)) & 
                              (redcap_report_df["redcap_event_name"].isin(["Baseline (Arm 1: C-OPN)"]))].copy()

n_nipoppy_redcap_report_participants = len(nipoppy_redcap_report_df["participant_id"].unique())
print(f"n_nipoppy_redcap_participants: {n_nipoppy_redcap_report_participants}")
               
nipoppy_redcap_report_df = nipoppy_redcap_report_df.replace("", np.nan)

## Get all the age columns
nipoppy_redcap_report_df_filtered = nipoppy_redcap_report_df[["participant_id", "redcap_event_name", "group", "sex",
                                                               "age_moca_v1", "age_updrs_v1", "age_neuropsy_v1"]].copy()

nipoppy_redcap_filtered_df = pd.merge(nipoppy_redcap_df,nipoppy_redcap_report_df_filtered, on=["participant_id","redcap_event_name"], how="left")

n_participants = nipoppy_redcap_filtered_df["participant_id"].nunique()
print(f"n_nipoppy_redcap_report_merged_participants: {n_participants}")

nipoppy_redcap_filtered_df.head()

### Generate bagel(s)
- neuro-bagel
- dash-bagel (currenly this is a melted version of neuro-bagel)

**Note**: QPN has different `visit` names for MRI, UPDRS, MoCA, Neuropsy etc.

Using redcap events as evidence of multiple visits. However we are NOT assuming that two assessments (e.g. UPDRS and MoCA) are co-acquired in the same redcap event. This will be inferred using dates once available. 

In [None]:
redcap_index_cols = ["participant_id", "redcap_event_name"]
redcap_event_visit_id_dict = {"Baseline (Arm 1: C-OPN)": "v1"}

demo_cols = ['group', 'study_visit_age', 'sex', 'yrs_education']
dx_cols = ['diagnosis_determined', 'duration_disease']
age_cols = ["age_MRI_v1","age_moca_v1", "age_updrs_v1", "age_neuropsy_v1"]

screen_cols = redcap_index_cols + demo_cols + dx_cols + age_cols

screen_cols_rename_dict = {
    "study_visit_age": "age_at_screening",
    "yrs_education": "years_education_at_screening",
    # "diagnosis_determined": "diagnosis",
    "duration_disease": "duration_disease_at_screening",
    "group": "group_at_screening"
}

screen_df = nipoppy_redcap_filtered_df[screen_cols].copy()
screen_df = screen_df.rename(columns=screen_cols_rename_dict)

# ------------------------------------------------------------------------------------ #
# Check and add nipoppy participants which yet don't have any redcap data
# Only populates the participant_id and bids_id columns (required by neurobagel)
# ------------------------------------------------------------------------------------ #
nipoppy_participants_without_pheno_data_df = pd.DataFrame()
nipoppy_participants_without_pheno_data_df["participant_id"] = list(missing_nipoppy_participants)
# nipoppy_participants_without_pheno_data_df["bids_id"] = nipoppy_participants_without_pheno_data_df["participant_id"].apply(utils.participant_id_to_bids_id)
print(f"appending {nipoppy_participants_without_pheno_data_df.shape[0]} participants without pheno data")
screen_df = pd.concat([screen_df, nipoppy_participants_without_pheno_data_df], axis=0)

print(f"screen_df participants: {screen_df['participant_id'].nunique()}")
# ------------------------------------------------------------------------------------ #

screen_df.head()

### Pivot the screen_df to conform to tabular demographics CSV schema

In [None]:
id_vars=["participant_id","redcap_event_name","group_at_screening","age_at_screening","sex",
         "years_education_at_screening","duration_disease_at_screening", "diagnosis_determined"]
demographic_df = screen_df.melt(id_vars=id_vars, var_name="visit_id", value_name="age")

# rename visit_id values 
demographic_df["visit_id"] = demographic_df["visit_id"].replace({"age_MRI_v1": "MRI_v1", 
                                                                 "age_updrs_v1": "UPDRS_v1", 
                                                                 "age_moca_v1": "MOCA_v1", 
                                                                 "age_neuropsy_v1": "NEUROPSY_v1"})

demographic_df = demographic_df.sort_values(["participant_id", "visit_id"])
demographic_df.head()

In [None]:
visit_prefix = "UPDRS_"
updrs_df = nipoppy_redcap_filtered_df[redcap_index_cols + updrs_cols].copy()
updrs_df["visit_id"] = visit_prefix + updrs_df["redcap_event_name"].map(redcap_event_visit_id_dict)
# updrs_df["visit"] =  + updrs_df["visit_id"]

updrs_df.head()

In [None]:
visit_prefix = "MOCA_"
moca_df = nipoppy_redcap_filtered_df[redcap_index_cols + moca_cols].copy()
moca_df["visit_id"] = visit_prefix + moca_df["redcap_event_name"].map(redcap_event_visit_id_dict)
# moca_df["visit"] = visit_prefix + moca_df["visit_id"]

moca_df.head()

In [None]:
pheno_df = pd.concat([updrs_df, moca_df], axis=0)

# To be added later
pheno_df["medication_status"] = np.nan

reordered_cols = ["participant_id","redcap_event_name","visit_id","medication_status"] + updrs_cols + moca_cols
pheno_df = pheno_df[reordered_cols].copy()

pheno_df = pheno_df.sort_values(["participant_id", "visit_id"])

pheno_df.head()

In [None]:
pheno_df[pheno_df["participant_id"]=="MNI0056"]

### Generate availability 
Status options: 1) "Validated" 2) "Missing" 3) "Not collected" 

In [None]:
bagel_df = pd.merge(demographic_df, pheno_df, on=["participant_id","redcap_event_name","visit_id"], how="left") 

bagel_participants = bagel_df["participant_id"].unique()
logger.info(f"n_bagel_participants: {len(bagel_participants)}")

bagel_status_cols = []
for col in updrs_cols + moca_cols:
    bagel_df[f"{col}_status"] = ~bagel_df[col].isna()
    bagel_status_cols.append(f"{col}_status")   
    n_available_participants = np.sum(bagel_df[f"{col}_status"])
    logger.info(f"{col}, n_available_participants: {n_available_participants}")

logger.info(f"bagel df shape: {bagel_df.shape}")
bagel_df.head()

### Status only bagel (no clinical info for public digest/dashboard)

In [None]:
dash_bagel_df_cols = list(demographic_df.columns) + bagel_status_cols
dash_bagel_df = bagel_df[dash_bagel_df_cols].copy()

In [None]:
dash_bagel_df[dash_bagel_df["participant_id"]=="MNI0056"]

### dash bagel (melt)


In [None]:
# Dashboard variables
DASH_INDEX_COLUMNS = ["participant_id", "visit_id"]
DASH_NAME_COL = "assessment_name"
DASH_VAL_COL = "assessment_score"

dash_df = dash_bagel_df.melt(id_vars=DASH_INDEX_COLUMNS, var_name=DASH_NAME_COL, value_name=DASH_VAL_COL)
dash_df = dash_df.rename(columns={"visit_id": "session"})


### save phenotypic data
- Saves `demographics.csv` --> data collected at screening i.e. age, sex, group, education etc. 
- Saves `assessments.csv` i.e. collated data from clinical assessments i.e. UPDRS, MoCA

In [None]:
save_pheno = False

demograph_csv_path = f"{release_dir}{current_release}/tabular/demographics/demographics.csv"
assessment_csv_path = f"{release_dir}{current_release}/tabular/assessments/assessments.csv"

print(f"demographic participants: {demographic_df['participant_id'].nunique()}")
print(f"pheno participants: {pheno_df['participant_id'].nunique()}")
if save_pheno:
    print(f"Saving data to {demograph_csv_path} and {assessment_csv_path}")
    demographic_df.to_csv(demograph_csv_path, index=False)
    pheno_df.to_csv(assessment_csv_path, index=False)
    

### save bagels

In [None]:
save_bagels = False

bagel_csv_path = f"{release_dir}{current_release}/tabular/bagels/bagel.csv"
bagel_tsv_path = f"{release_dir}{current_release}/tabular/bagels/bagel.tsv"
dash_csv_path = f"{release_dir}{current_release}/tabular/bagels/dash_bagel.csv"
public_digest_csv_path = f"../digest/qpn_tabular_availability_digest.csv"

if save_bagels:
    bagel_df.to_csv(bagel_csv_path, index=False)
    bagel_df.to_csv(bagel_tsv_path, index=False, sep="\t")
    dash_df.to_csv(dash_csv_path, index=False)
    dash_df.to_csv(public_digest_csv_path, index=False)
    print(f"Bagel saved to {dash_csv_path}")