## Notebook to report descriptive statistics from demoraphic and assessment data

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

### Paths


In [None]:
DATASET_ROOT = "/home/nikhil/projects/Parkinsons/qpn/"

# Current nipoppy release
current_release = "June_2024"

data_release_dir = f"{DATASET_ROOT}/releases/{current_release}/"
tabular_data_release_dir = f"{data_release_dir}/tabular/"
manifest_file = f"{data_release_dir}/manifest.csv"

# tabular files
demographics_file = f"{tabular_data_release_dir}/demographics.csv"
mri_session_date_file = f"{tabular_data_release_dir}/mri_sessions.csv"
updrs_file = f"{tabular_data_release_dir}/assessments/updrs.csv"
moca_file = f"{tabular_data_release_dir}/assessments/moca.csv"
dx_file = f"{tabular_data_release_dir}/assessments/diagnosis.csv"
neuropsych_file = f"{tabular_data_release_dir}/assessments/neuropsych.csv"

### Defs

In [None]:
def subset_and_replace_df(df, filters_dict, rename_dict):
    """ Subset rows and replace columns values in a dataframe
    """
    for col, val_list in filters_dict.items():
        df = df[df[col].isin(val_list)].copy()

    for col, val_list in rename_dict.items():
        df[col] = df[col].replace(val_list).copy()

    return df
    


def get_group_table_stats(df, cat_cols, score_cols, groupby_col="redcap_event_name"):
    """ Get table stats for groups. Does not stratify by group! 
    """
    n_cat_cols = len(cat_cols)
    n_score_cols = len(score_cols)
    print(f"Counting {n_cat_cols} and averaging {n_score_cols}")

    table_df = df["redcap_event_name"].value_counts().reset_index()
    print("Starting cat cols")
    for col in cat_cols:
        # print(f"col: {col}")
        cat_count_df = df.groupby([groupby_col])[col].value_counts().unstack().reset_index()
        table_df = pd.merge(table_df, cat_count_df, on=groupby_col, how="left")

    print("Starting score cols")
    for col in score_cols:
        # print(f"col: {col}")
        score_count = df.groupby([groupby_col])[col].count()
        score_mean_df = df.groupby([groupby_col])[col].mean().round(1)
        score_std_df = df.groupby([groupby_col])[col].std().round(1)
        score_min_df = df.groupby([groupby_col])[col].min().round(1)
        score_max_df = df.groupby([groupby_col])[col].max().round(1)
        score_mean_std_df = "non-null-count: " + score_count.astype(str) + " " + score_mean_df.astype(str) + " (" + score_std_df.astype(str) + ")" + " ["  \
        + score_min_df.astype(str) + ", " + score_max_df.astype(str) + "]"
        score_mean_std_df = score_mean_std_df.reset_index()
        # score_mean_std_df["non-null-count"] = score_count
        table_df = pd.merge(table_df, score_mean_std_df, on=groupby_col, how="left")
    
    return table_df

### Load data

In [None]:
manifest_df = pd.read_csv(manifest_file)
demo_df = pd.read_csv(demographics_file)
mri_df = pd.read_csv(mri_session_date_file)
updrs_df = pd.read_csv(updrs_file)
moca_df = pd.read_csv(moca_file)
dx_df = pd.read_csv(dx_file)
neuropsy_df = pd.read_csv(neuropsych_file)


### QPN paper tables

In [None]:
# paper subset filters
cohort_inclusion_list = ["QPN"]
group_inclusion_list = ["Healthy control/Contrôle", "PD   (Parkinson's Disease)/Maladie de Parkinson"]
visits_inclusion_list = ["Baseline (Arm 1: C-OPN)", "pre-redcap-baseline-1 (legacy)", "pre-redcap-baseline-2 (legacy)",
                         "12 Months Follow-Up/Suivi (Arm 1: C-OPN)","18 Months Follow-Up/Suivi (Arm 1: C-OPN)"]

participant_inclusion_criteria = {
    "redcap_event_name" : visits_inclusion_list, 
    "recruitment_cohort": cohort_inclusion_list,
    "group": group_inclusion_list
    }

QPN_groups = {"Healthy control/Contrôle": "control", "PD   (Parkinson's Disease)/Maladie de Parkinson": "PD", np.NaN:"Unknown"}
QPN_sexes = {"Female/Féminin": "Female", "Male/Masculin":"Male"}

col_val_replacement_criteria = {
    "group": QPN_groups,
    "sex": QPN_sexes
}

### Manifest

In [None]:
paper_df = pd.merge(manifest_df, demo_df, on="participant_id", how="left")
paper_df = paper_df[paper_df["recruitment_cohort"]=="QPN"]

n_tabular_participants = paper_df["participant_id"].nunique()
print(f"Number of participants: {n_tabular_participants}")

# Filter and replace values
paper_df = subset_and_replace_df(paper_df, participant_inclusion_criteria, col_val_replacement_criteria)
n_paper_participants = paper_df["participant_id"].nunique()
print(f"Number of participants after event and group filter: {n_paper_participants}")

session_counts = paper_df["session"].value_counts()
print(f"session_counts: {session_counts}")

paper_df.head()

#### Demo table
- Add MRI age column

In [None]:
# add mri_age column
paper_df = pd.merge(demo_df, mri_df[["participant_id", "redcap_event_name", "MRI_age"]], on=["participant_id", "redcap_event_name"], how="left")

n_tabular_participants = paper_df["participant_id"].nunique()
print(f"Number of participants: {n_tabular_participants}")

# Filter and replace values
paper_df = subset_and_replace_df(paper_df, participant_inclusion_criteria, col_val_replacement_criteria)
n_paper_participants = paper_df["participant_id"].nunique()
print(f"Number of participants after event and group filter: {n_paper_participants}")

redcap_events = paper_df["redcap_event_name"].unique()
print(f"redcap events: {redcap_events}")

paper_df.head()

In [None]:
# counts
cat_cols = ["sex"]
score_cols = ["MRI_age"]

for dx_group in QPN_groups.values():
    print(f"*** group: {dx_group} ***")
    dx_group_df = paper_df[paper_df["group"]==dx_group].copy()
    table_df = get_group_table_stats(dx_group_df, cat_cols, score_cols)
    print("-"*10)
    print(table_df)
    print("-"*10)


#### UPDRS table

In [None]:
paper_df = pd.merge(demo_df.drop(columns=["redcap_event_name"]), updrs_df, on=["participant_id"], how="right")
paper_df = pd.merge(paper_df, dx_df, on=["participant_id","redcap_event_name"], how="left")

n_tabular_participants = paper_df["participant_id"].nunique()
print(f"Number of participants: {n_tabular_participants}")

# Filter and replace values
paper_df = subset_and_replace_df(paper_df, participant_inclusion_criteria, col_val_replacement_criteria)
n_paper_mds_updrs_participants = paper_df[~paper_df["Part III: Motor Examination"].isna()]["redcap_event_name"].value_counts()
n_paper_legacy_updrs_participants = paper_df[~paper_df["legacy_updrs3_scores"].isna()]["participant_id"].nunique()
print(f"Number of mds-updrs participants after event and group filter: {n_paper_mds_updrs_participants}")
print(f"Number of legacy-updrs participants after event and group filter: {n_paper_legacy_updrs_participants}")

# find common participants between 12 and 18 month visits
participants_12m = paper_df[paper_df["redcap_event_name"] == "12 Months Follow-Up/Suivi (Arm 1: C-OPN)"]["participant_id"]
participants_18m = paper_df[paper_df["redcap_event_name"] == "18 Months Follow-Up/Suivi (Arm 1: C-OPN)"]["participant_id"]
participants_union = set(participants_12m) | set(participants_18m)
participants_intersection = set(participants_12m) & set(participants_18m)

print(f"participants_union: {len(participants_union)}")
print(f"participants_intersection: {len(participants_intersection)}")

paper_df.head()

### Plot phono data

In [None]:
from enum import Enum
class my_colors(Enum):
    CONTROL = "#8d99ae"
    PD = "#e63946"
    
color_list = [my_colors.PD.value, my_colors.CONTROL.value,]
palette = sns.color_palette(palette=color_list) #sns.husl_palette()

monocrome_hot = ["#370617","#6a040f","#9d0208","#d00000","#dc2f02","#e85d04","#f48c06","#faa307","#ffba08"]
monochrome_hot_palette = sns.color_palette(palette=monocrome_hot[::-2]) #sns.husl_palette()

sns.palplot(monochrome_hot_palette)


In [None]:
updrs_cols_dict = {
    "Hoehn and Yahr Stage: ":'H_Y_stage',
    'Part I: Non-Motor Aspects of Experiences of Daily Living (nM-EDL)':'Part I',
    'Part II: Motor Aspects of Experiences of Daily Living (M-EDL)':'Part II',
    'Part III: Motor Examination':'Part III',
    'Part IV: Motor Complications':'Part IV',
}

H_Y_stage_dict = dict(zip(
    [
    '(2) Bilateral involvement without impairment of balance',
    '(3) Bilateral disease: mild to moderate disability with impaired postural reflexes; physically independent; needs assistance to recover from pull test',
    '(1) Unilateral involvement only, usually with minimal or no functional disability',
    '(0) Asymptomatic',
    '(4) Severely disabling disease; still able to walk or stand unassisted',
    np.nan], 
    
    ['2', '3', '1', '0', '4', "N/A"]
))

paper_df = paper_df.rename(columns=updrs_cols_dict)
paper_df["H_Y_stage"] = paper_df["H_Y_stage"].replace(H_Y_stage_dict)
paper_df = paper_df[paper_df["H_Y_stage"]!="N/A"]

# Melt for plotting
plot_df = pd.melt(paper_df, id_vars=["participant_id","redcap_event_name","group","H_Y_stage"], 
                  value_vars=['Part I', 'Part II', 'Part III', 'Part IV'], 
                  var_name="UPDRS", value_name="Score")

plot_df = plot_df[plot_df["redcap_event_name"]=="Baseline (Arm 1: C-OPN)"]

stage_order = ['0', '1', '2', '3', '4']

sns.set_theme(font_scale=2)
with sns.axes_style("whitegrid"):
    g = sns.catplot(y="Score",x="H_Y_stage", order=stage_order,
                    col="UPDRS", col_wrap=2, 
                    kind="box", palette=monochrome_hot_palette, 
                    data=plot_df, aspect=2, height=5, sharey=False)
    # g.set_xlabels("")
    # g.set_xticklabels("")

In [None]:
plot_df["group"].value_counts()

In [None]:
cat_cols = ["H_Y_stage"]
score_cols = ["Part I", "Part II", "Part III", "Part IV"]

# paper_df = paper_df[paper_df["redcap_event_name"]=="Baseline (Arm 1: C-OPN)"]

for dx_group in ["PD", "control"]:
    print(f"*** group: {dx_group} ***")
    dx_group_df = paper_df[paper_df["group"]==dx_group].copy()
    table_df = get_group_table_stats(dx_group_df, cat_cols, score_cols)
    print("-"*10)
    print(table_df)
    print("-"*10)


### Legacy score stats

In [None]:
updrs_df["legacy_updrs3_scores"].describe()

In [None]:
cat_cols = ["H_Y_stage"]
score_cols = ["legacy_updrs3_scores"]

# paper_df = paper_df[paper_df["redcap_event_name"]=="Baseline (Arm 1: C-OPN)"]

for dx_group in ["PD", "control"]:
    print(f"*** group: {dx_group} ***")
    dx_group_df = paper_df[paper_df["group"]==dx_group].copy()
    table_df = get_group_table_stats(dx_group_df, cat_cols, score_cols)
    print("-"*10)
    print(table_df)
    print("-"*10)


### MoCA score

In [None]:
paper_df = pd.merge(demo_df.drop(columns=["redcap_event_name"]), moca_df, on=["participant_id"], how="right")
paper_df = pd.merge(paper_df, dx_df, on=["participant_id","redcap_event_name"], how="left")

n_tabular_participants = paper_df["participant_id"].nunique()
print(f"Number of participants: {n_tabular_participants}")

# Filter and replace values
paper_df = subset_and_replace_df(paper_df, participant_inclusion_criteria, col_val_replacement_criteria)
n_paper_participants = paper_df["participant_id"].nunique()
print(f"Number of participants after event and group filter: {n_paper_participants}")

redcap_event_counts = paper_df["redcap_event_name"].value_counts()
print(f"redcap_event_counts: {redcap_event_counts}")

# find common participants between redcap baseline and legacy baseline
participants_redcap_baseline = paper_df[paper_df["redcap_event_name"] == "Baseline (Arm 1: C-OPN)"]["participant_id"]
participants_legacy_baseline = paper_df[paper_df["redcap_event_name"] == "pre-redcap-baseline-1 (legacy)"]["participant_id"]
participants_union = set(participants_redcap_baseline) | set(participants_legacy_baseline)
participants_intersection = set(participants_redcap_baseline) & set(participants_legacy_baseline)

print("-"*50)
print("Common participants between redcap baseline and legacy baseline")
print(f"participants_union: {len(participants_union)}")
print(f"participants_intersection: {len(participants_intersection)}")
print("-"*50)

# find common participants between 12 and 18 month visits
participants_12m = paper_df[paper_df["redcap_event_name"] == "12 Months Follow-Up/Suivi (Arm 1: C-OPN)"]["participant_id"]
participants_18m = paper_df[paper_df["redcap_event_name"] == "18 Months Follow-Up/Suivi (Arm 1: C-OPN)"]["participant_id"]
participants_union = set(participants_12m) | set(participants_18m)
participants_intersection = set(participants_12m) & set(participants_18m)

print("Common participants between 12 and 18 month visits")
print(f"participants_union: {len(participants_union)}")
print(f"participants_intersection: {len(participants_intersection)}")
print("-"*50)

# find participants with at least 1 follow-up visit including legacy baseline
participants_legacy_followup = set(participants_redcap_baseline) & set(participants_legacy_baseline)
participants_followup = participants_legacy_followup | set(participants_12m) | set(participants_18m)
print("Participants with at least 1 follow-up visit including legacy baseline")
print(f"participants_followup: {len(participants_followup)}")
print("-"*50)

### Plot moca

In [None]:
paper_df = paper_df.rename(columns={"TOTAL SCORE (make sure to include extra point for 12 years or less of education):    SCORE TOTAL (assurez-vous d'inclure un point supplémentaire pour 12 ans ou moins d'éducation) : " :"moca_total"})
plot_df = paper_df[paper_df["redcap_event_name"]=="Baseline (Arm 1: C-OPN)"]

sns.set_theme(font_scale=1.5)
with sns.axes_style("whitegrid"):
    g = sns.catplot(y="moca_total",x="group",
                    kind="box", palette=palette, 
                    data=plot_df, aspect=1.5, height=5, sharey=False)

In [None]:
cat_cols = ["group"]
score_cols = ["moca_total"]

# paper_df = paper_df[paper_df["redcap_event_name"]=="Baseline (Arm 1: C-OPN)"]

for dx_group in ["PD", "control"]:
    print(f"*** group: {dx_group} ***")
    dx_group_df = paper_df[paper_df["group"]==dx_group].copy()
    table_df = get_group_table_stats(dx_group_df, cat_cols, score_cols)
    print("-"*10)
    print(table_df)
    print("-"*10)


### Neuropsy scores

In [None]:
paper_df = pd.merge(demo_df.drop(columns=["redcap_event_name"]), neuropsy_df, on=["participant_id"], how="right")
n_tabular_participants = paper_df["participant_id"].nunique()
print(f"Number of participants: {n_tabular_participants}")

# Filter and replace values
paper_df = subset_and_replace_df(paper_df, participant_inclusion_criteria, col_val_replacement_criteria)
n_paper_participants = paper_df["participant_id"].nunique()
print(f"Number of participants after event and group filter: {n_paper_participants}")

redcap_events = paper_df["redcap_event_name"].unique()
print(f"redcap events: {redcap_events}")

print("-"*50)
print("Scores with lowest attrition (sorted)")
raw_score_cols = list(paper_df.columns[paper_df.columns.str.contains("score")]) + list(paper_df.columns[paper_df.columns.str.contains("aw")])
raw_scores_paper_df = paper_df[["participant_id", "redcap_event_name"] + raw_score_cols].copy()
raw_scores_paper_df[raw_scores_paper_df["redcap_event_name"]=="Baseline (Arm 1: C-OPN)"].isna().sum().sort_values(ascending=True)[:20]

In [None]:
raw_scores_paper_df[raw_scores_paper_df["redcap_event_name"]=="Baseline (Arm 1: C-OPN)"].isna().sum().sort_values(ascending=False)[:20]

In [None]:
index_cols = ["participant_id", "redcap_event_name","group"]
cat_cols = []

neuropsy_source = "redcap"

# redcap data
if neuropsy_source == "redcap":
    score_cols = [                   
                    "Repetitions total 1,2,3 (Raw score)",
                    "Digit Span Forward - total correct (Raw score) ",
                    "Digit span forward - longest correct serie (Raw score)",
                    "Digit Span Backward - total correct (Raw score)",
                    "Digit span backward - longest correct serie (Raw score)  ",    
                    "Command Clock raw (max 10)",
                    "Copy Clock raw (max 10)",
                    "STROOP GOLDEN : words, self-corrected errors (raw score)",
                    "STROOP GOLDEN, words, uncorrected errors (raw score)",
                    "STROOP GOLDEN : colors, self-corrected errors (raw scores)",
                    "Stroop GOLDEN : ink, self-corrected errors (raw score)",
                    "Stroop GOLDEN, ink, uncorrected errors (raw score)",
                    "Letter Fluency Total (Raw score)",                    
                    "Trail A raw score (time in sec.)",
                    "Trial total 1,2,3 (Raw score)",                            
                    "Trial 4 delayed (Raw score)",                                   
                    "BNT sans indice (Raw score)",                                   
                    "Letter Fluency F (Raw score)",                                  
                    "Semantic Fluency Actions (Raw score)",
                    "Letter Fluency S (Raw score)",                         
                    "Semantic Fluency Animals (Raw score)",
                    "Letter Fluency A (Raw score)",                                   
                    "Trail B raw score (time in sec.)",
                    "Trail B Errors raw score",
                    "Brixton raw score",
                ]

# BD_RPQ data
if neuropsy_source == "local":
    score_cols = [
                    "HVLT Trial total 1,2,3 (Raw score)",
                    "Clock Command (Raw score)",
                    "Clock Copy (Raw score)",
                    "Semantic Fluency Total (Raw score)",
                    "RCFT Copy (Raw score)",
                    "Letter Fluency Total (Raw score)",
                    "STROOP GOLDEN : words, self-corrected errors (raw score)",
                    "STROOP GOLDEN : colors, self-corrected errors (raw scores)",
                    "Stroop GOLDEN : ink, self-corrected errors (raw score)",                
                    ]

neuropsy_vars = index_cols + cat_cols + score_cols
paper_df = paper_df[neuropsy_vars]
n_participants = paper_df["participant_id"].nunique()
print(f"Number of participants: {n_participants}")

neuropsy_df.head()

In [None]:
for dx_group in ["PD", "control"]:
    print(f"*** group: {dx_group} ***")
    # dx_group_df = paper_df[(paper_df["group"]==dx_group) & (paper_df["redcap_event_name"]=="Baseline (Arm 1: C-OPN)")].copy()
    dx_group_df = paper_df[(paper_df["group"]==dx_group) ].copy()
    table_df = get_group_table_stats(dx_group_df, cat_cols, score_cols)
    print("-"*10)
    print(table_df)
    print("-"*10)


In [None]:
table_df