## Notebook to report descriptive statistics from demoraphic and assessment data

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

### Paths


In [106]:
DATASET_ROOT = "/home/nikhil/projects/Parkinsons/qpn/"

# Current nipoppy release
current_release = "qpn-nipoppy-R1"

data_release_dir = f"{DATASET_ROOT}/releases/{current_release}/"
tabular_data_release_dir = f"{data_release_dir}/tabular/"
manifest_file = f"{data_release_dir}/manifest.csv"

# tabular files
demographics_file = f"{tabular_data_release_dir}/demographics.csv"
mri_session_date_file = f"{tabular_data_release_dir}/mri_info/mri_sessions.csv"
updrs_file = f"{tabular_data_release_dir}/assessments/updrs.csv"
hy_file = f"{tabular_data_release_dir}/assessments/hy.csv"
moca_file = f"{tabular_data_release_dir}/assessments/moca.csv"
dx_file = f"{tabular_data_release_dir}/assessments/diagnosis.csv"
neuropsych_file = f"{tabular_data_release_dir}/assessments/neuropsych.csv"

### Defs

In [199]:
def subset_and_replace_df(df, filters_dict, rename_dict):
    """ Subset rows and replace columns values in a dataframe
    """
    for col, val_list in filters_dict.items():
        df = df[df[col].isin(val_list)].copy()

    for col, val_list in rename_dict.items():
        df[col] = df[col].replace(val_list).copy()

    return df
    


def get_group_table_stats(df, cat_cols, score_cols, groupby_col="redcap_event_name"):
    """ Get table stats for groups. Does not stratify by group! 
    """
    n_cat_cols = len(cat_cols)
    n_score_cols = len(score_cols)
    print(f"Counting {n_cat_cols} and averaging {n_score_cols}")

    table_df = df[groupby_col].value_counts().reset_index()
    print("Starting cat cols")
    for col in cat_cols:
        # print(f"col: {col}")
        cat_count_df = df.groupby([groupby_col])[col].value_counts().unstack().reset_index()
        table_df = pd.merge(table_df, cat_count_df, on=groupby_col, how="left")

    print("Starting score cols")
    for col in score_cols:
        # print(f"col: {col}")
        score_count = df.groupby([groupby_col])[col].count()
        score_mean_df = df.groupby([groupby_col])[col].mean().round(1)
        score_std_df = df.groupby([groupby_col])[col].std().round(1)
        score_min_df = df.groupby([groupby_col])[col].min().round(2)
        score_max_df = df.groupby([groupby_col])[col].max().round(2)
        score_mean_std_df = "(N=" + score_count.astype(str) + ") " + score_mean_df.astype(str) + " (" + score_std_df.astype(str) + ")" + " ["  \
        + score_min_df.astype(str) + ", " + score_max_df.astype(str) + "]"
        score_mean_std_df = score_mean_std_df.reset_index()
        # score_mean_std_df["non-null-count"] = score_count
        table_df = pd.merge(table_df, score_mean_std_df, on=groupby_col, how="left")
    
    return table_df

### Load data

In [158]:
manifest_df = pd.read_csv(manifest_file)
demo_df = pd.read_csv(demographics_file)
mri_df = pd.read_csv(mri_session_date_file)
hy_df = pd.read_csv(hy_file)
updrs_df = pd.read_csv(updrs_file)
moca_df = pd.read_csv(moca_file)
dx_df = pd.read_csv(dx_file)
neuropsy_df = pd.read_csv(neuropsych_file)

dx_df = dx_df[['participant_id', 'redcap_event_name','diagnosis_group_for_analysis']].copy()


### QPN paper tables

In [159]:
# paper subset filters
cohort_inclusion_list = ["QPN"]
# group_inclusion_list = ["Healthy control/Contrôle", "PD   (Parkinson's Disease)/Maladie de Parkinson"]
dx_inclusion_list = ["PD", "control"]
visits_inclusion_list = ["Baseline (Arm 1: C-OPN)", "legacy-updrs3", "legacy-moca"]
session_inclusion_list = ["ses-01"]

participant_inclusion_criteria = {
    "redcap_event_name" : visits_inclusion_list, 
    "recruitment_cohort": cohort_inclusion_list,
    "diagnosis_group_for_analysis": dx_inclusion_list,
    "session": session_inclusion_list
    }

# QPN_groups = {"Healthy control/Contrôle": "control", "PD   (Parkinson's Disease)/Maladie de Parkinson": "PD", np.NaN:"Unknown"}
QPN_sexes = {"Female/Féminin": "Female", "Male/Masculin":"Male"}

col_val_replacement_criteria = {
    # "enrollment_group": QPN_groups,
    "sex": QPN_sexes
}

### Manifest

In [None]:
paper_df = pd.merge(manifest_df, demo_df, on="participant_id", how="left")
paper_df = pd.merge(paper_df, dx_df, on=["participant_id","redcap_event_name"], how="left")

n_tabular_participants = paper_df["participant_id"].nunique()
print(f"Number of participants: {n_tabular_participants}")

# Filter and replace values
paper_df = subset_and_replace_df(paper_df, participant_inclusion_criteria, col_val_replacement_criteria)
n_paper_participants = paper_df["participant_id"].nunique()
print(f"Number of participants after event and group filter: {n_paper_participants}")

session_counts = paper_df["session"].value_counts()
print(f"session_counts: {session_counts}")

paper_df.head()

#### Demo table
- Add MRI age column

In [None]:
# add mri_age column
paper_mri_df = pd.merge(paper_df, mri_df[["participant_id", "redcap_event_name", "MRI_age"]], on=["participant_id", "redcap_event_name"], how="left")

n_tabular_participants = paper_mri_df["participant_id"].nunique()
print(f"Number of participants: {n_tabular_participants}")

# Filter and replace values
paper_mri_df = subset_and_replace_df(paper_mri_df, participant_inclusion_criteria, col_val_replacement_criteria)
n_paper_participants = paper_mri_df["participant_id"].nunique()
print(f"Number of participants after event and group filter: {n_paper_participants}")

redcap_events = paper_mri_df["redcap_event_name"].unique()
print(f"redcap events: {redcap_events}")

paper_mri_df.head()

In [None]:
# counts
cat_cols = ["sex"]
score_cols = ["MRI_age"]

for dx in dx_inclusion_list:
    print(f"*** dx: {dx} ***")
    dx_group_df = paper_mri_df[paper_mri_df["diagnosis_group_for_analysis"]==dx].copy()
    table_df = get_group_table_stats(dx_group_df, cat_cols, score_cols)
    print("-"*10)
    print(table_df)
    print("-"*10)


#### H&Y scores

In [None]:
H_Y_stage_dict = dict(zip(
    [
    '(2) Bilateral involvement without impairment of balance',
    '(3) Bilateral disease: mild to moderate disability with impaired postural reflexes; physically independent; needs assistance to recover from pull test',
    '(1) Unilateral involvement only, usually with minimal or no functional disability',
    '(0) Asymptomatic',
    '(4) Severely disabling disease; still able to walk or stand unassisted',
    np.nan], 
    
    ['2', '3', '1', '0', '4', "N/A"]
))

hy_df["H_Y_stage"] = hy_df['Hoehn and Yahr Stage: '].replace(H_Y_stage_dict).copy()
hy_df = hy_df[hy_df["H_Y_stage"]!="N/A"]

cat_cols = ["H_Y_stage"]
score_cols = []

hy_participants = hy_df["participant_id"].nunique()
hy_event_counts = hy_df["redcap_event_name"].value_counts()

print(f"updrs_participants: {hy_participants}")
print(f"updrs_event_counts: {hy_event_counts}")

paper_hy_df = pd.merge(dx_df.drop(columns=["redcap_event_name"]), hy_df, on=["participant_id"], how="right")


for dx_group in ["PD", "control"]:
    print(f"*** group: {dx_group} ***")
    dx_group_df = paper_hy_df[paper_hy_df["diagnosis_group_for_analysis"]==dx_group].copy()
    table_df = get_group_table_stats(dx_group_df, cat_cols, score_cols, groupby_col="diagnosis_group_for_analysis")
    print("-"*10)
    print(table_df)
    print("-"*10)


#### UPDRS table

In [None]:
updrs_participants = updrs_df["participant_id"].nunique()
updrs_event_counts = updrs_df["redcap_event_name"].value_counts()

print(f"updrs_participants: {updrs_participants}")
print(f"updrs_event_counts: {updrs_event_counts}")

paper_updrs_df = pd.merge(dx_df.drop(columns=["redcap_event_name"]), updrs_df, on=["participant_id"], how="right")

cat_cols = []
score_cols = ["Part III: Motor Examination"]

for dx in dx_inclusion_list:
    print(f"*** dx: {dx} ***")
    dx_group_df = paper_updrs_df[paper_updrs_df["diagnosis_group_for_analysis"]==dx].copy()
    table_df = get_group_table_stats(dx_group_df, cat_cols, score_cols, groupby_col="diagnosis_group_for_analysis")
    print("-"*10)
    print(table_df)
    print("-"*10)


paper_updrs_df.head()

### MoCA score

In [None]:
moca_participants = moca_df["participant_id"].nunique()
moca_event_counts = moca_df["redcap_event_name"].value_counts()

print(f"moca_participants: {moca_participants}")
print(f"moca_event_counts: {moca_event_counts}")

moca_score_col = "TOTAL SCORE (make sure to include extra point for 12 years or less of education):    SCORE TOTAL (assurez-vous d'inclure un point supplémentaire pour 12 ans ou moins d'éducation) : "

cog_subtype_col = "cog_subgroup"
paper_moca_df = pd.merge(dx_df.drop(columns=["redcap_event_name"]), moca_df, on=["participant_id"], how="right")
paper_moca_df.loc[paper_moca_df[moca_score_col] >= 26, cog_subtype_col] = "CN" 
paper_moca_df.loc[(paper_moca_df[moca_score_col] >= 21) & (paper_moca_df[moca_score_col] < 26), cog_subtype_col] = "MCI"
paper_moca_df.loc[paper_moca_df[moca_score_col] < 21, cog_subtype_col] = "Dementia"

paper_moca_df = paper_moca_df.rename(columns={moca_score_col: "moca_total"}).copy()

cat_cols = []
score_cols = ["moca_total"]

for dx in dx_inclusion_list:
    print(f"*** dx: {dx} ***")
    dx_group_df = paper_moca_df[paper_moca_df["diagnosis_group_for_analysis"]==dx].copy()
    table_df = get_group_table_stats(dx_group_df, cat_cols, score_cols, groupby_col=cog_subtype_col)
    print("-"*10)
    print(table_df)
    print("-"*10)


paper_updrs_df.head()

### Neuropsy scores

In [None]:
neuropsy_df = neuropsy_df[neuropsy_df["redcap_event_name"]=="Baseline (Arm 1: C-OPN)"]

neuropsy_participants = neuropsy_df["participant_id"].nunique()
neuropsy_event_counts = neuropsy_df["redcap_event_name"].value_counts()


print(f"neuropsy_participants: {neuropsy_participants}")
print(f"neuropsy_event_counts: {neuropsy_event_counts}")

paper_neuropsy_df = pd.merge(dx_df.drop(columns=["redcap_event_name"]), neuropsy_df, on=["participant_id"], how="right")
paper_neuropsy_df = paper_neuropsy_df[paper_neuropsy_df["diagnosis_group_for_analysis"].isin(dx_inclusion_list)].copy()

neuropsy_dx_counts = paper_neuropsy_df["diagnosis_group_for_analysis"].value_counts()
print(f"neuropsy_dx_counts: {neuropsy_dx_counts}")

paper_neuropsy_df.head()

In [None]:
import json
neuropsy_json_file = "/home/nikhil/projects/neuroinformatics_tools/qpn_workflows/configs/pheno.json"
neuropsy_json = json.load(open(neuropsy_json_file))
neuropsy_columns = neuropsy_json["variables"]["neuropsy_scores"]["sources"]["redcap"]["sarah_extended_export"]

neuropsy_instruments = []
possible_score_col_strings = ["raw","score","sec","time"]
for col in neuropsy_columns:
    for col_str in possible_score_col_strings:
        if col_str in col.lower():
            neuropsy_instruments.append(col)

neuropsy_instruments = set(neuropsy_instruments)
n_neuropsy_instruments = len(neuropsy_instruments)
print(f"n_neuropsy_instruments: {n_neuropsy_instruments}")


In [None]:
cat_cols = []
score_cols = list(neuropsy_instruments)
table_df_all_groups = pd.DataFrame()
for dx_group in dx_inclusion_list:
    print(f"*** dx_group: {dx_group} ***")
    dx_group_df = paper_neuropsy_df[(paper_neuropsy_df["diagnosis_group_for_analysis"]==dx_group) ].copy()
    table_df = get_group_table_stats(dx_group_df, cat_cols, score_cols, groupby_col="diagnosis_group_for_analysis")
    table_df["dx_group"] = dx_group
    print("-"*10)
    print(table_df)
    print("-"*10)

    table_df_all_groups = pd.concat([table_df_all_groups, table_df], axis=0)

table_df_all_groups.T.to_csv("./neuropsy_paper_table.csv", index=True, sep="\t")



### Plot pheno data

In [None]:
from enum import Enum
class my_colors(Enum):
    CONTROL = "#8d99ae"
    PD = "#e63946"
    
color_list = [my_colors.PD.value, my_colors.CONTROL.value,]
palette = sns.color_palette(palette=color_list) #sns.husl_palette()

monocrome_hot = ["#370617","#6a040f","#9d0208","#d00000","#dc2f02","#e85d04","#f48c06","#faa307","#ffba08"]
monochrome_hot_palette = sns.color_palette(palette=monocrome_hot[::-2])[1:] #sns.husl_palette()

sns.palplot(monochrome_hot_palette)


#### UPDRS

In [None]:
updrs_cols_dict = {
    "Hoehn and Yahr Stage: ":'H_Y_stage',
    'Part I: Non-Motor Aspects of Experiences of Daily Living (nM-EDL)':'Part I',
    'Part II: Motor Aspects of Experiences of Daily Living (M-EDL)':'Part II',
    'Part III: Motor Examination':'Part III',
    'Part IV: Motor Complications':'Part IV',
}

H_Y_stage_dict = dict(zip(
    [
    '(2) Bilateral involvement without impairment of balance',
    '(3) Bilateral disease: mild to moderate disability with impaired postural reflexes; physically independent; needs assistance to recover from pull test',
    '(1) Unilateral involvement only, usually with minimal or no functional disability',
    '(0) Asymptomatic',
    '(4) Severely disabling disease; still able to walk or stand unassisted',
    np.nan], 
    
    ['2', '3', '1', '0', '4', "N/A"]
))

paper_updrs_df = paper_updrs_df.rename(columns=updrs_cols_dict)
plot_df = pd.merge(paper_updrs_df, hy_df, on=["participant_id","redcap_event_name"], how="left")
# paper_df["H_Y_stage"] = paper_df["H_Y_stage"].replace(H_Y_stage_dict)
# paper_df = paper_df[paper_df["H_Y_stage"]!="N/A"]

# Melt for plotting
plot_df = pd.melt(plot_df, id_vars=["participant_id","redcap_event_name","diagnosis_group_for_analysis","H_Y_stage"], 
                  value_vars=['Part I', 'Part II', 'Part III', 'Part IV'], 
                  var_name="UPDRS", value_name="Score")

# plot_df = plot_df[plot_df["redcap_event_name"]=="Baseline (Arm 1: C-OPN)"]

stage_order = ['1', '2', '3', '4']

sns.set_theme(font_scale=2)
with sns.axes_style("whitegrid"):
    g = sns.catplot(y="Score",x="H_Y_stage", order=stage_order,
                    col="UPDRS", col_wrap=2, 
                    kind="box", palette=monochrome_hot_palette, 
                    data=plot_df, aspect=2, height=5, sharey=False)
    # g.set_xlabels("")
    # g.set_xticklabels("")

#### Moca

In [None]:
paper_moca_df.head()

In [None]:
plot_df = paper_moca_df.copy()
plot_df = plot_df.rename(columns={"diagnosis_group_for_analysis":"Dx"})

hue_order = ["CN", "MCI", "Dementia"]
sns.set_theme(font_scale=2)
with sns.axes_style("whitegrid"):
    g = sns.catplot(y="moca_total",col="Dx", x=cog_subtype_col, order=hue_order,
                    kind="box", palette=monochrome_hot_palette, 
                    data=plot_df, aspect=2, height=5, sharey=True)