In [1]:
from pathlib import Path
import pandas as pd
from pingouin import ancova
import sys

sys.path.append("../../")
from lib.general import get_stage_list
from lib.r_interface import tukey, tukey_multiple_dvs

Error importing in API mode: ImportError('On Windows, cffi mode "ANY" is only "ABI".')
Trying to import in ABI mode.
Trying to import in ABI mode.


### Input

In [2]:
# Define I/O paths
path_demographics: Path = Path(
    "../../../data/processed/adni/demographics_biomarkers.csv"
).resolve()
path_lipidomics: Path = Path(
    "../../../data/processed/adni/lipidomics_total.csv"
).resolve()
path_lipidomics_dict: Path = Path(
    "../../../data/processed/adni/lipidomics_dict.csv"
).resolve()

In [3]:
# Read files
demographics: pd.DataFrame = pd.read_csv(path_demographics)
lipidomics: pd.DataFrame = pd.read_csv(path_lipidomics).dropna().drop_duplicates()
df: pd.DataFrame = demographics.join(lipidomics.set_index("RID"), on="RID", how="inner")
lipidomics_dict = pd.read_csv(path_lipidomics_dict)

In [4]:
stage_list = get_stage_list(2)

### ANCOVA

In [5]:
# Convert bool columns to int
bool_cols: list[str] = df.select_dtypes(include=[bool]).columns.tolist()
df[bool_cols] = df[bool_cols].astype(int)

In [6]:
# For each lipid class, perform ANCOVA, and store the lipid classes that differ significantly between stages
lipid_differ_between_stages: list[str] = []
for lipid_class in lipidomics_dict["lipid_class"].unique():
    df_ancova: pd.DataFrame = ancova(
        data=df,
        dv=lipid_class,
        between="stage",
        covar=["age", "sex", "bmi", "cog", "apoe4"],
        effsize="np2",
    )
    if df_ancova["p-unc"].values[0] < 0.05:
        lipid_differ_between_stages.append(lipid_class)

In [7]:
lipid_differ_between_stages

['Sphingosine_1_phosphate',
 'Ceramide_1_Phosphate',
 'Monohexosylceramide',
 'Dihexosylceramide',
 'GM1_ganglioside',
 'Sulfatide']

### Tukey HSD post hoc

In [8]:
# Perform Tukey's HSD test for the lipid classes that significantly differ between stages
tukey_result: pd.DataFrame = tukey_multiple_dvs(
    df, ["strem2_log10"] + lipid_differ_between_stages, stage_list
)
# Display only the significant p-values
tukey_result[tukey_result < 0.05]  # .dropna(axis=0, how="all")

R callback write-console: Loading required package: MASS
  
R callback write-console: 
Attaching package: 'TH.data'

  
R callback write-console: The following object is masked from 'package:MASS':

    geyser

  
R callback write-console: 
Attaching package: 'TH.data'

  
R callback write-console: The following object is masked from 'package:MASS':

    geyser

  


Unnamed: 0,"(CSF-/PET-, CSF+/PET-)","(CSF-/PET-, CSF-/PET+)","(CSF-/PET-, CSF+/PET+)","(CSF+/PET-, CSF-/PET+)","(CSF+/PET-, CSF+/PET+)","(CSF-/PET+, CSF+/PET+)"
strem2_log10,0.004183,,,,,
Sphingosine_1_phosphate,,,0.031611,,,
Ceramide_1_Phosphate,,,,,,
Monohexosylceramide,,,0.025335,,,
Dihexosylceramide,,,0.008511,,0.00867,
GM1_ganglioside,0.04772,,,0.001581,0.003822,
Sulfatide,,,0.035636,,,


In [9]:
# Perform ANCOVA for strem2_log10
ancova(
    data=df,
    dv="strem2_log10",
    between="stage",
    covar=["age", "sex", "cog", "apoe4"],
    effsize="np2",
)

Unnamed: 0,Source,SS,DF,F,p-unc,np2
0,stage,0.700578,3,4.43189,0.004355,0.026004
1,age,0.714106,1,13.552397,0.000257,0.026493
2,sex,0.045603,1,0.865464,0.352665,0.001735
3,cog,0.025848,1,0.490548,0.48401,0.000984
4,apoe4,0.059457,1,1.128373,0.288637,0.002261
5,Residual,26.240732,498,,,


In [10]:
# Perform Tukey's post-hoc for strem2_log10
tukey(df, "strem2_log10", stage_list)

Unnamed: 0,coef,lower,upper,std_err,p_adj
"(CSF-/PET-, CSF+/PET-)",0.184211,0.04489,0.323532,0.054753,0.00374
"(CSF-/PET-, CSF-/PET+)",0.036573,-0.061851,0.134996,0.038681,0.767844
"(CSF-/PET-, CSF+/PET+)",0.06608,-0.001939,0.1341,0.026732,0.059712
"(CSF+/PET-, CSF-/PET+)",-0.147638,-0.308353,0.013077,0.063161,0.083757
"(CSF+/PET-, CSF+/PET+)",-0.118131,-0.255962,0.019701,0.054168,0.120142
"(CSF-/PET+, CSF+/PET+)",0.029508,-0.074971,0.133987,0.04106,0.882496
