In [1]:
from pathlib import Path
import pandas as pd
import numpy as np

In [2]:
input_path: Path = Path(
    "../../../data/original/adni/ADMCLIPIDOMICSMEIKLELABLONG_13Jul2023.csv"
).resolve()
output_path: Path = Path("../../../data/processed/adni/lipidomics.csv").resolve()

In [3]:
# Define columns to drop
drop_columns: list[str] = [
    "VISCODE",
    "VID",
    "EXAMDATE",
    "COHORT",
    "SAMPLE.ID",
    "update_stamp",
    "UBIQUINONE",
]

### Lipidomics dictionary

In [4]:
# Define dictionary I/O paths
input_path_dict: Path = Path(
    "../../../data/original/adni/Lipid_Models_Final.xlsx"
).resolve()
output_path_dict: Path = Path(
    "../../../data/processed/adni/lipidomics_dict.csv"
).resolve()

In [5]:
# Read file
lipid_dict: pd.DataFrame = pd.read_excel(
    input_path_dict, sheet_name=0, usecols=["term", "Set"]
).rename(columns={"term": "lipid", "Set": "lipid_class"})

In [6]:
# Remove UBIQUINONE
lipid_dict = lipid_dict.loc[lipid_dict["lipid"] != "UBIQUINONE"]

In [7]:
# Replace '.', '-', or ' ' with '_'
lipid_dict["lipid_label"] = lipid_dict["lipid"].str.replace(
    pat=r"\.|\-| ", repl="_", regex=True
)
lipid_dict["lipid_class_label"] = lipid_dict["lipid_class"].str.replace(
    pat=r"\.|\-| ", repl="_", regex=True
)

In [8]:
# Add units
lipid_dict["unit"] = "nM"

### Lipidomics

In [9]:
# Read file
df: pd.DataFrame = (
    pd.read_csv(input_path)
    .drop(columns=drop_columns)
    .dropna()
    .drop_duplicates()
    .convert_dtypes()
)

In [10]:
# Filter out all non-baseline data at viscode2
df: pd.DataFrame = (
    df.loc[df["VISCODE2"].isin(["bl"])]
    .drop(columns=["VISCODE2"])
    .sort_values(by="RID", ascending=True)
    .set_index("RID")
)

In [11]:
# Log10 transform lipidomics data and output
df.rename(
    columns=dict(zip(lipid_dict["lipid"], lipid_dict["lipid_label"])), inplace=True
)
np.log10(df).to_csv(output_path, index=True)

### Total plasma lipidomics

In [12]:
lipidomics_total: pd.DataFrame = pd.DataFrame(index=df.index)

In [13]:
# Sum up all lipids for each lipid class
for lipid_class in lipid_dict["lipid_class_label"].unique():
    subset_lipids: list[str] = lipid_dict.loc[
        lipid_dict["lipid_class_label"] == lipid_class, "lipid_label"
    ].tolist()
    lipidomics_total[lipid_class] = df[subset_lipids].sum(axis=1)

In [14]:
# Log10 transform total lipidomics data and output
lipidomics_total.rename(
    columns=dict(zip(lipid_dict["lipid_class"], lipid_dict["lipid_class_label"])),
    inplace=True,
)
output_total = output_path.with_name(
    output_path.name.replace("lipidomics", "lipidomics_total")
)
np.log10(lipidomics_total).to_csv(output_total, index=True)

### Output dictionary

In [15]:
# Output dictionary
lipid_dict.drop(columns=["lipid", "lipid_class"], inplace=True)
lipid_dict.rename(
    columns={"lipid_label": "lipid", "lipid_class_label": "lipid_class"}, inplace=True
)
lipid_dict.to_csv(output_path_dict, index=False)