In [1]:
from pathlib import Path
import pandas as pd

In [2]:
path_input: Path = Path(
    "../../../data/original/adni/ADNINIGHTINGALELONG_05_24_21_27Jul2023.csv"
).resolve()
path_output_lipoprotein: Path = Path(
    "../../../data/processed/adni/lipoprotein.csv"
).resolve()
path_output_dict: Path = Path(
    "../../../data/processed/adni/lipoprotein_dict.csv"
).resolve()

### Lipoprotein

In [3]:
# Read data
lipo: pd.DataFrame = (
    pd.read_csv(
        path_input,
        usecols=[0, 2, 3] + list(range(29, 279)),
        parse_dates=["EXAMDATE"],
        low_memory=False,
    )
    .dropna()
    .drop_duplicates()
    .convert_dtypes()
)

In [4]:
# Keep only baseline visits
lipo: pd.DataFrame = lipo.loc[lipo["VISCODE2"].isin(["bl"])].reset_index(drop=True)

In [5]:
# Columns that contain "TAG" string entries
lipo.select_dtypes(include=["string"]).columns[1:]

Index(['GLN', 'PYRUVATE', 'GLYCEROL', 'BOHBUTYRATE', 'CREATININE'], dtype='object')

In [6]:
# Drop all rows that contain "TAG" in any column
# There are 36 such rows
lipo: pd.DataFrame = lipo.loc[
    ~lipo.select_dtypes(include=["string"]).eq("TAG").any(axis=1)
].reset_index(drop=True)

### Lipoprotein dictionary

In [7]:
# Define the categorical values for the lipoprotein data
suffices: list[str] = ["C", "CE", "FC", "L", "P", "PL", "TG"]
densities: list[str] = ["VLDL", "LDL", "IDL", "HDL"]
sizes: list[str] = ["XS", "S", "M", "L", "XL", "XXL"]

In [8]:
# Instantiate lipoprotein dictionary
df: pd.DataFrame = pd.DataFrame(data=lipo.columns, columns=["label"])

In [9]:
# Create a temporary column to match the IDL labels to the format of the rest
df["temp"] = df["label"]
df.loc[df["label"].str.contains("IDL"), "temp"] = (
    "M_" + df.loc[df["label"].str.contains("IDL"), "temp"]
)

In [10]:
# Split the label into size, density, suffix, and percentage components
df[["size", "density", "suffix", "pct"]] = df["temp"].str.split(
    pat="_", n=3, expand=True
)

In [11]:
# Assign the categorical values to the respective columns
df["size"] = pd.Categorical(df["size"], categories=sizes, ordered=True)
df["density"] = pd.Categorical(df["density"], categories=densities, ordered=True)
df["suffix"] = pd.Categorical(df["suffix"], categories=suffices, ordered=True)
df["pct"] = (df["pct"] == "PCT").astype(bool)

In [12]:
# Assign units
df["unit"] = df["pct"].map({True: "%", False: "mM"})
df.loc[df["suffix"].eq("P"), "unit"] = "M"

In [13]:
# Drop the temporary column
df: pd.DataFrame = (
    df.dropna()
    .sort_values(by=["pct", "density", "size", "suffix"], ascending=True)
    .reset_index(drop=True)
)
df.drop(columns=["temp"], inplace=True)

### Output

In [14]:
# Save as CSV
lipo[["RID"] + df["label"].tolist()].to_csv(path_output_lipoprotein, index=False)
df.to_csv(path_output_dict, index=False)