In [1]:
from pathlib import Path
import pandas as pd

### Input

In [2]:
path_input_demographics: Path = Path(
    "../../../data/original/adni/ADNIMERGE_14Jul2023.csv"
).resolve()
path_input_bmi: Path = Path("../../../data/processed/adni/bmi.csv").resolve()
path_output: Path = Path("../../../data/processed/adni/demographics_tau.csv").resolve()

In [3]:
# Determine columns to be used
usecols_: list[str] = [
    "RID",
    "VISCODE",
    "DX_bl",
    "AGE",
    "PTGENDER",
    "APOE4",
    "PTAU_bl",
    "TAU_bl",
]

In [4]:
# Read files
df_demographics: pd.DataFrame = (
    pd.read_csv(path_input_demographics, usecols=usecols_, low_memory=False)
    .dropna()
    .drop_duplicates()
    .convert_dtypes()
)
df_bmi: pd.DataFrame = pd.read_csv(path_input_bmi).convert_dtypes()

### Processing

In [5]:
# Join dataframes
df: pd.DataFrame = df_demographics.join(
    df_bmi.set_index("RID"), on="RID", how="inner"
).reset_index(drop=True)

In [6]:
# Include only participants with baseline visit data
df: pd.Series | pd.DataFrame = df.loc[df["VISCODE"].isin(["bl"])]

In [7]:
# Map diagnosis to integer
df["cog"] = (
    df["DX_bl"].map({"CN": 0, "EMCI": 1, "LMCI": 1, "AD": 2, "SMC": 0}).astype(int)
)

In [8]:
# Define boolean columns
df["sex"] = (df["PTGENDER"] == "Male").astype(bool)
df["apoe4"] = (df["APOE4"] > 0).astype(bool)

In [9]:
# Transform biomarker values to float, censoring values with '<' or '>' at the boundaries
df["ptau"] = (
    df["PTAU_bl"].astype(str).str.replace(pat="[><]", repl="", regex=True).astype(float)
)
df["ttau"] = (
    df["TAU_bl"].astype(str).str.replace(pat="[><]", repl="", regex=True).astype(float)
)

In [10]:
# Display info
df: pd.DataFrame = (
    df.dropna().sort_values(by="RID", ascending=True).reset_index(drop=True)
)

### Export

In [11]:
# Columns to export and their names
cols_exp: list[str] = ["RID", "AGE", "sex", "bmi", "cog", "apoe4", "ptau", "ttau"]
names_exp: list[str] = ["RID", "age", "sex", "bmi", "cog", "apoe4", "ptau", "ttau"]

In [12]:
# Subset and rename columns
df_exp: pd.DataFrame = df[cols_exp].rename(columns=dict(zip(cols_exp, names_exp)))

In [13]:
# Export to CSV
df_exp.to_csv(path_output, index=False)