In [None]:
import sys
from pathlib import Path
import pandas as pd

sys.path.append("../../")
from lib.stats import demographic_characteristics

### Input

In [None]:
# Define input paths from ARIC_NP server
path_derive54: Path = Path(
    "Z:/DATA_NP/Visits/Visit 5/derive54_np.sas7bdat"
).resolve()  # Visit 5 on ARIC_NP server
path_derive13: Path = Path(
    "Z:/DATA_NP/Visits/Visit 1/derive13_np.sas7bdat"
).resolve()  # Visit 1 on ARIC_NP server
path_ncs51: Path = Path(
    "Z:/DATA_NP/Visits/Visit 5/derive_ncs51_np.sas7bdat"
).resolve()  # Visit 5 on ARIC_NP server

In [None]:
# Define output paths
path_population: Path = Path(
    "../../../data/original/aric/sample_selection/all_eleigible_samples_AS2021_25v3.xlsx"
).resolve()
path_lipoprotein: Path = Path(
    "../../../data/original/aric/lipoproteins_6_29_23.csv"
).resolve()
path_dictionary: Path = Path("../../../data/processed/aric/dictionary.csv").resolve()

In [None]:
# Define output paths
path_output_lipoprotein_list: Path = Path(
    "../../../data/processed/aric/lipoprotein_list.csv"
).resolve()
path_output_pilot: Path = Path("../../../data/processed/aric/pilot.csv").resolve()
path_output_demographic_characteristics: Path = Path(
    "../../../assets/tables/aric/demographic_characteristics.csv"
).resolve()

In [None]:
# Read files
derive54: pd.DataFrame = pd.read_sas(
    path_derive54, format="sas7bdat", encoding="latin-1"
).convert_dtypes()
derive13: pd.DataFrame = pd.read_sas(
    path_derive13, format="sas7bdat", encoding="latin-1"
).convert_dtypes()
ncs51: pd.DataFrame = pd.read_sas(
    path_ncs51, format="sas7bdat", encoding="latin-1"
).convert_dtypes()
population: pd.DataFrame = (
    pd.read_excel(path_population, sheet_name=0, header=0)
    .rename(columns={"subjectid": "SubjectID"})
    .convert_dtypes()
)
lipoprotein: pd.DataFrame = pd.read_csv(path_lipoprotein).convert_dtypes()
dictionary: pd.DataFrame = (
    pd.read_csv(path_dictionary)
    .dropna(subset=["derive54_np", "derive13_np", "derive_ncs51_np"], how="all")
    .convert_dtypes()
)

### Define inclusion

In [None]:
# Inclusion criteria
# 1. have fasting unthawed plasma samples at V5
population["criteria1"] = (population["criteria1"] == 1).astype(bool)

# 2. were adjudicated CN or MCI at V5
population["COGDIAG51"] = pd.Categorical(
    population["COGDIAG51"], categories=["N", "M", "D", "U"], ordered=True
)
population["cn_or_mci"] = (population["COGDIAG51"].isin(["N", "M"])).astype(bool)

# 3. have brain MRI scans at V5
ncs51["have_mri"] = ncs51["ADSIGREGVOL51"].notna()

# 4. have amyloid PET data from ARIC-PET study
population["criteria3"] = (population["criteria3"] == 1).astype(bool)

In [7]:
# Create sets
c1: set[str] = set(population["SubjectID"].loc[population["criteria1"]])
c2: set[str] = set(population["SubjectID"].loc[population["cn_or_mci"]])
c3: set[str] = set(ncs51["SUBJECTID"].loc[ncs51["have_mri"]])
c4: set[str] = set(population["SubjectID"].loc[population["criteria3"]])

In [8]:
# Calculate included and excluded population using set operations
id_eligible: set[str] = c1.intersection(c2, c3, c4)
id_included: set[str] = set(lipoprotein["ID"])
id_excluded: set[str] = id_eligible.difference(id_included)

### Processing

In [None]:
# Define dictionaries that map variable names to their original names on the ARIC server
dict_derive54: dict[str, str] = (
    dictionary.dropna(subset=["derive54_np"])
    .set_index("variable_name")["derive54_np"]
    .to_dict()
)
dict_derive13: dict[str, str] = (
    dictionary.dropna(subset=["derive13_np"])
    .set_index("variable_name")["derive13_np"]
    .to_dict()
)
dict_ncs51: dict[str, str] = (
    dictionary.dropna(subset=["derive_ncs51_np"])
    .set_index("variable_name")["derive_ncs51_np"]
    .to_dict()
)

In [None]:
# Filter data based on columns and samples
df_eligible: pd.DataFrame = derive54.loc[
    derive54["SUBJECTID"].isin(id_eligible), list(dict_derive54.values())
].reset_index(drop=True)
df: pd.DataFrame = df_eligible.join(
    derive13.set_index("SUBJECTID")[dict_derive13.values()], on="SUBJECTID", how="inner"
).join(ncs51.set_index("SUBJECTID")[dict_ncs51.values()], on="SUBJECTID", how="inner")

In [11]:
# Label stage as included or excluded
stage_list: list[str] = ["included", "excluded"]
df.loc[df["SUBJECTID"].isin(id_included), "stage"] = stage_list[0]
df.loc[df["SUBJECTID"].isin(id_excluded), "stage"] = stage_list[1]

In [12]:
# For cholesterols and triglycerides, convert from SI unit mM to mg/dL
for _, row in dictionary.dropna(subset="conversion_factor").iterrows():
    variable_name: str = row["derive54_np"]
    new_variable_name: str = variable_name.replace("SIU", "")
    df[new_variable_name] = (df[variable_name] / row["conversion_factor"]).astype(float)

In [None]:
# Sanitize categorical variables
df["stage"] = pd.Categorical(df["stage"], categories=stage_list, ordered=True)
df["CURSMK52"] = pd.Categorical(df["CURSMK52"], categories=[0, 1], ordered=True)
df["GENDER51"] = pd.Categorical(df["GENDER51"], categories=["F", "M"], ordered=True)
df["RACEGRP51"] = pd.Categorical(
    df["RACEGRP51"], categories=["A", "B", "I", "W"], ordered=True
)
df["CENTER"] = pd.Categorical(
    df["CENTER"], categories=["F", "J", "M", "W"], ordered=True
)
df["DIABTS54"] = pd.Categorical(df["DIABTS54"], categories=[0, 1], ordered=True)
df["HYPERT55"] = pd.Categorical(df["HYPERT55"], categories=[0, 1], ordered=True)
df["CHOLMDCODE53"] = pd.Categorical(df["CHOLMDCODE53"], categories=[0, 1], ordered=True)
df["HYPTMDCODE52"] = pd.Categorical(df["HYPTMDCODE52"], categories=[0, 1], ordered=True)
df["STATINCODE52"] = pd.Categorical(df["STATINCODE52"], categories=[0, 1], ordered=True)
df["PRVCHD51"] = pd.Categorical(df["PRVCHD51"], categories=[0, 1], ordered=True)
df["COGDIAG51"] = pd.Categorical(
    df["COGDIAG51"], categories=["N", "U", "M", "D"], ordered=True
)
df["ELEVEL02"] = pd.Categorical(df["ELEVEL02"], categories=[1, 2, 3], ordered=True)
df["PREVDEFPOSSHF51"] = pd.Categorical(
    df["PREVDEFPOSSHF51"], categories=[0, 1], ordered=True
)
df["PRVSTR51"] = pd.Categorical(df["PRVSTR51"], categories=[0, 1], ordered=True)

In [14]:
# Calculate statistics
df: pd.DataFrame = df.convert_dtypes()
df_stats: pd.DataFrame = demographic_characteristics(df, stage_list)

### Join lipoprotein data

In [None]:
# Define the list of lipoproteins
lipoprotein_list: list[str] = lipoprotein.columns[4:129].tolist()
df_lipoprotein_list: pd.DataFrame = pd.DataFrame(
    lipoprotein_list, columns=["lipoprotein"]
)

In [None]:
# Join lipoprotein data
df: pd.DataFrame = df.join(
    lipoprotein.set_index("ID")[lipoprotein_list], on="SUBJECTID", how="inner"
)

### Output

In [17]:
# Output
df_lipoprotein_list.to_csv(path_output_lipoprotein_list, index=False)
df.to_csv(path_output_pilot, index=False)
df_stats.to_csv(path_output_demographic_characteristics, index=True)