In [1]:
import pandas as pd
from pathlib import Path
import numpy as np
from scipy.stats import zscore
from sklearn.impute import SimpleImputer
import plotly.express as px
import sys

sys.path.append("../../")
from lib.stats import cluster_corr_df, remove_diagonal

### Input

In [2]:
# Define I/O paths
path_input_demographics: Path = Path(
    "../../../data/processed/adni/demographics_tau.csv"
).resolve()
path_input_dict: Path = Path("../../../data/processed/adni/somascan_dict.csv").resolve()
path_input_proteomics: Path = Path(
    "../../../data/processed/adni/somascan.csv"
).resolve()
path_input_davidson: Path = Path(
    "../../../data/processed/other/hdl_proteome_davidson.csv"
).resolve()

In [3]:
# Input files
df_demographics: pd.DataFrame = pd.read_csv(path_input_demographics).convert_dtypes()
df_dict: pd.DataFrame = pd.read_csv(path_input_dict).convert_dtypes()
df_proteomics: pd.DataFrame = pd.read_csv(path_input_proteomics).convert_dtypes()
df_davidson: pd.DataFrame = pd.read_csv(path_input_davidson).convert_dtypes()

### Processing

In [4]:
# Join cognitive status table with proteomics data
df: pd.DataFrame = df_demographics.join(
    df_proteomics.set_index("RID"), on="RID", how="inner"
).reset_index(drop=True)

In [5]:
# Join dictionary table with davidson proteome watchlist
df_dict: pd.DataFrame = df_dict.join(
    df_davidson.set_index("uniprot_id"), on="uniprot_id", how="inner"
).reset_index(drop=True)

In [6]:
# Filter out columns that are not in the dictionary
df: pd.DataFrame = df[df_demographics.columns.tolist() + df_dict["label"].tolist()]

In [7]:
# log10 transform ptau and ttau
df["ptau"] = np.log10(df["ptau"])
df["ttau"] = np.log10(df["ttau"])

### PCA

In [8]:
# Prepare data for PCA
# Shape: (proteins, observations)
data_pca: pd.DataFrame = (
    df.drop(columns=[var for var in df_demographics.columns if var != "RID"])
    .set_index("RID")
    .transpose()
)

In [9]:
# Standardize data using z-score
data_standardized: pd.DataFrame | np.ndarray = zscore(
    data_pca.astype(float), axis=1, nan_policy="omit"
)
# Remove outliers using z-score threshold of 3
# data_pca[data_standardized > 3] = np.nan

In [10]:
# Impute missing values using mean
imputer: SimpleImputer = SimpleImputer(strategy="mean")
data_imputed: np.ndarray = imputer.fit_transform(data_standardized)
df[df_dict["label"]] = data_imputed.transpose()

### Determine the number of PCs and cluster proteins

In [11]:
# Make entrez_gene_symbol unique by appending asterisks to duplicates
df_dict["entrez_gene_symbol"] = df_dict["entrez_gene_symbol"] + df_dict.groupby(
    "entrez_gene_symbol"
).cumcount().apply(lambda x: "*" * (x)).astype(str)

# Create a dictionary to map entrez_gene_symbol to target label
map_label: dict[str, str] = dict(zip(df_dict["label"], df_dict["entrez_gene_symbol"]))

### Dashboard

In [12]:
subset: pd.DataFrame = data_pca.transpose().copy()

In [13]:
df_plot = cluster_corr_df(subset.corr())
fig = px.imshow(
    remove_diagonal(df_plot),
    labels=dict(y="protein", x="protein", color="Pearson_r"),
    x=df_plot.columns.map(map_label),
    y=df_plot.index.map(map_label),
    zmin=-1,
    zmax=1,
    color_continuous_scale=px.colors.diverging.RdBu_r,
    aspect="auto",
)

fig = fig.update_layout(
    width=900,
    height=700,
    font=dict(color="black", size=12),
    title=dict(text=None, font=dict(color="black", size=22)),
    plot_bgcolor="rgba(255, 255, 255, 1)",
)

In [14]:
fig.show()