In [1]:
from pathlib import Path
import pandas as pd

In [2]:
path_input_proteomics: Path = Path(
    "../../../data/original/adni/CruchagaLab_CSF_SOMAscan7k_Protein_matrix_postQC_20230620.csv"
).resolve()
path_output_proteomics: Path = Path(
    "../../../data/processed/adni/somascan.csv"
).resolve()
path_input_dict: Path = Path(
    "../../../data/original/adni/ADNI_Cruchaga_lab_CSF_SOMAscan7k_analyte_information_20_06_2023.csv"
).resolve()
path_output_dict: Path = Path(
    "../../../data/processed/adni/somascan_dict.csv"
).resolve()

### Proteomics dictionary

In [3]:
# Define columns to be used
usecols_dict: list[str] = [
    "Analytes",
    "TargetFullName",
    "Target",
    "UniProt",
    "EntrezGeneSymbol",
]
colnames_dict: list[str] = [
    "label",
    "target_full_name",
    "target",
    "uniprot_id",
    "entrez_gene_symbol",
]

In [4]:
# Input
proteomics_dict: pd.DataFrame = (
    pd.read_csv(path_input_dict, usecols=usecols_dict)
    .dropna()
    .drop_duplicates()
    .convert_dtypes()
)

In [5]:
# Rename columns
proteomics_dict.rename(columns=dict(zip(usecols_dict, colnames_dict)), inplace=True)

In [6]:
# Replace dots with underscores
label_old: pd.Series = proteomics_dict["label"]
proteomics_dict["label"] = proteomics_dict["label"].str.replace(".", "_")

In [7]:
# Output
proteomics_dict.to_csv(path_output_dict, index=False)

### CSF proteomics SomaScan

In [8]:
# Define columns to be used
usecols_proteomics: list[str] = ["RID", "VISCODE2"] + label_old.tolist()

In [9]:
# Read file
proteomics: pd.DataFrame = (
    pd.read_csv(path_input_proteomics, usecols=usecols_proteomics)
    .drop_duplicates()
    .convert_dtypes()
)

In [10]:
# Rename columns to new labels
proteomics.rename(columns=dict(zip(label_old, proteomics_dict["label"])), inplace=True)

In [11]:
# Keep only baseline data, and sort by RID
proteomics: pd.DataFrame = proteomics.loc[proteomics["VISCODE2"].isin(["bl"])]
proteomics.drop(columns=["VISCODE2"], inplace=True)
proteomics.sort_values(by="RID", ascending=True, inplace=True)
proteomics.reset_index(drop=True, inplace=True)

In [12]:
# Output
proteomics.to_csv(path_output_proteomics, index=False)