In [13]:
# Imports + paths

from pathlib import Path
import pandas as pd

PROJECT = Path.cwd()
PROCESSED = PROJECT / "data" / "processed"

print("Project:", PROJECT)
print("Processed folder:", PROCESSED)
print("Processed files:", [p.name for p in sorted(PROCESSED.glob("*.csv"))])



Project: /mnt/vol1/Multi-Omics-Cancer-Subtype-Discovery
Processed folder: /mnt/vol1/Multi-Omics-Cancer-Subtype-Discovery/data/processed
Processed files: ['clinical_aligned.csv', 'proteomics_aligned.csv', 'transcriptomics_aligned.csv']


In [14]:
# Load A1 outputs

clinical_a = pd.read_csv(PROCESSED / "clinical_aligned.csv", index_col=0)
proteomics_a = pd.read_csv(PROCESSED / "proteomics_aligned.csv", index_col=0)
transcriptomics_a = pd.read_csv(PROCESSED / "transcriptomics_aligned.csv", index_col=0)

print("clinical_a:", clinical_a.shape)
print("proteomics_a:", proteomics_a.shape)
print("transcriptomics_a:", transcriptomics_a.shape)


clinical_a: (95, 124)
proteomics_a: (95, 11949)
transcriptomics_a: (95, 59286)


In [15]:
# Convert proteomics to numeric

prot = proteomics_a.apply(pd.to_numeric, errors="coerce")

print("prot shape:", prot.shape)
print("total missing values:", int(prot.isna().sum().sum()))


prot shape: (95, 11949)
total missing values: 204000


In [16]:
# Drop proteins missing in >40% samples

missing_per_protein = prot.isna().mean(axis=0)
keep_proteins = missing_per_protein <= 0.40

prot_qc = prot.loc[:, keep_proteins].copy()

print("proteins before:", prot.shape[1])
print("proteins after :", prot_qc.shape[1])


proteins before: 11949
proteins after : 9491


In [17]:
# Impute remaining missing values (median per protein)

prot_imp = prot_qc.fillna(prot_qc.median())

print("missing after imputation:", int(prot_imp.isna().sum().sum()))


missing after imputation: 0


In [19]:
# Save A2 outputs

prot_imp.to_csv(PROCESSED / "proteomics_qc_imputed.csv")

# keep these for the next stage (still aligned)
clinical_a.to_csv(PROCESSED / "clinical_aligned.csv")
transcriptomics_a.to_csv(PROCESSED / "transcriptomics_aligned.csv")

print("Saved:")
print(" - proteomics_qc_imputed.csv")
print(" - clinical_aligned.csv")
print(" - transcriptomics_aligned.csv")



Saved:
 - proteomics_qc_imputed.csv
 - clinical_aligned.csv
 - transcriptomics_aligned.csv
