In [1]:
# Imports + paths

from pathlib import Path
import pandas as pd

PROJECT = Path.cwd()
PROCESSED = PROJECT / "data" / "processed"
print("Project:", PROJECT)
print("Processed:", PROCESSED)



Project: /mnt/vol1/Multi-Omics-Cancer-Subtype-Discovery
Processed: /mnt/vol1/Multi-Omics-Cancer-Subtype-Discovery/data/processed


In [2]:
# Load matrices

prot = pd.read_csv(PROCESSED / "proteomics_qc_imputed.csv", index_col=0)
rna  = pd.read_csv(PROCESSED / "rna_filtered_imputed.csv", index_col=0)

print("prot:", prot.shape)
print("rna :", rna.shape)


prot: (95, 9491)
rna : (95, 54055)


In [3]:
# Align samples

common = prot.index.intersection(rna.index)
prot = prot.loc[common].copy()
rna  = rna.loc[common].copy()

print("samples used:", len(common))


samples used: 95


In [4]:
# PCA on concatenated (scaled) features

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

X = pd.concat([prot, rna], axis=1)

X_scaled = StandardScaler().fit_transform(X)

pca = PCA(n_components=20, random_state=1)
PCs = pca.fit_transform(X_scaled)

pcs = pd.DataFrame(PCs, index=X.index, columns=[f"PC{i}" for i in range(1, 21)])
print("pcs:", pcs.shape)





pcs: (95, 20)


In [5]:
# K-means clustering (k=3 baseline)

from sklearn.cluster import KMeans

k = 3
km = KMeans(n_clusters=k, n_init=25, random_state=1)
labels = km.fit_predict(pcs)

subtypes = pd.DataFrame({"subtype": labels}, index=pcs.index)
subtypes["subtype"] = subtypes["subtype"].map(lambda x: f"C{x+1}")

print(subtypes["subtype"].value_counts())




subtype
C1    44
C2    36
C3    15
Name: count, dtype: int64


In [6]:
# Save outputs

pcs.to_csv(PROCESSED / "pca_20pcs.csv")
subtypes.to_csv(PROCESSED / "subtype_labels.csv")

print("Saved:")
print(" - pca_20pcs.csv")
print(" - subtype_labels.csv")


Saved:
 - pca_20pcs.csv
 - subtype_labels.csv
