In [1]:
# Imports + paths

from pathlib import Path
import pandas as pd

PROJECT = Path.cwd()
PROCESSED = PROJECT / "data" / "processed"

print("Project:", PROJECT)
print("Processed folder:", PROCESSED)


Project: /mnt/vol1/Multi-Omics-Cancer-Subtype-Discovery
Processed folder: /mnt/vol1/Multi-Omics-Cancer-Subtype-Discovery/data/processed


In [2]:
# Load aligned RNA (from A1) + clinical

clinical_a = pd.read_csv(PROCESSED / "clinical_aligned.csv", index_col=0)
rna_a = pd.read_csv(PROCESSED / "transcriptomics_aligned.csv", index_col=0)

print("clinical_a:", clinical_a.shape)
print("rna_a:", rna_a.shape)


clinical_a: (95, 124)
rna_a: (95, 59286)


In [3]:
# Convert RNA to numeric
rna = rna_a.apply(pd.to_numeric, errors="coerce")

print("total missing values:", int(rna.isna().sum().sum()))
print("min/max:", float(rna.min().min()), float(rna.max().max()))


total missing values: 0
min/max: 0.0 22.48


In [4]:
# Remove near-constant genes

gene_var = rna.var(axis=0, skipna=True)
keep_genes = gene_var > 1e-6  
rna_f = rna.loc[:, keep_genes].copy()

print("genes before:", rna.shape[1])
print("genes after :", rna_f.shape[1])



genes before: 59286
genes after : 54055


In [5]:
# Impute remaining missing (median per gene)

rna_imp = rna_f.fillna(rna_f.median())

print("missing after imputation:", int(rna_imp.isna().sum().sum()))



missing after imputation: 0


In [None]:
# Save A3 output

rna_imp.to_csv(PROCESSED / "rna_filtered_imputed.csv")
print("Saved: rna_filtered_imputed.csv")

