In [16]:
# Imports + paths

from pathlib import Path
import pandas as pd

PROJECT = Path.cwd()
RAW = PROJECT / "data" / "raw"
PROCESSED = PROJECT / "data" / "processed"

print("Project:", PROJECT)
print("RAW folder:", RAW)
print("RAW files:", [p.name for p in sorted(RAW.glob("*.csv"))])


Project: /mnt/vol1/Multi-Omics-Cancer-Subtype-Discovery
RAW folder: /mnt/vol1/Multi-Omics-Cancer-Subtype-Discovery/data/raw
RAW files: ['clinical.csv', 'proteomics.csv', 'transcriptomics.csv']


In [13]:
# Load raw tables
%%capture

import pandas as pd

clinical_path = RAW / "clinical.csv"
prot_path = RAW / "proteomics.csv"
rna_path = RAW / "transcriptomics.csv"

clinical = pd.read_csv(clinical_path, index_col=0)
proteomics = pd.read_csv(prot_path, index_col=0)
transcriptomics = pd.read_csv(rna_path, index_col=0)


In [4]:
print("clinical:", clinical.shape)
print("proteomics:", proteomics.shape)
print("transcriptomics:", transcriptomics.shape)

clinical.head(3)

clinical: (103, 124)
proteomics: (115, 11949)
transcriptomics: (121, 59286)


Unnamed: 0_level_0,tumor_code,discovery_study,type_of_analyzed_samples,confirmatory_study,type_of_analyzed_samples.1,age,sex,race,ethnicity,ethnicity_race_ancestry_identified,...,additional_treatment_pharmaceutical_therapy_for_new_tumor,additional_treatment_immuno_for_new_tumor,number_of_days_from_date_of_initial_pathologic_diagnosis_to_date_of_additional_surgery_for_new_tumor_event_loco-regional,number_of_days_from_date_of_initial_pathologic_diagnosis_to_date_of_additional_surgery_for_new_tumor_event_metastasis,"Recurrence-free survival, days","Recurrence-free survival from collection, days","Recurrence status (1, yes; 0, no)","Overall survival, days","Overall survival from collection, days","Survival status (1, dead; 0, alive)"
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C3L-00006,UCEC,Yes,Tumor_and_Normal,,,64,Female,White,Not Hispanic or Latino,White,...,,,,,,,0,737.0,737.0,0.0
C3L-00008,UCEC,Yes,Tumor,,,58,Female,White,Not Hispanic or Latino,White,...,,,,,,,0,898.0,898.0,0.0
C3L-00032,UCEC,Yes,Tumor,,,50,Female,White,Not Hispanic or Latino,White,...,,,,,,,0,1710.0,1710.0,0.0


In [14]:
## Align the same samples across all three tables

# Convert index values to strings (avoids subtle mismatches)
clinical.index = clinical.index.astype(str)
proteomics.index = proteomics.index.astype(str)
transcriptomics.index = transcriptomics.index.astype(str)

common = sorted(set(clinical.index) & set(proteomics.index) & set(transcriptomics.index))

print("Sample counts:")
print(" clinical:", clinical.shape[0])
print(" proteomics:", proteomics.shape[0])
print(" transcriptomics:", transcriptomics.shape[0])
print(" common:", len(common))

# Subset and force the same order
clinical_a = clinical.loc[common].copy()
proteomics_a = proteomics.loc[common].copy()
transcriptomics_a = transcriptomics.loc[common].copy()

print("\nAligned shapes:")
print(" clinical_a:", clinical_a.shape)
print(" proteomics_a:", proteomics_a.shape)
print(" transcriptomics_a:", transcriptomics_a.shape)

Sample counts:
 clinical: 103
 proteomics: 115
 transcriptomics: 121
 common: 95

Aligned shapes:
 clinical_a: (95, 124)
 proteomics_a: (95, 11949)
 transcriptomics_a: (95, 59286)


In [17]:
# Save A1 outputs to data/processed/

clinical_a.to_csv(PROCESSED / "clinical_aligned.csv")
proteomics_a.to_csv(PROCESSED / "proteomics_aligned.csv")
transcriptomics_a.to_csv(PROCESSED / "transcriptomics_aligned.csv")

print("Saved A1 files to:", PROCESSED)
print(" - clinical_aligned.csv")
print(" - proteomics_aligned.csv")
print(" - transcriptomics_aligned.csv")




Saved A1 files to: /mnt/vol1/Multi-Omics-Cancer-Subtype-Discovery/data/processed
 - clinical_aligned.csv
 - proteomics_aligned.csv
 - transcriptomics_aligned.csv
