In [1]:
import pandas as pd
import numpy as np
import category_encoders as ce
import logging
from sklearn.preprocessing import LabelEncoder

# Create a logger
logging.basicConfig(format="%(asctime)s %(levelname)s: %(message)s")
logger = logging.getLogger()
logger.setLevel(logging.INFO)

### Constants

In [2]:
data_dir = "data"
raw_data_dir = "raw_data"

### Preprocess clinical data

In [3]:
df = pd.read_csv(f"{raw_data_dir}/clinical.tsv", sep="\t")
# df.columns.to_list()

In [4]:
# Select only some fields
selected_cols = [
    "case_submitter_id",
    "age_at_index",
    "days_to_death",
    "days_to_last_follow_up",
    "morphology",
    "ethnicity",
    "gender",
    "race",
    "vital_status",
    # "year_of_birth",
    # "year_of_death",
    "ajcc_pathologic_m",
    "ajcc_pathologic_n",
    "ajcc_pathologic_stage",
    "primary_diagnosis",
    "treatment_or_therapy",
    "treatment_type",
]

df = df[selected_cols]

# Replace '--' with NaN
df = df.replace("'--", np.nan)

logger.info(df.shape)
df

2025-01-13 17:37:25,137 INFO: (1107, 15)


Unnamed: 0,case_submitter_id,age_at_index,days_to_death,days_to_last_follow_up,morphology,ethnicity,gender,race,vital_status,ajcc_pathologic_m,ajcc_pathologic_n,ajcc_pathologic_stage,primary_diagnosis,treatment_or_therapy,treatment_type
0,TCGA-62-A471,64,,1246.0,8140/3,not hispanic or latino,male,white,Alive,M0,N1,Stage IIB,"Adenocarcinoma, NOS",yes,"Pharmaceutical Therapy, NOS"
1,TCGA-62-A471,64,,1246.0,8140/3,not hispanic or latino,male,white,Alive,M0,N1,Stage IIB,"Adenocarcinoma, NOS",no,"Radiation Therapy, NOS"
2,TCGA-67-3773,84,,427.0,8140/3,not hispanic or latino,female,white,Alive,M0,N0,Stage IB,"Adenocarcinoma, NOS",not reported,"Radiation Therapy, NOS"
3,TCGA-67-3773,84,,427.0,8140/3,not hispanic or latino,female,white,Alive,M0,N0,Stage IB,"Adenocarcinoma, NOS",not reported,"Pharmaceutical Therapy, NOS"
4,TCGA-17-Z038,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1102,TCGA-55-7570,60,,824.0,8140/3,not hispanic or latino,male,black or african american,Alive,MX,N0,Stage IA,"Adenocarcinoma, NOS",no,"Pharmaceutical Therapy, NOS"
1103,TCGA-78-7146,71,173,,8255/3,not reported,female,white,Dead,M0,N2,Stage IIIA,Adenocarcinoma with mixed subtypes,no,"Radiation Therapy, NOS"
1104,TCGA-78-7146,71,173,,8255/3,not reported,female,white,Dead,M0,N2,Stage IIIA,Adenocarcinoma with mixed subtypes,no,"Pharmaceutical Therapy, NOS"
1105,TCGA-44-3398,77,,1163.0,8140/3,not hispanic or latino,female,white,Alive,M0,N0,Stage IA,"Adenocarcinoma, NOS",no,"Pharmaceutical Therapy, NOS"


In [5]:
# Add new column for Pharmaceutical Therapy, NOS
# Add new column for Radiation Therapy, NOS
df['pharmaceutical_treatment'] = ((df['treatment_type'] == 'Pharmaceutical Therapy, NOS') & (df["treatment_or_therapy"] == "yes")).astype(int)
df['radiation_treatment'] = ((df['treatment_type'] == 'Radiation Therapy, NOS') & (df["treatment_or_therapy"] == "yes")).astype(int)

In [6]:
# Group by 'case_submitter_id' and aggregate
df_merged = df.groupby('case_submitter_id').agg({
    "age_at_index": "first",
    "days_to_death": "first",
    "days_to_last_follow_up": "first",
    "ethnicity": "first",
    "gender": "first",
    "race": "first",
    "age_at_index": "first",
    "morphology": "first",
    "vital_status": "first",
    "ajcc_pathologic_m": "first",
    "ajcc_pathologic_n": "first",
    "ajcc_pathologic_stage": "first",
    "primary_diagnosis": "first",
    "pharmaceutical_treatment": "max",             # Update to 1 if any row has 1
    "radiation_treatment": "max"                   # Update to 1 if any row has 1
}).reset_index()

# Update 'treatment_type' based on 'treatment_or_therapy'
df_merged['treatment_type'] = np.where(df_merged['pharmaceutical_treatment'] == 1, 'Pharmaceutical Therapy, NOS', 'None')
df_merged['treatment_type'] = np.where(df_merged['radiation_treatment'] == 1, 'Radiation Therapy, NOS', df_merged['treatment_type'])

# Remove rows where 'vital_status' is NaN
df_merged = df_merged.dropna(subset=['vital_status'])

# Remove rows where 'days_to_death' or 'days_to_last_follow_up' is NaN
df_merged = df_merged[(df_merged["days_to_death"].notna()) | (df_merged["days_to_last_follow_up"].notna())]

# Convert vital_status to a binary event indicator (1 = Dead, 0 = Alive)
df_merged['event'] = (df_merged['vital_status'] == 'Dead').astype(int)

# Get 'days_to_event'
df_merged['days_to_event'] = np.where((df_merged['days_to_death'].notna()) & (df_merged['event'] == 1), df_merged['days_to_death'], df_merged['days_to_last_follow_up']).astype(float)

df_merged = df_merged[df_merged['days_to_event'] > 0]

# Drop some columns
df_merged = df_merged.drop(columns=['pharmaceutical_treatment', 'radiation_treatment'])

logger.info(f"Total samples: {df_merged.shape[0]}")
logger.info(f"Total clincal features: { df_merged.shape[1]}")

df_merged

2025-01-13 17:37:25,169 INFO: Total samples: 509
2025-01-13 17:37:25,170 INFO: Total clincal features: 16


Unnamed: 0,case_submitter_id,age_at_index,days_to_death,days_to_last_follow_up,ethnicity,gender,race,morphology,vital_status,ajcc_pathologic_m,ajcc_pathologic_n,ajcc_pathologic_stage,primary_diagnosis,treatment_type,event,days_to_event
1,TCGA-05-4245,81,,730.0,not reported,male,not reported,8140/3,Alive,M0,N2,Stage IIIA,"Adenocarcinoma, NOS",,0,730.0
2,TCGA-05-4249,67,,1523.0,not reported,male,not reported,8140/3,Alive,M0,N0,Stage IB,"Adenocarcinoma, NOS",,0,1523.0
3,TCGA-05-4250,79,121,,not reported,female,not reported,8140/3,Dead,M0,N1,Stage IIIA,"Adenocarcinoma, NOS",,1,121.0
4,TCGA-05-4382,68,,607.0,not reported,male,not reported,8255/3,Alive,M0,N0,Stage IB,Adenocarcinoma with mixed subtypes,"Radiation Therapy, NOS",0,607.0
5,TCGA-05-4384,66,,426.0,not reported,male,not reported,8255/3,Alive,M0,N2,Stage IIIA,Adenocarcinoma with mixed subtypes,"Radiation Therapy, NOS",0,426.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
580,TCGA-NJ-A55O,56,,13.0,not hispanic or latino,female,white,8480/3,Alive,M0,N1,Stage IIA,Mucinous adenocarcinoma,,0,13.0
581,TCGA-NJ-A55R,67,,603.0,not hispanic or latino,male,white,8230/3,Alive,MX,N0,Stage IA,"Solid carcinoma, NOS",,0,603.0
582,TCGA-NJ-A7XG,49,,617.0,not hispanic or latino,male,black or african american,8140/3,Alive,M0,N1,Stage IIIA,"Adenocarcinoma, NOS","Pharmaceutical Therapy, NOS",0,617.0
583,TCGA-O1-A52J,74,1798,,not hispanic or latino,female,white,8140/3,Dead,MX,N0,Stage IA,"Adenocarcinoma, NOS",,1,1798.0


In [7]:
df_merged.to_csv(f"{data_dir}/clincal.tsv", sep="\t")

### Preprocess CNV data