In [3]:
import pandas as pd 
import matplotlib.pyplot as plt
import torch
import numpy as np
import os
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [4]:
data = pd.read_csv('Small dataset.csv')


In [5]:
data['age'].fillna(data['age'].mean(), inplace=True)

In [6]:
data['clinical.T.Stage'].fillna(2.0, inplace=True)

In [7]:
data['Histology'].fillna('nos', inplace=True)

In [8]:
data['Overall.Stage'].fillna('IIIb', inplace=True)

## Get Tabular Data

In [9]:
df = data.copy()
stage_mapping = {"I": 1, "II": 2, "IIIa": 3, "IIIb": 4, "IV": 5}
df["Overall.Stage"] = df["Overall.Stage"].map(stage_mapping)


In [10]:
df.drop(columns=['Survival.time','deadstatus.event'],inplace=True)

In [11]:
def f(x):
    return x.split('-')[1]
df['patient'] = df['PatientID'].apply(f)

In [12]:
df

Unnamed: 0,PatientID,age,clinical.T.Stage,Clinical.N.Stage,Clinical.M.Stage,Overall.Stage,Histology,gender,Study Date,patient
0,LUNG1-001,78.7515,2.0,3,0,4,large cell,male,20140101,001
1,LUNG1-002,83.8001,2.0,0,0,1,squamous cell carcinoma,male,20080123,002
2,LUNG1-003,68.1807,2.0,3,0,4,large cell,male,20050105,003
3,LUNG1-004,70.8802,2.0,1,0,2,squamous cell carcinoma,male,20140101,004
4,LUNG1-005,80.4819,4.0,2,0,4,squamous cell carcinoma,male,20071008,005
...,...,...,...,...,...,...,...,...,...,...
415,LUNG1-418,53.6712,2.0,0,0,1,adenocarcinoma,male,20080921,418
416,LUNG1-419,66.5096,4.0,1,0,4,squamous cell carcinoma,male,20080508,419
417,LUNG1-420,73.3808,2.0,1,0,2,squamous cell carcinoma,male,20051124,420
418,LUNG1-421,61.7041,2.0,2,0,3,squamous cell carcinoma,female,20100531,421


In [13]:
df.dropna(inplace=True)
df.drop(columns=['PatientID'],inplace=True)

In [14]:
df.set_index('patient',inplace=True)


In [15]:
df

Unnamed: 0_level_0,age,clinical.T.Stage,Clinical.N.Stage,Clinical.M.Stage,Overall.Stage,Histology,gender,Study Date
patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
001,78.7515,2.0,3,0,4,large cell,male,20140101
002,83.8001,2.0,0,0,1,squamous cell carcinoma,male,20080123
003,68.1807,2.0,3,0,4,large cell,male,20050105
004,70.8802,2.0,1,0,2,squamous cell carcinoma,male,20140101
005,80.4819,4.0,2,0,4,squamous cell carcinoma,male,20071008
...,...,...,...,...,...,...,...,...
418,53.6712,2.0,0,0,1,adenocarcinoma,male,20080921
419,66.5096,4.0,1,0,4,squamous cell carcinoma,male,20080508
420,73.3808,2.0,1,0,2,squamous cell carcinoma,male,20051124
421,61.7041,2.0,2,0,3,squamous cell carcinoma,female,20100531


In [16]:
df.columns

Index(['age', 'clinical.T.Stage', 'Clinical.N.Stage', 'Clinical.M.Stage',
       'Overall.Stage', 'Histology', 'gender', 'Study Date'],
      dtype='object')

In [17]:

ordinal_features = ["clinical.T.Stage", "Clinical.N.Stage", "Clinical.M.Stage", "Overall.Stage"]
categorical_features = [ "Histology", "gender"]
numerical_features = ["age"]

ordinal_transformer = OrdinalEncoder()

# One-Hot Encoding for categorical variables
categorical_transformer = OneHotEncoder(drop="first")

# Standardize numerical features
numerical_transformer = StandardScaler()

# 3. Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_features),
        ("cat", categorical_transformer, categorical_features),
        ("ord", ordinal_transformer, ordinal_features),
    ]
)
pipeline = Pipeline(steps=[("preprocessor", preprocessor)])

# 4. Fit and transform data
X = pipeline.fit_transform(df)


In [18]:

train_ids =  [i.split('_')[0]for i in os.listdir('data/train_time') ] 
test_ids = [i.split('_')[0]for i in os.listdir('data/test_time') ]

In [19]:

for patient, feature in zip(df.index,X):
    if patient in train_ids:
        tensor = torch.tensor(feature)
        torch.save(tensor, f'data/train_tabular/{patient}.pt')
    elif patient in test_ids:
        tensor = torch.tensor(feature)
        torch.save(tensor, f'data/test_tabular/{patient}.pt')
        

## Get Event & Time Data

In [20]:
df = data.copy()
def f(x):
    return x.split('-')[1]
df['patient'] = df['PatientID'].apply(f)

In [21]:

train_patients = [i.split('_')[0]for i in os.listdir('data/train') if i.endswith('.png')] 
train_patients.sort()
train_patients.sort()
unique_train_patients = set(train_patients)
train_time = df[df['patient'].isin(unique_train_patients)][['Survival.time','patient']]
train_event = df[df['patient'].isin(unique_train_patients)][['deadstatus.event','patient']]

In [22]:

test_patients = [i.split('_')[0]for i in os.listdir('data/test') if i.endswith('.png')] 
test_patients.sort()
test_patients.sort()
unique_test_patients = set(test_patients)
test_time = df[df['patient'].isin(unique_test_patients)][['Survival.time','patient']]
test_event = df[df['patient'].isin(unique_test_patients)][['deadstatus.event','patient']]


In [23]:

for i in train_time.index:
    np.save('data/train_time/'+train_time.loc[i,'patient']+'_time.npy',train_time.loc[i,'Survival.time'])

In [24]:
for i in train_event.index:
    np.save('data/train_event/'+train_event.loc[i,'patient']+'_event.npy',train_event.loc[i,'deadstatus.event'])

In [25]:

for i in test_time.index:
    np.save('data/test_time/'+test_time.loc[i,'patient']+'_time.npy',test_time.loc[i,'Survival.time'])

In [26]:
for i in test_event.index:
    np.save('data/test_event/'+test_event.loc[i,'patient']+'_event.npy',test_event.loc[i,'deadstatus.event'])

In [27]:
len(os.listdir('data/train_time')), len(os.listdir('data/train_event')), len(os.listdir('data/test_time')), len(os.listdir('data/test_event'))

(326, 326, 94, 94)

## Get Volume Data

In [28]:
def get_slices_per_patient(patient_id,mode):
    return [f'data/{mode}/{i}' for i in os.listdir(f'data/{mode}') if i.startswith(patient_id) and i.endswith('.png')]
def load_slices_per_subject(slices):
    X = np.zeros((5, 256, 256, 1))
    for i, slice in enumerate(slices):
        X[i] = np.expand_dims(plt.imread(slice), axis=-1)
    X = torch.tensor(X)
    return X.permute(3,0,1,2)
def get_patients(mode):
    patients = list(set([i.split('_')[0] for i in os.listdir(f'data/{mode}') if i.endswith('.png')]))
    return patients
def get_slices(mode):
    patients = get_patients(mode)
    slices = {}
    for patient in patients:
        slice = get_slices_per_patient(patient,mode)
        slices[patient] = slice
    return slices

In [29]:
slices = get_slices('test')
for key, value in slices.items():
    volume = load_slices_per_subject(value)
    torch.save(volume, f'data/test_volume/{key}.pt')