# Imports

In [1]:
import numpy as np
import pandas as pd

# Open data

In [2]:
file_path = './data/synthetic_data_lung_cancer.csv'
df = pd.read_csv(file_path)

df.head()

Unnamed: 0,SUBJECT_ID,DEFINITION_ID,TIME
0,1,drug_217,0.004807
1,1,condition_1922,0.008643
2,1,condition_785,0.027792
3,1,drug_49,0.032515
4,1,measurement_132,0.056765


In [3]:
# Using a smaller subset for performance issues
subset_fraction = 0.3
df = df.sample(frac=subset_fraction, random_state=42)

# Data cleaning

In [4]:
# Handle Missing Values
df.isnull().sum()  # Check for missing values
df = df.dropna()    # Drop rows with missing values

# Identify and Remove Outliers (assuming 'TIME' is a numerical column)
Q1 = df['TIME'].quantile(0.25)
Q3 = df['TIME'].quantile(0.75)
IQR = Q3 - Q1

# Define upper and lower bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Remove outliers
df = df[(df['TIME'] >= lower_bound) & (df['TIME'] <= upper_bound)]

# Display basic statistics after data cleaning
print("\nSummary Statistics after Data Cleaning:")
print(df.describe())


Summary Statistics after Data Cleaning:
          SUBJECT_ID           TIME
count  168268.000000  168268.000000
mean      478.890936       4.148546
std       273.734290       2.368932
min         1.000000       0.000002
25%       242.000000       2.156218
50%       471.000000       4.200333
75%       701.000000       6.154354
max       984.000000      12.023025


# Feature engineering

In [5]:
# # Scale the 'TIME' column
# scaler = MinMaxScaler()
# df[['TIME']] = scaler.fit_transform(df[['TIME']])

# df.head()

In [6]:
# Build the target feature ('DIED')
df['DIED'] = 0

# Identify rows where the "death" state is present
death_rows = df[df['DEFINITION_ID'].str.lower() == 'death']

# Iterate through each death row and update DIED column for corresponding rows
for _, death_row in death_rows.iterrows():
    patient_id = death_row['SUBJECT_ID']
    death_time = death_row['TIME']
    
    if death_time < 5:
        # Update DIED to 1 for rows with the same PATIENT_ID and TIME within 1 year
        df.loc[(df['SUBJECT_ID'] == patient_id), 'DIED'] = 1

# Delete the rows indicating death as that is what we are trying to predict
df = df[df['DEFINITION_ID'].str.lower() != 'death']

df.head()

Unnamed: 0,SUBJECT_ID,DEFINITION_ID,TIME,DIED
146286,251,drug_167,4.085872,0
522409,898,measurement_562,7.724991,0
314621,518,measurement_1113,7.005983,0
397520,658,measurement_1257,6.831718,0
377685,635,measurement_966,0.894524,0


In [7]:
# Keep time but proceed to bining and merging with 'DEFINITION_ID' features
# Allow us to retain more information about the timeframe in which the procedure happened
df['TIME'] = np.floor(df['TIME']).astype(int)
df['DEFINITION_ID'] = df['DEFINITION_ID'] + '_' + df['TIME'].astype(str)

# Drop 'TIME' to focus on 'DEFINITION_ID'
df= df.drop(['TIME'], axis=1)

df.head()

Unnamed: 0,SUBJECT_ID,DEFINITION_ID,DIED
146286,251,drug_167_4,0
522409,898,measurement_562_7,0
314621,518,measurement_1113_7,0
397520,658,measurement_1257_6,0
377685,635,measurement_966_0,0


In [8]:

# Proceed to one-hot encode and group together all interventions happening on one patient
# One-hot encode the 'DEFINITION_ID' feature
df = pd.get_dummies(df, columns=['DEFINITION_ID'], prefix='DEF')
# Group by 'SUBJECT_ID' and perform a logical OR on 'DIED'
df = df.groupby('SUBJECT_ID').max()

df.head()

Unnamed: 0_level_0,DIED,DEF_condition_1000_0,DEF_condition_1000_1,DEF_condition_1000_2,DEF_condition_1000_3,DEF_condition_1000_4,DEF_condition_1000_5,DEF_condition_1000_7,DEF_condition_1000_8,DEF_condition_1001_0,...,DEF_procedure_99_4,DEF_procedure_99_5,DEF_procedure_99_6,DEF_procedure_99_7,DEF_procedure_9_0,DEF_procedure_9_1,DEF_procedure_9_2,DEF_procedure_9_3,DEF_procedure_9_5,DEF_procedure_9_6
SUBJECT_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,0,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,0,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,0,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
6,0,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [9]:
# Separate features and target variable
X = df.drop(['DIED'], axis=1)
y = df['DIED']