In [1]:
# Import libraries
# Numpy library to statistically analyse data
import numpy as np
# Pandas library to make data frames
import pandas as pd
# Matplotlib and seaborn library to visualize data as charts 
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Visualize charts after the relavent line of code execution
%matplotlib inline

In [None]:
# Dataset is taken from https://www.kaggle.com/datasets/sadiaanzum/patient-survival-prediction-dataset?select=Dataset.csv 
# assign data in the csv file to data frame
patient_survival_data = pd.read_csv("survival_dataset.csv")

In [None]:
# Visualize first 5 rows of the dataset
patient_survival_data.head()

In [None]:
# Get detils about the dataset
patient_survival_data.info()

In [None]:
# Check the availability of null values in the dataset
patient_survival_data.isnull()

In [None]:
# Graphically view the availability of null values in the dataset
sns.heatmap(patient_survival_data.isnull(), yticklabels=False, cbar=False, cmap="viridis")

In [None]:
# Identify number of null values in the dataset (each column)
patient_survival_data.isnull().sum()

In [None]:
# Update the dataframe by removing the null values
updated_patient_survival_data = patient_survival_data.dropna()

In [None]:
# Check the availability of null data after removing the null data
updated_patient_survival_data.isnull().sum()

In [None]:
# Graphically view the availability of null values in the dataset
sns.heatmap(updated_patient_survival_data.isnull(), yticklabels=False, cbar=False, cmap="viridis")

In [None]:
# Save the updated file with a new name where the null values are removed
updated_patient_survival_data.to_csv('updated_survival_dataset.csv',index=False)

In [None]:
# View the dataset
updated_patient_survival_data

In [None]:
# Identify the relationship between data
sns.countplot(x='hospital_death', data=updated_patient_survival_data)

In [None]:
# sns.pairplot(updated_patient_survival_data, hue='hospital_death', height=3.0);

In [None]:
# Preprocess Data

In [None]:
pd.options.mode.chained_assignment = None  # default='warn'
ICUAdmitMapping = {'Accident & Emergency': 'AcEm', 'Floor': 'Fl', 'Operating Room / Recovery': 'OpRe', 
                   'Other Hospital':'O', 'Other ICU': 'O'}

updated_patient_survival_data['icu_admit_source'] = updated_patient_survival_data['icu_admit_source'].map(ICUAdmitMapping)

In [None]:
ICUStayTypeMapping = {'admit': 'Ad', 'readmit': 'Ra', 'transfer': 'Tr'}

updated_patient_survival_data['icu_stay_type'] = updated_patient_survival_data['icu_stay_type'].map(ICUStayTypeMapping)

In [None]:
ICUTypeMapping = {'CTICU': 'CTICU', 'CCU-CTICU': 'CCU', 'Med-Surg ICU': 'Other'}

updated_patient_survival_data['icu_type'] = updated_patient_survival_data['icu_type'].map(ICUTypeMapping)

In [None]:
# View table with updated changes
updated_patient_survival_data

In [None]:
DiagnosisAIIIMapping = {'Cardiovascular': 'Car', 'Gastrointestinal':'Gas', 'Genitourinary': 'GeU', 
                        'Gynecological': 'GYn', 'Hematological': 'Hea', 'Metabolic': 'Met', 
                        'Musculoskeletal/Skin': 'MuS', 'Neurological':'Neu', 'Respiratory': 'Res', 
                        'Sepsis': 'Sep', 'Trauma': 'Tra'}

updated_patient_survival_data['apache_3j_bodysystem'] = updated_patient_survival_data['apache_3j_bodysystem'].map(DiagnosisAIIIMapping)

In [None]:
DiagnosisAIIMapping = {'Cardiovascular': 'C', 'Gastrointestinal': 'GA', 'Hematological': 'H', 'Metabolic': 'M', 'Neurologic': 'N', 'Renal/Genitourinary': 'RG', 
                       'Respiratory': 'R', 'Trauma': 'T', 'Undefined diagnoses': 'U'}

updated_patient_survival_data['apache_2_bodysystem'] = updated_patient_survival_data['apache_2_bodysystem'].map(DiagnosisAIIMapping)

In [None]:
# View table with updated changes
updated_patient_survival_data

In [None]:
# Assign dummy variables
pd.get_dummies(updated_patient_survival_data['gender'], drop_first=True)

In [None]:
patient_gender = pd.get_dummies(updated_patient_survival_data['gender'], drop_first=True)

In [None]:
patient_gender

In [None]:
ICUadmit_source = pd.get_dummies(updated_patient_survival_data['icu_admit_source'], drop_first=True)

In [None]:
ICUadmit_source

patient_icu_stay_type = pd.get_dummies(updated_patient_survival_data['icu_stay_type'], drop_first=True)
# pd.get_dummies(updated_patient_survival_data['icu_stay_type'], drop_first=True)

In [None]:
patient_icu_stay_type

# patient_ICU_Admit = pd.get_dummies(updated_patient_survival_data['icu_admit_type'], drop_first=True)
# patient_ICU_Admit

patient_icu_type = pd.get_dummies(updated_patient_survival_data['icu_type'], drop_first=True)

In [None]:
patient_icu_type

In [None]:
diagnosis_GIII = pd.get_dummies(updated_patient_survival_data['apache_3j_bodysystem'], drop_first=True)

In [None]:
diagnosis_GIII

In [None]:
diagnosis_GII = pd.get_dummies(updated_patient_survival_data['apache_2_bodysystem'], drop_first=True)

In [None]:
diagnosis_GII

In [None]:
# concatanate dummy variable to dataset
updated_patient_survival_data = pd.concat([updated_patient_survival_data, diagnosis_GII, diagnosis_GIII, patient_gender, patient_icu_stay_type, patient_icu_type], axis=1)

In [None]:
updated_patient_survival_data

# View column names of the updated csv files
# updated_patient_survival_data.head()
# View only the column names in the updated csv files
for column_name in updated_patient_survival_data.columns:
    print(column_name)

In [None]:
# As dummy variables are made for the variables, delete other variables with might be exist as duplicates
updated_patient_survival_data.drop(['encounter_id', 'patient_id', 'hospital_id', 'ethnicity', 'gender', 'hospital_admit_source', 'icu_admit_source', 'icu_id', 'icu_stay_type', 'icu_type', 'apache_3j_bodysystem', 'apache_2_bodysystem'], axis=1, inplace=True)

# View only the column names in the updated csv files
for updated_column_name in updated_patient_survival_data.columns:
    print(updated_column_name)

In [None]:
# View as a table
updated_patient_survival_data

In [None]:
# Encode data
# Import library from sklearn to encode data
from sklearn.preprocessing import LabelEncoder

In [None]:
label_enc = LabelEncoder()
survival_data = updated_patient_survival_data.copy()

In [None]:
# Encode code individually
for data in updated_patient_survival_data.columns:
    survival_data[data] = label_enc.fit_transform(updated_patient_survival_data[data])

In [None]:
survival_data

In [None]:
# Split dataset

In [None]:
# Divide the data set
# X => Get all the column names except churn column
# X is independent variable
X = survival_data.drop('hospital_death', axis = 1)
# Y is the dependent variable which depend on x variables
Y = survival_data['hospital_death']

In [None]:
# Split dataset for test and train which is done by importing a package called model_selection from sklearn
from sklearn.model_selection import train_test_split

In [None]:
# Variables
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 2)

In [None]:
# Find the shape of the dataset
print(x_train.shape, '----> Training Feature Shape')
print(x_test.shape, '----> Training Feature Shape')
print(y_train.shape, '----> Training Feature Shape')
print(y_test.shape, '----> Training Feature Shape')