In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from pylab import rcParams

%matplotlib inline
%config InlineBackend.figure_format='retina'
sns.set(style='whitegrid', palette='muted')
HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]
sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
rcParams['figure.figsize'] = 7, 4

import random
from utils.helpers import load_data, get_features, plot_random_patient_recordings

from tqdm.auto import tqdm

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import pytorch_lightning as pl
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from multiprocessing import cpu_count
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
from torchmetrics import Accuracy
from sklearn.metrics import classification_report, confusion_matrix

RANDOM_SEED = 42
pl.seed_everything(42)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

# Importing Data

In [2]:
directories = ['./physionet.org/files/challenge-2019/1.0.0/training/training_setA', 
               './physionet.org/files/challenge-2019/1.0.0/training/training_setB']

dataset, record_lengths = load_data(directories=directories, target_label='SepsisLabel')
# X_train, y_train, record_lengths = load_data(directories=directories, target_label='SepsisLabel', x_train_y_train=True)

In [3]:
print(dataset.shape)
dataset.head()

# Initial Analysis

## Target Distribution

In [4]:
def plot_target_classes():
    start_length, ones, zeros = 0, 0, 0
    for idx, length in enumerate(record_lengths):
        end_length = start_length + length    
        if dataset['SepsisLabel'][start_length:end_length].isin([1.0]).any():
            ones += 1
        else:
            zeros += 1
        start_length = end_length
        
    data = {'Category': ['Sepsis', 'Non-Sepsis'], 'Count': [ones, zeros]}
    plot_df = pd.DataFrame(data)
    ax = sns.barplot(x='Category', y='Count', data=plot_df)
    plt.title('Counts of 1.0s and 0.0s in SepsisLabel')
    total = ones + zeros
    for p in ax.patches:
        percentage = '{:.3f}%'.format(100 * p.get_height() / total)
        ax.annotate(percentage, (p.get_x() + p.get_width() / 2., p.get_height()), fontsize='x-small', 
                    ha='center', va='center', xytext=(0, 5), textcoords='offset points')
    plt.show()

In [5]:
plot_target_classes()

- **Immbalanced Dataset**

## Distribution of recordings

In [6]:
def plot_recordings(record_lengths):
    sns.histplot(record_lengths, kde=True)
    plt.title('Distribution of Record Lengths')
    plt.xlabel('Record Length')
    plt.ylabel('Frequency')
    plt.show()
    
plot_recordings(record_lengths)

- **259 Recordings: p000009**<br>
- **337 Recordings: p016581, p003658**
- **336 Recordings: p018823**

## Missing Values

In [7]:
def missing_percentage(dataset):
    
    missing_percent = dataset.isnull().mean() * 100
    missing_df = pd.DataFrame({
        'Features': missing_percent.index,
        'Percentage': missing_percent.values
    })
    
    plt.figure(figsize=(10, 4))
    sns.barplot(x='Features', y='Percentage', data=missing_df)
    plt.title('Percentage of Missing Values')
    plt.xticks(rotation=90)
    plt.axhline(90, color='r', linestyle='--')
    plt.axhline(95, color='r', linestyle='--')
    plt.axhline(99, color='r', linestyle='--')
    plt.text(42.5, 88, '90%', color='r', ha='center', fontdict={'size': 8})
    plt.text(42.5, 93, '95%', color='r', ha='center', fontdict={'size': 8})
    plt.text(42.5, 98, '99%', color='r', ha='center', fontdict={'size': 8})
    
    plt.show()
    
missing_percentage(dataset)

- **Most of the features have above 90% missing values**

In [8]:
vital_signs = ['HR', 'O2Sat', 'Temp', 'SBP', 'MAP', 'DBP', 'Resp', 'EtCO2']
laboratory_values = ['BaseExcess', 'HCO3', 'FiO2', 'pH', 'PaCO2', 'SaO2', 'AST', 'BUN', 'Alkalinephos', 'Calcium', 
                     'Chloride', 'Creatinine', 'Bilirubin_direct', 'Glucose', 'Lactate', 'Magnesium', 'Phosphate', 
                     'Potassium', 'Bilirubin_total', 'TroponinI', 'Hct', 'Hgb', 'PTT', 'WBC', 'Fibrinogen', 'Platelets']
demographics = ['Age', 'Gender', 'Unit1', 'Unit2', 'HospAdmTime', 'ICULOS']

print(f"Total number of features: {len(vital_signs) + len(laboratory_values) + len(demographics)}")

- **Among vital signs features, EtCO2 has >95% of missing values.**<br>
- **All laboratory features have missing values >90%.**<br>
- **Among demographics features, Unit 1 and Unit 2 has around 40% of missing values.**

### Case 1: Removing features having missing values >90%

In [9]:
# vital_signs.remove('EtCO2')
# laboratory_values = [] # removing all
# demographics = demographics

vital_signs, laboratory_values, demographics = get_features(case=1)
INPUT_FEATURES = vital_signs + laboratory_values + demographics
OUTPUT_FEATURE = ['SepsisLabel']
dataset = dataset[INPUT_FEATURES + OUTPUT_FEATURE + ['PatientID']]

In [10]:
print(dataset.shape)
dataset.head()

In [11]:
selected_patient_ids = plot_random_patient_recordings(dataset, feature='Temp', num_plots=2, fill_method='ffill')

- **Filling NULL values with forward filling stratergy**<br>
- **By Dropping null values in Unit1 and Unit2, we are loosing data belong to 2 paitents, their IDs is 1 and 2**

# Pipeline

In [1]:
from models.lgbm_classifier import SepsisPipeline

import numpy as np

In [2]:
sepsis_pipeline = SepsisPipeline()

dataset, record_lengths = sepsis_pipeline.load_data()
vital_signs, laboratory_values, demographics = sepsis_pipeline.get_features(case=1)

In [3]:
for id, patient_data in dataset.groupby('PatientID'):
    print(id, type(patient_data), patient_data.shape)
    
    if id==3:
        break

In [4]:
train_percentage = 0.80
test_percentage = 1 - train_percentage

patient_ids = dataset['PatientID'].unique()
train_ids = np.random.choice(patient_ids, int(len(patient_ids) * train_percentage), replace=False)

In [5]:
# Normalizing the csv by mean and std
train_set_mean_std = dataset[dataset['PatientID'].isin(train_ids)].describe().loc[['mean', 'std']]

In [6]:
window_list = []
for patient_id, patitent_data in dataset.groupby('PatientID'):
    patient_data = patient_data.reset_index(drop=True)
    
    # Imputing missing csv
    for feature in vital_signs + laboratory_values + demographics:
        patient_data[feature] = patient_data[feature].bfill().ffill()
    
    # Standardizing csv
    for feature in vital_signs + laboratory_values + demographics:
        mean = train_set_mean_std[feature]['mean']
        std = train_set_mean_std[feature]['std']
        patient_data[feature] = (patient_data[feature] - mean) / std
        
    # Making csv ready
    patient_length = len(patient_data)
    i, window_length, prediction_length = 0, 10, 1
    while (i + window_length + prediction_length) <= patient_length:
        temp_data = patient_data.iloc[i: i + window_length]
        temp_label = patient_data.iloc[i + window_length: i + window_length + prediction_length]
        temp_label = int(any(temp_label['SepsisLabel']))
        temp_patient = patient_id
        
        # Sliding window by "window_length", and predicting "prediction_length" samples into future
        i = i + 1
        X_continous = temp_data[]
        
    
    break

In [10]:
import os


def project_root() -> str:
    """Returns path to root directory of a project"""
    return os.path.join(_file_directory_path(__file__), '..')


def _file_directory_path(file_path_name):
    return os.path.dirname(os.path.realpath(file_path_name))

In [13]:
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# print('Using device:', device)
# print()
# 
# if device.type == 'cuda':
#     print(torch.cuda.get_device_name(0))
#     print('Memory Usage:')
#     print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
#     print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

# To monitor GPU usage every 2 seconds
# watch -n 2 nvidia-smi