## General Processing

Flow:
1. general filter: Exclude pediatric
2. outliers
3. train/test spliting
4. missing values: imputation




### Load master dataset

In [None]:
import pandas as pd
import os
from helpers import *
from sklearn.impute import SimpleImputer
path = 'C:/Users/XFE/Documents/mimic4ed-benchmark/data_processed/'
df_master = pd.read_csv(os.path.join(path, 'master_dataset.csv'))

In [None]:
pd.set_option('display.max_columns', 100) 
df_master.head()

In [None]:
#revise time range + function name

### 1. General filter - Age

In [None]:
df_master = df_master[df_master['age'] >= 18]

### 2. Outlier Detection 

In [None]:
# from mimic-extract
vitals_valid_range = {
    'temperature': {'outlier_low': 14.2, 'valid_low': 26, 'valid_high': 45, 'outlier_high':47},
    'heartrate': {'outlier_low': 0, 'valid_low': 0, 'valid_high': 350, 'outlier_high':390},
    'resprate': {'outlier_low': 0, 'valid_low': 0, 'valid_high': 300, 'outlier_high':330},
    'o2sat': {'outlier_low': 0, 'valid_low': 0, 'valid_high': 100, 'outlier_high':150},
    'sbp': {'outlier_low': 0, 'valid_low': 0, 'valid_high': 375, 'outlier_high':375},
    'dbp': {'outlier_low': 0, 'valid_low': 0, 'valid_high': 375, 'outlier_high':375},
    'pain': {'outlier_low': 0, 'valid_low': 0, 'valid_high': 10, 'outlier_high':10},
    'acuity': {'outlier_low': 1, 'valid_low': 1, 'valid_high': 5, 'outlier_high':5},
}

In [None]:
df_master = convert_temp_to_celcius(df_master)

In [None]:
display_outliers_count(df_master, vitals_valid_range)

In [None]:
df_master = remove_outliers(df_master, vitals_valid_range)

### 3. Dataset Split (train:0.8, test: 0.2, use seed to fix)

In [None]:
df_train=df_master.sample(frac=0.8,random_state=10) #random state is a seed value
df_test=df_master.drop(df_train.index)

In [None]:
df_train.head()

### 4. Missing Value imputation 

In [None]:
df_missing_stats = df_train.isnull().sum().to_frame().T
df_missing_stats.loc[1] = df_missing_stats.loc[0] / len(df_master)
df_missing_stats.index = ['no. of missing values', 'percentage of missing values']
df_missing_stats

In [None]:
vitals_cols = [col for col in df_master.columns if len(col.split('_')) > 1 and 
                                                   col.split('_')[1] in vitals_valid_range]
vitals_cols

In [None]:
imputer = SimpleImputer(strategy='mean')
df_train[vitals_cols] = imputer.fit_transform(df_train[vitals_cols])
df_test[vitals_cols] = imputer.transform(df_test[vitals_cols])

### Output the train and test data

In [None]:
df_train.to_csv(os.path.join(path, 'train.csv'), index=False)
df_test.to_csv(os.path.join(path, 'test.csv'), index=False)