# Data Preprocessing

In [None]:
# Important libraries
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import csv
from wordcloud import WordCloud
from wordcloud import STOPWORDS
import matplotlib.pyplot as plt

# File paths
fp_notebooks_folder = "./"
fp_code_folder = "../"
fp_processed_folder = os.path.join(fp_code_folder, "../processed_data")
fp_converted_folder = os.path.join(fp_processed_folder, "converted_to_csv")
fp_downsampled_folder = os.path.join(fp_processed_folder, "downsampled")
fp_downsampled_concatenated_file = os.path.join(fp_downsampled_folder, "concatenated.csv")
fp_downsampled_continuous_file = os.path.join(fp_downsampled_folder, "continous.csv")
fp_downsampled_continuous_ae_file = os.path.join(fp_downsampled_folder, "continous_ae.csv")
fp_downsampled_dropna_file = os.path.join(fp_downsampled_folder, "dropna.csv")
fp_downsampled_scaler_file = os.path.join(fp_downsampled_folder, "scaler.pkl")
fp_fields_file = os.path.join(fp_converted_folder, "fields.csv")

In [None]:
# Get fields file to analyse
fields_df = pd.read_csv(fp_fields_file, quoting=csv.QUOTE_NONE, escapechar='/', index_col=0)
fields_df

In [None]:
def add_feat_names(fields_df):
    feat_names = []
    for i in range(len(fields_df)):
        row = fields_df.iloc[i]
        units_list, sig_list = eval(row["units"]), eval(row["sig_name"])
        cur_feat_names = {sig_list[j] + " (" + units_list[j] + ")" for j in range(len(units_list))}
        feat_names.append(cur_feat_names)
    return feat_names

fields_df["feat_cols"] = add_feat_names(fields_df)
fields_df

## Select Feature Set

In [None]:
# Check what are the most common subset of features
def most_comment_subset_of_features(fields_df):
    feat_set_series = fields_df["feat_cols"]
    feat_set_counts = feat_set_series.value_counts().reset_index()
    feat_set_counts.columns = ["feat_set", "count"]
    feat_set_counts["num_features"] = [len(feat_set) for feat_set in feat_set_counts["feat_set"]]
    feat_set_counts = feat_set_counts.sort_values("num_features", ascending=True)
    num_unique_feat_sets = len(feat_set_counts)
    
    feat_set_inc_list = []
    for i in range(num_unique_feat_sets):
        total_count = 0
        potential_subset = feat_set_counts["feat_set"].iloc[i]
        for j in range(i,num_unique_feat_sets):
            potential_superset, count = feat_set_counts[["feat_set","count"]].iloc[j].values
            if potential_subset.issubset(potential_superset):
                total_count += count
        feat_set_inc_list.append({"feat_set": potential_subset, "inc_count": total_count})
    
    feat_set_inc_df = pd.DataFrame(feat_set_inc_list)
    
    return feat_set_counts, feat_set_inc_df.sort_values("inc_count", ascending=False)
feat_set_stats = most_comment_subset_of_features(fields_df)
display(feat_set_stats[0])
display(feat_set_stats[1])

In [None]:
feat_set = feat_set_stats[1].iloc[2][0]
print(feat_set)
# Check above count
def check_feat_set_count(fields_df, cur_feat_set):
    feat_set_series = fields_df["feat_cols"]
    count = 0
    for fs in feat_set_series:
        if cur_feat_set.issubset(fs):
            count += 1
    return count
print(check_feat_set_count(fields_df, cur_feat_set=feat_set))

In [None]:
# Shortlist patients with feat_set
def shortlist_patients_with_feat_set(fields_df, cur_feat_set):
    feat_set_series = fields_df["feat_cols"]
    records = []
    for i, fs in enumerate(feat_set_series):
        if cur_feat_set.issubset(fs):
            records.append(fields_df["record"].iloc[i])
    return records
shortlisted_records = shortlist_patients_with_feat_set(fields_df, cur_feat_set=feat_set)
print(len(shortlisted_records), "Records:", shortlisted_records)

In [None]:
fields_df_shortlisted = fields_df[fields_df["record"].isin(shortlisted_records)]
fields_df_shortlisted

In [None]:
fields_df_shortlisted.head()

## Split Records into Train, Valid, Test Records

In [None]:
# split records by train, valid, test
def recordwise_train_valid_test_split(df, valid_prop, test_prop, seed):
    records = df["record"].unique()
    num_records = len(records)
    record_indices = np.arange(num_records)
    np.random.seed(seed=seed)
    np.random.shuffle(record_indices)
    shuffled_records = records[record_indices]
    num_valid, num_test = round(valid_prop * num_records), round(test_prop * num_records)
    num_train = num_records - num_valid - num_test
    train_records, valid_records, test_records = (
        shuffled_records[:num_train], shuffled_records[num_train: num_train+num_valid],
        shuffled_records[num_train+num_valid:num_train+num_valid+num_test],
    )
    return train_records, valid_records, test_records

train_records, valid_records, test_records = recordwise_train_valid_test_split(
    fields_df_shortlisted, valid_prop=0.1, test_prop=0.2, seed=2023)

In [None]:
print(len(train_records), "Train Records:", train_records)
print(len(valid_records), "Valid Records:", valid_records)
print(len(test_records), "Test Records:", test_records)

In [None]:
fields_df_shortlisted.head()

## Load Data

In [None]:
def load_data(fields_df_shortlisted, feat_set):
    df_list = []
    records = fields_df_shortlisted["record"].tolist()
    for record in tqdm(records):
        fp_csv = os.path.join(fp_converted_folder, record+".csv")
        df = pd.read_csv(fp_csv, index_col=0)[list(feat_set)]
        df["record"] = record
        df_list.append(df)
    all_df = pd.concat(df_list).reset_index()
    all_df.columns = ["record_index"] + all_df.columns[1:].to_list()
    return all_df
all_df = load_data(fields_df_shortlisted, feat_set)
all_df.head()

In [None]:
def plot_columns(df):
    import seaborn as sns
    df = df.copy()
    df = df.drop("record_index", axis=1)
    df = df.drop("record", axis=1)
    df = df.dropna()
    print(df.columns)
    num_rows = 2
    num_cols = 3

    # Create subplots
    fig, axes = plt.subplots(num_rows, num_cols, figsize=(12, 6), dpi=300)

    # Flatten the axes array for easier iteration
    axes = axes.flatten()

    # Loop through each column and create a violin plot
    for i, column in enumerate(df.columns):
        ax = axes[i]
        sns.boxplot(data=df, y=column, ax=ax)
        ax.set_title(column)
#         ax.set_xlabel(column)
        ax.set_ylabel('Value')

    # Hide any remaining empty subplots
    for i in range(len(df.columns), num_rows * num_cols):
        axes[i].axis('off')

    plt.tight_layout()  # Ensures proper spacing between subplots
    plt.show()
    
plot_columns(all_df)

In [None]:
all_df.to_csv(fp_downsampled_concatenated_file, index=False)

In [None]:
all_df = pd.read_csv(fp_downsampled_concatenated_file)

In [None]:
all_df.describe()

## Set outlier values to NA

In [None]:
all_df.isna().sum()

- Drop anything that is 0 – basically means machine disconnected, for any feature in the dataset
- Drop BP > 250 – this is highly unlikely (he told me he will check for us to see if some other threshold is better, but 250 is very safe.)

In [None]:
list(feat_set)

In [None]:
def set_outliers_to_na(df, predictors):
    df = df.copy()
    # 1. Drop anything that is 0 – basically means machine disconnected, for any feature in the dataset
    df[df[predictors]<=0] = np.nan
    
    # 2. Drop BP > 250 – this is highly unlikely
    bp_cols = ["ABPmean (mmHg)", 'ABPdias (mmHg)', 'ABPsys (mmHg)']
    df[df[bp_cols]>250] = np.nan 
    return df
    
all_df = set_outliers_to_na(all_df, predictors=list(feat_set))

In [None]:
all_df

In [None]:
all_df.describe()

In [None]:
plot_columns(all_df)

## Standardise Data

In [None]:
def standardise_data(df, train_records, valid_records, test_records, predictors):
    from sklearn.preprocessing import StandardScaler
    df_train, df_valid, df_test = (
        df[df["record"].isin(train_records)], df[df["record"].isin(valid_records)],
        df[df["record"].isin(test_records)]
    )
    scaler = StandardScaler()
    scaler.fit(df_train[predictors])
    df_train[predictors] = scaler.transform(df_train[predictors])
    df_valid[predictors] = scaler.transform(df_valid[predictors])
    df_test[predictors] = scaler.transform(df_test[predictors])
    
    return pd.concat([df_train, df_valid, df_test]), scaler

all_df, scaler = standardise_data(all_df, train_records, valid_records, test_records, list(feat_set))
all_df.head()

In [None]:
all_df.shape

In [None]:
def save_scaler(scaler, fp_downsampled_scaler_file):
    import pickle 
    with open(fp_downsampled_scaler_file, 'wb') as handle:
        pickle.dump(scaler, handle, protocol=pickle.HIGHEST_PROTOCOL)
save_scaler(scaler, fp_downsampled_scaler_file)

In [None]:
all_df[all_df["record"]=='052n'].shape[0]/60

In [None]:
all_df["record"].unique()

## Downsample Time Series Data

In [None]:
def downsample_all_df(all_df, feat_set, window=60):
    all_downsampled_df_list = []
    feat_set = list(feat_set)
    for record, record_df in all_df.groupby("record"):
        window_groups = record_df.groupby((record_df.index//window).astype('i'))
        # Downsample with mean
        downsampled_df = window_groups[feat_set].mean()
        downsampled_df.columns = [col + "_mean" for col in feat_set]
        # Downsample with std
        std_cols = [col + "_std" for col in feat_set]
        downsampled_df[std_cols] = window_groups[feat_set].std()
        downsampled_df["record"] = record
        # Get record df index
        downsampled_df = downsampled_df.reset_index(drop=True)
        downsampled_df["record_index"] = downsampled_df.index
        all_downsampled_df_list.append(downsampled_df)
    all_downsampled_df = pd.concat(all_downsampled_df_list)
    return all_downsampled_df
downsampled_df = downsample_all_df(all_df, feat_set, window=60)
downsampled_df.head(10)

In [None]:
downsampled_df.shape

In [None]:
downsampled_df[downsampled_df["record"]=='052n']

In [None]:
downsampled_df.to_csv(fp_downsampled_continuous_file, index=False)

## Flatten Data and Generate Data for Training

In [None]:
# Update feat set
downsampled_feat_set = [f"{feat}_mean" for feat in feat_set] + [f"{feat}_std" for feat in feat_set] 
print(downsampled_feat_set)

In [None]:
pred_cols = [f"{feat}_mean" for feat in feat_set]
print(pred_cols)

In [None]:
def generate_data_for_training(
    downsampled_df, downsampled_feat_set, input_mins, prediction_mins, pred_cols):
    data = []
    downsampled_feat_set = list(downsampled_feat_set)
    # The flattened features for past input_days
    new_feat_set = [feat + " Min" + str(i) for i in range(input_mins) for feat in downsampled_feat_set] + \
        [feat + " PredMin" + str(i) for i in prediction_mins for feat in pred_cols] + \
        ["target_index"] + ["input_start_index"] + ["input_end_index"] + ["record"]

    # Sort prediction mins e.g. 1, 3, 2 -> 1, 2, 3
    prediction_mins.sort() 
    # For each record
    record_groups = downsampled_df.groupby("record")
    for record, record_df in tqdm(record_groups):
        record_df = record_df.set_index("record_index")
        # Go through possible end_input indices
        for end_input in record_df.index:
            # Not enough input information
            if end_input < input_mins-1:
                continue
            # Not enough pred information
            elif end_input + prediction_mins[-1] > record_df.index[-1]:
                break
            start_input = end_input - input_mins + 1
            # (input) 0 1 2 3 4, (output) 5, 6, 7
            pred_indices = [end_input + pred_min for pred_min in prediction_mins]
            # Get the rows we are predicting
            pred_rows = record_df[pred_cols].loc[pred_indices]
            # Get rows we are using to predict
            input_rows = record_df[downsampled_feat_set].loc[start_input:end_input]
            
            # Get input
            feat_list = list(input_rows.values.flatten())
            # Get prediction
            pred_list = list(pred_rows.values.flatten())
            
            # Add row
            new_row = (
                feat_list + pred_list +
                [pred_rows.index.tolist(), start_input, end_input, record]
            )
            data.append(new_row)
    return pd.DataFrame(data, columns=new_feat_set)

flattened_df = generate_data_for_training(
    downsampled_df, downsampled_feat_set, input_mins=5, prediction_mins=[1, 2, 3], pred_cols=pred_cols)
flattened_df.head()

In [None]:
flattened_df.shape # previously, 119780

## Remove Rows With NaN Values

In [None]:
flattened_df = flattened_df.dropna()
flattened_df.head()

In [None]:
flattened_df.shape # previously 90000+

In [None]:
flattened_df["ABPmean (mmHg)_mean PredMin1"].hist()

In [None]:
def label_train_valid_test_samples(df, train_records, valid_records, test_records):
    df = df.copy()
    df["train"] = df["record"].isin(train_records)
    df["valid"] = df["record"].isin(valid_records)
    df["test"] = df["record"].isin(test_records)
    return df

train_valid_test_labelled_df = label_train_valid_test_samples(flattened_df, train_records, valid_records, test_records)
train_valid_test_labelled_df.head()

In [None]:
train_valid_test_labelled_df.to_csv(fp_downsampled_dropna_file)