# Data Preprocessing

In [8]:
# Important libraries
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import csv
from wordcloud import WordCloud
from wordcloud import STOPWORDS
import matplotlib.pyplot as plt

# File paths
fp_notebooks_folder = "./"
fp_code_folder = "../"
fp_processed_folder = os.path.join(fp_code_folder, "../processed_data")
fp_converted_folder = os.path.join(fp_processed_folder, "converted_to_csv")
fp_downsampled_folder = os.path.join(fp_processed_folder, "downsampled")
fp_downsampled_continuous_file = os.path.join(fp_downsampled_folder, "continous.csv")
fp_downsampled_dropna_file = os.path.join(fp_downsampled_folder, "dropna.csv")
fp_fields_file = os.path.join(fp_converted_folder, "fields.csv")

In [None]:
# Get fields file to analyse
fields_df = pd.read_csv(fp_fields_file, quoting=csv.QUOTE_NONE, escapechar='/', index_col=0)
fields_df

In [None]:
def add_feat_names(fields_df):
    feat_names = []
    for i in range(len(fields_df)):
        row = fields_df.iloc[i]
        units_list, sig_list = eval(row["units"]), eval(row["sig_name"])
        cur_feat_names = {sig_list[j] + " (" + units_list[j] + ")" for j in range(len(units_list))}
        feat_names.append(cur_feat_names)
    return feat_names

fields_df["feat_cols"] = add_feat_names(fields_df)
fields_df

## Select Feature Set

In [None]:
# Check what are the most common subset of features
def most_comment_subset_of_features(fields_df):
    feat_set_series = fields_df["feat_cols"]
    feat_set_counts = feat_set_series.value_counts().reset_index()
    feat_set_counts.columns = ["feat_set", "count"]
    feat_set_counts["num_features"] = [len(feat_set) for feat_set in feat_set_counts["feat_set"]]
    feat_set_counts = feat_set_counts.sort_values("num_features", ascending=True)
    num_unique_feat_sets = len(feat_set_counts)
    
    feat_set_inc_list = []
    for i in range(num_unique_feat_sets):
        total_count = 0
        potential_subset = feat_set_counts["feat_set"].iloc[i]
        for j in range(i,num_unique_feat_sets):
            potential_superset, count = feat_set_counts[["feat_set","count"]].iloc[j].values
            if potential_subset.issubset(potential_superset):
                total_count += count
        feat_set_inc_list.append({"feat_set": potential_subset, "inc_count": total_count})
    
    feat_set_inc_df = pd.DataFrame(feat_set_inc_list)
    
    return feat_set_counts, feat_set_inc_df.sort_values("inc_count", ascending=False)
feat_set_stats = most_comment_subset_of_features(fields_df)
display(feat_set_stats[0])
display(feat_set_stats[1])

In [None]:
feat_set = feat_set_stats[1].iloc[2][0]
# Check above count
def check_feat_set_count(fields_df, cur_feat_set):
    feat_set_series = fields_df["feat_cols"]
    count = 0
    for fs in feat_set_series:
        if cur_feat_set.issubset(fs):
            count += 1
    return count
check_feat_set_count(fields_df, cur_feat_set=feat_set)

In [None]:
# Shortlist patients with feat_set
def shortlist_patients_with_feat_set(fields_df, cur_feat_set):
    feat_set_series = fields_df["feat_cols"]
    records = []
    for i, fs in enumerate(feat_set_series):
        if cur_feat_set.issubset(fs):
            records.append(fields_df["record"].iloc[i])
    return records
shortlisted_records = shortlist_patients_with_feat_set(fields_df, cur_feat_set=feat_set)
print(len(shortlisted_records), "Records:", shortlisted_records)

In [None]:
fields_df_shortlisted = fields_df[fields_df["record"].isin(shortlisted_records)]
fields_df_shortlisted

In [None]:
fields_df_shortlisted.head()

## Select Alarms to Predict

In [None]:
def most_common_alarms(fields_df_shortlisted):
    alarm_col = fields_df_shortlisted["unique_alarms"]
    alarm_count = {}
    num_shortlisted = len(fields_df_shortlisted)
    for i in range(num_shortlisted):
        unique_alarms_dict = eval(alarm_col.iloc[i])
        for alarm, count in unique_alarms_dict.items():
            alarm = alarm.strip()
            if alarm in alarm_count:
                alarm_count[alarm] += count
            else:
                alarm_count[alarm] = count
    alarm_series = pd.Series(alarm_count).sort_values(ascending=False).reset_index()
    alarm_series["index"] = alarm_series["index"]
    alarm_series.columns = ["alarms", "count"]
    return alarm_series

alarm_count = most_common_alarms(fields_df_shortlisted)
alarm_count.head(10)

In [None]:
# Show word cloud of alarms
def generate_alarm_wordcloud(alarm_count):
    text = " ".join(alarm for alarm, count in alarm_count[["alarms", "count"]].values for j in range(count))
    stopwords = set(STOPWORDS)
    wordcloud = WordCloud(
        width=1000, height=500, stopwords=stopwords, background_color="white", collocations=False, max_words=20).generate(text)
    plt.figure(figsize=(6,3))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()
generate_alarm_wordcloud(alarm_count)

In [None]:
# find out how many alarms with "VPB" in it
def query_alarm_count(alarm_count, keyword):
    alarm_list = []
    total_count = 0
    for alarm, count in alarm_count[["alarms", "count"]].values:
        if keyword in alarm:
            alarm_list.append(alarm)
            total_count += count 
    return alarm_count[alarm_count["alarms"].isin(alarm_list)], total_count
vpb_alarm_stats = query_alarm_count(alarm_count, keyword="VPB")
print(vpb_alarm_stats[1], "VPBs:")
display(vpb_alarm_stats[0])

In [None]:
# How many patients have vpb
def query_records_with_alarm_keyword(fields_df_shortlisted, keyword_list):
    df_list = []
    for keyword in keyword_list:
        count = 0
        record_list = []
        for record, alarm_str in fields_df_shortlisted[["record","unique_alarms"]].values:
            if keyword in alarm_str:
                count += 1
                record_list.append(record)
        df_list.append({"keyword": keyword, "count":count, "records": record_list})
    return pd.DataFrame(df_list)
vbp_record_stats = query_records_with_alarm_keyword(fields_df_shortlisted, keyword_list=["VPB"]+vpb_alarm_stats[0]["alarms"].tolist())
vbp_record_stats

In [None]:
# Shortlist patients with alarm
def shortlist_patients_with_alarm(vbp_record_stats, keyword):
    record_list = vbp_record_stats[vbp_record_stats["keyword"]==keyword]["records"][0]
    return record_list
shortlisted_records_alarm = shortlist_patients_with_alarm(vbp_record_stats, keyword="VPB")
fields_df_shortlisted_alarm = fields_df_shortlisted[fields_df_shortlisted["record"].isin(shortlisted_records_alarm)]
display(fields_df_shortlisted_alarm.head())
print(fields_df_shortlisted_alarm.shape)

In [None]:
# 6.22 million rows in total (with nan values)
fields_df_shortlisted_alarm["sig_len"].sum() 

## Downsample Time Series Data

In [None]:
def downsample_all_csvs(fp_converted_folder, feat_set, fields_df_shortlisted, window=60):
    df_list = []
    records = fields_df_shortlisted["record"].tolist()
    for record in tqdm(records):
        fp_csv = os.path.join(fp_converted_folder, record+".csv")
        df = pd.read_csv(fp_csv)
        df = downsample_ts(df, window, feat_set)
        df["record"] = record
        df_list.append(df)
    all_df = pd.concat(df_list).reset_index()
    all_df.columns = ["record_index"] + all_df.columns[1:].to_list()
    return all_df

def downsample_ts(df, window, feat_set, alarm_col="alarms"):
    df[alarm_col] = df[alarm_col].str.strip()
    feat_set = list(feat_set)
    all_feat = feat_set + [alarm_col]
    df_feat_sel = df[all_feat]
    grouped = df_feat_sel.groupby((df_feat_sel.index/window).astype('i'))
    df_downsampled = grouped[feat_set].mean()
    alarm_list = []
    for group, alarms in grouped[alarm_col]:
        list_of_alarms = [eval(al) for al in alarms[alarms.notna()]]
        alarm_set = set().union(*list_of_alarms)
        alarm_list.append(alarm_set)
    df_downsampled["alarms"] = alarm_list
    
    return df_downsampled

df_ds = downsample_all_csvs(fp_converted_folder, feat_set, fields_df_shortlisted, window=60)
df_ds

In [None]:
df_ds["alarms"].value_counts()

In [None]:
def label_rows(df_ds, keyword):
    return [(keyword in str(al)) for al in df_ds["alarms"]]
df_ds["VPBs"] = label_rows(df_ds, keyword="VPBs")
df_ds

In [None]:
df_ds["VPBs"].value_counts()

In [None]:
def label_true_negative(df_ds):
    return [len(al) == 0 for al in df_ds["alarms"]]
# label rows with no alarms at all
df_ds["no_alarms"] = label_true_negative(df_ds)
df_ds

In [None]:
df_ds["no_alarms"].value_counts()

In [None]:
if not os.path.exists(fp_downsampled_folder):
    os.makedirs(fp_downsampled_folder)
df_ds.to_csv(fp_downsampled_continuous_file)

In [9]:
df_ds = pd.read_csv(fp_downsampled_continuous_file, index_col=0)
df_ds

Unnamed: 0,record_index,ABPmean (mmHg),HR (bpm),ABPsys (mmHg),RESP (bpm),SpO2 (%),ABPdias (mmHg),alarms,record,VPBs,no_alarms
0,0,84.800000,63.850000,120.483333,0.0,96.300000,67.416667,set(),048n,False,True
1,1,86.033333,83.416667,127.183333,0.0,93.600000,63.300000,set(),048n,False,True
2,2,83.350000,82.466667,125.050000,0.0,93.366667,59.100000,set(),048n,False,True
3,3,88.850000,83.950000,131.766667,0.0,93.150000,63.833333,set(),048n,False,True
4,4,109.400000,87.183333,156.550000,0.0,82.400000,86.066667,set(),048n,False,True
...,...,...,...,...,...,...,...,...,...,...,...
120149,1031,,91.900000,,,96.716667,,set(),476n,False,True
120150,1032,,81.150000,,,97.100000,,{'IRREGULAR HR'},476n,False,False
120151,1033,,96.366667,,,96.650000,,{'IRREGULAR HR'},476n,False,False
120152,1034,,94.850000,,,96.716667,,{'IRREGULAR HR'},476n,False,False


In [10]:
df_ds.iloc[139-9:139+10]

Unnamed: 0,record_index,ABPmean (mmHg),HR (bpm),ABPsys (mmHg),RESP (bpm),SpO2 (%),ABPdias (mmHg),alarms,record,VPBs,no_alarms
130,130,,77.483333,,22.366667,97.4,,set(),048n,False,True
131,131,,81.866667,,21.066667,98.3,,set(),048n,False,True
132,132,,81.466667,,20.333333,99.0,,set(),048n,False,True
133,133,,79.733333,,18.366667,98.9,,set(),048n,False,True
134,134,,79.716667,,11.7,99.966667,,set(),048n,False,True
135,135,,79.85,,13.516667,99.383333,,set(),048n,False,True
136,136,,78.833333,,22.533333,100.0,,set(),048n,False,True
137,137,,78.4,,21.8,99.85,,set(),048n,False,True
138,138,,78.633333,,12.983333,100.0,,set(),048n,False,True
139,139,,138.883333,,13.833333,100.0,,"{'*TACHY 215 > 180', '*TACHY 216 > 180', 'PAIR...",048n,True,False


## Generate Training/Validation Data

In [11]:
feat_set = df_ds.columns.tolist()[1:7]
feat_set.sort()
print("Features:", feat_set)

Features: ['ABPdias (mmHg)', 'ABPmean (mmHg)', 'ABPsys (mmHg)', 'HR (bpm)', 'RESP (bpm)', 'SpO2 (%)']


In [18]:
def generate_all_possible_pos_neg_data(df_ds, feat_set, input_days, prediction_day, pred_col):
    # The flattened features for past input_days
    new_feat_set = [feat + " " + str(i) for i in range(input_days) for feat in feat_set] + ["target"] + ["target_index"] + ["record"]
    data = []
    record_groups = df_ds.groupby("record")
    first_pos_index = input_days + prediction_day - 1
    for record, df in tqdm(record_groups):
        for pred_index in range(first_pos_index, len(df)):
            pred_row = df.iloc[pred_index]
            # if this is not positive and not negative case
            if (not pred_row[pred_col]) and (not pred_row["no_alarms"]):
                continue
            # else, add to data set
            start_input, end_input = pred_index - prediction_day - input_days + 1, pred_index - prediction_day
            new_row = (
                list(df[feat_set].iloc[start_input:end_input+1].values.flatten()) +
                [pred_row[pred_col], pred_row.name, pred_row["record"]]
            )
            data.append(new_row)
    return pd.DataFrame(data, columns=new_feat_set)

pos_neg = generate_all_possible_pos_neg_data(df_ds, feat_set, input_days=5, prediction_day=5, pred_col="VPBs")      
pos_neg

100%|███████████████████████████████████████████| 57/57 [00:32<00:00,  1.77it/s]


Unnamed: 0,ABPdias (mmHg) 0,ABPmean (mmHg) 0,ABPsys (mmHg) 0,HR (bpm) 0,RESP (bpm) 0,SpO2 (%) 0,ABPdias (mmHg) 1,ABPmean (mmHg) 1,ABPsys (mmHg) 1,HR (bpm) 1,...,SpO2 (%) 3,ABPdias (mmHg) 4,ABPmean (mmHg) 4,ABPsys (mmHg) 4,HR (bpm) 4,RESP (bpm) 4,SpO2 (%) 4,target,target_index,record
0,67.416667,84.800000,120.483333,63.850000,0.0,96.300000,63.300000,86.033333,127.183333,83.416667,...,93.150000,86.066667,109.400000,156.550000,87.183333,0.0,82.400000,False,9,048n
1,63.300000,86.033333,127.183333,83.416667,0.0,93.600000,59.100000,83.350000,125.050000,82.466667,...,82.400000,85.716667,108.866667,158.133333,86.583333,0.0,96.750000,False,10,048n
2,59.100000,83.350000,125.050000,82.466667,0.0,93.366667,63.833333,88.850000,131.766667,83.950000,...,96.750000,88.466667,112.650000,160.550000,86.250000,0.0,97.283333,False,11,048n
3,63.833333,88.850000,131.766667,83.950000,0.0,93.150000,86.066667,109.400000,156.550000,87.183333,...,97.283333,103.750000,128.666667,181.050000,93.166667,0.0,95.200000,False,12,048n
4,86.066667,109.400000,156.550000,87.183333,0.0,82.400000,85.716667,108.866667,158.133333,86.583333,...,95.200000,101.116667,123.700000,174.750000,92.183333,0.0,90.616667,False,13,048n
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112637,,,,89.050000,,95.850000,,,,89.466667,...,96.350000,,,,88.833333,,96.083333,False,120138,476n
112638,,,,91.116667,,95.766667,,,,108.066667,...,96.483333,,,,92.333333,,96.200000,False,120147,476n
112639,,,,108.066667,,95.733333,,,,108.950000,...,96.200000,,,,92.733333,,96.000000,False,120148,476n
112640,,,,108.950000,,95.800000,,,,87.533333,...,96.000000,,,,94.133333,,95.766667,False,120149,476n


In [19]:
pos_neg[pos_neg["target_index"]==139]

Unnamed: 0,ABPdias (mmHg) 0,ABPmean (mmHg) 0,ABPsys (mmHg) 0,HR (bpm) 0,RESP (bpm) 0,SpO2 (%) 0,ABPdias (mmHg) 1,ABPmean (mmHg) 1,ABPsys (mmHg) 1,HR (bpm) 1,...,SpO2 (%) 3,ABPdias (mmHg) 4,ABPmean (mmHg) 4,ABPsys (mmHg) 4,HR (bpm) 4,RESP (bpm) 4,SpO2 (%) 4,target,target_index,record
130,,,,77.483333,22.366667,97.4,,,,81.866667,...,98.9,,,,79.716667,11.7,99.966667,True,139,048n


In [20]:
pos_neg["target"].value_counts()

target
False    101523
True      11119
Name: count, dtype: int64

In [21]:
pos_neg_dropna = pos_neg.dropna()
pos_neg_dropna["target"].value_counts()

target
False    79664
True      9034
Name: count, dtype: int64

In [22]:
pos_neg_dropna

Unnamed: 0,ABPdias (mmHg) 0,ABPmean (mmHg) 0,ABPsys (mmHg) 0,HR (bpm) 0,RESP (bpm) 0,SpO2 (%) 0,ABPdias (mmHg) 1,ABPmean (mmHg) 1,ABPsys (mmHg) 1,HR (bpm) 1,...,SpO2 (%) 3,ABPdias (mmHg) 4,ABPmean (mmHg) 4,ABPsys (mmHg) 4,HR (bpm) 4,RESP (bpm) 4,SpO2 (%) 4,target,target_index,record
0,67.416667,84.800000,120.483333,63.850000,0.000000,96.300000,63.300000,86.033333,127.183333,83.416667,...,93.150000,86.066667,109.400000,156.550000,87.183333,0.000000,82.400000,False,9,048n
1,63.300000,86.033333,127.183333,83.416667,0.000000,93.600000,59.100000,83.350000,125.050000,82.466667,...,82.400000,85.716667,108.866667,158.133333,86.583333,0.000000,96.750000,False,10,048n
2,59.100000,83.350000,125.050000,82.466667,0.000000,93.366667,63.833333,88.850000,131.766667,83.950000,...,96.750000,88.466667,112.650000,160.550000,86.250000,0.000000,97.283333,False,11,048n
3,63.833333,88.850000,131.766667,83.950000,0.000000,93.150000,86.066667,109.400000,156.550000,87.183333,...,97.283333,103.750000,128.666667,181.050000,93.166667,0.000000,95.200000,False,12,048n
4,86.066667,109.400000,156.550000,87.183333,0.000000,82.400000,85.716667,108.866667,158.133333,86.583333,...,95.200000,101.116667,123.700000,174.750000,92.183333,0.000000,90.616667,False,13,048n
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112560,51.000000,76.000000,131.733333,91.683333,21.733333,96.766667,50.050000,74.100000,126.983333,90.133333,...,96.333333,49.583333,74.633333,130.533333,90.933333,19.600000,96.000000,False,120058,476n
112561,50.050000,74.100000,126.983333,90.133333,21.766667,96.166667,50.833333,75.383333,131.400000,91.500000,...,96.000000,47.583333,73.350000,132.166667,91.850000,20.416667,96.733333,False,120059,476n
112562,50.833333,75.383333,131.400000,91.500000,20.966667,96.350000,50.200000,73.883333,128.050000,90.466667,...,96.733333,47.750000,73.750000,130.850000,92.300000,23.033333,97.000000,False,120060,476n
112563,50.200000,73.883333,128.050000,90.466667,18.866667,96.333333,49.583333,74.633333,130.533333,90.933333,...,97.000000,49.066667,76.933333,139.483333,92.533333,23.616667,97.116667,False,120061,476n


In [23]:
pos_neg_dropna.to_csv(fp_downsampled_dropna_file)