# Converting Data to CSV

In [None]:
# Important libraries
import os
import wfdb
import numpy as np
import pandas as pd
from tqdm import tqdm

# File paths
fp_notebooks_folder = "./"
fp_code_folder = "../"
fp_raw_data_folder = os.path.join(fp_code_folder, "../mimic-database-1.0.0")
fp_processed_folder = os.path.join(fp_code_folder, "../processed_data")
fp_converted_folder = os.path.join(fp_code_folder, "../processed_data", "converted_to_csv")

## Signal Data

In [None]:
# Get Record Names
def get_record_names(fp_raw_data_folder, remove_trailing=2):
    fp_records_file = os.path.join(fp_raw_data_folder, "RECORDS")
    if not os.path.exists(fp_records_file):
        raise Exception("Records file does not exist!") 
    else:
        with open(fp_records_file, "r") as f:
            records = f.readlines()
            # remove trailing \n and slash
            records = [record[:-remove_trailing] for record in records]
        return records
    
records = get_record_names(fp_raw_data_folder)
print(len(records), "Records:", records)

In [None]:
def filter_record_names(records, remove_trailing=None, filter_limit=55):
    # Only keep records after 055 (which contain alarms)
    # "al (annotations for alarms related to changes in the patient's status) and 
    # in (annotations related to changes in the functioning of the monitor) for all records after 055"
    if remove_trailing:
        return [record for record in records if int(record[:-remove_trailing])>=filter_limit]
    return [record for record in records if int(record)>=filter_limit]

filtered_records = filter_record_names(records)
print(len(filtered_records), "Filtered Records:", filtered_records)

In [None]:
# Get data in dataframe format for a single record's files
def get_record_data(fp_raw_data_folder, record_name):
    fp_record_folder = os.path.join(fp_raw_data_folder, record_name, record_name)
    
    # 1. Get signal data
    signal, fields = wfdb.rdsamp(fp_record_folder)
    # Check that signal length is the same as signal length in fields description
    assert signal.shape[0] == fields["sig_len"]
    # Check that we have units for all features
    colnames, colunits = fields['sig_name'], fields['units']
    assert len(colnames) == len(colunits)
    # Column names = feature (units)
    num_cols = len(colnames) 
    new_colnames = [colnames[i]+" ("+colunits[i]+")" for i in range(num_cols)]
    data_df = pd.DataFrame(signal, columns=new_colnames)
    
    # 2. Get annotations
    def add_annotation(ann_type, data_df, name):
        ann = wfdb.rdann(fp_record_folder, ann_type)
        ann_df = pd.DataFrame({"index": ann.sample, name: ann.aux_note})
        # Remove duplicate alarms
        ann_df = ann_df.drop_duplicates()
        fields[f"unique_{name}"] = str(ann_df[name].value_counts().to_dict())
        # If multiple alarms occur at the same time
        ann_df = ann_df.groupby('index').agg({name:'first'.join})
        # Add annotation to existing data_df
        data_df = data_df.join(ann_df, how='left')
        return data_df
    data_df = add_annotation("al", data_df, "alarms")
    data_df = add_annotation("abp", data_df, "abp")
    
    return data_df, fields

# Example of how a record data would appear
testing = get_record_data(fp_raw_data_folder, '409')
display(testing[0])
print(testing[1])

In [None]:
# NA values exist in columns
testing[0].isna().sum()

In [None]:
testing[0][testing[0]["alarms"].notna()].tail(50)

In [None]:
fgap, bgap = 5, 5
index =13905952	
testing[0].iloc[index-fgap:index+bgap]

## Periodic Measurements "Numerics"    fields_list = []


In [None]:
records_numerics = get_record_names(os.path.join(fp_raw_data_folder, "numerics"), 1)
print(len(records_numerics), "Records:", records_numerics)

In [None]:
filtered_records_numerics = filter_record_names(records_numerics, 1, filter_limit=48)
print(len(filtered_records_numerics), "Filtered Records:", filtered_records_numerics)

In [None]:
# Get data in dataframe format for a single record's files
def get_record_data_numeric(fp_raw_data_folder, record_name):
    fp_record_folder = os.path.join(fp_raw_data_folder, "numerics", record_name)
    
    # 1. Get signal data
    signal, fields = wfdb.rdsamp(fp_record_folder)
    # Check that signal length is the same as signal length in fields description
    assert signal.shape[0] == fields["sig_len"]
    # Check that we have units for all features
    colnames, colunits = fields['sig_name'], fields['units']
    assert len(colnames) == len(colunits)
    # Column names = feature (units)
    num_cols = len(colnames) 
    new_colnames = [colnames[i]+" ("+colunits[i]+")" for i in range(num_cols)]
    data_df = pd.DataFrame(signal, columns=new_colnames)
    
    # 2. Get annotations
    def add_annotation(ann_type, data_df, name):
        ann = wfdb.rdann(fp_record_folder, ann_type)
        ann_df = pd.DataFrame({"index": ann.sample, name: ann.aux_note})
        if len(ann_df)==0:
            data_df[name] = np.nan
            return data_df
        ann_df[name] = ann_df[name].str.strip()
        # Remove duplicate alarms
        ann_df = ann_df.drop_duplicates()
        fields[f"unique_{name}"] = str(ann_df[name].value_counts().to_dict())
        # If multiple alarms occur at the same time
        groups = ann_df.groupby('index')
        indices, alarm_list = [], []
        for index, alarms in groups[name]:
            alarm_set = set(alarms[alarms.notna()])
            alarm_list.append(alarm_set)
            indices.append(index)
        ann_df = pd.DataFrame({"index":indices, name:alarm_list}).set_index("index")

        # Add annotation to existing data_df
        data_df = data_df.join(ann_df, how='left')
        return data_df
    data_df = add_annotation("al", data_df, "alarms")
    
    return data_df, fields

testing2 = get_record_data_numeric(fp_raw_data_folder, "055n")
display(testing2[0])
print(testing2[1])

In [None]:
testing2[0][testing2[0]["alarms"].notna()]

In [None]:
# NA values exist in columns
testing2[0].isna().sum()

## Output Converted Record (Numerics)

In [None]:
def get_all_records(fp_raw_data_folder, fp_output, records):
    import csv
    if not os.path.exists(fp_output):
        os.makedirs(fp_output)
    fields_list = []
    pbar = tqdm(records)
    for record in pbar:
        pbar.set_description(f"Getting record data [{record}]...")
        # Output data_df
        fp_output_file = os.path.join(fp_output, record+".csv")
        if os.path.exists(fp_output_file):
            continue
        data_df, fields = get_record_data_numeric(fp_raw_data_folder, record)
        pbar.set_description(f"Outputing record data [{record}]...")
        data_df.to_csv(fp_output_file)
        # Append fields data to be outputted
        fields["record"] = record
        fields_list.append(fields)
    fp_output_file = os.path.join(fp_output, "fields.csv")
    fields_df = pd.DataFrame(fields_list)
    fields_df.to_csv(fp_output_file, quoting=csv.QUOTE_NONE, escapechar='/')
    print("All records converted to CSV!")
    
get_all_records(fp_raw_data_folder=fp_raw_data_folder, fp_output=fp_converted_folder, records=filtered_records_numerics)

In [None]:
def get_all_fields(fp_raw_data_folder, fp_output, records):
    import csv
    if not os.path.exists(fp_output):
        os.makedirs(fp_output)
    fields_list = []
    pbar = tqdm(records)
    for record in pbar:
        pbar.set_description(f"Getting field data [{record}]...")
        # Output data_df
        fp_output_file = os.path.join(fp_output, record+".csv")
        data_df, fields = get_record_data_numeric(fp_raw_data_folder, record)
        pbar.set_description(f"Outputing record data [{record}]...")
        fields["record"] = record
        fields_list.append(fields)
    fp_output_file = os.path.join(fp_output, "fields.csv")
    fields_df = pd.DataFrame(fields_list)
    fields_df.to_csv(fp_output_file, quoting=csv.QUOTE_NONE, escapechar='/')
    print("All records converted to CSV!")
get_all_fields(fp_raw_data_folder=fp_raw_data_folder, fp_output=fp_converted_folder, records=filtered_records_numerics)