In [None]:
import pandas as pd
import numpy as np
import json
import simplejson

In [None]:
df = pd.read_excel('data/dataset.xlsx')

In [None]:
df.head()

In [None]:
# Replace <1000 entries by 0 in order to keep this category continuous
df[["Urine - Leukocytes"]] = df[["Urine - Leukocytes"]].replace('<1000', '0')
df['Urine - Leukocytes'] = df['Urine - Leukocytes']

In [None]:
# Nao Realizado is basically NaN, so convert it to one
df["Urine - pH"][177]

In [None]:
# 
df["Urine - pH"][177] = float('NaN')
df['Urine - pH'] = df['Urine - pH']
df["Patient age quantile"] = df["Patient age quantile"]

In [None]:
# Drop all columns with all NaNs
df1 = df.drop(['Mycoplasma pneumoniae', 'Urine - Nitrite', 'Urine - Sugar', 'Partial thromboplastin time\xa0(PTT)\xa0', 'Prothrombin time (PT), Activity', 'D-Dimer', 'Fio2 (venous blood gas analysis)'], axis=1)

In [None]:
# New dataset with attribute metadata: number of unique values, datatype, number of nulls, unique values if datatype is categorical
def getAttrInfo(df):
    attrInfo = {}
    for name in list(df.columns):
        num = len(df[name].unique())

        # use number of unique values as loose proxy for datatype
        varType = 'continuous' if num > 6 else 'categorical'

        # proxy misclassifies patient ID, so adjust manually
        if (name == 'Urine - pH' or name == "Patient ID" or "Urine - Leukocytes"):
            varType = "categorical"

        attrInfo[name] = {'numDistinctValues': num, 'type': varType, 'numNull': df[name].isna().sum().item()}

        if (varType == 'categorical'):
            attrInfo[name]['values'] = list(df[name].unique())
        else:
            # add min and max value for continuous variables
            print(name)
            attrInfo[name]['min'] = df[name].min()
            attrInfo[name]['max'] = df[name].max()
    attrInfo["Patient ID"]["values"] = list(df['Patient ID'].unique())
    
    return attrInfo

In [None]:
attrInfo = getAttrInfo(df1)

In [None]:
hasToBeEncoded = {}
df1_encoded = df1.copy()

In [None]:
# convert dummy binary variables with values 0 and 1 to yes and no for readability in tool
for item in attrInfo:
    if (attrInfo[item]['type'] == 'categorical'):
        if (attrInfo[item]['values'] == [0,1]):
            df1_encoded[item] = df1_encoded[item].replace([0,1], ['no', 'yes'])
    
df1_encoded.head()

In [None]:
# Remove (1=yes, 0=no) from column names
df1_enc_renamed = df1_encoded.rename({'Patient addmited to regular ward (1=yes, 0=no)': 'Patient addmited to regular ward', 'Patient addmited to semi-intensive unit (1=yes, 0=no)': 'Patient addmited to semi-intensive unit', 'Patient addmited to intensive care unit (1=yes, 0=no)': 'Patient addmited to intensive care unit'}, axis=1)

In [None]:
df1_enc_renamed.to_json('./data/data.json', orient='records')

In [None]:
attrInfo_json_serializable = getAttrInfo(df1_enc_renamed)

In [None]:
# Convert int64 list to int list in order to make it json serializable
attrInfo_json_serializable['Patient age quantile']['values'] = list(map(int, attrInfo_json_serializable['Patient age quantile']['values']))

In [None]:
# Write attribute info dataset to file
with open('./data/attrInfo.json', 'w') as ai:
    simplejson.dump(attrInfo_json_serializable, ai, ignore_nan=True)