In [4]:
import pandas as pd 
import numpy as np

In [6]:
data = pd.read_csv('../../../synthetic-data-service/isaFull.tsv', delimiter='\t')

#### Prepare a clean version of the prism csv

In [7]:
# select columns that are wanted
data2 = data[['Participant_Id','Visit date [EUPATH_0000091]','Abdominal pain duration (days) [EUPATH_0000154]',"Age at visit (years) [EUPATH_0000113]", "Anorexia duration (days) [EUPATH_0000155]", "Asexual Plasmodium parasite density, by microscopy [EUPATH_0000092]", "Cough duration (days) [EUPATH_0000156]", "Diarrhea duration (days) [EUPATH_0000157]", "Fatigue duration (days) [EUPATH_0000158]", "Fever, subjective duration (days) [EUPATH_0000164]", "Headache duration (days) [EUPATH_0000159]", "Height (cm) [EUPATH_0010075]", "Hemoglobin (g/dL) [EUPATH_0000047]", "Joint pains duration (days) [EUPATH_0000161]", "Muscle aches duration (days) [EUPATH_0000162]", "Temperature (C) [EUPATH_0000110]", "Vomiting duration (days) [EUPATH_0000165]", "Weight (kg) [EUPATH_0000732]", 'Complicated malaria [EUPATH_0000040]', "Febrile [EUPATH_0000097]", "ITN last night [EUPATH_0000216]", "Malaria diagnosis [EUPATH_0000090]", "Malaria diagnosis and parasite status [EUPATH_0000338]", "Malaria treatment [EUPATH_0000740]", "Plasmodium gametocytes present, by microscopy [EUPATH_0000207]", "Submicroscopic Plasmodium present, by LAMP [EUPATH_0000487]", "Visit type [EUPATH_0000311]"]].copy().sort_values(by =['Participant_Id', 'Visit date [EUPATH_0000091]']).reset_index(drop=True)

# rename columns
data2.columns = ['id', 'date', 'ab_pain_dur', 'age', 'aneroxia_dur', 'plasmodium_density', 'cough_dur', 'diarrhea_dur', 'fatigue_dur', 'fever_dur', 'headache_dur', 'height', 'hemoglobin', 'joint_pain_dur', 'muscle_ache_dur', 'temp', 'vomit_dur', 'weight', 'complicated_malaria','febrile', 'ITN', 'malaria', 'malaria_parasite', 'malaria_treatment', 'plasmodium_gametocytes', 'plasmodium_lamp', 'visit_type']

# drop a row which is mostly NAs
data2 = data2.drop(44432)

# fill NAs in duration columns and plasmodium density column with 0
dur_cols = ['ab_pain_dur', 'aneroxia_dur', 'plasmodium_density', 'cough_dur', 'diarrhea_dur', 'fatigue_dur', 'fever_dur', 'headache_dur', 'joint_pain_dur', 'muscle_ache_dur', 'vomit_dur']
for col in dur_cols:
    data2[col] = data2[col].fillna(0)

# fill NAs in numerical columns by interpolation
num_cols = ['height', 'hemoglobin', 'temp', 'weight']
for col in num_cols:
    data2[col] = data2[col].interpolate(method='linear')

# fill NAs in categorical columns with new category "not applicable"/"no result" etc
data2['plasmodium_lamp'] = data2['plasmodium_lamp'].fillna('no_result')
data2['ITN'] = data2['ITN'].fillna('not applicable')
data2['complicated_malaria'] = data2['complicated_malaria'].fillna('not_assessed')
data2['plasmodium_gametocytes'] = data2['plasmodium_gametocytes'].fillna('No')

# replace white spaces with underscores so that it won't create trouble later
data2 = data2.replace(' ', '_', regex=True)

# convert categorical column values to lowercase
# one hot encode categorical columns
cat_cols = ['complicated_malaria', 'febrile', 'ITN', 'malaria', 'malaria_parasite', 'malaria_treatment', 'plasmodium_gametocytes', 'plasmodium_lamp', 'visit_type']
for col in cat_cols:
    data2[col] = data2[col].map(lambda x: x.lower() if isinstance(x,str) else x)
    one_hot_cols = pd.get_dummies(data2[col], prefix=col)
    data2 = pd.concat([data2, one_hot_cols], axis=1)

# find delta day between visits
# for first visit, we fill in with 0 for now
data2['id_diff'] = data2['id'].diff()
data2['date'] = pd.to_datetime(data2['date'])
data2['dday'] = data2['date'].diff()
def fill_first_dday(row):
    if row['id_diff'] != 0:
        return (row['date']- row['date']) # to get 0 in datetime format
    else:
        return row['dday']
data2['dday'] = data2.apply(fill_first_dday, axis=1)
data2['dday'] = data2['dday'].dt.days.astype('int16') # get int value from datetime format

# get delta day for first visit
# this is a separate column from above as we will treat it as an attribute rather than feature
earliest_date = min(data2['date'])
def get_first_dday(row, earliest_date):
    if row['id_diff'] != 0:
        return (row['date']- earliest_date)
data2['first_dday'] = data2.apply(get_first_dday,args=(earliest_date,), axis=1)
data2['first_dday'] = data2['first_dday'].fillna(method='ffill')
data2['first_dday'] = data2['first_dday'].dt.days.astype('int16')

# only take patiente with more than 5 visits
data_5above = data2[(data2.groupby('id')['id'].transform('size') >= 5)].reset_index(drop=True)
data_5above = data_5above.sort_values(by=['id', 'date'])

# remove unwanted columns
data_5above = data_5above.drop(columns=['date','complicated_malaria', 'febrile', 'ITN', 'malaria', 'malaria_parasite', 'malaria_treatment', 'plasmodium_gametocytes', 'plasmodium_lamp', 'visit_type', 'id_diff'])

# write data into a csv
# data_5above.to_csv('gan/data/ori_prism_cleaned.csv', index=False)



In [84]:
import pandas as pd
import numpy as np

def real_data_loading():
    """
    takes in cleaned dataset 
    and returns features, gen_flag, attributes
    to be saved in data_train.npz, 
    min_ and max_ for renormalization
    """

    data = pd.read_csv('data/ori_prism_cleaned.csv')
    # fill in any NAs that have been missed out
    data.interpolate(method = 'linear', inplace=True)

    # normalizing data
    min_val = data.min()
    max_val = data.max()
    data = (data - min_val) / (max_val - min_val + 1e-7)
    id_unique = data.id.unique()

    feature_cols = ['ab_pain_dur', 'age', 'aneroxia_dur', 'plasmodium_density', 'cough_dur', 'diarrhea_dur', 
        'fatigue_dur', 'fever_dur', 'headache_dur', 'height', 'hemoglobin', 'joint_pain_dur', 'muscle_ache_dur', 
        'temp', 'vomit_dur', 'weight', 'complicated_malaria_no', 'complicated_malaria_not_assessed', 
        'complicated_malaria_yes', 'febrile_no', 'febrile_yes', 'ITN_no', 'ITN_not_applicable', 'ITN_yes',
        'malaria_no', 'malaria_yes',
        'malaria_parasite_blood_smear_indicated_but_not_done',
        'malaria_parasite_blood_smear_negative_/_lamp_negative',
        'malaria_parasite_blood_smear_negative_/_lamp_not_done',
        'malaria_parasite_blood_smear_negative_/_lamp_positive',
        'malaria_parasite_blood_smear_not_indicated',
        'malaria_parasite_blood_smear_positive_/_no_malaria',
        'malaria_parasite_symptomatic_malaria',
        'malaria_treatment_artmether-lumefantrine_for_uncomplicated_malaria',
        'malaria_treatment_no_malaria_medications_given',
        'malaria_treatment_quinine_for_uncomplicated_malaria_in_the_1st_trimester_of_pregnancy',
        'malaria_treatment_quinine_for_uncomplicated_malaria_within_14_days_of_a_previous_treatment_for_malaria',
        'malaria_treatment_quinine_or_artesunate_for_complicated_malaria',
        'plasmodium_gametocytes_no', 'plasmodium_gametocytes_yes',
        'plasmodium_lamp_negative', 'plasmodium_lamp_no_result',
        'plasmodium_lamp_positive', 'visit_type_enrollment',
        'visit_type_scheduled_visit', 'visit_type_unscheduled_visit', 'dday']

    features =[] 
    attributes = []
    gen_flag = []

    # iterate over each id
    for i in id_unique:
        #get features for each participant
        child = np.array(data.loc[data['id'] == i][feature_cols])

        if len(child) >= 5:
            # get padded gen_flag according to length of time series for each child
            gen_flag.append(np.concatenate([np.ones(len(child)), np.zeros(130-len(child))]))   
            # get padded features for each child
            child = np.pad(child, ((0, 130-len(child)), (0,0)))
            features.append(child)
            # get attributes for each child
            attributes.append(np.array(data.loc[data['id'] == i].iloc[0, -1])) #-1 for first_dday

    min_val = min_val.drop('id')
    max_val = max_val.drop('id')
  
    return np.array(features), np.array(attributes), np.array(gen_flag), min_val, max_val, feature_cols

In [None]:
# WIP
def gen_feature_loading(path, feature_cols, seq_len=130):
    """
    loads in generated npz file and returns a csv for generated data
    Args:
    path: path to npz file from doppelganger
    feature_cols: column names for features in dataset
    seq_len: max sequence length for each patient
    """

    data = np.load(path)
    
    data_out = np.clip(data['data_feature'], 0, 1)

    dim = data['data_feature'].shape[2]
    data_stack = data_out.reshape((-1,dim))

    # create a dataframe from the array
    data_df = pd.DataFrame(data_stack)
    data_df['id'] = data_df.index // seq_len + 1
    data_df.columns = feature_cols + ['id']

    #remove padded columns
    data_df = data_df.drop(data_df[(data_df.weight == 0) & (data_df.height == 0)].index)

    #temporary 
    min_val = min_val.drop('first_dday')
    max_val = max_val.drop('first_dday')

    #renormalization
    data_df[feature_cols] = data_df[feature_cols] * (np.array(max_val) - np.array(min_val)) + np.array(min_val)

    #data_df.to_csv('gen_prism.csv', index=False)

#### An example of saving data in the format that can be processed by the doppelganger

In [85]:
data_feature, data_attribute, data_gen_flag, min_, max_, feature_cols = real_data_loading()
data_attribute = data_attribute.reshape((-1, 1))

print(data_feature.shape)
print(data_attribute.shape)
print(data_gen_flag.shape)
#np.savez('data/data_train.npz', data_feature=data_feature, data_attribute=data_attribute, data_gen_flag=data_gen_flag)

(1347, 130, 47)
(1347, 1)
(1347, 130)


In [86]:
import pickle
from output import *
data_feature_output = [
    Output(type_=OutputType.CONTINUOUS, dim=1, normalization=Normalization.ZERO_ONE, is_gen_flag=False),
    Output(type_=OutputType.CONTINUOUS, dim=1, normalization=Normalization.ZERO_ONE, is_gen_flag=False),
    Output(type_=OutputType.CONTINUOUS, dim=1, normalization=Normalization.ZERO_ONE, is_gen_flag=False),
    Output(type_=OutputType.CONTINUOUS, dim=1, normalization=Normalization.ZERO_ONE, is_gen_flag=False),
    Output(type_=OutputType.CONTINUOUS, dim=1, normalization=Normalization.ZERO_ONE, is_gen_flag=False),
    Output(type_=OutputType.CONTINUOUS, dim=1, normalization=Normalization.ZERO_ONE, is_gen_flag=False),
    Output(type_=OutputType.CONTINUOUS, dim=1, normalization=Normalization.ZERO_ONE, is_gen_flag=False),
    Output(type_=OutputType.CONTINUOUS, dim=1, normalization=Normalization.ZERO_ONE, is_gen_flag=False),
    Output(type_=OutputType.CONTINUOUS, dim=1, normalization=Normalization.ZERO_ONE, is_gen_flag=False),
    Output(type_=OutputType.CONTINUOUS, dim=1, normalization=Normalization.ZERO_ONE, is_gen_flag=False),
    Output(type_=OutputType.CONTINUOUS, dim=1, normalization=Normalization.ZERO_ONE, is_gen_flag=False),
    Output(type_=OutputType.CONTINUOUS, dim=1, normalization=Normalization.ZERO_ONE, is_gen_flag=False),
    Output(type_=OutputType.CONTINUOUS, dim=1, normalization=Normalization.ZERO_ONE, is_gen_flag=False),
    Output(type_=OutputType.CONTINUOUS, dim=1, normalization=Normalization.ZERO_ONE, is_gen_flag=False),
    Output(type_=OutputType.CONTINUOUS, dim=1, normalization=Normalization.ZERO_ONE, is_gen_flag=False),
    Output(type_=OutputType.CONTINUOUS, dim=1, normalization=Normalization.ZERO_ONE, is_gen_flag=False),
    Output(type_=OutputType.DISCRETE, dim=3, normalization=None, is_gen_flag=False),
    Output(type_=OutputType.DISCRETE, dim=2, normalization=None, is_gen_flag=False),
    Output(type_=OutputType.DISCRETE, dim=3, normalization=None, is_gen_flag=False),
    Output(type_=OutputType.DISCRETE, dim=2, normalization=None, is_gen_flag=False),
    Output(type_=OutputType.DISCRETE, dim=7, normalization=None, is_gen_flag=False),
    Output(type_=OutputType.DISCRETE, dim=5, normalization=None, is_gen_flag=False),
    Output(type_=OutputType.DISCRETE, dim=2, normalization=None, is_gen_flag=False),
    Output(type_=OutputType.DISCRETE, dim=3, normalization=None, is_gen_flag=False),
    Output(type_=OutputType.DISCRETE, dim=3, normalization=None, is_gen_flag=False),
    Output(type_=OutputType.CONTINUOUS, dim=1, normalization=Normalization.ZERO_ONE, is_gen_flag=False)]

# with open('data/data_feature_output.pkl', 'wb') as f:
#     pickle.dump(data_feature_output, f)

In [79]:
data_attribute_output = [Output(type_=OutputType.CONTINUOUS, dim=1, normalization=Normalization.ZERO_ONE, is_gen_flag=False)]

# with open('data/data_attribute_output.pkl', 'wb') as f:
#     pickle.dump(data_attribute_output, f)