In [1]:
import sys
sys.path.append("..")

from gan import output

import pandas as pd
import numpy as np

import pickle

from copy import deepcopy

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

## Original data

### Load

In [2]:
df_static = pd.read_pickle('../../../../../../Master Thesis/data/preprocessed/tumours_patients_2017_train.pickle').set_index('eid')
df_treatments = pd.read_pickle('../../../../../../Master Thesis/data/preprocessed/treatments_2017_train.pickle')

### Transformer

In [3]:
class DoppelGANgerTransformer:
    """
    Class that transforms patients covariates and treatments data to format used for DoppelGANger
    """
    def __init__(self, primary_key, treatment_col, order_col, max_nr_treatments, transformer_static, transformer_treatments):
        self.primary_key = primary_key
        self.treatment_col = treatment_col
        self.order_col = order_col
        self.max_nr_treatments = max_nr_treatments
        self.transformer_static = transformer_static
        self.transformer_treatments = transformer_treatments
        
    def transform(self, df_static, df_treatments): 
        df_static = df_static.copy()
        df_treatments = df_treatments.copy()
        
        ##### TREATMENTS #####
        
        # Sort df_treatments on primary key and order number
        df_treatments = df_treatments.sort_values([self.primary_key, self.order_col])
        df_treatments = df_treatments.set_index(self.primary_key)
        
        ## Encoding
        # Transform
        df_treatments = pd.DataFrame(self.transformer_treatments.fit_transform(df_treatments).toarray(),
                                     columns=list(self.transformer_treatments.transformers_[0][1].get_feature_names(
                                         [self.treatment_col])),
                                     index=df_treatments.index)
                
        # One encoded list per timestamp
        df_treatments['timestamp'] = df_treatments.values.tolist()
        
        # Groupby primary key to get list of timestamps per primary key
        df_treatments = pd.DataFrame(df_treatments.groupby(self.primary_key)['timestamp'].apply(list))
                
        ## Gen flag
        # Get length of sequence per primary key
        df_treatments['length'] = df_treatments['timestamp'].apply(len)
        
        def get_gen_flag(length):
            return length * [1.0] + (self.max_nr_treatments-length) * [0.0]
        
        df_treatments['data_gen_flag'] = df_treatments['length'].apply(get_gen_flag)
        
        ## Set timestamp to always contain maximum number of timestamps (zero padding)
        def fix_data_timestamp(row):
            timestamp = deepcopy(row['timestamp'])
            to_append = self.max_nr_treatments-row['length']
            for i in range(to_append):
                timestamp.append(len(self.transformer_treatments.transformers_[0][1].get_feature_names(
                    [self.treatment_col])) * [0]) 
            return timestamp
        
        df_treatments['timestamp'] = df_treatments.apply(lambda row: fix_data_timestamp(row), axis=1)
        
        # Drop unneeded length column
        df_treatments = df_treatments.drop('length', axis=1).copy()
        
        # Get needed np arrays
        data_feature = np.array(df_treatments['timestamp'].values.tolist())
        data_gen_flag = np.array(df_treatments['data_gen_flag'].values.tolist())
                
        ##### STATIC #####
        
        # Only tumours in treatments df
        df_static = df_static.loc[df_treatments.index]
        
        ## One-hot encoding
        df_static = pd.DataFrame(self.transformer_static.fit_transform(df_static).toarray(),
                                 columns=list(self.transformer_static.transformers_[0][1].get_feature_names(
                                     df_static.columns)),
                                 index=df_static.index)
        
        assert all(df_static.index == df_treatments.index), 'Primary keys of static and treatments dataframe do not match'
        
        data_attribute = df_static.values
        
        return data_feature, data_gen_flag, data_attribute
    
    def inverse_transform(self, data_feature, data_gen_flag, data_attribute):
        ##### STATIC #####
        
        ## Inverse transform column transformers (One-hot encoding)
        df_static_inverse = pd.DataFrame(self.transformer_static.transformers_[0][1].inverse_transform(data_attribute),
                              columns=self.transformer_static.transformers_[0][2])
        
        ##### TREATMENTS #####
        
        ## Sample lengths inverse transform
        lengths = [list(sample).count(1) for sample in data_gen_flag]
        features_length_transformed = [list(sample_length[0][:sample_length[1]]) for sample_length in list(zip(data_feature, lengths))]
        
        ## Explode
        # One for per treatment, multiple rows per primary key
        df_treatments_inverse = pd.DataFrame({'timestamps': features_length_transformed}).explode('timestamps')
        df_treatments_inverse.index.name = self.primary_key
        df_treatments_inverse = df_treatments_inverse.reset_index()
    
        ## Inverse transform column transformers (One-hot encoding)
        cat_features = [sample for sample in df_treatments_inverse['timestamps']]
        
        df_treatments_inverse = pd.DataFrame(self.transformer_treatments.transformers_[0][1].inverse_transform(cat_features),
                                             columns=self.transformer_treatments.transformers_[0][2],
                                             index=df_treatments_inverse[self.primary_key])
        df_treatments_inverse.index.name = self.primary_key
        df_treatments_inverse = df_treatments_inverse.reset_index()
        
        df_treatments_inverse[self.order_col] = df_treatments_inverse.groupby('eid').cumcount()+1
        
        df_treatments_inverse = df_treatments_inverse[[self.primary_key, self.treatment_col, self.order_col]].copy()
        
        return df_static_inverse, df_treatments_inverse

### Transform and save original data

In [4]:
transformer_treatments = ColumnTransformer(transformers=
                                           [('categorical_encoding', OneHotEncoder(), ['gbs_gebeurtenis_code'])])

transformer_static = ColumnTransformer(transformers=
                                       [('categorical_encoding', OneHotEncoder(), df_static.columns)])

In [5]:
tf = DoppelGANgerTransformer(primary_key='eid',
                             treatment_col='gbs_gebeurtenis_code', 
                             order_col='gbs_vnr',
                             max_nr_treatments=5,
                             transformer_static=transformer_static,
                             transformer_treatments=transformer_treatments)

In [6]:
data_feature, data_gen_flag, data_attribute = tf.transform(df_static, df_treatments)

In [7]:
data_feature.shape, data_gen_flag.shape, data_attribute.shape

((191913, 5, 47), (191913, 5), (191913, 38))

In [8]:
# Metadata needed by DoppelGANger (see their GitHub repository for more information)
data_feature_output = [
    output.Output(type_=output.OutputType.DISCRETE, dim=df_treatments['gbs_gebeurtenis_code'].nunique(), normalization=None, is_gen_flag=False)]

data_attribute_output = [
    output.Output(type_=output.OutputType.DISCRETE, dim=df_static['tum_topo_code'].nunique(), normalization=None, is_gen_flag=False),
    output.Output(type_=output.OutputType.DISCRETE, dim=df_static['pat_geslacht_code'].nunique(), normalization=None, is_gen_flag=False),
    output.Output(type_=output.OutputType.DISCRETE, dim=df_static['tum_differentiatiegraad_code'].nunique(), normalization=None, is_gen_flag=False),
    output.Output(type_=output.OutputType.DISCRETE, dim=df_static['tum_lymfklieren_positief_atl'].nunique(), normalization=None, is_gen_flag=False),
    output.Output(type_=output.OutputType.DISCRETE, dim=df_static['age_at_diagnosis'].nunique(), normalization=None, is_gen_flag=False),
    output.Output(type_=output.OutputType.DISCRETE, dim=df_static['tum_topo_sublokalisatie_code'].nunique(), normalization=None, is_gen_flag=False),
    output.Output(type_=output.OutputType.DISCRETE, dim=df_static['stadium'].nunique(), normalization=None, is_gen_flag=False),
    output.Output(type_=output.OutputType.DISCRETE, dim=df_static['survival_1'].nunique(), normalization=None, is_gen_flag=False)]

In [9]:
with open('../../../../../../Master Thesis/data/doppelGANger/final/data_feature_output.pkl', 'wb') as file:
    pickle.dump(data_feature_output, file)

In [10]:
with open('../../../../../../Master Thesis/data/doppelGANger/final/data_attribute_output.pkl', 'wb') as file:
    pickle.dump(data_attribute_output, file)

In [11]:
np.savez('../../../../../../Master Thesis/data/doppelGANger/final/data_train.npz',
         data_feature=data_feature,
         data_attribute=data_attribute,
         data_gen_flag=data_gen_flag)

## Synthetic data

After training DoppelGANger (training.py) and generating data (generating.ipynb), we should reverse transform the generated data to the right format (same format as original data). This is done by inverse transforming the DoppelGANgerTransformer class on the generated data.

In [12]:
# Load data
generated_data = np.load('./generated_data/generated_data.npz')

generated_feature = generated_data["data_feature"]
generated_attribute = generated_data["data_attribute"]
generated_gen_flag = generated_data["data_gen_flag"]

In [13]:
# Transform
df_static_synth, df_treatments_synth = tf.inverse_transform(generated_feature, generated_gen_flag, generated_attribute)

In [14]:
df_static_synth.to_pickle('../../synthetic_data/DGNP_tumours.pickle')

In [15]:
df_treatments_synth.to_pickle('../../synthetic_data/DGNP_treatments.pickle')