# Generating Sequential Data with PrivBayes

# Environment

## Library Imports

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
import os
import sys
from itertools import product
module_path = os.path.abspath(os.path.join('../../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
from copy import deepcopy
import math
from dython.nominal import associations
import matplotlib.pyplot as plt
import random

In [3]:
print(module_path)

C:\Users\mwo2008.54063\Documents\GitHub\thesis


## Jupyter-specific Imports and Settings

In [4]:
# set printing options
np.set_printoptions(threshold=sys.maxsize)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.expand_frame_repr', False)

# Display all cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from IPython import get_ipython
ipython = get_ipython()

# autoreload extension
if 'autoreload' not in ipython.extension_manager.loaded:
    get_ipython().run_line_magic('load_ext', 'autoreload')

get_ipython().run_line_magic('autoreload', '2')
from importlib import reload

## Import local libraries

In [5]:
from synthesis.synthesizers.privbayes import PrivBayes, NodeParentPair

## Load original data

In [6]:
tumours = pd.read_pickle('../../../../../Master Thesis/data/preprocessed/tumours_patients_2017_train.pickle').set_index('eid')
treatments = pd.read_pickle('../../../../../Master Thesis/data/preprocessed/treatments_2017_train.pickle')

In [7]:
tumours.shape, treatments.shape

((191913, 8), (317592, 4))

## Sequential Data Pivoting + Post-processing

In [8]:
class SequentialDataPivotingMS:
    """
    Basic class for Sequential Data Pivoting as described in Section 4.2.2
    Inverse transform is extended with post-processing such that the first ’empty’ treatment sampled by the generative model 
    marks the end of the treatment sequence for a patient (Section 4.2.4)  
    """
    
    def __init__(self, max_n_treatments, primary_key, treatment_col, order_col):
        self.max_n_treatments = max_n_treatments
        self.primary_key = primary_key
        self.treatment_col = treatment_col
        self.order_col = order_col
        
    def transform(self, df_static, df_treatments):
        """
        Flattens df_treatments and appends to df_static
        """
        # Sort values by order col
        df_treatments = df_treatments.sort_values([self.primary_key, self.order_col])
        
        # Group treatments by primary key, list treatments
        df_treatments_grouped = pd.DataFrame(df_treatments.groupby(self.primary_key)[self.treatment_col].apply(list))
        
        # Instantiate flat treatments dataframe
        df_treatments_flat = pd.DataFrame(index=df_treatments_grouped.index)
        
        # Flatten treatment columns
        for nr in range(1, self.max_n_treatments + 1):
            df_treatments_flat['treatment_' + str(nr)] = df_treatments_grouped[self.treatment_col].apply(self.get_treatment_nr,
                                                                                                        args=[nr])
            
        # Merge flat treatments dataframe with static covariates dataframe
        df = df_static.merge(df_treatments_flat, left_index=True, right_index=True)
        
        return df
    
    def inverse_transform(self, df):
        """
        Transforms flat dataframe back to df_static and df_treatments 
        Post-processes treatment sequences such that the first ’empty’ treatment sampled by the generative model 
        marks the end of the treatment sequence for a patient (Section 4.2.4)     
        """
        # Define static and treatment columns
        static_cols = [col for col in df if not (col.startswith('treatment') or col=='sequence_length')]
        treatment_cols = list(set(df.columns) - set(static_cols))
        
        # df_static
        df_static = df[static_cols].copy()

        # df_treatments
        treatment_cols_treatments = sorted([col for col in treatment_cols if col.startswith('treatment')])
        
        # One df per treatment column to reverse flatten
        treatment_cols_data = [pd.DataFrame(df[col]) for col in treatment_cols_treatments]
        for a_df in treatment_cols_data:
            a_df['nr_treatment'] = a_df.columns[0][-1]
            a_df.rename(columns={a_df.columns[0]: self.treatment_col}, inplace=True)
        
        # Concat all treatment columns to one dataframe
        treatments = pd.concat(treatment_cols_data)
        treatments.index.name = self.primary_key
        treatments = treatments.reset_index()
        
        # Replace nan 
        df_treatments = treatments.replace('nan', np.nan)
        df_treatments = df_treatments.dropna().rename({'nr_treatment': self.order_col}, axis=1)
        
        # Sort by primary key and order of treatments
        df_treatments = df_treatments.sort_values([self.primary_key, self.order_col]).reset_index(drop=True)

        # Drop all treatments after first nan
        df_treatments = df_treatments[df_treatments[self.order_col].astype(int)
                                      ==df_treatments.groupby(self.primary_key).cumcount()+1].copy()
                
        return df_static, df_treatments
        
    @staticmethod
    def get_treatment_nr(treatments, nr):
        if len(treatments)>=nr:
            return treatments[nr-1]
        else:
            return np.nan

In [9]:
# develop transformer to flatten the table
fts = SequentialDataPivotingMS(max_n_treatments=5,
                               primary_key='eid',
                               treatment_col='gbs_gebeurtenis_code',
                               order_col='gbs_vnr')

df = fts.transform(tumours, treatments)

In [10]:
df.shape

(191913, 13)

In [11]:
# Set all dtypes to string
df = df.astype(str)

# Generate data

### Epsilon 1

In [12]:
epsilon = 1

#### Static data only to find static network

In [13]:
df_static = df.iloc[:, :8]

In [18]:
pb = PrivBayes(epsilon=epsilon, verbose=True)
pb.fit(df_static)

1/8 - Root of network: age_at_diagnosis

2/8 - Evaluating next node to add to network
Number of NodeParentPair candidates: 7
Candidates: [NodeParentPair(node='survival_1', parents=('age_at_diagnosis',)), NodeParentPair(node='tum_lymfklieren_positief_atl', parents=('age_at_diagnosis',)), NodeParentPair(node='tum_topo_code', parents=('age_at_diagnosis',)), NodeParentPair(node='tum_topo_sublokalisatie_code', parents=('age_at_diagnosis',)), NodeParentPair(node='pat_geslacht_code', parents=('age_at_diagnosis',)), NodeParentPair(node='stadium', parents=('age_at_diagnosis',)), NodeParentPair(node='tum_differentiatiegraad_code', parents=('age_at_diagnosis',))]
Selected node: 'survival_1' - with parents: ('age_at_diagnosis',)

3/8 - Evaluating next node to add to network
Number of NodeParentPair candidates: 6
Candidates: [NodeParentPair(node='tum_lymfklieren_positief_atl', parents=('survival_1', 'age_at_diagnosis')), NodeParentPair(node='tum_topo_code', parents=('survival_1', 'age_at_diagnosis'

Selected node: 'tum_differentiatiegraad_code' - with parents: ('tum_topo_code', 'tum_topo_sublokalisatie_code', 'stadium')

Learned Network Structure

Learning conditional probabilities: age_at_diagnosis - with parents None ~ estimated size: 6
Learning conditional probabilities: survival_1 - with parents ('age_at_diagnosis',) ~ estimated size: 12
Learning conditional probabilities: tum_topo_sublokalisatie_code - with parents ('survival_1', 'age_at_diagnosis') ~ estimated size: 120
Learning conditional probabilities: tum_topo_code - with parents ('survival_1', 'tum_topo_sublokalisatie_code', 'age_at_diagnosis') ~ estimated size: 360
Learning conditional probabilities: pat_geslacht_code - with parents ('survival_1', 'tum_topo_code', 'tum_topo_sublokalisatie_code', 'age_at_diagnosis') ~ estimated size: 720
Learning conditional probabilities: tum_lymfklieren_positief_atl - with parents ('survival_1', 'tum_topo_code', 'age_at_diagnosis', 'pat_geslacht_code') ~ estimated size: 216
Learning c

<synthesis.synthesizers.privbayes.PrivBayes at 0x2e1e2578408>

In [19]:
network_static = deepcopy(pb.network_)

In [20]:
network_static

[NodeParentPair(node='age_at_diagnosis', parents=None),
 NodeParentPair(node='survival_1', parents=('age_at_diagnosis',)),
 NodeParentPair(node='tum_topo_sublokalisatie_code', parents=('survival_1', 'age_at_diagnosis')),
 NodeParentPair(node='tum_topo_code', parents=('survival_1', 'tum_topo_sublokalisatie_code', 'age_at_diagnosis')),
 NodeParentPair(node='pat_geslacht_code', parents=('survival_1', 'tum_topo_code', 'tum_topo_sublokalisatie_code', 'age_at_diagnosis')),
 NodeParentPair(node='tum_lymfklieren_positief_atl', parents=('survival_1', 'tum_topo_code', 'age_at_diagnosis', 'pat_geslacht_code')),
 NodeParentPair(node='stadium', parents=('survival_1', 'tum_lymfklieren_positief_atl', 'age_at_diagnosis', 'tum_topo_code', 'pat_geslacht_code')),
 NodeParentPair(node='tum_differentiatiegraad_code', parents=('tum_topo_code', 'tum_topo_sublokalisatie_code', 'stadium'))]

#### Sequential network definition

In [21]:
context_columns = ['tum_topo_sublokalisatie_code', 'age_at_diagnosis', 'stadium', 'survival_1']
max_treatments = 5

init_network_sequential = []
for treatment_nr in range(1, max_treatments+1):
    if not treatment_nr==1:
        init_network_sequential += ([NodeParentPair('treatment_' + str(treatment_nr), 
                                                       context_columns + ['treatment_' + str(treatment_nr-1)])])
    else:
        init_network_sequential += [NodeParentPair('treatment_' + str(treatment_nr), context_columns)]

In [22]:
init_network_sequential

[NodeParentPair(node='treatment_1', parents=['tum_topo_sublokalisatie_code', 'age_at_diagnosis', 'stadium', 'survival_1']),
 NodeParentPair(node='treatment_2', parents=['tum_topo_sublokalisatie_code', 'age_at_diagnosis', 'stadium', 'survival_1', 'treatment_1']),
 NodeParentPair(node='treatment_3', parents=['tum_topo_sublokalisatie_code', 'age_at_diagnosis', 'stadium', 'survival_1', 'treatment_2']),
 NodeParentPair(node='treatment_4', parents=['tum_topo_sublokalisatie_code', 'age_at_diagnosis', 'stadium', 'survival_1', 'treatment_3']),
 NodeParentPair(node='treatment_5', parents=['tum_topo_sublokalisatie_code', 'age_at_diagnosis', 'stadium', 'survival_1', 'treatment_4'])]

#### Add sequential network to static network

In [23]:
init_network = network_static + init_network_sequential

#### Run PrivBayes with initialized network

In [24]:
# Train
pb = PrivBayes(epsilon=epsilon, verbose=True)
pb.set_network(init_network)
pb.fit(df)

<synthesis.synthesizers.privbayes.PrivBayes at 0x2e1ddd9b688>

1/13 - init node age_at_diagnosis - with parents: None
2/13 - init node survival_1 - with parents: ('age_at_diagnosis',)
3/13 - init node tum_topo_sublokalisatie_code - with parents: ('survival_1', 'age_at_diagnosis')
4/13 - init node tum_topo_code - with parents: ('survival_1', 'tum_topo_sublokalisatie_code', 'age_at_diagnosis')
5/13 - init node pat_geslacht_code - with parents: ('survival_1', 'tum_topo_code', 'tum_topo_sublokalisatie_code', 'age_at_diagnosis')
6/13 - init node tum_lymfklieren_positief_atl - with parents: ('survival_1', 'tum_topo_code', 'age_at_diagnosis', 'pat_geslacht_code')
7/13 - init node stadium - with parents: ('survival_1', 'tum_lymfklieren_positief_atl', 'age_at_diagnosis', 'tum_topo_code', 'pat_geslacht_code')
8/13 - init node tum_differentiatiegraad_code - with parents: ('tum_topo_code', 'tum_topo_sublokalisatie_code', 'stadium')
9/13 - init node treatment_1 - with parents: ['tum_topo_sublokalisatie_code', 'age_at_diagnosis', 'stadium', 'survival_1']
10/13 

<synthesis.synthesizers.privbayes.PrivBayes at 0x2e1ddd9b688>

In [26]:
# Synthesize data
df_pb = pb.sample()

Number of records generated: 191913 / 191913
Synthetic Data Generated



#### Post-processing

In [27]:
df_static, _ = fts.inverse_transform(df)
df_treatments = treatments.sort_values(['eid', 'gbs_vnr'])
df_static_pb, df_treatments_pb = fts.inverse_transform(df_pb)

In [28]:
df_static_pb.to_pickle('../synthetic_data/PBS_eps1_Full_tumours.pickle')
df_treatments_pb.to_pickle('../synthetic_data/PBS_eps1_Full_treatments.pickle')

### Epsilon 0.1

In [52]:
epsilon = 0.1

#### Static data only to find static network

In [53]:
df_static = df.iloc[:, :8]

In [56]:
pb = PrivBayes(epsilon=epsilon, verbose=True)
pb.fit(df_static)

1/8 - Root of network: pat_geslacht_code

2/8 - Evaluating next node to add to network
Number of NodeParentPair candidates: 7
Candidates: [NodeParentPair(node='survival_1', parents=('pat_geslacht_code',)), NodeParentPair(node='tum_lymfklieren_positief_atl', parents=('pat_geslacht_code',)), NodeParentPair(node='tum_topo_code', parents=('pat_geslacht_code',)), NodeParentPair(node='tum_topo_sublokalisatie_code', parents=('pat_geslacht_code',)), NodeParentPair(node='age_at_diagnosis', parents=('pat_geslacht_code',)), NodeParentPair(node='stadium', parents=('pat_geslacht_code',)), NodeParentPair(node='tum_differentiatiegraad_code', parents=('pat_geslacht_code',))]
Selected node: 'survival_1' - with parents: ('pat_geslacht_code',)

3/8 - Evaluating next node to add to network
Number of NodeParentPair candidates: 6
Candidates: [NodeParentPair(node='tum_lymfklieren_positief_atl', parents=('survival_1', 'pat_geslacht_code')), NodeParentPair(node='tum_topo_code', parents=('survival_1', 'pat_gesl

Selected node: 'age_at_diagnosis' - with parents: ('pat_geslacht_code', 'tum_topo_sublokalisatie_code')

8/8 - Evaluating next node to add to network
Number of NodeParentPair candidates: 12
Candidates: [NodeParentPair(node='tum_differentiatiegraad_code', parents=('tum_topo_code', 'pat_geslacht_code', 'age_at_diagnosis')), NodeParentPair(node='tum_differentiatiegraad_code', parents=('survival_1', 'pat_geslacht_code', 'age_at_diagnosis')), NodeParentPair(node='tum_differentiatiegraad_code', parents=('survival_1', 'tum_topo_code', 'age_at_diagnosis')), NodeParentPair(node='tum_differentiatiegraad_code', parents=('survival_1', 'pat_geslacht_code', 'tum_topo_sublokalisatie_code')), NodeParentPair(node='tum_differentiatiegraad_code', parents=('tum_topo_code', 'tum_topo_sublokalisatie_code')), NodeParentPair(node='tum_differentiatiegraad_code', parents=('tum_lymfklieren_positief_atl', 'survival_1', 'pat_geslacht_code', 'tum_topo_code')), NodeParentPair(node='tum_differentiatiegraad_code', par

<synthesis.synthesizers.privbayes.PrivBayes at 0x2e1e77fa108>

In [57]:
network_static = deepcopy(pb.network_)

In [58]:
network_static

[NodeParentPair(node='pat_geslacht_code', parents=None),
 NodeParentPair(node='survival_1', parents=('pat_geslacht_code',)),
 NodeParentPair(node='tum_lymfklieren_positief_atl', parents=('survival_1', 'pat_geslacht_code')),
 NodeParentPair(node='tum_topo_code', parents=('survival_1', 'tum_lymfklieren_positief_atl', 'pat_geslacht_code')),
 NodeParentPair(node='tum_topo_sublokalisatie_code', parents=('tum_lymfklieren_positief_atl', 'tum_topo_code', 'pat_geslacht_code')),
 NodeParentPair(node='stadium', parents=('tum_lymfklieren_positief_atl', 'survival_1', 'tum_topo_code')),
 NodeParentPair(node='age_at_diagnosis', parents=('pat_geslacht_code', 'tum_topo_sublokalisatie_code')),
 NodeParentPair(node='tum_differentiatiegraad_code', parents=('tum_lymfklieren_positief_atl', 'pat_geslacht_code', 'age_at_diagnosis'))]

#### Sequential network definition

In [59]:
context_columns = ['tum_topo_sublokalisatie_code', 'age_at_diagnosis', 'stadium', 'survival_1']
max_treatments = 5

init_network_sequential = []
for treatment_nr in range(1, max_treatments+1):
    if not treatment_nr==1:
        init_network_sequential += ([NodeParentPair('treatment_' + str(treatment_nr), 
                                                       context_columns + ['treatment_' + str(treatment_nr-1)])])
    else:
        init_network_sequential += [NodeParentPair('treatment_' + str(treatment_nr), context_columns)]

In [60]:
init_network_sequential

[NodeParentPair(node='treatment_1', parents=['tum_topo_sublokalisatie_code', 'age_at_diagnosis', 'stadium', 'survival_1']),
 NodeParentPair(node='treatment_2', parents=['tum_topo_sublokalisatie_code', 'age_at_diagnosis', 'stadium', 'survival_1', 'treatment_1']),
 NodeParentPair(node='treatment_3', parents=['tum_topo_sublokalisatie_code', 'age_at_diagnosis', 'stadium', 'survival_1', 'treatment_2']),
 NodeParentPair(node='treatment_4', parents=['tum_topo_sublokalisatie_code', 'age_at_diagnosis', 'stadium', 'survival_1', 'treatment_3']),
 NodeParentPair(node='treatment_5', parents=['tum_topo_sublokalisatie_code', 'age_at_diagnosis', 'stadium', 'survival_1', 'treatment_4'])]

#### Add sequential network to static network

In [61]:
init_network = network_static + init_network_sequential

#### Run PrivBayes with initialized network

In [62]:
# Train
pb = PrivBayes(epsilon=epsilon, verbose=True)
pb.set_network(init_network)
pb.fit(df)

<synthesis.synthesizers.privbayes.PrivBayes at 0x2e1dd552f08>

1/13 - init node pat_geslacht_code - with parents: None
2/13 - init node survival_1 - with parents: ('pat_geslacht_code',)
3/13 - init node tum_lymfklieren_positief_atl - with parents: ('survival_1', 'pat_geslacht_code')
4/13 - init node tum_topo_code - with parents: ('survival_1', 'tum_lymfklieren_positief_atl', 'pat_geslacht_code')
5/13 - init node tum_topo_sublokalisatie_code - with parents: ('tum_lymfklieren_positief_atl', 'tum_topo_code', 'pat_geslacht_code')
6/13 - init node stadium - with parents: ('tum_lymfklieren_positief_atl', 'survival_1', 'tum_topo_code')
7/13 - init node age_at_diagnosis - with parents: ('pat_geslacht_code', 'tum_topo_sublokalisatie_code')
8/13 - init node tum_differentiatiegraad_code - with parents: ('tum_lymfklieren_positief_atl', 'pat_geslacht_code', 'age_at_diagnosis')
9/13 - init node treatment_1 - with parents: ['tum_topo_sublokalisatie_code', 'age_at_diagnosis', 'stadium', 'survival_1']
10/13 - init node treatment_2 - with parents: ['tum_topo_sublok

<synthesis.synthesizers.privbayes.PrivBayes at 0x2e1dd552f08>

In [63]:
# Synthesize data
df_pb = pb.sample()

Number of records generated: 191913 / 191913
Synthetic Data Generated



#### Post-processing

In [64]:
df_static, _ = fts.inverse_transform(df)
df_treatments = treatments.sort_values(['eid', 'gbs_vnr'])
df_static_pb, df_treatments_pb = fts.inverse_transform(df_pb)

In [65]:
df_static_pb.to_pickle('../synthetic_data/PBS_eps01_Full_tumours.pickle')
df_treatments_pb.to_pickle('../synthetic_data/PBS_eps01_Full_treatments.pickle')