# Generating Sequential Data with Marginal Synthesizer

# Environment

## Library Imports

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
import os
import sys
from itertools import product
module_path = os.path.abspath(os.path.join('../../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
from copy import deepcopy
import math
from dython.nominal import associations
import matplotlib.pyplot as plt
import random

In [3]:
print(module_path)

C:\Users\mwo2008.54063\Documents\GitHub\thesis


## Jupyter-specific Imports and Settings

In [4]:
# set printing options
np.set_printoptions(threshold=sys.maxsize)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.expand_frame_repr', False)

# Display all cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from IPython import get_ipython
ipython = get_ipython()

# autoreload extension
if 'autoreload' not in ipython.extension_manager.loaded:
    get_ipython().run_line_magic('load_ext', 'autoreload')

get_ipython().run_line_magic('autoreload', '2')
from importlib import reload

## Import local libraries

In [5]:
from synthesis.synthesizers.marginal import MarginalSynthesizer

## Load original data

In [6]:
tumours = pd.read_pickle('../../../../../Master Thesis/data/preprocessed/tumours_patients_2017_train.pickle').set_index('eid')
treatments = pd.read_pickle('../../../../../Master Thesis/data/preprocessed/treatments_2017_train.pickle')

In [7]:
tumours.shape, treatments.shape

((191913, 8), (317592, 4))

## Sequential Data Pivoting

In [8]:
class SequentialDataPivotingMS:
    
    def __init__(self, max_n_treatments, primary_key, treatment_col, order_col):
        self.max_n_treatments = max_n_treatments
        self.primary_key = primary_key
        self.treatment_col = treatment_col
        self.order_col = order_col
        
    def transform(self, df_static, df_treatments):
        """
        Flattens df_treatments and appends to df_static
        """
        # Sort values by vnr
        df_treatments = df_treatments.sort_values([self.primary_key, self.order_col])
        
        # Group treatments by eid, list treatments and date
        df_treatments_grouped = pd.DataFrame(df_treatments.groupby(self.primary_key)[self.treatment_col].apply(list))
        
        # Instantiate flat treatments dataframe
        df_treatments_flat = pd.DataFrame(index=df_treatments_grouped.index)
        
        # Generate treatment and date columns
        for nr in range(1, self.max_n_treatments + 1):
            df_treatments_flat['treatment_' + str(nr)] = df_treatments_grouped[self.treatment_col].apply(self.get_treatment_nr,
                                                                                                        args=[nr])
            
        # Merge flat treatments dataframe with static covariates dataframe
        df = df_static.merge(df_treatments_flat, left_index=True, right_index=True)
        
        return df
    
    def inverse_transform(self, df):
        """
        Transforms flat dataframe back to df_static and df_treatments 
        """
        # Define static and treatment columns
        static_cols = [col for col in df if not col.startswith('treatment')]
        treatment_cols = list(set(df.columns) - set(static_cols))
        
        # df_static
        df_static = df[static_cols].copy()
        
        # df_treatments
        treatment_cols_treatments = sorted([col for col in treatment_cols if col.startswith('treatment')])
        
        treatment_cols_data = [pd.DataFrame(df[col]) for col in treatment_cols_treatments]
        for a_df in treatment_cols_data:
            a_df['nr_treatment'] = a_df.columns[0][-1]
            a_df.rename(columns={a_df.columns[0]: self.treatment_col}, inplace=True)
        
        treatments = pd.concat(treatment_cols_data)
        treatments.index.name = self.primary_key
        treatments = treatments.reset_index()
        
        df_treatments = treatments.replace('nan', np.nan)
        df_treatments = df_treatments.dropna().rename({'nr_treatment': self.order_col}, axis=1)
        df_treatments = df_treatments.sort_values(['eid', self.order_col]).reset_index(drop=True)
        
        # Drop all treatments after first nan
        df_treatments = df_treatments[df_treatments[self.order_col].astype(int)
                                      ==df_treatments.groupby('eid').cumcount()+1].copy()
        
        return df_static, df_treatments
        
    @staticmethod
    def get_treatment_nr(treatments, nr):
        if len(treatments)>=nr:
            return treatments[nr-1]
        else:
            return np.nan

In [9]:
# develop transformer to flatten the table
fts = SequentialDataPivotingMS(max_n_treatments=5,
                               primary_key='eid',
                               treatment_col='gbs_gebeurtenis_code',
                               order_col='gbs_vnr')

df = fts.transform(tumours, treatments)

In [10]:
df.shape

(191913, 13)

In [11]:
# Set all dtypes to string
df = df.astype(str)

# Generate data

In [12]:
epsilon = np.inf

In [13]:
ms = MarginalSynthesizer(epsilon=epsilon)
ms.fit(df)
df_ms = ms.sample()

Marginal fitted: tum_topo_code
Marginal fitted: pat_geslacht_code
Marginal fitted: tum_differentiatiegraad_code
Marginal fitted: tum_lymfklieren_positief_atl
Marginal fitted: age_at_diagnosis
Marginal fitted: tum_topo_sublokalisatie_code
Marginal fitted: stadium
Marginal fitted: survival_1
Marginal fitted: treatment_1
Marginal fitted: treatment_2
Marginal fitted: treatment_3
Marginal fitted: treatment_4
Marginal fitted: treatment_5


<synthesis.synthesizers.marginal.MarginalSynthesizer at 0x27268eee288>

Column sampled: tum_topo_code
Column sampled: pat_geslacht_code
Column sampled: tum_differentiatiegraad_code
Column sampled: tum_lymfklieren_positief_atl
Column sampled: age_at_diagnosis
Column sampled: tum_topo_sublokalisatie_code
Column sampled: stadium
Column sampled: survival_1
Column sampled: treatment_1
Column sampled: treatment_2
Column sampled: treatment_3
Column sampled: treatment_4
Column sampled: treatment_5


In [14]:
df_static, _ = fts.inverse_transform(df)
df_treatments = treatments.sort_values(['eid', 'gbs_vnr'])
df_static_ms, df_treatments_ms = fts.inverse_transform(df_ms)

In [15]:
df_static_ms.to_pickle('../synthetic_data/MSNP_tumours.pickle')
df_treatments_ms.to_pickle('../synthetic_data/MSNP_treatments.pickle')