# DataSynthesizer Usage (correlated attribute mode)



In [None]:
!pip install DataSynthesizer

Collecting DataSynthesizer
  Downloading DataSynthesizer-0.1.13-py2.py3-none-any.whl (24 kB)
Installing collected packages: DataSynthesizer
Successfully installed DataSynthesizer-0.1.13


In [None]:
from DataSynthesizer.DataDescriber import DataDescriber
from DataSynthesizer.DataGenerator import DataGenerator
from DataSynthesizer.ModelInspector import ModelInspector
from DataSynthesizer.lib.utils import read_json_file, display_bayesian_network

import pandas as pd

### Step 2 user-defined parameteres

In [None]:
# input dataset
input_data = '/content/real_train.csv'
# location of two output files
mode = 'correlated_attribute_mode'
#description_file = f'./out/{mode}/description.json'
synthetic_data = f'{mode}/sythetic_data.csv'

In [None]:
# An attribute is categorical if its domain size is less than this threshold.
# Here modify the threshold to adapt to the domain size of "education" (which is 14 in input dataset).
threshold_value = 3

# specify categorical attributes
categorical_attributes = {
 'careplan_within_24': True,
 'Pneumococcal conjugate PCV 13': True,
 'MMR': True,
 'DTaP': True,
 'varicella': True,
 'Acetaminophen 160 MG': True,
 'Ibuprofen 200 MG Oral Tablet': True,
 'Naproxen sodium 220 MG Oral Tablet': True,
 'Dextromethorphan Hydrobromide 1 MG/ML': True,
 'Amoxicillin 250 MG / Clavulanate 125 MG [Augmentin]': True,
 'Documentation of current medications': True,
 'Measurement of respiratory function (procedure)': True,
 'Spirometry (procedure)': True,
 'Throat culture (procedure) ': True,
 'Sputum examination (procedure)': True,
 'Viral_sinusitis_present': True,
 'MARITAL_M': True,
 'MARITAL_S': True,
 'RACE_asian': True,
 'RACE_black': True,
 'RACE_hispanic': True,
 'RACE_native': True,
 'RACE_white': True,
 'ETHNICITY_african': True,
 'ETHNICITY_american': True,
 'ETHNICITY_american_indian': True,
 'ETHNICITY_asian_indian': True,
 'ETHNICITY_central_american': True,
 'ETHNICITY_chinese': True,
 'ETHNICITY_dominican': True,
 'ETHNICITY_english': True,
 'ETHNICITY_french': True,
 'ETHNICITY_french_canadian': True,
 'ETHNICITY_german': True,
 'ETHNICITY_irish': True,
 'ETHNICITY_italian': True,
 'ETHNICITY_mexican': True,
 'ETHNICITY_polish': True,
 'ETHNICITY_portuguese': True,
 'ETHNICITY_puerto_rican': True,
 'ETHNICITY_russian': True,
 'ETHNICITY_scottish': True,
 'ETHNICITY_swedish': True,
 'ETHNICITY_west_indian': True,
 'GENDER_F': True,
 'GENDER_M': True,
 'encounter_type_emergency room visit': True,
 'encounter_type_inpatient': True,
 'encounter_type_outpatient': True
}


# specify which attributes are candidate keys of input dataset.
#candidate_keys = {'ssn': True}

# A parameter in Differential Privacy. It roughly means that removing a row in the input dataset will not
# change the probability of getting the same output more than a multiplicative difference of exp(epsilon).
# Increase epsilon value to reduce the injected noises. Set epsilon=0 to turn off differential privacy.
epsilon = 0

# The maximum number of parents in Bayesian network, i.e., the maximum number of incoming edges.
degree_of_bayesian_network = 5

# Number of tuples generated in synthetic dataset.
num_tuples_to_generate = 60593 # Here 32561 is the same as input dataset, but it can be set to another number.

### Step 3 DataDescriber

1. Instantiate a DataDescriber.
2. Compute the statistics of the dataset.
3. Save dataset description to a file on local machine.

In [None]:
describer = DataDescriber(category_threshold=threshold_value)
describer.describe_dataset_in_correlated_attribute_mode(dataset_file=input_data,
                                                        epsilon=epsilon,
                                                        k=degree_of_bayesian_network,
                                                        attribute_to_is_categorical=categorical_attributes,
                                                        )
describer.save_dataset_description_to_file(description_file)

Adding ROOT ETHNICITY_french_canadian
Adding attribute RACE_white
Adding attribute RACE_hispanic
Adding attribute ETHNICITY_puerto_rican
Adding attribute RACE_black
Adding attribute RACE_asian
Adding attribute ETHNICITY_chinese
Adding attribute ETHNICITY_asian_indian
Adding attribute ETHNICITY_dominican


In [None]:
display_bayesian_network(describer.bayesian_network)

Constructed Bayesian network:
    marital-status has parents ['relationship'].
    age            has parents ['marital-status', 'relationship'].
    sex            has parents ['marital-status', 'relationship'].
    education      has parents ['sex', 'age'].
    income         has parents ['sex', 'age'].


### Step 4 generate synthetic dataset

1. Instantiate a DataGenerator.
2. Generate a synthetic dataset.
3. Save it to local machine.

In [None]:
generator = DataGenerator()
generator.generate_dataset_in_correlated_attribute_mode(num_tuples_to_generate, description_file)
generator.save_synthetic_data(synthetic_data)

### Step 5 compare the statistics of input and sythetic data (optional)

The synthetic data is already saved in a file by step 4. The ModelInspector is for a quick test on the similarity between input and synthetic datasets.

#### 5.1 instantiate a ModelInspector.

It needs input dataset, synthetic dataset, and attribute description.

In [None]:
# Read both datasets using Pandas.
input_df = pd.read_csv(input_data, skipinitialspace=True)
synthetic_df = pd.read_csv(synthetic_data)
# Read attribute description from the dataset description file.
attribute_description = read_json_file(description_file)['attribute_description']

inspector = ModelInspector(input_df, synthetic_df, attribute_description)

#### 5.2 compare histograms between input and synthetic datasets.

In [None]:
for attribute in synthetic_df.columns:
    inspector.compare_histograms(attribute)

#### 5.3 compare pairwise mutual information

In [None]:
inspector.mutual_information_heatmap()