### Install SDV Community

In [None]:
%pip install sdv

### Install ipykernal && nbformat to avoid Mime type rendering issues with visualizations
* `ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed`

In [None]:
%pip install ipykernel
%pip install --upgrade nbformat

### Load demo data

In [None]:
from sdv.datasets.demo import download_demo

real_data, metadata = download_demo(
    modality='single_table',
    dataset_name='fake_hotel_guests'
)

### Preview demo data

In [None]:
real_data.head()

### Visualize metadata

In [None]:
metadata.visualize()

### Create a synthesizer

In [None]:
from sdv.single_table import GaussianCopulaSynthesizer

synthesizer = GaussianCopulaSynthesizer(metadata)
synthesizer.fit(real_data)

### Generate synthetic data

In [None]:
synthetic_data = synthesizer.sample(num_rows=500)
synthetic_data.head()

### Evaluate real vs. synthetic data
* Run diagnostic to ensure data is valid

In [None]:
from sdv.evaluation.single_table import run_diagnostic

diagnostic = run_diagnostic(
    real_data=real_data,
    synthetic_data=synthetic_data,
    metadata=metadata
)

* Measure data quality between real and synthetic data

In [None]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    real_data,
    synthetic_data,
    metadata
)

### Get details on column shapes

In [None]:
quality_report.get_details('Column Shapes')

### Visualize real vs. synthetic data

In [None]:
from sdv.evaluation.single_table import get_column_plot

fig = get_column_plot(
    real_data=real_data,
    synthetic_data=synthetic_data,
    metadata=metadata,
    column_name='room_rate'
)

fig.show()

### Visualize in 2D (column pair plot)

In [None]:
from sdv.evaluation.single_table import get_column_pair_plot

fig = get_column_pair_plot(
    real_data=real_data,
    synthetic_data=synthetic_data,
    column_names=['room_rate', 'room_type'],
    metadata=metadata
)

fig.show()

### Anonymize sensitive data (PII)

In [None]:
sensitive_column_names = ['guest_email', 'billing_address', 'credit_card_number']

real_data[sensitive_column_names].head()

### Compare desensitized data vs. real data (above)

In [None]:
synthetic_data[sensitive_column_names].head()

### Save the current state of synthesized data
* Useful when sharing with others or synthesizing more data in the future

In [None]:
synthesizer.save('my_synthesizer.pkl')

synthesizer = GaussianCopulaSynthesizer.load('my_synthesizer.pkl')

## Gaussian Copula customization

In [None]:
custom_synthesizer = GaussianCopulaSynthesizer(
    metadata,
    default_distribution='truncnorm',
    numerical_distributions={
        'checkin_date': 'uniform',
        'checkout_date': 'uniform',
        'room_rate': 'gaussian_kde'
    }
)

custom_synthesizer.fit(real_data)

#### Inspect the distributions after training.
This synthesizer returns a learned parameter during `truncnorm` distribution.
* <a href="https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.truncnorm.html">More information</a> available in `scipy` truncnorm distribution documentation.  

In [None]:
learned_distributions = custom_synthesizer.get_learned_distributions()
learned_distributions['has_rewards']

* By strategically setting distributions, tradeoffs can be made in the quality of synthetic data.

In [None]:
synthetic_data_customized = custom_synthesizer.sample(num_rows=500)

quality_report = evaluate_quality(
    real_data,
    synthetic_data_customized,
    metadata
)

### Visualize customized synthetic data

In [None]:
fig = get_column_plot(
    real_data=real_data,
    synthetic_data=synthetic_data_customized,
    column_name='room_rate',
    metadata=metadata
)

fig.show()

### Conditional Sampling
* Simulate hypothetical scenarios by using Gaussian Copula to efficiently sample conditions.
* In the below example, every guest is staying in a `suite` <i>(1/2 w/ rewards, 1/2 without)</i>

In [None]:
from sdv.sampling import Condition

suite_guests_with_rewards = Condition(
    num_rows=250,
    column_values={'room_type': 'SUITE', 'has_rewards': True}
)

suite_guests_without_rewards = Condition(
    num_rows=250,
    column_values={'room_type': 'SUITE', 'has_rewards': False}
)

### Simulate the scenario using the trained synthesizer

In [None]:
simulated_synthetic_data = custom_synthesizer.sample_from_conditions(conditions=[
  suite_guests_with_rewards,
  suite_guests_without_rewards
])

### Visualize the data

In [None]:
fig = get_column_plot(
    real_data=real_data,
    synthetic_data=simulated_synthetic_data,
    column_name='room_type',
    metadata=metadata
)

fig.update_layout(
    title='Using synthetic data to simulate room_type scenario'
)

fig.show()