In [3]:
from sdv.datasets.demo import download_demo

real_data, metadata = download_demo(
    modality='single_table',
    dataset_name='fake_hotel_guests'
)

In [19]:
real_data

Unnamed: 0,guest_email,has_rewards,room_type,amenities_fee,checkin_date,checkout_date,room_rate,billing_address,credit_card_number
0,michaelsanders@shaw.net,False,BASIC,37.89,2020-12-27,2020-12-29,131.23,"49380 Rivers Street\nSpencerville, AK 68265",4075084747483975747
1,randy49@brown.biz,False,BASIC,24.37,2020-12-30,2021-01-02,114.43,"88394 Boyle Meadows\nConleyberg, TN 22063",180072822063468
2,webermelissa@neal.com,True,DELUXE,0.00,2020-09-17,2020-09-18,368.33,"0323 Lisa Station Apt. 208\nPort Thomas, LA 82585",38983476971380
3,gsims@terry.com,False,BASIC,,2020-12-28,2020-12-31,115.61,"77 Massachusetts Ave\nCambridge, MA 02139",4969551998845740
4,misty33@smith.biz,False,BASIC,16.45,2020-04-05,NaT,122.41,"1234 Corporate Drive\nBoston, MA 02116",3558512986488983
...,...,...,...,...,...,...,...,...,...
495,laurabennett@jones-duncan.net,False,BASIC,8.71,2021-01-04,2021-01-06,103.25,"5678 Office Road\nSan Francisco, CA 94103",3505516387300030
496,johnny71@cook.info,False,BASIC,16.31,2020-08-24,2020-08-26,115.81,"953 White Island\nChristopherside, TN 91366",2224524502892552
497,ygarcia@ballard-lopez.net,False,BASIC,30.59,2020-11-11,2020-11-13,141.61,"5678 Office Road\nSan Francisco, CA 94103",180096250673548
498,thomasdale@hall.com,False,BASIC,1.93,2020-07-16,2020-07-18,136.92,"5678 Office Road\nSan Francisco, CA 94103",4488223821722


In [6]:
from sdv.single_table import GaussianCopulaSynthesizer

synthesizer = GaussianCopulaSynthesizer(metadata)

In [7]:
synthesizer.fit(
    data=real_data
)

In [8]:
synthetic_data = synthesizer.sample(
    num_rows=500
)

synthetic_data.head()

Unnamed: 0,guest_email,has_rewards,room_type,amenities_fee,checkin_date,checkout_date,room_rate,billing_address,credit_card_number
0,dsullivan@example.net,False,BASIC,0.29,27 Mar 2020,09 Mar 2020,135.15,"90469 Karla Knolls Apt. 781\nSusanberg, CA 70033",5161033759518983
1,steven59@example.org,False,DELUXE,8.15,07 Sep 2020,25 Jun 2020,183.24,"6108 Carla Ports Apt. 116\nPort Evan, MI 71694",4133047413145475690
2,brandon15@example.net,False,BASIC,11.65,22 Mar 2020,01 Apr 2020,163.57,86709 Jeremy Manors Apt. 786\nPort Garychester...,4977328103788
3,humphreyjennifer@example.net,False,BASIC,48.12,04 Jun 2020,14 May 2020,127.75,"8906 Bobby Trail\nEast Sandra, NY 43986",3524946844839485
4,joshuabrown@example.net,False,DELUXE,11.07,08 Jan 2020,13 Jan 2020,180.12,"732 Dennis Lane\nPort Nicholasstad, DE 49786",4446905799576890978


In [9]:
from sdv.evaluation.single_table import run_diagnostic

diagnostic = run_diagnostic(
    real_data=real_data,
    synthetic_data=synthetic_data,
    metadata=metadata
)

Generating report ...

(1/2) Evaluating Data Validity: |███████████████████████████████████████████████████████| 9/9 [00:00<00:00, 671.42it/s]|
Data Validity Score: 100.0%

(2/2) Evaluating Data Structure: |██████████████████████████████████████████████████████| 1/1 [00:00<00:00, 180.56it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 100.0%



In [10]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    real_data,
    synthetic_data,
    metadata
)

Generating report ...

(1/2) Evaluating Column Shapes: |███████████████████████████████████████████████████████| 9/9 [00:00<00:00, 474.34it/s]|
Column Shapes Score: 90.06%

(2/2) Evaluating Column Pair Trends: |████████████████████████████████████████████████| 36/36 [00:00<00:00, 270.57it/s]|
Column Pair Trends Score: 89.29%

Overall Score (Average): 89.68%



In [11]:
quality_report.get_details('Column Shapes')

Unnamed: 0,Column,Metric,Score
0,has_rewards,TVComplement,0.982
1,room_type,TVComplement,0.984
2,amenities_fee,KSComplement,0.764778
3,checkin_date,KSComplement,0.962
4,checkout_date,KSComplement,0.96875
5,room_rate,KSComplement,0.742


In [12]:
sensitive_column_names = ['guest_email', 'billing_address', 'credit_card_number']

real_data[sensitive_column_names].head(3)

Unnamed: 0,guest_email,billing_address,credit_card_number
0,michaelsanders@shaw.net,"49380 Rivers Street\nSpencerville, AK 68265",4075084747483975747
1,randy49@brown.biz,"88394 Boyle Meadows\nConleyberg, TN 22063",180072822063468
2,webermelissa@neal.com,"0323 Lisa Station Apt. 208\nPort Thomas, LA 82585",38983476971380


In [13]:
synthetic_data[sensitive_column_names].head(3)

Unnamed: 0,guest_email,billing_address,credit_card_number
0,dsullivan@example.net,"90469 Karla Knolls Apt. 781\nSusanberg, CA 70033",5161033759518983
1,steven59@example.org,"6108 Carla Ports Apt. 116\nPort Evan, MI 71694",4133047413145475690
2,brandon15@example.net,86709 Jeremy Manors Apt. 786\nPort Garychester...,4977328103788


In [14]:
from sdv.evaluation.single_table import get_column_plot

fig = get_column_plot(
    real_data=real_data,
    synthetic_data=synthetic_data,
    column_name='room_rate',
    metadata=metadata
)

fig.show()

In [15]:
from sdv.evaluation.single_table import get_column_pair_plot

fig = get_column_pair_plot(
    real_data=real_data,
    synthetic_data=synthetic_data,
    column_names=['checkin_date', 'checkout_date'],
    metadata=metadata
)

fig.show()