In [6]:
!pip install -q -r requirements.txt

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import sdv
from ucimlrepo import fetch_ucirepo
from DataSynthesizer import DataSynthesizer
from metrics import evaluate

# Credit Default Dataset

### Data Preprocessing

In [6]:
!unzip data/default\ of\ credit\ card\ clients.zip -d data

Archive:  data/default of credit card clients.zip
replace data/default of credit card clients.xls? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [107]:
data = pd.read_excel('data/default of credit card clients.xls', skiprows=[0])

In [108]:
data = data.drop(columns=['ID'])

In [109]:
data.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,20000,2,2,1,24,2,2,-1,-1,-2,...,0,0,0,0,689,0,0,0,0,1
1,120000,2,2,2,26,-1,2,0,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,90000,2,2,2,34,0,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,50000,2,2,1,37,0,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,50000,1,2,1,57,-1,0,-1,0,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [110]:
label = 'default payment next month'

In [111]:
train, test = train_test_split(data, test_size=0.2, random_state=42)

In [112]:
num_samples = train.shape[0]

In [113]:
num_samples

24000

### Directory Structure

In [114]:
!mkdir images

mkdir: cannot create directory ‘images’: File exists


In [115]:
!mkdir images/credit

mkdir: cannot create directory ‘images/credit’: File exists


In [116]:
!mkdir images/credit/tvae

mkdir: cannot create directory ‘images/credit/tvae’: File exists


In [117]:
!mkdir images/credit/ctgan

mkdir: cannot create directory ‘images/credit/ctgan’: File exists


In [118]:
!mkdir images/credit/findiff

mkdir: cannot create directory ‘images/credit/findiff’: File exists


In [119]:
!mkdir images/credit/tabula

mkdir: cannot create directory ‘images/credit/tabula’: File exists


### TVAE

In [120]:
tvae = DataSynthesizer(data, "tvae")

Initializing TVAE synthesizer



We strongly recommend saving the metadata using 'save_to_json' for replicability in future SDV versions.



In [121]:
tvae.fit()

Fitting TVAE synthesizer



os.fork() was called. os.fork() is incompatible with multithreaded code, and JAX is multithreaded, so this will likely lead to a deadlock.



In [122]:
synthetic_data_tvae = tvae.sample(num_samples)

Sampling from TVAE synthesizer


In [123]:
synthetic_data_tvae.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,319702,2,1,2,28,-2,-2,-2,-2,-2,...,13751,346,30297,26115,8186,6567,2643,9010,15189,0
1,55300,2,2,1,43,0,0,0,0,0,...,21804,24765,44160,4737,3500,217,381,2837,1618,0
2,187559,1,2,1,50,0,0,0,0,0,...,111820,111610,76770,3813,4051,5512,7438,4218,3668,0
3,204109,2,2,1,45,-1,-1,-1,-1,-1,...,1174,1520,1244,1211,243,0,0,170,0,0
4,14815,2,2,1,37,0,0,0,0,0,...,968,178,2043,25,2560,377,0,289,98,0


In [124]:
evaluate(train, test, synthetic_data_tvae, label=label, visualization=True, directory="images/credit/tvae")

Data Validity Score: 1.0
Data Structure Score: 1.0
Column Fidelity: 0.9159704861111111
Row Fidelity: 0.6826595898730077
Privacy Score: 0.9954047948122025
Utility Score: 0.8051666666666667


{'data_validity_score': 1.0,
 'data_structure_score': 1.0,
 'column_fidelity': 0.9159704861111111,
 'row_fidelity': 0.6826595898730077,
 'privacy_score': 0.9954047948122025,
 'utility_score': 0.8051666666666667}

### CTGAN

In [125]:
ctgan = DataSynthesizer(data, "ctgan")

Initializing CTGAN synthesizer



We strongly recommend saving the metadata using 'save_to_json' for replicability in future SDV versions.



In [126]:
ctgan.fit()

Fitting CTGAN synthesizer



os.fork() was called. os.fork() is incompatible with multithreaded code, and JAX is multithreaded, so this will likely lead to a deadlock.



In [127]:
synthetic_data_ctgan = ctgan.sample(num_samples)

Sampling from CTGAN synthesizer


In [128]:
synthetic_data_ctgan.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,83892,2,2,3,24,0,0,0,0,0,...,13694,29252,8698,302,3744,2004,2739,2042,7507,0
1,138890,1,2,1,52,0,0,0,0,0,...,79458,77819,47363,3766,4387,4338,3354,793,969,1
2,195184,2,1,1,38,-1,-1,-1,-1,-1,...,2474,1261,1957,5008,50253,5159,5350,5678,1810,0
3,168597,2,2,2,34,-2,-1,-2,-2,-2,...,8481,824,460,4004,803,1644,6880,4876,1573,0
4,77652,2,3,0,55,-1,0,0,0,0,...,20908,51596,13494,2563,3160,3172,1657,3456,1851,0


In [129]:
evaluate(train, test, synthetic_data_ctgan, label=label, visualization=True, directory="images/credit/ctgan")

Data Validity Score: 1.0
Data Structure Score: 1.0
Column Fidelity: 0.9160347222222223
Row Fidelity: 0.6865705123683858
Privacy Score: 0.9927716497331858
Utility Score: 0.7971666666666667


{'data_validity_score': 1.0,
 'data_structure_score': 1.0,
 'column_fidelity': 0.9160347222222223,
 'row_fidelity': 0.6865705123683858,
 'privacy_score': 0.9927716497331858,
 'utility_score': 0.7971666666666667}

### FinDiff

In [130]:
findiff = DataSynthesizer(train, "findiff")

Initializing FinDiff synthesizer


In [131]:
findiff.fit()

Fitting FinDiff synthesizer


[LOG 2024-05-27 01:00:02] epoch: 0029, train-loss: 0.27899778: 100%|██████████| 30/30 [00:45<00:00,  1.53s/it]


In [132]:
synthetic_data_findiff = findiff.sample(num_samples)

Sampling from FinDiff synthesizer


[LOG 2024-05-27 01:00:06] Diffusion Step: 0000: : 10it [00:04,  2.44it/s]

X does not have valid feature names, but QuantileTransformer was fitted with feature names



In [133]:
synthetic_data_findiff.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,290000,1,3,0,33,1,0,-1,0,0,...,79194,0,17336,15343,1654,0,500,1000,870,1
1,30000,1,5,0,22,-1,0,0,-1,0,...,25527,11419,14159,0,447,2000,5035,2000,1000,1
2,110000,2,3,0,51,-1,-1,-2,-1,-1,...,169,2612,20157,1306,10469,15000,492,0,6653,1
3,100000,1,2,0,35,2,0,0,0,-1,...,5527,29724,34642,0,2146,1617,5961,1497,781,1
4,462977,1,0,0,46,-1,0,-2,-2,0,...,138033,18771,2677,3500,5255,45766,0,2824,2001,1


In [134]:
data.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,20000,2,2,1,24,2,2,-1,-1,-2,...,0,0,0,0,689,0,0,0,0,1
1,120000,2,2,2,26,-1,2,0,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,90000,2,2,2,34,0,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,50000,2,2,1,37,0,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,50000,1,2,1,57,-1,0,-1,0,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [135]:
evaluate(train, test, synthetic_data_findiff, label=label, visualization=True, directory="images/credit/findiff")

Data Validity Score: 1.0
Data Structure Score: 1.0
Column Fidelity: 0.8973732638888888
Row Fidelity: 0.6104828585903839
Privacy Score: 0.974627610296011
Utility Score: 0.21883333333333332


{'data_validity_score': 1.0,
 'data_structure_score': 1.0,
 'column_fidelity': 0.8973732638888888,
 'row_fidelity': 0.6104828585903839,
 'privacy_score': 0.974627610296011,
 'utility_score': 0.21883333333333332}

### TabuLa

In [136]:
tabula = DataSynthesizer(train, "tabula")

Initializing Tabula synthesizer



`resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.



In [137]:
tabula.fit()

Fitting Tabula synthesizer


Step,Training Loss
500,2.119
1000,1.6474
1500,1.5592
2000,1.5026
2500,1.4747
3000,1.4572
3500,1.4285
4000,1.4256
4500,1.4043
5000,1.3842


In [138]:
synthetic_data_tabula = tabula.sample(num_samples)

Sampling from Tabula synthesizer


24066it [04:09, 96.55it/s]


In [139]:
synthetic_data_tabula.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,70000.0,1.0,2.0,2.0,28.0,0.0,0.0,0.0,0.0,0.0,...,42096.0,4403.0,40212.0,2500.0,2000.0,2000.0,2000.0,3000.0,2000.0,0.0
1,50000.0,2.0,2.0,1.0,30.0,0.0,0.0,0.0,0.0,0.0,...,89111.0,91284.0,96273.0,4000.0,3248.0,3166.0,7000.0,3500.0,10000.0,0.0
2,80000.0,2.0,2.0,2.0,34.0,-1.0,-1.0,-1.0,-1.0,0.0,...,11069.0,10068.0,7957.0,14587.0,7749.0,11069.0,201.0,2000.0,0.0,0.0
3,30000.0,2.0,2.0,1.0,38.0,-1.0,-1.0,-1.0,0.0,0.0,...,1048.0,1768.0,948.0,5772.0,360.0,0.0,0.0,0.0,0.0,0.0
4,110000.0,1.0,1.0,1.0,40.0,0.0,0.0,0.0,0.0,0.0,...,47837.0,48260.0,49553.0,3000.0,3000.0,2000.0,3000.0,3000.0,3000.0,0.0


In [140]:
evaluate(train, test, synthetic_data_tabula, label=label, visualization=True, directory="images/credit/tabula")

Data Validity Score: 0.9995520833333332
Data Structure Score: 1.0
Column Fidelity: 0.8852534722245325
Row Fidelity: 0.6271373481820456
Privacy Score: 0.9987639207392931
Utility Score: 0.8118333333333333


{'data_validity_score': 0.9995520833333332,
 'data_structure_score': 1.0,
 'column_fidelity': 0.8852534722245325,
 'row_fidelity': 0.6271373481820456,
 'privacy_score': 0.9987639207392931,
 'utility_score': 0.8118333333333333}

# Rice Classification Dataset

### Data Preprocessing

In [8]:
rice_cammeo_and_osmancik = fetch_ucirepo(id=545)

In [9]:
X = rice_cammeo_and_osmancik.data.features
y = rice_cammeo_and_osmancik.data.targets
data = pd.concat([X, y], axis=1)

In [10]:
data.head()

Unnamed: 0,Area,Perimeter,Major_Axis_Length,Minor_Axis_Length,Eccentricity,Convex_Area,Extent,Class
0,15231,525.578979,229.749878,85.093788,0.928882,15617,0.572896,Cammeo
1,14656,494.311005,206.020065,91.730972,0.895405,15072,0.615436,Cammeo
2,14634,501.122009,214.106781,87.768288,0.912118,14954,0.693259,Cammeo
3,13176,458.342987,193.337387,87.448395,0.891861,13368,0.640669,Cammeo
4,14688,507.166992,211.743378,89.312454,0.906691,15262,0.646024,Cammeo


In [11]:
label = 'Class'

In [12]:
train, test = train_test_split(data, test_size=0.2, random_state=42)

In [13]:
num_samples = train.shape[0]

In [14]:
num_samples

3048

### Directory Structure

In [15]:
!mkdir images

In [16]:
!mkdir images/rice

In [17]:
!mkdir images/rice/tvae

In [18]:
!mkdir images/rice/ctgan

In [19]:
!mkdir images/rice/findiff

In [20]:
!mkdir images/rice/tabula

### TVAE

In [21]:
tvae = DataSynthesizer(data, "tvae")

Initializing TVAE synthesizer




In [22]:
tvae.fit()

Fitting TVAE synthesizer


  pid = os.fork()


In [23]:
synthetic_data_tvae = tvae.sample(num_samples)

Sampling from TVAE synthesizer


In [24]:
synthetic_data_tvae.head()

Unnamed: 0,Area,Perimeter,Major_Axis_Length,Minor_Axis_Length,Eccentricity,Convex_Area,Extent,Class
0,13759,486.393293,202.209772,86.97011,0.903903,14076,0.607758,Cammeo
1,11819,417.917737,173.505496,86.55104,0.843816,12241,0.714305,Osmancik
2,14628,480.293794,203.432967,95.441687,0.889452,14946,0.794663,Cammeo
3,12709,438.053341,177.30194,90.036869,0.855265,13188,0.780136,Osmancik
4,13434,468.022081,190.058772,92.039016,0.871129,13837,0.801722,Cammeo


In [25]:
evaluate(train, test, synthetic_data_tvae, label=label, visualization=True, directory="images/rice/tvae")

Data Validity Score: 1.0
Data Structure Score: 1.0
Column Fidelity: 0.9534940944881889
Row Fidelity: 0.7251515903899086
Privacy Score: 0.9851025762036443
Utility Score: 0.9278215223097113


{'data_validity_score': 1.0,
 'data_structure_score': 1.0,
 'column_fidelity': 0.9534940944881889,
 'row_fidelity': 0.7251515903899086,
 'privacy_score': 0.9851025762036443,
 'utility_score': 0.9278215223097113}

### CTGAN

In [26]:
ctgan = DataSynthesizer(data, "ctgan")

Initializing CTGAN synthesizer



We strongly recommend saving the metadata using 'save_to_json' for replicability in future SDV versions.



In [27]:
ctgan.fit()

Fitting CTGAN synthesizer


In [28]:
synthetic_data_ctgan = ctgan.sample(num_samples)

Sampling from CTGAN synthesizer


In [29]:
synthetic_data_ctgan.head()

Unnamed: 0,Area,Perimeter,Major_Axis_Length,Minor_Axis_Length,Eccentricity,Convex_Area,Extent,Class
0,11710,413.359538,173.302066,82.801039,0.856694,13219,0.637124,Osmancik
1,15216,478.641865,193.976793,98.066235,0.906942,15997,0.565663,Cammeo
2,13124,394.207632,178.984848,88.90271,0.885306,10978,0.596614,Osmancik
3,14568,473.609619,222.367146,101.309163,0.892569,13761,0.514421,Cammeo
4,15049,446.965405,162.861546,86.577197,0.865252,11935,0.589289,Osmancik


In [30]:
evaluate(train, test, synthetic_data_ctgan, label=label, visualization=True, directory="images/rice/ctgan")

Data Validity Score: 1.0
Data Structure Score: 1.0
Column Fidelity: 0.8841043307086613
Row Fidelity: 0.6858827265297867
Privacy Score: 0.96873489767313
Utility Score: 0.9225721784776902


{'data_validity_score': 1.0,
 'data_structure_score': 1.0,
 'column_fidelity': 0.8841043307086613,
 'row_fidelity': 0.6858827265297867,
 'privacy_score': 0.96873489767313,
 'utility_score': 0.9225721784776902}

### FinDiff

In [31]:
findiff = DataSynthesizer(train, "findiff")

Initializing FinDiff synthesizer


In [32]:
findiff.fit()

Fitting FinDiff synthesizer


[LOG 2024-05-27 07:56:17] epoch: 0029, train-loss: 0.23659146: 100%|██████████| 30/30 [00:05<00:00,  5.24it/s]


In [33]:
synthetic_data_findiff = findiff.sample(num_samples)

Sampling from FinDiff synthesizer


[LOG 2024-05-27 07:56:18] Diffusion Step: 0000: : 10it [00:00, 24.08it/s]

X does not have valid feature names, but QuantileTransformer was fitted with feature names



In [34]:
synthetic_data_findiff.head()

Unnamed: 0,Area,Perimeter,Major_Axis_Length,Minor_Axis_Length,Eccentricity,Convex_Area,Extent,Class
0,14847,441.553345,226.78746,84.84269,0.923866,11510,0.569836,Osmancik
1,13077,416.727081,224.566025,82.714729,0.888116,15627,0.703681,Cammeo
2,12741,488.908081,181.523712,89.191452,0.875773,16304,0.701691,Cammeo
3,10157,451.775543,214.150482,83.132286,0.872224,11082,0.582853,Osmancik
4,11492,487.390106,192.241913,83.142677,0.876527,11497,0.642816,Osmancik


In [36]:
evaluate(train, test, synthetic_data_findiff, label=label, visualization=True, directory="images/rice/findiff")

Data Validity Score: 1.0
Data Structure Score: 1.0
Column Fidelity: 0.9533300524934383
Row Fidelity: 0.5567000965581385
Privacy Score: 0.9563017040491104
Utility Score: 0.5183727034120735


{'data_validity_score': 1.0,
 'data_structure_score': 1.0,
 'column_fidelity': 0.9533300524934383,
 'row_fidelity': 0.5567000965581385,
 'privacy_score': 0.9563017040491104,
 'utility_score': 0.5183727034120735}

### TabuLa

In [37]:
tabula = DataSynthesizer(train, "tabula")

Initializing Tabula synthesizer




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.



tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]


`resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.



config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [38]:
tabula.fit()

Fitting Tabula synthesizer


Step,Training Loss
500,3.2324
1000,2.4257
1500,2.0457
2000,1.6515
2500,1.3022
3000,1.0322
3500,0.8391
4000,0.7058
4500,0.6204


In [39]:
synthetic_data_tabula = tabula.sample(num_samples)

Sampling from Tabula synthesizer


3114it [00:27, 113.46it/s]


In [40]:
synthetic_data_tabula.head()

Unnamed: 0,Area,Perimeter,Major_Axis_Length,Minor_Axis_Length,Eccentricity,Convex_Area,Extent,Class
0,10803.0,12.057007,171.396243,84.949575,0.879059,12755.0,0.626924,Osmancik
1,12172.0,436.184998,177.93849,86.000385,0.872734,12771.0,0.67607,Osmancik
2,13589.0,479.647003,199.9083,93.505301,0.892444,14275.0,0.625423,Cammeo
3,10367.0,421.822998,176.590719,79.34763,0.880328,11581.0,0.624811,Osmancik
4,10765.0,426.677002,163.852051,84.60763,0.851069,11302.0,0.695066,Osmancik


In [41]:
evaluate(train, test, synthetic_data_tabula, label=label, visualization=True, directory="images/rice/tabula")

Data Validity Score: 0.9688320209973753
Data Structure Score: 1.0
Column Fidelity: 0.916871719160105
Row Fidelity: 0.5669173885869342
Privacy Score: 0.997712844517082
Utility Score: 0.9173228346456693


{'data_validity_score': 0.9688320209973753,
 'data_structure_score': 1.0,
 'column_fidelity': 0.916871719160105,
 'row_fidelity': 0.5669173885869342,
 'privacy_score': 0.997712844517082,
 'utility_score': 0.9173228346456693}

# Car Evaluation Dataset

### Data Preprocessing

In [175]:
car_evaluation = fetch_ucirepo(id=19)

In [176]:
X = car_evaluation.data.features
y = car_evaluation.data.targets

data = pd.concat([X, y], axis=1)

In [177]:
data.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [178]:
label = 'class'

In [179]:
train, test = train_test_split(data, test_size=0.2, random_state=42)

In [180]:
num_samples = train.shape[0]

In [181]:
num_samples

1382

### Directory Structure

In [182]:
!mkdir images

mkdir: cannot create directory ‘images’: File exists


In [183]:
!mkdir images/car

mkdir: cannot create directory ‘images/car’: File exists


In [184]:
!mkdir images/car/tvae

mkdir: cannot create directory ‘images/car/tvae’: File exists


In [185]:
!mkdir images/car/ctgan

mkdir: cannot create directory ‘images/car/ctgan’: File exists


In [186]:
!mkdir images/car/findiff

mkdir: cannot create directory ‘images/car/findiff’: File exists


In [187]:
!mkdir images/car/tabula

mkdir: cannot create directory ‘images/car/tabula’: File exists


### TVAE

In [188]:
tvae = DataSynthesizer(data, "tvae")

Initializing TVAE synthesizer



We strongly recommend saving the metadata using 'save_to_json' for replicability in future SDV versions.



In [189]:
tvae.fit()

Fitting TVAE synthesizer



os.fork() was called. os.fork() is incompatible with multithreaded code, and JAX is multithreaded, so this will likely lead to a deadlock.



In [190]:
synthetic_data_tvae = tvae.sample(num_samples)

Sampling from TVAE synthesizer


In [191]:
synthetic_data_tvae.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,high,5more,2,big,low,unacc
1,med,high,2,4,small,med,unacc
2,vhigh,med,2,2,big,low,unacc
3,vhigh,low,3,4,med,low,unacc
4,vhigh,med,5more,4,big,low,unacc


In [192]:
evaluate(train, test, synthetic_data_tvae, label=label, visualization=True, directory="images/car/tvae")

Data Validity Score: 1.0
Data Structure Score: 1.0
Column Fidelity: 0.9659913169319826
Row Fidelity: 0.9252635931362414
Privacy Score: 1.0
Utility Score: 0.7976878612716763


{'data_validity_score': 1.0,
 'data_structure_score': 1.0,
 'column_fidelity': 0.9659913169319826,
 'row_fidelity': 0.9252635931362414,
 'privacy_score': 1.0,
 'utility_score': 0.7976878612716763}

### CTGAN

In [193]:
ctgan = DataSynthesizer(data, "ctgan")

Initializing CTGAN synthesizer



We strongly recommend saving the metadata using 'save_to_json' for replicability in future SDV versions.



In [194]:
ctgan.fit()

Fitting CTGAN synthesizer


In [195]:
synthetic_data_ctgan = ctgan.sample(num_samples)

Sampling from CTGAN synthesizer


In [196]:
synthetic_data_ctgan.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,high,med,2,4,big,med,acc
1,low,med,2,2,small,low,unacc
2,high,med,4,4,big,med,unacc
3,high,vhigh,5more,2,big,low,unacc
4,low,med,4,2,med,low,unacc


In [197]:
evaluate(train, test, synthetic_data_ctgan, label=label, visualization=True, directory="images/car/ctgan")

Data Validity Score: 1.0
Data Structure Score: 1.0
Column Fidelity: 0.8782303080421748
Row Fidelity: 0.8205154710219834
Privacy Score: 0.8571428507566452
Utility Score: 0.6473988439306358


{'data_validity_score': 1.0,
 'data_structure_score': 1.0,
 'column_fidelity': 0.8782303080421748,
 'row_fidelity': 0.8205154710219834,
 'privacy_score': 0.8571428507566452,
 'utility_score': 0.6473988439306358}

### FinDiff

In [198]:
findiff = DataSynthesizer(train, "findiff")

Initializing FinDiff synthesizer


In [199]:
findiff.fit()

Fitting FinDiff synthesizer


[LOG 2024-05-27 02:38:40] epoch: 0029, train-loss: 0.37690958: 100%|██████████| 30/30 [00:02<00:00, 10.86it/s]


In [200]:
synthetic_data_findiff = findiff.sample(num_samples)

Sampling from FinDiff synthesizer


[LOG 2024-05-27 02:38:41] Diffusion Step: 0000: : 10it [00:00, 48.33it/s]


In [201]:
synthetic_data_findiff.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,med,med,5more,more,med,med,good
1,high,med,3,more,big,low,vgood
2,low,med,5more,4,med,low,good
3,med,med,3,more,med,low,good
4,med,low,3,2,big,med,unacc


In [202]:
data.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [203]:
evaluate(train, test, synthetic_data_findiff, label=label, visualization=True, directory="images/car/findiff")

Data Validity Score: 1.0
Data Structure Score: 1.0
Column Fidelity: 0.7644200951002688
Row Fidelity: 0.6502308593480808
Privacy Score: 0.8571428507566452
Utility Score: 0.22832369942196531


{'data_validity_score': 1.0,
 'data_structure_score': 1.0,
 'column_fidelity': 0.7644200951002688,
 'row_fidelity': 0.6502308593480808,
 'privacy_score': 0.8571428507566452,
 'utility_score': 0.22832369942196531}

### TabuLa

In [204]:
tabula = DataSynthesizer(train, "tabula")

Initializing Tabula synthesizer



`resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.



In [205]:
tabula.fit()

Fitting Tabula synthesizer


Step,Training Loss
500,0.8565
1000,0.638
1500,0.6302
2000,0.6262


In [206]:
synthetic_data_tabula = tabula.sample(num_samples)

Sampling from Tabula synthesizer


1400it [00:06, 210.00it/s]


In [207]:
synthetic_data_tabula.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,high,high,5more,more,med,med,acc
1,med,vhigh,2,2,small,high,unacc
2,low,vhigh,5more,more,big,high,acc
3,med,high,3,4,med,low,unacc
4,high,med,3,4,big,low,unacc


In [208]:
evaluate(train, test, synthetic_data_tabula, label=label, visualization=True, directory="images/car/tabula")

Data Validity Score: 0.7798222038453586
Data Structure Score: 1.0
Column Fidelity: 0.7691751094425747
Row Fidelity: 0.5710495486182896
Privacy Score: 0.8571428507566452
Utility Score: 0.8815028901734104


{'data_validity_score': 0.7798222038453586,
 'data_structure_score': 1.0,
 'column_fidelity': 0.7691751094425747,
 'row_fidelity': 0.5710495486182896,
 'privacy_score': 0.8571428507566452,
 'utility_score': 0.8815028901734104}

# Adult Census Income Dataset

### Data Preprocessing

In [2]:
adult = fetch_ucirepo(id=2)

In [3]:
X = adult.data.features
y = adult.data.targets

data = pd.concat([X, y], axis=1)

In [4]:
data.isna().sum()

age                 0
workclass         963
fnlwgt              0
education           0
education-num       0
marital-status      0
occupation        966
relationship        0
race                0
sex                 0
capital-gain        0
capital-loss        0
hours-per-week      0
native-country    274
income              0
dtype: int64

In [6]:
data = data.dropna(subset=['workclass', 'occupation', 'native-country'])

In [7]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [9]:
label = 'income'

In [10]:
train, test = train_test_split(data, test_size=0.2, random_state=42)

In [11]:
num_samples = train.shape[0]

In [12]:
num_samples

38096

### Directory Structure

In [68]:
!mkdir images

mkdir: cannot create directory ‘images’: File exists


In [13]:
!mkdir images/income

In [14]:
!mkdir images/income/tvae

In [15]:
!mkdir images/income/ctgan

In [16]:
!mkdir images/income/findiff

In [17]:
!mkdir images/income/tabula

### TVAE

In [18]:
tvae = DataSynthesizer(data, "tvae")

Initializing TVAE synthesizer




In [19]:
tvae.fit()

Fitting TVAE synthesizer


  pid = os.fork()


In [20]:
synthetic_data_tvae = tvae.sample(num_samples)

Sampling from TVAE synthesizer


In [21]:
synthetic_data_tvae.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,56,State-gov,117218,Assoc-voc,11,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,1,64,United-States,>50K
1,52,Private,76802,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,1,0,40,United-States,<=50K.
2,60,Self-emp-not-inc,148824,HS-grad,9,Divorced,Sales,Not-in-family,White,Female,0,0,45,United-States,<=50K
3,51,Federal-gov,204549,HS-grad,9,Married-civ-spouse,Adm-clerical,Husband,White,Male,0,0,50,United-States,<=50K
4,57,Federal-gov,149927,Prof-school,15,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,40,United-States,<=50K


In [23]:
evaluate(train, test, synthetic_data_tvae, label=label, visualization=True, directory="images/income/tvae")

Data Validity Score: 1.0
Data Structure Score: 1.0
Column Fidelity: 0.9156534369312614
Row Fidelity: 0.43918605279632067
Privacy Score: 0.9905765671283007
Utility Score: 0.5408923884514436


{'data_validity_score': 1.0,
 'data_structure_score': 1.0,
 'column_fidelity': 0.9156534369312614,
 'row_fidelity': 0.43918605279632067,
 'privacy_score': 0.9905765671283007,
 'utility_score': 0.5408923884514436}

### CTGAN

In [24]:
ctgan = DataSynthesizer(data, "ctgan")

Initializing CTGAN synthesizer



We strongly recommend saving the metadata using 'save_to_json' for replicability in future SDV versions.



In [25]:
ctgan.fit()

Fitting CTGAN synthesizer



os.fork() was called. os.fork() is incompatible with multithreaded code, and JAX is multithreaded, so this will likely lead to a deadlock.



In [26]:
synthetic_data_ctgan = ctgan.sample(num_samples)

Sampling from CTGAN synthesizer


In [27]:
synthetic_data_ctgan.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,26,Private,208916,HS-grad,9,Never-married,Transport-moving,Other-relative,White,Male,0,1,35,Canada,<=50K
1,27,Private,120176,Bachelors,13,Never-married,Other-service,Not-in-family,White,Female,2,0,40,United-States,<=50K
2,17,Private,194439,11th,7,Never-married,Adm-clerical,Own-child,White,Male,0,0,23,United-States,<=50K
3,17,Federal-gov,88188,HS-grad,9,Never-married,Adm-clerical,Not-in-family,White,Female,0,0,13,United-States,<=50K
4,22,?,213981,Some-college,10,Never-married,?,Not-in-family,White,Female,5,0,40,United-States,<=50K


In [28]:
evaluate(train, test, synthetic_data_ctgan, label=label, visualization=True, directory="images/income/ctgan")

Data Validity Score: 0.9999667506649867
Data Structure Score: 1.0
Column Fidelity: 0.8908319333629315
Row Fidelity: 0.4345198456687712
Privacy Score: 0.9870952107012272
Utility Score: 0.5463517060367454


{'data_validity_score': 0.9999667506649867,
 'data_structure_score': 1.0,
 'column_fidelity': 0.8908319333629315,
 'row_fidelity': 0.4345198456687712,
 'privacy_score': 0.9870952107012272,
 'utility_score': 0.5463517060367454}

### FinDiff

In [29]:
findiff = DataSynthesizer(train, "findiff")

Initializing FinDiff synthesizer


In [30]:
findiff.fit()

Fitting FinDiff synthesizer


[LOG 2024-05-27 10:42:41] epoch: 0029, train-loss: 0.24558991: 100%|██████████| 30/30 [01:11<00:00,  2.40s/it]


In [31]:
synthetic_data_findiff = findiff.sample(num_samples)

Sampling from FinDiff synthesizer


[LOG 2024-05-27 10:42:47] Diffusion Step: 0000: : 10it [00:05,  1.70it/s]

X does not have valid feature names, but QuantileTransformer was fitted with feature names



In [32]:
synthetic_data_findiff.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,26,Without-pay,88817,Assoc-voc,14,Widowed,?,Not-in-family,Black,Female,0,0,40,Hungary,<=50K.
1,43,Private,182490,Assoc-voc,9,Married-civ-spouse,Other-service,Not-in-family,Amer-Indian-Eskimo,Female,0,0,42,Vietnam,<=50K.
2,48,Federal-gov,180177,Preschool,6,Widowed,Other-service,Not-in-family,Black,Female,0,0,50,Columbia,<=50K.
3,27,Federal-gov,169888,Prof-school,9,Married-spouse-absent,Priv-house-serv,Not-in-family,Black,Female,0,0,40,Outlying-US(Guam-USVI-etc),<=50K.
4,47,State-gov,68829,12th,9,Married-spouse-absent,?,Not-in-family,Black,Female,2596,0,45,South,>50K.


In [34]:
evaluate(train, test, synthetic_data_findiff, label=label, visualization=True, directory="images/income/findiff")

Data Validity Score: 1.0
Data Structure Score: 1.0
Column Fidelity: 0.6593220635587289
Row Fidelity: 0.22870656877773393
Privacy Score: 0.7754897326231003
Utility Score: 0.2358005249343832


{'data_validity_score': 1.0,
 'data_structure_score': 1.0,
 'column_fidelity': 0.6593220635587289,
 'row_fidelity': 0.22870656877773393,
 'privacy_score': 0.7754897326231003,
 'utility_score': 0.2358005249343832}

### TabuLa

In [35]:
tabula = DataSynthesizer(train, "tabula")

Initializing Tabula synthesizer




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


`resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.



In [36]:
tabula.fit()

Fitting Tabula synthesizer


Step,Training Loss
500,0.7509
1000,0.4351
1500,0.4231
2000,0.418
2500,0.4134
3000,0.4102
3500,0.4087
4000,0.403
4500,0.4027
5000,0.4


In [37]:
synthetic_data_tabula = tabula.sample(num_samples)

Sampling from Tabula synthesizer


38176it [06:13, 102.11it/s]


In [38]:
synthetic_data_tabula.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,20.0,Private,205839.0,Some-college,10.0,Never-married,Other-service,Own-child,White,Male,0.0,0.0,16.0,United-States,<=50K
1,36.0,Private,188834.0,HS-grad,9.0,Married-civ-spouse,Craft-repair,Husband,White,Male,0.0,0.0,40.0,United-States,>50K
2,32.0,Private,133861.0,Bachelors,13.0,Never-married,Exec-managerial,Not-in-family,White,Male,13550.0,0.0,48.0,United-States,>50K
3,34.0,Private,209900.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Husband,White,Male,0.0,0.0,55.0,United-States,>50K.
4,51.0,Private,171914.0,9th,5.0,Widowed,Craft-repair,Not-in-family,White,Male,0.0,0.0,30.0,United-States,<=50K.


In [39]:
evaluate(train, test, synthetic_data_tabula, label=label, visualization=True, directory="images/income/tabula")

Data Validity Score: 0.5956215875682486
Data Structure Score: 1.0
Column Fidelity: 0.581425871744624
Row Fidelity: 0.17020115200227887
Privacy Score: 0.5999999940395355
Utility Score: 0.5589501312335958


{'data_validity_score': 0.5956215875682486,
 'data_structure_score': 1.0,
 'column_fidelity': 0.581425871744624,
 'row_fidelity': 0.17020115200227887,
 'privacy_score': 0.5999999940395355,
 'utility_score': 0.5589501312335958}