In [34]:
import tensorflow as tf
import pandas as pd
from sdv.single_table import CTGANSynthesizer
from sdv.datasets.local import load_csvs
from sdv.metadata import SingleTableMetadata
from sdv.single_table import GaussianCopulaSynthesizer
from sdv.single_table import CTGANSynthesizer


In [60]:
# Import clean Dataset as dataframe
path = "data\\pokemon_train_set.csv"
df = pd.read_csv(path, encoding= "UTF-8")
df.head()

Unnamed: 0,Name,Type 1,Type 2,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary,Evolution
0,Bulbasaur,Grass,Poison,45,49,49,65,65,45,1,False,Basic
1,Ivysaur,Grass,Poison,60,62,63,80,80,60,1,False,Stage1
2,Venusaur,Grass,Poison,80,82,83,100,100,80,1,False,Stage2
3,Mega Venusaur,Grass,Poison,80,100,123,122,120,80,1,False,Mega
4,Charmander,Fire,,39,52,43,60,50,65,1,False,Basic


In [61]:
# Import clean Dataset for sdv

datasets = load_csvs(
    folder_name='data/',
    read_csv_parameters={
        'skipinitialspace': True,
        'encoding': 'utf-8'
    })

# the data is available under the file name
data = datasets['pokemon_train_set']

In [62]:
# Create metadata

metadata = SingleTableMetadata()
metadata.detect_from_dataframe(df)


In [63]:
# Try Gaussian Synthasizer

# Step 1: Create the synthesizer
synthesizer = GaussianCopulaSynthesizer(metadata)

# Step 2: Train the synthesizer
synthesizer.fit(df)

# Step 3: Generate synthetic data
synthetic_data_gaussian = synthesizer.sample(num_rows=100)



In [76]:
# Initializing CTGAN

synthesizer = CTGANSynthesizer(
    metadata, # required
    enforce_rounding=False,
    epochs=1500,
    verbose=True,
    enforce_min_max_values=True,

)

In [77]:
# Training and Generating Data

synthesizer.fit(df)

synthetic_data_ctgan = synthesizer.sample(num_rows=400)

  col for col, dtype in data.dtypes.items() if pd.api.types.is_categorical_dtype(dtype)
Gen. (-1.69) | Discrim. (-0.11): 100%|██████████| 1500/1500 [04:08<00:00,  6.05it/s]


In [66]:
synthetic_data_ctgan.head()

Unnamed: 0,Name,Type 1,Type 2,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary,Evolution
0,sdv-pii-64jr8,Dragon,Ground,67,13,52,28,85,39,4,False,Stage1
1,sdv-pii-xnuld,Psychic,Dragon,5,58,66,41,21,86,2,False,Stage2
2,sdv-pii-23pgo,Ground,Flying,37,5,186,32,44,152,4,False,Stage1
3,sdv-pii-8mult,Bug,Steel,31,6,5,72,29,54,3,False,Basic
4,sdv-pii-pjbi7,Fire,Ground,49,46,130,183,28,99,4,True,Basic


In [67]:
# Export synthetic Data to csv
synthetic_data_ctgan.to_csv("CTGAN_Pokemon.csv", encoding="UTF-8", index=False)
synthetic_data_gaussian.to_csv("Gaussian_Pokemon.csv", encoding="UTF-8", index=False)

In [68]:
# Quick model Eval
from sdmetrics.reports.single_table import QualityReport

report = QualityReport()

In [71]:
metadata_dict = {
    "columns": {
        "Name": {
            "sdtype": "unknown"
        },
        "Type 1": {
            "sdtype": "categorical"
        },
        "Type 2": {
            "sdtype": "categorical"
        },
        "HP": {
            "sdtype": "numerical"
        },
        "Attack": {
            "sdtype": "numerical"
        },
        "Defense": {
            "sdtype": "numerical"
        },
        "Sp. Atk": {
            "sdtype": "numerical"
        },
        "Sp. Def": {
            "sdtype": "numerical"
        },
        "Speed": {
            "sdtype": "numerical"
        },
        "Generation": {
            "sdtype": "categorical"
        },
        "Legendary": {
            "sdtype": "categorical"
        },
        "Evolution": {
            "sdtype": "categorical"
        }
    },
    "primary_key": "Name",
    "METADATA_SPEC_VERSION": "SINGLE_TABLE_V1"
}


In [72]:
report.generate(df, synthetic_data_ctgan, metadata_dict)

Generating report ...

(1/2) Evaluating Column Shapes: |          | 0/12 [00:00<?, ?it/s]|

(1/2) Evaluating Column Shapes: |██████████| 12/12 [00:00<00:00, 124.61it/s]|
Column Shapes Score: 76.16%

(2/2) Evaluating Column Pair Trends: |██████████| 66/66 [00:02<00:00, 27.02it/s]|
Column Pair Trends Score: 70.06%

Overall Score (Average): 73.11%



In [73]:
# Overall Score of 75% not so good -> should be closer to 1

In [74]:
report.get_details(property_name='Column Shapes')

Unnamed: 0,Column,Metric,Score
0,Type 1,TVComplement,0.865
1,Type 2,TVComplement,0.899574
2,HP,KSComplement,0.535
3,Attack,KSComplement,0.6275
4,Defense,KSComplement,0.78875
5,Sp. Atk,KSComplement,0.615
6,Sp. Def,KSComplement,0.64125
7,Speed,KSComplement,0.6725
8,Generation,TVComplement,0.83
9,Legendary,TVComplement,0.99125


In [75]:
report.generate(df, synthetic_data_gaussian, metadata_dict)

Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 12/12 [00:00<00:00, 354.53it/s]|
Column Shapes Score: 90.06%

(2/2) Evaluating Column Pair Trends: |██████████| 66/66 [00:01<00:00, 50.55it/s]| 
Column Pair Trends Score: 75.38%

Overall Score (Average): 82.72%

