# Install libraries

In [None]:
!pip install sdv

In [None]:
import sdv
print(sdv.__version__)

In [None]:
# !pip install sdmetrics

In [None]:
!pip install plotly>=5.10.0

In [None]:
!pip install -U kaleido

# Import libraries

In [None]:
from ctgan import CTGAN
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
# import sdv
# from sdv.tabular import CTGAN
# from sdv.evaluation import evaluate
from sdv.single_table import CTGANSynthesizer
from sdv.metadata import SingleTableMetadata
from sdmetrics.reports.single_table import QualityReport
import plotly
import json
import kaleido
import time

In [None]:
!python --version

In [None]:
pd.__version__

In [None]:
plotly.__version__

# Global variables

In [None]:
TRAIN_DF_PATH = 'data/train_df.csv'


# ------------- Model Training ------------- #
EPOCHS = 1000
BATCH_SIZE = 100

MODEL_PATH = 'model/ctgan.pkl'
SAVING_PATH = 'results/ctgan.csv'

# ------------- Generating Data ------------- #
GENERATING_SIZE = 7000

print(f'{MODEL_PATH = }')
print(f'{SAVING_PATH = }')

# Training phase

## Import Training Set

In [None]:
train_df = pd.read_csv(TRAIN_DF_PATH)

In [None]:
train_df.info()

In [None]:
train_df.head()

## Train model

In [None]:
# Names of the columns that are discrete
discrete_columns = ['sex', 'age2', 'edattain', 'classwk', 'marst', 'geo2_th2000']

In [None]:
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data=train_df)

synthesizer = CTGANSynthesizer(
    metadata=metadata, # required
    enforce_rounding=False,
    epochs=EPOCHS,
    verbose=True
)

# get the start time
st = time.time()

synthesizer.fit(train_df)

# get the end time
et = time.time()

# get the execution time
elapsed_time = et - st
print(f"Training time: {elapsed_time:.2f} seconds")

In [None]:
synthesizer.save(MODEL_PATH)

# Generating phase

In [None]:
# Load model
model = CTGANSynthesizer.load(MODEL_PATH)

In [None]:
# get the start time
st = time.time()

synthetic_data = model.sample(num_rows=GENERATING_SIZE)

# get the end time
et = time.time()

# get the execution time
elapsed_time = et - st
print(f"Generating time: {elapsed_time:.2f} seconds")

In [None]:
synthetic_data.head()

In [None]:
# Save the generated data
synthetic_data.to_csv(SAVING_PATH, index=False, header=True)
print(f'{SAVING_PATH = }')