In [32]:
import numpy as np
import pandas as pd
import wgan
from pathlib import Path

malawi_path = Path('/home/selker/eop/data/malawi')


In [33]:
malawi_2019 = pd.read_parquet(malawi_path / 'malawi_cleaned_2019_not_one_hot_encoded.parquet')

In [34]:
malawi_numeric_columns = set(malawi_2019.select_dtypes(
    include=[np.number]
).columns)
malawi_non_numeric_columns = set(malawi_2019.select_dtypes(
    exclude=[np.number, np.datetime64]
).columns)

enforced_categorical = {c for c in malawi_numeric_columns if c.endswith('_nan')}
malawi_numeric_columns = list(malawi_numeric_columns - enforced_categorical)
malawi_non_numeric_columns = list(malawi_non_numeric_columns | enforced_categorical)

non_numeric_converted = (
    malawi_2019[malawi_non_numeric_columns].astype('category').apply(lambda c: c.cat.codes)
)

malawi_2019[malawi_non_numeric_columns] = non_numeric_converted

In [35]:
noise_generator = np.random.default_rng(seed=1234)
noise_shape = malawi_2019.shape

noise = noise_generator.normal(size=noise_shape)
noise = pd.DataFrame(noise, columns=[f'noise_{i}' for i in range(noise_shape[1])])
with_noise = pd.concat((malawi_2019, noise), axis=1)


In [36]:
# importlib.reload(wgan)
data_wrapper = wgan.DataWrapper(
    with_noise, continuous_vars=malawi_numeric_columns, categorical_vars=malawi_non_numeric_columns,
    context_vars=list(noise.columns.values)
)
specifications = wgan.Specifications(
    data_wrapper, batch_size=4096, max_epochs=1000, critic_lr=1e-3, generator_lr=1e-3,
    print_every=100, device = "cuda"
)
generator = wgan.Generator(specifications)
critic = wgan.Critic(specifications)

training_data, context = data_wrapper.preprocess(with_noise)

settings: {'optimizer': <class 'torch.optim.adam.Adam'>, 'critic_d_hidden': [128, 128, 128], 'critic_dropout': 0, 'critic_steps': 15, 'critic_lr': 0.001, 'critic_gp_factor': 5, 'generator_d_hidden': [128, 128, 128], 'generator_dropout': 0.1, 'generator_lr': 0.001, 'generator_d_noise': 24774, 'generator_optimizer': 'optimizer', 'max_epochs': 1000, 'batch_size': 4096, 'test_set_size': 16, 'load_checkpoint': None, 'save_checkpoint': None, 'save_every': 100, 'print_every': 100, 'device': 'cuda'}


In [37]:
wgan.train(generator, critic, training_data, context, specifications)

epoch 0 | step 4 | WD_test 6.14 | WD_train 2.27 | sec passed 8 |
epoch 100 | step 304 | WD_test 31211786.0 | WD_train 30246374.67 | sec passed 620 |
epoch 200 | step 604 | WD_test 641735744.0 | WD_train 629866837.33 | sec passed 611 |
exited gracefully.


In [38]:
generated_data = data_wrapper.apply_generator(generator, with_noise)