# Reproduce GP-VAE
## Instructions for use:
1. Update the DRIVE_ROOT_PATH & DATA_ROOT_PATH relative to the cloned repo folder in your google drive
2. **Only for the first time you run on a colab host**: perform initial installations and fixes:
  * Un-Comment code cell 2
  * Execute code cells 1, 2 
  * Comment cell 2 back
  * Restart the host
3. Run the notebook

**Note**: The train / test outputs (model, results) will be copied into the repository clone under 'model' folder in the end, as intended by the original writers.

In [24]:
import time
from pathlib import Path
from google.colab import drive

# Google Drive linkage
drive.mount('/content/drive')
DRIVE_ROOT_DIR = Path('/content/drive/MyDrive/colab_data/aml_project/gp_vae_for_data_generation')
DRIVE_DATA_DIR = Path('/content/drive/MyDrive/colab_data/aml_project/data/hmnist_full.npz')

# copy code to colab host
!rm -rf outputs/*
!cp -R $DRIVE_ROOT_DIR/* .
!ls -la $ROOT_DIR

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
total 92
drwxr-xr-x 1 root root  4096 Aug 16 12:07 .
drwxr-xr-x 1 root root  4096 Aug 16 11:59 ..
drwxr-xr-x 4 root root  4096 Aug  3 20:20 .config
drwx------ 2 root root  4096 Aug 16 12:05 data
drwx------ 5 root root  4096 Aug 16 12:05 drive
drwx------ 2 root root  4096 Aug 16 12:05 figures
-rw------- 1 root root  8525 Aug 16 12:18 generate.py
drwx------ 3 root root  4096 Aug 16 12:05 lib
-rw------- 1 root root  1087 Aug 16 12:18 LICENSE
drwx------ 3 root root  4096 Aug 16 12:18 outputs
drwx------ 2 root root  4096 Aug 16 12:17 __pycache__
-rw------- 1 root root  2364 Aug 16 12:18 README.md
-rw------- 1 root root   146 Aug 16 12:18 requirements.txt
drwxr-xr-x 1 root root  4096 Aug  3 20:21 sample_data
-rw------- 1 root root 18845 Aug 16 12:18 train.py
drwx------ 2 root root  4096 Aug 16 12:07 utils
-rw------- 1 root root  2579 Aug 16 12:18 utils.py


In [25]:
# Run one time only, then comment and restart the host

# !pip install -r requirements.txt

## Train model

In [26]:
# Create output folders for this run

# Relative paths - correct both for drive and for local (Colab) VM
datetime = time.strftime("%Y%m%d-%H%M%S")
OUTPUTS_DIR = Path(f'outputs/{datetime}')
DRIVE_OUTPUTS_DIR = DRIVE_ROOT_DIR / OUTPUTS_DIR

# Create output folders on local (Colab) VM
Path(OUTPUTS_DIR).mkdir(parents=True, exist_ok=True)
print(f'OUTPUTS_DIR = {OUTPUTS_DIR}, exists = {Path.exists(OUTPUTS_DIR)}')

Path(DRIVE_OUTPUTS_DIR).mkdir(parents=True, exist_ok=True)
print(f'DRIVE_OUTPUTS_DIR = {DRIVE_OUTPUTS_DIR}, exists = {Path.exists(DRIVE_OUTPUTS_DIR)}')

OUTPUTS_DIR = outputs/20220816-121854, exists = True
DRIVE_OUTPUTS_DIR = /content/drive/MyDrive/colab_data/aml_project/gp_vae_for_data_generation/outputs/20220816-121854, exists = True


In [None]:
# #Train on limited dataset
SAMPLES_PER_DIGIT = 200
NUM_EPOCHS = 25
WHITE_FLIP_RATIO = 0.6
BLACK_FLIP_RATIO = 0.8

!python train.py --model_type gp-vae --seed 123 --data_type hmnist --banded_covar --latent_dim 256 --encoder_sizes=256,256 --decoder_sizes=256,256,256 --window_size 3 --sigma 1 --length_scale 2 --beta 0.8 --num_epochs $NUM_EPOCHS --train_class_number $SAMPLES_PER_DIGIT --data_dir $DRIVE_DATA_DIR --base_dir $OUTPUTS_DIR --black_flip_ratio $BLACK_FLIP_RATIO --white_flip_ratio $WHITE_FLIP_RATIO

Seed: {FLAGS.seed}
Outputs will be saved to:  outputs/20220816-121854/model
tcmalloc: large alloc 3763200000 bytes == 0x59aa000 @  0x7fd1175331e7 0x7fd113f88cf1 0x7fd113fed7c8 0x7fd113f8c4f1 0x5947d6 0x548cc1 0x5127f1 0x549576 0x593fce 0x548ae9 0x5127f1 0x4bc98a 0x533274 0x4d3969 0x512147 0x549e0e 0x593fce 0x5118f8 0x593dd7 0x5118f8 0x549576 0x593fce 0x548ae9 0x51566f 0x549576 0x604173 0x5f5506 0x5f8c6c 0x5f9206 0x64faf2 0x64fc4e
tcmalloc: large alloc 1881604096 bytes == 0xe5ec8000 @  0x7fd1175331e7 0x7fd113f88cf1 0x7fd113ff05d0 0x7fd114080ab2 0x59b1b0 0x515655 0x549e0e 0x593fce 0x5118f8 0x593dd7 0x5118f8 0x549576 0x593fce 0x548ae9 0x51566f 0x549576 0x604173 0x5f5506 0x5f8c6c 0x5f9206 0x64faf2 0x64fc4e 0x7fd117130c87 0x5b621a
Loading data...Done!
4000
Saved train dataset to: outputs/20220816-121854/train_data.npz
2022-08-16 12:19:13.580182: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1
2022-08-16 12:19:13.653761: I tens

In [None]:
# Copy trained models back to drive
!cp -R $OUTPUTS_DIR/* $DRIVE_OUTPUTS_DIR
print(f'DRIVE_OUTPUTS_DIR = {DRIVE_OUTPUTS_DIR}')
!ls -la $DRIVE_OUTPUTS_DIR


## Generate new samples based on the dataset used in train and the model

In [None]:
#Train on limited dataset
WHITE_FLIP_RATIO = 0.6
BLACK_FLIP_RATIO = 0.8

!python generate.py --model_type gp-vae --data_type hmnist --seed 123 --banded_covar --latent_dim 256 --encoder_sizes=256,256 --decoder_sizes=256,256,256 --window_size 3 --sigma 1 --length_scale 2 --beta 0.8 --num_epochs 20 --base_dir $OUTPUTS_DIR --black_flip_ratio $BLACK_FLIP_RATIO --white_flip_ratio $WHITE_FLIP_RATIO

In [None]:
# Copy trained models back to drive
!cp -R $OUTPUTS_DIR/generated_data $DRIVE_OUTPUTS_DIR
print(f'DRIVE_OUTPUTS_DIR = {DRIVE_OUTPUTS_DIR}')
!ls -la $DRIVE_OUTPUTS_DIR/generated_data



In [None]:
!rm -rf outputs/*
