# Variational AutoEncoder Digits Example

Check the README for an introduction to the project and how to get started!

## Imports and Setup

In [1]:
import os
import numpy as np
import pandas as pd

os.environ["KERAS_BACKEND"] = "tensorflow"

# ml dependencies
import tensorflow as tf
import keras
from keras import ops
from keras import layers

# mlflow dependencies
import mlflow
from mlflow import MlflowClient
from pprint import pprint

2024-08-13 03:43:49.379064: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-13 03:43:49.379131: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-13 03:43:49.380830: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-13 03:43:49.386924: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# create the model directory for saving outputs
model_dir = './model-dir'
os.makedirs(model_dir, exist_ok = True)

env_name = "digits_env" # <name of your env>

## Create the ML Model

### Create sampling layer

In [3]:
class Sampling(layers.Layer):
    """Uses (z_mean, z_log_var) to sample z, the vector encoding a digit."""

    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.seed_generator = keras.random.SeedGenerator(1337)

    def call(self, inputs):
        z_mean, z_log_var = inputs
        batch = ops.shape(z_mean)[0]
        dim = ops.shape(z_mean)[1]
        epsilon = keras.random.normal(shape=(batch, dim), seed=self.seed_generator)
        return z_mean + ops.exp(0.5 * z_log_var) * epsilon

### Build the encoder

In [4]:
latent_dim = 2

encoder_inputs = keras.Input(shape=(28, 28, 1))
x = layers.Conv2D(32, 3, activation="relu", strides=2, padding="same")(encoder_inputs)
x = layers.Conv2D(64, 3, activation="relu", strides=2, padding="same")(x)
x = layers.Flatten()(x)
x = layers.Dense(16, activation="relu")(x)
z_mean = layers.Dense(latent_dim, name="z_mean")(x)
z_log_var = layers.Dense(latent_dim, name="z_log_var")(x)
z = Sampling()([z_mean, z_log_var])
encoder = keras.Model(encoder_inputs, [z_mean, z_log_var, z], name="encoder")
encoder.summary()

2024-08-13 03:43:54.934211: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-08-13 03:43:54.983218: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-08-13 03:43:54.987069: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

### Build the decoder

In [5]:
latent_inputs = keras.Input(shape=(latent_dim,))
x = layers.Dense(7 * 7 * 64, activation="relu")(latent_inputs)
x = layers.Reshape((7, 7, 64))(x)
x = layers.Conv2DTranspose(64, 3, activation="relu", strides=2, padding="same")(x)
x = layers.Conv2DTranspose(32, 3, activation="relu", strides=2, padding="same")(x)
decoder_outputs = layers.Conv2DTranspose(1, 3, activation="sigmoid", padding="same")(x)
decoder = keras.Model(latent_inputs, decoder_outputs, name="decoder")
decoder.summary()

### Define the VAE as a `Model` with a custom `train_step`

In [6]:
class VAE(keras.Model):
    def __init__(self, encoder, decoder, **kwargs):
        super().__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder
        self.total_loss_tracker = keras.metrics.Mean(name="total_loss")
        self.reconstruction_loss_tracker = keras.metrics.Mean(
            name="reconstruction_loss"
        )
        self.kl_loss_tracker = keras.metrics.Mean(name="kl_loss")

    @property
    def metrics(self):
        return [
            self.total_loss_tracker,
            self.reconstruction_loss_tracker,
            self.kl_loss_tracker,
        ]

    def train_step(self, data):
        with tf.GradientTape() as tape:
            z_mean, z_log_var, z = self.encoder(data)
            reconstruction = self.decoder(z)
            reconstruction_loss = ops.mean(
                ops.sum(
                    keras.losses.binary_crossentropy(data, reconstruction),
                    axis=(1, 2),
                )
            )
            kl_loss = -0.5 * (1 + z_log_var - ops.square(z_mean) - ops.exp(z_log_var))
            kl_loss = ops.mean(ops.sum(kl_loss, axis=1))
            total_loss = reconstruction_loss + kl_loss
        grads = tape.gradient(total_loss, self.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
        self.total_loss_tracker.update_state(total_loss)
        self.reconstruction_loss_tracker.update_state(reconstruction_loss)
        self.kl_loss_tracker.update_state(kl_loss)
        return {
            "loss": self.total_loss_tracker.result(),
            "reconstruction_loss": self.reconstruction_loss_tracker.result(),
            "kl_loss": self.kl_loss_tracker.result(),
        }

## DVC

In the lines below, the `&&` symbol is used multiple times. This symbol is originally a logical operator (the command on the right will only run if the command on the left executes successfully). However, when using `!` in a Jupyter notebook, the Linux commands are executed within the directory where the notebook is currently located. This behavior prevents commands from being run in separate directories if `!cd` is on its own line. Using `&&` ensures that the `dvc` commands are executed within the `dvc` submodule directory, without affecting the repository where the notebook resides.

In [7]:
dvc_repo_link = "git@github.com:oobielodan/digits_dvc.git" # <ssh link to the repo you set aside for dvc>
dvc_storage = "/demo-bucket" # <complete path to the mounted storage you have set up for dvc>

In [8]:
# grab your dvc repository -> the --force flag allows for this to still run if the submodule had already been created at a prior time
!git submodule add --force "{dvc_repo_link}"

Cloning into '/home/lobielodan/parsl_mpi/run_on_cluster/cvae-demos/digits-demo/digits_dvc'...
remote: Enumerating objects: 37, done.[K
remote: Counting objects: 100% (37/37), done.[K
remote: Compressing objects: 100% (28/28), done.[K
Receiving objects: 100% (37/37), 6.69 KiB | 6.69 MiB/s, done.
Resolving deltas: 100% (14/14), done.
remote: Total 37 (delta 14), reused 17 (delta 3), pack-reused 0[K


In [9]:
dvc_repo = "digits_dvc" # <name of the repository/submodule you just added for dvc> -> should appear as a folder in the current directory

In [10]:
# DVC initialization and storage set up
!cd "{dvc_repo}" && dvc init
!cd "{dvc_repo}" && dvc remote add -d dvcstorage "{dvc_storage}"

Initialized DVC repository.

You can now commit the changes to git.

[31m+---------------------------------------------------------------------+
[0m[31m|[0m                                                                     [31m|[0m
[31m|[0m        DVC has enabled anonymous aggregate usage analytics.         [31m|[0m
[31m|[0m     Read the analytics documentation (and how to opt-out) here:     [31m|[0m
[31m|[0m             <[36mhttps://dvc.org/doc/user-guide/analytics[39m>              [31m|[0m
[31m|[0m                                                                     [31m|[0m
[31m+---------------------------------------------------------------------+
[0m
[33mWhat's next?[39m
[33m------------[39m
- Check out the documentation: <[36mhttps://dvc.org/doc[39m>
- Get help and share ideas: <[36mhttps://dvc.org/chat[39m>
- Star us on GitHub: <[36mhttps://github.com/iterative/dvc[39m>
[0mSetting 'dvcstorage' as a default remote.
[0m

In [11]:
# initial commit to git
!cd "{dvc_repo}" && git add .
!cd "{dvc_repo}" && git commit -m "loaded dependencies, mkdir -p, DVC init"

[main 7e684aa] loaded dependencies, mkdir -p, DVC init
 Committer: Parallel Works app-run user <lobielodan@mgmt-lobielodan-cvaetraining-00035.pw-canary-us-east-1.pw.local>
Your name and email address were configured automatically based
on your username and hostname. Please check that they are accurate.
You can suppress this message by setting them explicitly. Run the
following command and follow the instructions in your editor to edit
your configuration file:

    git config --global --edit

After doing this, you may fix the identity used for this commit with:

    git commit --amend --reset-author

 3 files changed, 10 insertions(+)
 create mode 100644 .dvc/.gitignore
 create mode 100644 .dvc/config
 create mode 100644 .dvcignore


## MLFlow
MLflow is designed to help simplify the ML workflow, assisting users throughout the various stages of development and deployment. In this notebook, we use its ... capabilities to ... Documentation and more information can be found at [the MLFlow website](https://mlflow.org/docs/latest/index.html).


To get started with MLflow, run `mlflow server --host 127.0.0.1 --port 8080` in the command line. The `mlflow server` command needs to run in the background and therefore cannot be executed directly in a Jupyter notebook, as each cell must complete execution before the next one can run.

### Configuration
*If you used a different host and/or port during initialization, make sure to update the following URIs accordingly.*

In [34]:
# utilize and set up the initialized server for tracking 
client = MlflowClient(tracking_uri = "http://127.0.0.1:8080")
mlflow.set_tracking_uri("http://127.0.0.1:8080")

In [35]:
# view the metadata associated with all the experiments that are currently on the server. 
all_experiments = client.search_experiments()
print(all_experiments)

[<Experiment: artifact_location='mlflow-artifacts:/0', creation_time=1723520735255, experiment_id='0', last_update_time=1723520735255, lifecycle_stage='active', name='Default', tags={}>]


In [36]:
# example for accessing elements from returned collections of experiments
default_experiment = [
    {"name": experiment.name, "lifecycle_stage": experiment.lifecycle_stage}
    for experiment in all_experiments
    if experiment.name == "Default"
][0]

pprint(default_experiment)

{'lifecycle_stage': 'active', 'name': 'Default'}


In [37]:
# working on getting the server to display in the notebook --------------------------------------
# !curl http://127.0.0.1:8080
# %%javascript
# alert("JavaScript is working!");
# from IPython.display import IFrame
# IFrame("http://127.0.0.1:8080", 900,500)

### Experiment 1
In Experiment 1, we train the Digit CVAE model on multiple datasets. To create these datasets, we split the original dataset into five equal, randomized parts. After each training session, we save the weights and use them as the starting point for retraining the model on the next dataset.

In [39]:
# provide an experiment description that will appear in the UI
experiment1_description = (
    "This is the digits forecasting project."
    "This experiment contains the digit model for randomized numbers (0-9) trained separately."
)

# provide searchable tags for the experiment
experiment1_tags = {
    "project_name": "digit-forecasting",
    "model_type": "randomzied",
    "team": "digit-ml",
    "project_quarter": "Q3-2024",
    "mlflow.note.content": experiment1_description,
}

# create the experiment and give it a unique name
digit_experiment1 = client.create_experiment(
    name="Randomize_Model", tags=experiment1_tags
)

### Experiment 2
In Experiment 2, we train the Digit CVAE model on all digit samples simultaneously, without any subsequent retraining using the weights.

In [40]:
# provide an experiment description that will appear in the UI
experiment2_description = (
    "This is the digits forecasting project."
    "This experiment contains the digit model for numbers (0-9) trained all together."
)

# provide searchable tags for the experiment
experiment2_tags = {
    "project_name": "digit-forecasting",
    "model_type": "all digits",
    "team": "digit-ml",
    "project_quarter": "Q3-2024",
    "mlflow.note.content": experiment2_description,
}

# create the experiment and give it a unique name
digit_experiment2 = client.create_experiment(
    name="Together_Model", tags=experiment2_tags
)

### Experiment 3
In Experiment 3, we revisit the approach used in Experiment 1 - initializing the model with the weights from a previous training session and retraining it from there. However in this experiment, we train the Digit CVAE model sequentially on each of the 10 digits (0–9), one digit at a time. After each training session, we save the weights and use them to retrain the model on the next digit. This approach induces a 'forgetting' effect, where the model gradually loses its ability to recognize previous digits with each subsequent training session.

In [41]:
# provide an experiment description that will appear in the UI
experiment3_description = (
    "This is the digits forecasting project."
    "This experiment contains the digit model for each of the numbers (0-9) trained separately."
)

# provide searchable tags that define characteristics of the runs that will be in this experiment
experiment3_tags = {
    "project_name": "digit-forecasting",
    "model_type": "sequential",
    "team": "digit-ml",
    "project_quarter": "Q3-2024",
    "mlflow.note.content": experiment3_description,
}

# create the experiment and give it a unique name
digit_experiment3 = client.create_experiment(
    name="Sequenced_Model", tags=experiment3_tags
)

### Experiment Set Up

In [42]:
# save each of the experiment's metadata
digit_experiment1 = mlflow.set_experiment("Randomize_Model")
digit_experiment2 = mlflow.set_experiment("Together_Model")
digit_experiment3 = mlflow.set_experiment("Sequenced_Model")

## Train the VAE

*Make sure that 'vae.weights.h5' does not already exist in the model directory if you want to training from the beginning.*

In [43]:
vae = VAE(encoder, decoder)
vae.compile(optimizer=keras.optimizers.Adam())
(x_train, Y_train), (x_test, Y_test) = keras.datasets.mnist.load_data()

In [44]:
early_stopping_cb = keras.callbacks.EarlyStopping(patience = 5, restore_best_weights = True) # stops training early if the validation loss does not improve

def train_model(num, model, data, experiment):
    if os.path.exists(os.path.join(model_dir, 'vae.weights.h5')): # if the model has already been trained at least once, load that model
        model.load_weights(os.path.join(model_dir, 'vae.weights.h5'))
    
    mlflow.autolog()
    
    run_name = f"{num}_test" # define a run name for this iteration of training
    artifact_path = f"{num}"  # define an artifact path that the model will be saved to
    
    # initiate the MLflow run context
    with mlflow.start_run(run_name = run_name, experiment_id = experiment) as run:
        mlflow.log_params({"num": num}) # log the parameters used for the model fit
        # mlflow.log_metrics(history.history) #  log the error metrics that were calculated during validation
        mlflow.keras.save.log_model(model, "model") # log an instance of the trained model for later use
    
    history = model.fit(data, epochs=30, batch_size=128, callbacks = [early_stopping_cb])
    model.save_weights(os.path.join(model_dir, 'vae.weights.h5')) # save model weights after training
    
    hist_pd = pd.DataFrame(history.history)
    hist_pd.to_csv(os.path.join(model_dir, f'history_{num}.csv'), index = False)

### Experiment 1
*How to filter mnist data found here: https://stackoverflow.com/questions/51202181/how-do-i-select-only-a-specific-digit-from-the-mnist-dataset-provided-by-keras*

In [45]:
# retraining the model n times
count = 0
n = 5

mnist_digits = np.expand_dims(np.concatenate([x_train, x_test], axis=0), -1).astype("float32") / 255

for arr in np.array_split(mnist_digits, n):
    count += 1
    train_model(f"rand_{count}", vae, arr, digit_experiment3.experiment_id)

2024/08/13 04:23:20 INFO mlflow.tracking.fluent: Autologging successfully enabled for keras.
2024/08/13 04:23:23 INFO mlflow.tracking._tracking_service.client: 🏃 View run rand_1_test at: http://127.0.0.1:8080/#/experiments/232904901056209151/runs/b6fe3b85684743428fe5662326561678.
2024/08/13 04:23:23 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/232904901056209151.
2024/08/13 04:23:23 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'baf43fd8594449a58fa29e1bc24c1bf4', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current keras workflow


Epoch 1/30
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 19ms/step - kl_loss: 5.2232 - loss: 185.8567 - reconstruction_loss: 180.6335
Epoch 2/30
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 5.0889 - loss: 166.8121 - reconstruction_loss: 161.7232
Epoch 3/30
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 5.2281 - loss: 164.2390 - reconstruction_loss: 159.0108
Epoch 4/30
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 5.3337 - loss: 161.7626 - reconstruction_loss: 156.4289
Epoch 5/30
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 5.4579 - loss: 159.1759 - reconstruction_loss: 153.7180
Epoch 6/30
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 5.5395 - loss: 158.0799 - reconstruction_loss: 152.5404
Epoch 7/30
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/

2024/08/13 04:23:44 INFO mlflow.tracking._tracking_service.client: 🏃 View run judicious-roo-840 at: http://127.0.0.1:8080/#/experiments/232904901056209151/runs/baf43fd8594449a58fa29e1bc24c1bf4.
2024/08/13 04:23:44 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/232904901056209151.
2024/08/13 04:23:44 INFO mlflow.tracking.fluent: Autologging successfully enabled for keras.
2024/08/13 04:23:47 INFO mlflow.tracking._tracking_service.client: 🏃 View run rand_2_test at: http://127.0.0.1:8080/#/experiments/232904901056209151/runs/0263cdf65b97430eadcc20cbbb6c1c60.
2024/08/13 04:23:47 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/232904901056209151.
2024/08/13 04:23:47 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '9892484877334628b1b0c66741862717', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the

Epoch 1/30
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 6.1595 - loss: 153.6005 - reconstruction_loss: 147.4410
Epoch 2/30
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 6.0341 - loss: 151.8808 - reconstruction_loss: 145.8467
Epoch 3/30
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 6.1260 - loss: 151.2448 - reconstruction_loss: 145.1187
Epoch 4/30
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 6.1962 - loss: 150.3224 - reconstruction_loss: 144.1262
Epoch 5/30
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 6.2037 - loss: 150.9980 - reconstruction_loss: 144.7944
Epoch 6/30
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 6.1850 - loss: 150.2426 - reconstruction_loss: 144.0576
Epoch 7/30
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/s

2024/08/13 04:24:00 INFO mlflow.tracking._tracking_service.client: 🏃 View run youthful-bear-806 at: http://127.0.0.1:8080/#/experiments/232904901056209151/runs/9892484877334628b1b0c66741862717.
2024/08/13 04:24:00 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/232904901056209151.
2024/08/13 04:24:00 INFO mlflow.tracking.fluent: Autologging successfully enabled for keras.
2024/08/13 04:24:03 INFO mlflow.tracking._tracking_service.client: 🏃 View run rand_3_test at: http://127.0.0.1:8080/#/experiments/232904901056209151/runs/3f4698b51d11448488156b50f0b5621c.
2024/08/13 04:24:03 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/232904901056209151.
2024/08/13 04:24:03 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '9d24eb68afe94bb0b79bed0c11cff4e2', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the

Epoch 1/30
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 6.3169 - loss: 151.9627 - reconstruction_loss: 145.6458
Epoch 2/30
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 6.2032 - loss: 150.1854 - reconstruction_loss: 143.9823
Epoch 3/30
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 6.2448 - loss: 150.2710 - reconstruction_loss: 144.0262
Epoch 4/30
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 6.2529 - loss: 149.3618 - reconstruction_loss: 143.1089
Epoch 5/30
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 6.2699 - loss: 148.3064 - reconstruction_loss: 142.0365
Epoch 6/30
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 6.2780 - loss: 149.1591 - reconstruction_loss: 142.8811
Epoch 7/30
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/s

2024/08/13 04:24:16 INFO mlflow.tracking._tracking_service.client: 🏃 View run welcoming-sheep-15 at: http://127.0.0.1:8080/#/experiments/232904901056209151/runs/9d24eb68afe94bb0b79bed0c11cff4e2.
2024/08/13 04:24:16 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/232904901056209151.
2024/08/13 04:24:16 INFO mlflow.tracking.fluent: Autologging successfully enabled for keras.
2024/08/13 04:24:19 INFO mlflow.tracking._tracking_service.client: 🏃 View run rand_4_test at: http://127.0.0.1:8080/#/experiments/232904901056209151/runs/b04317dec82146bdaaa7ecb8b758a1dd.
2024/08/13 04:24:19 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/232904901056209151.
2024/08/13 04:24:19 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '716496d8dc9644d1b9ecf0543d09769c', which will track hyperparameters, performance metrics, model artifacts, and lineage information for th

Epoch 1/30
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 6.3566 - loss: 151.0983 - reconstruction_loss: 144.7416
Epoch 2/30
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - kl_loss: 6.2793 - loss: 148.6919 - reconstruction_loss: 142.4127
Epoch 3/30
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - kl_loss: 6.2745 - loss: 149.0372 - reconstruction_loss: 142.7627
Epoch 4/30
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 6.2965 - loss: 148.4876 - reconstruction_loss: 142.1911
Epoch 5/30
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 6.3401 - loss: 148.2284 - reconstruction_loss: 141.8883
Epoch 6/30
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - kl_loss: 6.3447 - loss: 147.7461 - reconstruction_loss: 141.4014
Epoch 7/30
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/s

2024/08/13 04:24:37 INFO mlflow.tracking._tracking_service.client: 🏃 View run unleashed-cub-795 at: http://127.0.0.1:8080/#/experiments/232904901056209151/runs/716496d8dc9644d1b9ecf0543d09769c.
2024/08/13 04:24:37 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/232904901056209151.
2024/08/13 04:24:37 INFO mlflow.tracking.fluent: Autologging successfully enabled for keras.
2024/08/13 04:24:40 INFO mlflow.tracking._tracking_service.client: 🏃 View run rand_5_test at: http://127.0.0.1:8080/#/experiments/232904901056209151/runs/14a98f5f0fba49129b6c9e847fa35295.
2024/08/13 04:24:40 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/232904901056209151.
2024/08/13 04:24:40 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '52d92316a155490d8debcbf2b7ada58e', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the

Epoch 1/30
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 6.4252 - loss: 149.3366 - reconstruction_loss: 142.9113
Epoch 2/30
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 6.3578 - loss: 147.7439 - reconstruction_loss: 141.3861
Epoch 3/30
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 6.3465 - loss: 147.3868 - reconstruction_loss: 141.0403
Epoch 4/30
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 6.3809 - loss: 146.4050 - reconstruction_loss: 140.0241
Epoch 5/30
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 6.3872 - loss: 146.2565 - reconstruction_loss: 139.8693
Epoch 6/30
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 6.4320 - loss: 146.1986 - reconstruction_loss: 139.7666
Epoch 7/30
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/s

2024/08/13 04:24:52 INFO mlflow.tracking._tracking_service.client: 🏃 View run loud-sow-301 at: http://127.0.0.1:8080/#/experiments/232904901056209151/runs/52d92316a155490d8debcbf2b7ada58e.
2024/08/13 04:24:52 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/232904901056209151.


In [46]:
# add model 1 to dvc 
!cp ./"{model_dir}"/vae.weights.h5 "{dvc_repo}"/experiment_1.weights.h5
!sh dvcgit.sh experiment_1.weights.h5 "digit experiment 1" "{dvc_repo}" "{env_name}"

!rm "{dvc_storage}"/experiment_1.weights.h5
!rm ./"{model_dir}"/vae.weights.h5

[?25l[32m⠋[0m Checking graph                                       core[39m>
Adding...                                                                       
![A
Collecting files and computing hashes in experiment_1.weights.h5 |0.00 [00:00,  [A
                                                                                [A
![A
  0% Checking cache in '/home/lobielodan/parsl_mpi/run_on_cluster/cvae-demos/dig[A
                                                                                [A
![A
  0%|          |Adding experiment_1.weights.h5 to cach0/1 [00:00<?,     ?file/s][A
                                                                                [A
![A
  0%|          |Checking out /home/lobielodan/parsl_mp0/1 [00:00<?,    ?files/s][A
100% Adding...|████████████████████████████████████████|1/1 [00:00, 37.35file/s][A

To track the changes with git, run:

	git add experiment_1.weights.h5.dvc

To enable auto staging, run:

	dvc config core.autostage true
Collecti

### Experiment 2

In [47]:
# train all numbers at the same time
train_model("all", vae, mnist_digits, digit_experiment2.experiment_id)

2024/08/13 04:24:58 INFO mlflow.tracking.fluent: Autologging successfully enabled for keras.
2024/08/13 04:25:01 INFO mlflow.tracking._tracking_service.client: 🏃 View run all_test at: http://127.0.0.1:8080/#/experiments/691869195488029627/runs/dc88245d2f3448b89e327a625498e4f7.
2024/08/13 04:25:01 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/691869195488029627.
2024/08/13 04:25:01 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '15b9bdf8f3914dd7be463e5674f9335c', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current keras workflow


Epoch 1/30
[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - kl_loss: 6.4688 - loss: 147.7049 - reconstruction_loss: 141.2361
Epoch 2/30
[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - kl_loss: 6.4687 - loss: 147.0574 - reconstruction_loss: 140.5887
Epoch 3/30
[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - kl_loss: 6.4498 - loss: 146.2116 - reconstruction_loss: 139.7618
Epoch 4/30
[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - kl_loss: 6.4746 - loss: 147.1335 - reconstruction_loss: 140.6589
Epoch 5/30
[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - kl_loss: 6.4850 - loss: 146.4447 - reconstruction_loss: 139.9596
Epoch 6/30
[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - kl_loss: 6.4629 - loss: 146.0515 - reconstruction_loss: 139.5885
Epoch 7/30
[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/s

2024/08/13 04:25:52 INFO mlflow.tracking._tracking_service.client: 🏃 View run blushing-pig-77 at: http://127.0.0.1:8080/#/experiments/232904901056209151/runs/15b9bdf8f3914dd7be463e5674f9335c.
2024/08/13 04:25:52 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/232904901056209151.


In [48]:
# add model 2 to dvc 
!cp ./"{model_dir}"/vae.weights.h5 "{dvc_repo}"/experiment_2.weights.h5
!sh dvcgit.sh experiment_2.weights.h5 "digit experiment 2" "{dvc_repo}" "{env_name}"

!rm "{dvc_repo}"/experiment_2.weights.h5
!rm ./"{model_dir}"/vae.weights.h5

[?25l[32m⠋[0m Checking graph                                       core[39m>
Adding...                                                                       
![A
Collecting files and computing hashes in experiment_2.weights.h5 |0.00 [00:00,  [A
                                                                                [A
![A
  0% Checking cache in '/home/lobielodan/parsl_mpi/run_on_cluster/cvae-demos/dig[A
                                                                                [A
![A
  0%|          |Adding experiment_2.weights.h5 to cach0/1 [00:00<?,     ?file/s][A
                                                                                [A
![A
  0%|          |Checking out /home/lobielodan/parsl_mp0/1 [00:00<?,    ?files/s][A
100% Adding...|████████████████████████████████████████|1/1 [00:00, 27.47file/s][A

To track the changes with git, run:

	git add experiment_2.weights.h5.dvc

To enable auto staging, run:

	dvc config core.autostage true
Collecti

### Experiment 3

In [49]:
# training one number at a time
for num in np.arange(10):
    train_filter = np.where(Y_train == num)
    test_filter = np.where(Y_test == num)
    
    x_trn = x_train[train_filter]
    x_tst = x_test[test_filter]
    
    mnist_digits = np.expand_dims(np.concatenate([x_trn, x_tst], axis=0), -1).astype("float32") / 255
    train_model(num, vae, mnist_digits, digit_experiment1.experiment_id)

2024/08/13 04:25:56 INFO mlflow.tracking.fluent: Autologging successfully enabled for keras.
2024/08/13 04:25:59 INFO mlflow.tracking._tracking_service.client: 🏃 View run 0_test at: http://127.0.0.1:8080/#/experiments/852214238853281471/runs/f337231f0ffc46e89ce4d2ef3c4e0e49.
2024/08/13 04:25:59 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/852214238853281471.
2024/08/13 04:25:59 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '0223759b3b5f420c9bec6eb374b319d3', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current keras workflow


Epoch 1/30
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 39ms/step - kl_loss: 7.3741 - loss: 154.6270 - reconstruction_loss: 147.2530
Epoch 2/30
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - kl_loss: 7.2313 - loss: 151.9733 - reconstruction_loss: 144.7419 
Epoch 3/30
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 7.0763 - loss: 151.1368 - reconstruction_loss: 144.0605
Epoch 4/30
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 6.9729 - loss: 150.1927 - reconstruction_loss: 143.2198
Epoch 5/30
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 6.9205 - loss: 150.4737 - reconstruction_loss: 143.5532
Epoch 6/30
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 6.8527 - loss: 149.9870 - reconstruction_loss: 143.1342
Epoch 7/30
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_los

2024/08/13 04:26:11 INFO mlflow.tracking._tracking_service.client: 🏃 View run carefree-skink-468 at: http://127.0.0.1:8080/#/experiments/232904901056209151/runs/0223759b3b5f420c9bec6eb374b319d3.
2024/08/13 04:26:11 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/232904901056209151.
2024/08/13 04:26:11 INFO mlflow.tracking.fluent: Autologging successfully enabled for keras.
2024/08/13 04:26:14 INFO mlflow.tracking._tracking_service.client: 🏃 View run 1_test at: http://127.0.0.1:8080/#/experiments/852214238853281471/runs/bef2acf78a9b48a5ba15e5a4098de16a.
2024/08/13 04:26:14 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/852214238853281471.
2024/08/13 04:26:14 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '56d00fab2c2b4799ae6a6c82ab328b30', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the cur

Epoch 1/30
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 33ms/step - kl_loss: 7.1960 - loss: 63.9316 - reconstruction_loss: 56.7355
Epoch 2/30
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 6.5884 - loss: 58.4960 - reconstruction_loss: 51.9076
Epoch 3/30
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 6.3943 - loss: 57.2860 - reconstruction_loss: 50.8916
Epoch 4/30
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 6.2778 - loss: 57.0965 - reconstruction_loss: 50.8187
Epoch 5/30
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 6.1209 - loss: 56.6502 - reconstruction_loss: 50.5292
Epoch 6/30
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 6.0893 - loss: 56.6756 - reconstruction_loss: 50.5863
Epoch 7/30
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 6.0141 - l

2024/08/13 04:26:25 INFO mlflow.tracking._tracking_service.client: 🏃 View run luxuriant-hen-541 at: http://127.0.0.1:8080/#/experiments/232904901056209151/runs/56d00fab2c2b4799ae6a6c82ab328b30.
2024/08/13 04:26:25 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/232904901056209151.
2024/08/13 04:26:25 INFO mlflow.tracking.fluent: Autologging successfully enabled for keras.
2024/08/13 04:26:28 INFO mlflow.tracking._tracking_service.client: 🏃 View run 2_test at: http://127.0.0.1:8080/#/experiments/852214238853281471/runs/3f2f503e3c6f4b22995bf2f8b6b4eade.
2024/08/13 04:26:28 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/852214238853281471.
2024/08/13 04:26:28 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '3a42682208cd4549a4dfab9ea8faf6a5', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the curr

Epoch 1/30
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 35ms/step - kl_loss: 6.0680 - loss: 210.8275 - reconstruction_loss: 204.7595
Epoch 2/30
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 5.7774 - loss: 177.8335 - reconstruction_loss: 172.0561
Epoch 3/30
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 5.8700 - loss: 175.0369 - reconstruction_loss: 169.1669
Epoch 4/30
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 5.9326 - loss: 174.2699 - reconstruction_loss: 168.3372
Epoch 5/30
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 6.0334 - loss: 173.0288 - reconstruction_loss: 166.9954
Epoch 6/30
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 6.0389 - loss: 172.3272 - reconstruction_loss: 166.2884
Epoch 7/30
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss

2024/08/13 04:26:39 INFO mlflow.tracking._tracking_service.client: 🏃 View run adorable-dog-39 at: http://127.0.0.1:8080/#/experiments/232904901056209151/runs/3a42682208cd4549a4dfab9ea8faf6a5.
2024/08/13 04:26:39 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/232904901056209151.
2024/08/13 04:26:39 INFO mlflow.tracking.fluent: Autologging successfully enabled for keras.
2024/08/13 04:26:42 INFO mlflow.tracking._tracking_service.client: 🏃 View run 3_test at: http://127.0.0.1:8080/#/experiments/852214238853281471/runs/627fa3541e154608b9194c32787de3ed.
2024/08/13 04:26:42 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/852214238853281471.
2024/08/13 04:26:42 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '656352783e7f4229af4ca27a33530661', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the curren

Epoch 1/30
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 37ms/step - kl_loss: 5.6709 - loss: 188.5913 - reconstruction_loss: 182.9204
Epoch 2/30
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - kl_loss: 5.2268 - loss: 159.4611 - reconstruction_loss: 154.2342 
Epoch 3/30
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 5.3386 - loss: 157.8038 - reconstruction_loss: 152.4652
Epoch 4/30
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 5.3551 - loss: 156.0963 - reconstruction_loss: 150.7412
Epoch 5/30
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 5.3195 - loss: 154.4827 - reconstruction_loss: 149.1632
Epoch 6/30
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 5.3368 - loss: 152.6404 - reconstruction_loss: 147.3036
Epoch 7/30
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_los

2024/08/13 04:26:54 INFO mlflow.tracking._tracking_service.client: 🏃 View run gaudy-jay-480 at: http://127.0.0.1:8080/#/experiments/232904901056209151/runs/656352783e7f4229af4ca27a33530661.
2024/08/13 04:26:54 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/232904901056209151.
2024/08/13 04:26:54 INFO mlflow.tracking.fluent: Autologging successfully enabled for keras.
2024/08/13 04:26:57 INFO mlflow.tracking._tracking_service.client: 🏃 View run 4_test at: http://127.0.0.1:8080/#/experiments/852214238853281471/runs/779da648d42e426e94895545cd2b478f.
2024/08/13 04:26:57 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/852214238853281471.
2024/08/13 04:26:57 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '0489c631f43d4b89bc9d90d7a93d3798', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current 

Epoch 1/30
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 37ms/step - kl_loss: 5.5612 - loss: 186.1803 - reconstruction_loss: 180.6191
Epoch 2/30
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 5.2008 - loss: 142.5547 - reconstruction_loss: 137.3539
Epoch 3/30
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 5.3540 - loss: 138.4836 - reconstruction_loss: 133.1296
Epoch 4/30
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 5.4771 - loss: 136.1691 - reconstruction_loss: 130.6920
Epoch 5/30
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 5.4902 - loss: 135.2910 - reconstruction_loss: 129.8008
Epoch 6/30
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 5.4767 - loss: 134.5769 - reconstruction_loss: 129.1002
Epoch 7/30
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss

2024/08/13 04:27:07 INFO mlflow.tracking._tracking_service.client: 🏃 View run upset-ant-282 at: http://127.0.0.1:8080/#/experiments/232904901056209151/runs/0489c631f43d4b89bc9d90d7a93d3798.
2024/08/13 04:27:07 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/232904901056209151.
2024/08/13 04:27:07 INFO mlflow.tracking.fluent: Autologging successfully enabled for keras.
2024/08/13 04:27:10 INFO mlflow.tracking._tracking_service.client: 🏃 View run 5_test at: http://127.0.0.1:8080/#/experiments/852214238853281471/runs/6352f1bd7b904b54b3bdb1bdcad1e511.
2024/08/13 04:27:10 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/852214238853281471.
2024/08/13 04:27:10 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '0b1dd046978f45caa687c00ba0eb8fe7', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current 

Epoch 1/30
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 39ms/step - kl_loss: 5.6773 - loss: 218.1777 - reconstruction_loss: 212.5004
Epoch 2/30
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - kl_loss: 4.9369 - loss: 156.5666 - reconstruction_loss: 151.6298 
Epoch 3/30
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 5.0945 - loss: 152.2926 - reconstruction_loss: 147.1982
Epoch 4/30
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 5.1624 - loss: 149.7402 - reconstruction_loss: 144.5779
Epoch 5/30
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 5.2655 - loss: 149.2747 - reconstruction_loss: 144.0092
Epoch 6/30
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 5.2677 - loss: 147.8607 - reconstruction_loss: 142.5930
Epoch 7/30
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_los

2024/08/13 04:27:20 INFO mlflow.tracking._tracking_service.client: 🏃 View run delicate-stoat-808 at: http://127.0.0.1:8080/#/experiments/232904901056209151/runs/0b1dd046978f45caa687c00ba0eb8fe7.
2024/08/13 04:27:20 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/232904901056209151.
2024/08/13 04:27:20 INFO mlflow.tracking.fluent: Autologging successfully enabled for keras.
2024/08/13 04:27:23 INFO mlflow.tracking._tracking_service.client: 🏃 View run 6_test at: http://127.0.0.1:8080/#/experiments/852214238853281471/runs/41329bfed9934d4598bf59dfadf8c4c2.
2024/08/13 04:27:23 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/852214238853281471.
2024/08/13 04:27:23 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'e315bdebb2ec4dd8aa6a1e42fc37c704', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the cur

Epoch 1/30
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 39ms/step - kl_loss: 5.7980 - loss: 177.9513 - reconstruction_loss: 172.1533
Epoch 2/30
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 5.4817 - loss: 139.9636 - reconstruction_loss: 134.4819
Epoch 3/30
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 5.5327 - loss: 135.1525 - reconstruction_loss: 129.6198
Epoch 4/30
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 5.6102 - loss: 133.2983 - reconstruction_loss: 127.6881
Epoch 5/30
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 5.6733 - loss: 131.9673 - reconstruction_loss: 126.2939
Epoch 6/30
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 5.7481 - loss: 132.0035 - reconstruction_loss: 126.2553
Epoch 7/30
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss

2024/08/13 04:27:35 INFO mlflow.tracking._tracking_service.client: 🏃 View run upbeat-wolf-76 at: http://127.0.0.1:8080/#/experiments/232904901056209151/runs/e315bdebb2ec4dd8aa6a1e42fc37c704.
2024/08/13 04:27:35 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/232904901056209151.
2024/08/13 04:27:35 INFO mlflow.tracking.fluent: Autologging successfully enabled for keras.
2024/08/13 04:27:38 INFO mlflow.tracking._tracking_service.client: 🏃 View run 7_test at: http://127.0.0.1:8080/#/experiments/852214238853281471/runs/ac183b8ed847473181363e4423aff690.
2024/08/13 04:27:38 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/852214238853281471.
2024/08/13 04:27:38 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '3d7f2e86623c4c16a0eaa8b317cee11d', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current

Epoch 1/30
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 37ms/step - kl_loss: 5.6549 - loss: 199.6274 - reconstruction_loss: 193.9725
Epoch 2/30
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - kl_loss: 5.4118 - loss: 128.5738 - reconstruction_loss: 123.1620 
Epoch 3/30
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 5.6506 - loss: 122.1373 - reconstruction_loss: 116.4867
Epoch 4/30
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 5.7022 - loss: 119.9138 - reconstruction_loss: 114.2117
Epoch 5/30
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 5.7341 - loss: 117.8687 - reconstruction_loss: 112.1345
Epoch 6/30
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 5.7357 - loss: 118.4526 - reconstruction_loss: 112.7169
Epoch 7/30
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_los

2024/08/13 04:27:49 INFO mlflow.tracking._tracking_service.client: 🏃 View run sassy-snail-986 at: http://127.0.0.1:8080/#/experiments/232904901056209151/runs/3d7f2e86623c4c16a0eaa8b317cee11d.
2024/08/13 04:27:49 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/232904901056209151.
2024/08/13 04:27:49 INFO mlflow.tracking.fluent: Autologging successfully enabled for keras.
2024/08/13 04:27:52 INFO mlflow.tracking._tracking_service.client: 🏃 View run 8_test at: http://127.0.0.1:8080/#/experiments/852214238853281471/runs/199e1731dd344d7dbf638676fa488a8f.
2024/08/13 04:27:52 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/852214238853281471.
2024/08/13 04:27:52 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '70086c7ffbbb4ffba0a3d32c8549bf9d', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the curren

Epoch 1/30
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 5.4597 - loss: 225.0519 - reconstruction_loss: 219.5922
Epoch 2/30
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 4.6358 - loss: 165.0537 - reconstruction_loss: 160.4179
Epoch 3/30
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 4.8764 - loss: 161.1864 - reconstruction_loss: 156.3100
Epoch 4/30
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 4.8432 - loss: 158.6170 - reconstruction_loss: 153.7738
Epoch 5/30
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 4.8504 - loss: 158.3408 - reconstruction_loss: 153.4904
Epoch 6/30
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 4.8971 - loss: 157.4110 - reconstruction_loss: 152.5139
Epoch 7/30
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss:

2024/08/13 04:28:00 INFO mlflow.tracking._tracking_service.client: 🏃 View run clean-carp-90 at: http://127.0.0.1:8080/#/experiments/232904901056209151/runs/70086c7ffbbb4ffba0a3d32c8549bf9d.
2024/08/13 04:28:00 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/232904901056209151.
2024/08/13 04:28:00 INFO mlflow.tracking.fluent: Autologging successfully enabled for keras.
2024/08/13 04:28:03 INFO mlflow.tracking._tracking_service.client: 🏃 View run 9_test at: http://127.0.0.1:8080/#/experiments/852214238853281471/runs/78005ae8554e42339d1a33ce613dc17e.
2024/08/13 04:28:03 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/852214238853281471.
2024/08/13 04:28:03 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '507adac378b341e0ad596e992b2398df', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current 

Epoch 1/30
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 36ms/step - kl_loss: 4.8946 - loss: 158.2163 - reconstruction_loss: 153.3218
Epoch 2/30
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 4.9802 - loss: 124.9583 - reconstruction_loss: 119.9782
Epoch 3/30
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 5.1333 - loss: 122.1214 - reconstruction_loss: 116.9881
Epoch 4/30
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 5.1959 - loss: 121.6360 - reconstruction_loss: 116.4401
Epoch 5/30
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 5.2653 - loss: 119.9013 - reconstruction_loss: 114.6359
Epoch 6/30
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss: 5.2689 - loss: 120.0936 - reconstruction_loss: 114.8247
Epoch 7/30
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss

2024/08/13 04:28:14 INFO mlflow.tracking._tracking_service.client: 🏃 View run resilient-lynx-847 at: http://127.0.0.1:8080/#/experiments/232904901056209151/runs/507adac378b341e0ad596e992b2398df.
2024/08/13 04:28:14 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/232904901056209151.


In [50]:
# add model 3 to dvc 
!cp ./"{model_dir}"/vae.weights.h5 "{dvc_repo}"/experiment_3.weights.h5
!sh dvcgit.sh experiment_3.weights.h5 "digit experiment 3" "{dvc_repo}" "{env_name}"

!rm "{dvc_repo}"/experiment_3.weights.h5
!rm ./"{model_dir}"/vae.weights.h5

[?25l[32m⠋[0m Checking graph                                       core[39m>
Adding...                                                                       
![A
Collecting files and computing hashes in experiment_3.weights.h5 |0.00 [00:00,  [A
                                                                                [A
![A
  0% Checking cache in '/home/lobielodan/parsl_mpi/run_on_cluster/cvae-demos/dig[A
                                                                                [A
![A
  0%|          |Adding experiment_3.weights.h5 to cach0/1 [00:00<?,     ?file/s][A
                                                                                [A
![A
  0%|          |Checking out /home/lobielodan/parsl_mp0/1 [00:00<?,    ?files/s][A
100% Adding...|████████████████████████████████████████|1/1 [00:00, 10.47file/s][A

To track the changes with git, run:

	git add experiment_3.weights.h5.dvc

To enable auto staging, run:

	dvc config core.autostage true
Collecti

------------------------------------------------------------------------------------------------
*`dvcgit.sh` is a script used for dvc and git tracking. The correct call is as follows (all arguments are required):* `sh dvcgit.sh <file_name> <commit_message> <dvc_repo_name> <conda_env_name>`

## Display a grid of reconstructed digits in the latent space

In [None]:
import matplotlib.pyplot as plt

def plot_latent_space(vae, n=30, figsize=15):
    # display a n*n 2D manifold of digits
    digit_size = 28
    scale = 1.0
    figure = np.zeros((digit_size * n, digit_size * n))
    # linearly spaced coordinates corresponding to the 2D plot
    # of digit classes in the latent space
    grid_x = np.linspace(-scale, scale, n)
    grid_y = np.linspace(-scale, scale, n)[::-1]

    for i, yi in enumerate(grid_y):
        for j, xi in enumerate(grid_x):
            z_sample = np.array([[xi, yi]])
            x_decoded = vae.decoder.predict(z_sample, verbose=0)
            digit = x_decoded[0].reshape(digit_size, digit_size)
            figure[
                i * digit_size : (i + 1) * digit_size,
                j * digit_size : (j + 1) * digit_size,
            ] = digit

    plt.figure(figsize=(figsize, figsize))
    start_range = digit_size // 2
    end_range = n * digit_size + start_range
    pixel_range = np.arange(start_range, end_range, digit_size)
    sample_range_x = np.round(grid_x, 1)
    sample_range_y = np.round(grid_y, 1)
    plt.xticks(pixel_range, sample_range_x)
    plt.yticks(pixel_range, sample_range_y)
    plt.xlabel("z[0]")
    plt.ylabel("z[1]")
    plt.imshow(figure, cmap="Greys_r")
    plt.show()

plot_latent_space(vae)

## Display how the latent space clusters digits

In [None]:
def plot_label_clusters(vae, data, labels):
    # display a 2D plot of the digit classes in the latent space
    z_mean, _, _ = vae.encoder.predict(data, verbose=0)
    plt.figure(figsize=(12, 10))
    plt.scatter(z_mean[:, 0], z_mean[:, 1], c=labels)
    plt.colorbar()
    plt.xlabel("z[0]")
    plt.ylabel("z[1]")
    plt.show()

(x_train, y_train), _ = keras.datasets.mnist.load_data()
x_train = np.expand_dims(x_train, -1).astype("float32") / 255

plot_label_clusters(vae, x_train, y_train)