# **CVAE Training**

In [1]:
import os, json

import papermill as pm
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import tensorflow as tf
import netCDF4
import cartopy

from tensorflow import keras
from keras import layers
from sklearn.model_selection import train_test_split 

print("TF version:", tf.__version__)
print("GPU is", "available" if tf.config.list_physical_devices('GPU') else "NOT AVAILABLE")

2024-07-25 14:33:24.314602: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-25 14:33:24.314670: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-25 14:33:24.316256: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-25 14:33:24.322678: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


TF version: 2.16.2
GPU is available


2024-07-25 14:33:26.057549: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-07-25 14:33:26.103956: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-07-25 14:33:26.105981: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

# Download and Convert Data

In [2]:
data_pdir = "./gefs_data"
data_dir = "./gefs_data/converted/"
model_dir = './model_dir'

In [3]:
# data definitions

from scripts.get_data import download_file
from scripts.get_data import convert_file
from scripts.get_data import subset_file
from scripts.get_data import remove_data # removes all data

# data loading
def load_data(data_dir):      
    files = [f for f in os.listdir(data_dir) if ('subset' in f and 'tmp' not in f)]
    
    all_data = ((np.expand_dims(
        np.concatenate(
            [netCDF4.Dataset(data_dir + converted_file)['msl'][:] for converted_file in files]
        ),
        -1
    ).astype("float32") - 85000) / (110000 - 85000)).astype("float16")
    
    return all_data

# Neural Network Design

We need to get to a small latent space. Conv2D networks are good because they help reduce the number of connections in a network in a meaningful way.  I'm using terms as defined in [this definition of conv2D](https://towardsdatascience.com/conv2d-to-finally-understand-what-happens-in-the-forward-pass-1bbaafb0b148).

**Definitions:**
K -> kernel size;
P -> padding;
S -> stride;
D -> Dilation;
G -> Groups

**Filter options:**
Longitude is easy because it is large and even, so as long as you have an even stride, you get integer results when dividing.
e.g. lon 9: stride 4, lat 7: stride 5

- Latitude - whole numbers occurr for P = 2 & K = 3 or K = 11.
- 11 grid points * 0.25 deg * 100 km/deg = 275 km filter window (a good scale for weather)
- 9 grid points * 0.25 deg * 100 km/deg = 225 km
- Longitude - whole numbers occur for P = 0 & K = 11 (nice match with Latitude), P = 1 & K = 3 or 13, P = 2 & K = 5.

For a 5 x 7 filter with 3 stride (no overlap) and no padding:
- lat: (721 - 4) / 3 = 239 possible steps (good whole number!)
- lon: (1440 - 4) / 3 = 478.6666 possible steps

## Load and Preprocess Training Data:
The standard way of manipulating arrays in Conv2D layers in TF is to use arrays in the shape:
`batch_size,  height, width, channels = data.shape`
In our case, the the `batch_size` is the number of image frames (i.e. separate samples or rows in a `.csv` file), the `height` and `width` define the size of the image frame in number of pixels, and the `channels` are the number of layers in the frames.  Typically, channels are color layers (e.g. RGB or CMYK) but in our case, we could use different metereological variables.  However, for this first experiment, **we only need one channel** because we're only going to use mean sea level pressure (msl).

## Build the Encoder:
GFS grids I have available here are at 0.25 degree resolution.  I'm doing this as a "worst case" scenario since there are also 0.5 and 1.0 degree grids with lower resolution but I can't find that data quickly and don't know what's available.

These 0.25 degree grids are 721 x 1440.
Each forecast file is 3 hourly for 10 days = 8 steps/forecast * 10 days = 80 "frames"
This demo is only using two forecasts from the control ensemble
(one launched Jan 01, 2019 and one launched Jan 02, 2019) -> this is only 
a small subset of the variability possible in the model.

This particular data set spans 2000-2019 and there are 5 ensemble members.

## Build the Decoder:
With the 11 x 11 and 5 x 5 filters, non-overlapping stride, applied here, we have a final "image" size of 14 x 27 and 64 channels.

In [4]:
# model defintions

class Sampling(layers.Layer):
    """Uses (z_mean, z_log_var) to sample z, the vector encoding a digit."""

    def call(self, inputs):
        z_mean, z_log_var = inputs
        batch = tf.shape(z_mean)[0]
        dim = tf.shape(z_mean)[1]
        epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon

def build_encoder(latent_dim):
    encoder_inputs = keras.Input(shape=(721, 1440, 1))
    
    x = layers.Conv2D(32, 11, activation = "relu", strides = [9, 10], padding = "valid")(encoder_inputs)
    x = layers.Conv2D(64, [5,9], activation = "relu", strides = [5, 9], padding = "valid")(x)
    x = layers.Flatten()(x)
    x = layers.Dense(16, activation="relu")(x)
    
    z_mean = layers.Dense(latent_dim, name="z_mean")(x)
    z_log_var = layers.Dense(latent_dim, name="z_log_var")(x)
    z = Sampling()([z_mean, z_log_var])
    
    encoder = keras.Model(encoder_inputs, [z_mean, z_log_var, z], name = "encoder")
    
    print(encoder.summary())
    return encoder

def build_decoder(latent_dim):
    latent_inputs = keras.Input(shape=(latent_dim,))
    x = layers.Dense(15 * 15 * 64, activation="relu")(latent_inputs)
    x = layers.Reshape((15, 15, 64))(x)
    # FIXME - there is something wrong here, but at least there is a pattern.
    # Using output_padding as a fudge factor -> it may be that there is exactly
    # one "missing" filter stamp/convolution because for both Conv2DTranspose
    # operations, output_padding is set to maximum it could be in both dims
    # (i.e. exactly one less than the stride of each filter).
    x = layers.Conv2DTranspose(64, [5, 9], activation = "relu", strides = [5,9], padding = "valid", output_padding = [4, 8])(x)
    x = layers.Conv2DTranspose(32, 11, activation = "relu", strides = [9,10], padding = "valid", output_padding = [8, 9])(x)
    decoder_outputs = layers.Conv2DTranspose(1, 3, activation = "sigmoid", padding = "same")(x)
    decoder = keras.Model(latent_inputs, decoder_outputs, name = "decoder")
    
    print(decoder.summary())
    return decoder

class VAE(keras.Model):
    def __init__(self, encoder, decoder, **kwargs):
        super(VAE, self).__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder
        self.total_loss_tracker = keras.metrics.Mean(name = "total_loss")
        self.reconstruction_loss_tracker = keras.metrics.Mean(name = "reconstruction_loss")
        self.kl_loss_tracker = keras.metrics.Mean(name = "kl_loss")

    @property
    def metrics(self):
        return [
            self.total_loss_tracker,
            self.reconstruction_loss_tracker,
            self.kl_loss_tracker,
        ]

    def train_step(self, data):
        
        with tf.GradientTape() as tape:
            z_mean, z_log_var, z = self.encoder(data)
            reconstruction = self.decoder(z)
            # FIXME: Normalize loss with the number of features (28 * 28)
            n_features = 28 * 28
            reconstruction_loss = tf.reduce_mean(
                tf.reduce_sum(
                    keras.losses.binary_crossentropy(data, reconstruction), axis = (1, 2)
                )
            ) / n_features
            kl_loss = -0.5 * (1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var))
            kl_loss = tf.reduce_mean(tf.reduce_sum(kl_loss, axis = 1)) / n_features
            total_loss = (reconstruction_loss + kl_loss)
        grads = tape.gradient(total_loss, self.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
        self.total_loss_tracker.update_state(total_loss)
        self.reconstruction_loss_tracker.update_state(reconstruction_loss)
        self.kl_loss_tracker.update_state(kl_loss)
        
        return {
            "loss": self.total_loss_tracker.result(),
            "reconstruction_loss": self.reconstruction_loss_tracker.result(),
            "kl_loss": self.kl_loss_tracker.result(),
        }

    # Needed to validate (validation loss) and to evaluate
    def test_step(self, data):
        if type(data) == tuple:
            data, _ = data
            
        z_mean, z_log_var, z = self.encoder(data)
        reconstruction = self.decoder(z)
        # FIXME: Normalize loss with the number of features (28 * 28)
        n_features = 28 * 28
        reconstruction_loss = tf.reduce_mean(
            tf.reduce_sum(
                keras.losses.binary_crossentropy(data, reconstruction), axis = (1, 2)
            )
        ) / n_features
        kl_loss = -0.5 * (1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var))
        kl_loss = tf.reduce_mean(tf.reduce_sum(kl_loss, axis = 1)) / n_features
        total_loss = (reconstruction_loss + kl_loss)
        # grads = tape.gradient(total_loss, self.trainable_weights)
        # self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
        self.total_loss_tracker.update_state(total_loss)
        self.reconstruction_loss_tracker.update_state(reconstruction_loss)
        self.kl_loss_tracker.update_state(kl_loss)
        
        return {
            "loss": self.total_loss_tracker.result(),
            "reconstruction_loss": self.reconstruction_loss_tracker.result(),
            "kl_loss": self.kl_loss_tracker.result(),
        }

In [5]:
# training definitions

def train_model(X_train, X_test, X_valid, date, vae, model_dir):
    early_stopping_cb = keras.callbacks.EarlyStopping(patience = 5, restore_best_weights = True) # stops training early if the validation loss does not improve
    
    if os.path.exists(os.path.join(model_dir, 'vae.weights.h5')): # if the model has already been trained at least once, load that model
        vae.load_weights(os.path.join(model_dir, 'vae.weights.h5'))

    history = vae.fit(
        X_train, epochs = 50, batch_size = 40,
        callbacks = [early_stopping_cb],
        validation_data = (X_valid,)
    )

    vae.save_weights(os.path.join(model_dir, 'vae.weights.h5')) # save model weights after training
    !cp model_dir/vae.weights.h5 model_dir/vae.weights_{date}.h5 # make a copy to dvc save
    
    hist_pd = pd.DataFrame(history.history)
    hist_pd.to_csv(os.path.join(model_dir, f'history_{date}.csv'), index = False)

    test_loss = vae.evaluate(X_test)
    test_loss = dict(zip(["loss", "reconstruction_loss", "kl_loss"], test_loss))

    print('Test loss:', test_loss)

    with open(os.path.join(model_dir, f'test_loss_{date}.json'), 'w') as json_file:
        json.dump(test_loss, json_file, indent = 4)
        
    print(date)
    !sh scripts/run_dvcgit.sh model_dir/history_{date}.csv f"{date}"
    !sh scripts/run_dvcgit.sh model_dir/vae.weights_{date}.h5 f"{date}"
    !rm model_dir/vae.weights_{date}.h5 # delete copy
    
def run_train(num_files, date, vae, data_dir, model_dir):
    slp = load_data(data_dir) # load data
    print("shape:", np.shape(slp)) # verify data shape
    
    # split the data - y values are throw away
    X_train, X_test, y_train, y_test = train_test_split(slp[0:(num_files * 40 - 1), :, :, :], np.arange(0, num_files * 40 - 1), test_size = 0.2, random_state = 1)
    X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size = 0.25, random_state = 1) # 0.25 x 0.8 = 0.2

    train_model(X_train, X_test, X_valid, date, vae, model_dir)
    remove_data(data_pdir)

# Train the VAE model

In [6]:
# model build

latent_dim = 2

# build encoder
encoder = build_encoder(latent_dim)
print("Memory usage after building encoder:", tf.config.experimental.get_memory_info('GPU:0'))

# build decoder
decoder = build_decoder(latent_dim)
print("Memory usage after building decoder:", tf.config.experimental.get_memory_info('GPU:0'))

# build VAE (variational autoencoder)
vae = VAE(encoder, decoder)
vae.compile(optimizer = 'rmsprop') 
print("Memory usage after building VAE:", tf.config.experimental.get_memory_info('GPU:0'))

2024-07-25 14:33:26.248557: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355


2024-07-25 14:33:26.250546: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-07-25 14:33:26.252578: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-07-25 14:33:26.384473: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

None
Memory usage after building encoder: {'current': 1311232, 'peak': 3153408}


None
Memory usage after building decoder: {'current': 3216640, 'peak': 6049536}
Memory usage after building VAE: {'current': 3219200, 'peak': 6049536}


In [7]:
# parameter cell for pm 
year = "2018"
day = "01"
months = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"] 
ensembles = ["c00"] #, "p01", "p02", "p03", "p04"]

In [8]:
# Parameters
year = "2003"
day = "20"


In [9]:
# training

num_files = 0

# get wanted data -------------------------------------------------------------------------
for month in months:
    for ensemble in ensembles:
        download_file(year, month, day, ensemble, data_pdir)
        convert_file(year, month, day, ensemble, data_dir)
        subset_file(f'pres_msl_{year}{month}{day}00_{ensemble}.nc', data_dir)

        if f'pres_msl_{year}{month}{day}00_{ensemble}.nc' in os.listdir(data_dir):
            num_files += 1
# ------------------------------------------------------------------------------------------          
     
run_train(num_files, year + day, vae, data_dir, model_dir) # run training 
    
print("Memory usage after training:", tf.config.experimental.get_memory_info('GPU:0'))

shape: (480, 721, 1440, 1)


  saveable.load_own_variables(weights_store.get(inner_path))


Epoch 1/50


I0000 00:00:1721918226.962689   82762 service.cc:145] XLA service 0x15222c00c080 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1721918226.962736   82762 service.cc:153]   StreamExecutor device (0): NVIDIA A10G, Compute Capability 8.6
2024-07-25 14:37:07.028323: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.


2024-07-25 14:37:07.216872: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:465] Loaded cuDNN version 8907


2024-07-25 14:37:11.537023: W external/local_tsl/tsl/framework/bfc_allocator.cc:296] Allocator (GPU_0_bfc) ran out of memory trying to allocate 5.07GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.
2024-07-25 14:37:11.537091: W external/local_tsl/tsl/framework/bfc_allocator.cc:296] Allocator (GPU_0_bfc) ran out of memory trying to allocate 5.07GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.
2024-07-25 14:37:11.537113: W external/local_tsl/tsl/framework/bfc_allocator.cc:296] Allocator (GPU_0_bfc) ran out of memory trying to allocate 5.07GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.
2024-07-25 14:37:11.537134: W external/local_tsl/tsl/framework/bfc_

2024-07-25 14:37:14.091869: E external/local_xla/xla/service/slow_operation_alarm.cc:65] Trying algorithm eng0{} for conv (f32[40,32,721,1440]{3,2,1,0}, u8[0]{0}) custom-call(f32[40,64,79,143]{3,2,1,0}, f32[64,32,11,11]{3,2,1,0}), window={size=11x11 stride=9x10}, dim_labels=bf01_oi01->bf01, custom_call_target="__cudnn$convBackwardInput", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0}} is taking a while...


2024-07-25 14:37:58.156554: E external/local_xla/xla/service/slow_operation_alarm.cc:133] The operation took 45.064752505s
Trying algorithm eng0{} for conv (f32[40,32,721,1440]{3,2,1,0}, u8[0]{0}) custom-call(f32[40,64,79,143]{3,2,1,0}, f32[64,32,11,11]{3,2,1,0}), window={size=11x11 stride=9x10}, dim_labels=bf01_oi01->bf01, custom_call_target="__cudnn$convBackwardInput", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0}} is taking a while...


2024-07-25 14:38:20.227982: E external/local_xla/xla/service/slow_operation_alarm.cc:65] Trying algorithm eng0{} for conv (f32[64,32,11,11]{3,2,1,0}, u8[0]{0}) custom-call(f32[40,32,721,1440]{3,2,1,0}, f32[40,64,79,143]{3,2,1,0}), window={size=11x11 stride=9x10}, dim_labels=bf01_oi01->bf01, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0}} is taking a while...


2024-07-25 14:38:24.929055: E external/local_xla/xla/service/slow_operation_alarm.cc:133] The operation took 5.701130817s
Trying algorithm eng0{} for conv (f32[64,32,11,11]{3,2,1,0}, u8[0]{0}) custom-call(f32[40,32,721,1440]{3,2,1,0}, f32[40,64,79,143]{3,2,1,0}), window={size=11x11 stride=9x10}, dim_labels=bf01_oi01->bf01, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0}} is taking a while...


I0000 00:00:1721918311.974033   82762 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m1/8[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m10:05[0m 87s/step - kl_loss: 4.5086e-04 - loss: 860.0428 - reconstruction_loss: 860.0424

[1m2/8[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m1s[0m 238ms/step - kl_loss: 4.5014e-04 - loss: 860.0762 - reconstruction_loss: 860.0758 

[1m3/8[0m [32m━━━━━━━[0m[37m━━━━━━━━━━━━━[0m [1m1s[0m 237ms/step - kl_loss: 4.4938e-04 - loss: 860.0538 - reconstruction_loss: 860.0533

[1m4/8[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m0s[0m 236ms/step - kl_loss: 4.4880e-04 - loss: 860.0824 - reconstruction_loss: 860.0819

[1m5/8[0m [32m━━━━━━━━━━━━[0m[37m━━━━━━━━[0m [1m0s[0m 236ms/step - kl_loss: 4.4841e-04 - loss: 860.0728 - reconstruction_loss: 860.0724

[1m6/8[0m [32m━━━━━━━━━━━━━━━[0m[37m━━━━━[0m [1m0s[0m 235ms/step - kl_loss: 4.4814e-04 - loss: 860.0573 - reconstruction_loss: 860.0568

[1m7/8[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m0s[0m 235ms/step - kl_loss: 4.4794e-04 - loss: 860.0480 - reconstruction_loss: 860.0475

2024-07-25 14:38:37.027997: E external/local_xla/xla/service/slow_operation_alarm.cc:65] Trying algorithm eng0{} for conv (f32[7,32,721,1440]{3,2,1,0}, u8[0]{0}) custom-call(f32[7,64,79,143]{3,2,1,0}, f32[64,32,11,11]{3,2,1,0}), window={size=11x11 stride=9x10}, dim_labels=bf01_oi01->bf01, custom_call_target="__cudnn$convBackwardInput", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0}} is taking a while...


2024-07-25 14:38:43.915898: E external/local_xla/xla/service/slow_operation_alarm.cc:133] The operation took 7.887961793s
Trying algorithm eng0{} for conv (f32[7,32,721,1440]{3,2,1,0}, u8[0]{0}) custom-call(f32[7,64,79,143]{3,2,1,0}, f32[64,32,11,11]{3,2,1,0}), window={size=11x11 stride=9x10}, dim_labels=bf01_oi01->bf01, custom_call_target="__cudnn$convBackwardInput", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0}} is taking a while...


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3s/step - kl_loss: 4.4780e-04 - loss: 860.0314 - reconstruction_loss: 860.0309   




2024-07-25 14:38:57.834935: E external/local_xla/xla/service/slow_operation_alarm.cc:65] Trying algorithm eng0{} for conv (f32[16,32,721,1440]{3,2,1,0}, u8[0]{0}) custom-call(f32[16,64,79,143]{3,2,1,0}, f32[64,32,11,11]{3,2,1,0}), window={size=11x11 stride=9x10}, dim_labels=bf01_oi01->bf01, custom_call_target="__cudnn$convBackwardInput", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0}} is taking a while...


2024-07-25 14:39:14.861863: E external/local_xla/xla/service/slow_operation_alarm.cc:133] The operation took 18.026990723s
Trying algorithm eng0{} for conv (f32[16,32,721,1440]{3,2,1,0}, u8[0]{0}) custom-call(f32[16,64,79,143]{3,2,1,0}, f32[64,32,11,11]{3,2,1,0}), window={size=11x11 stride=9x10}, dim_labels=bf01_oi01->bf01, custom_call_target="__cudnn$convBackwardInput", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0}} is taking a while...


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m133s[0m 7s/step - kl_loss: 4.4769e-04 - loss: 860.0184 - reconstruction_loss: 860.0179 - val_kl_loss: 4.4777e-04 - val_loss: 859.8553 - val_reconstruction_loss: 859.8549


Epoch 2/50


[1m1/8[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m2s[0m 342ms/step - kl_loss: 4.4777e-04 - loss: 859.7046 - reconstruction_loss: 859.7042

[1m2/8[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m1s[0m 236ms/step - kl_loss: 4.4817e-04 - loss: 859.6471 - reconstruction_loss: 859.6466

[1m3/8[0m [32m━━━━━━━[0m[37m━━━━━━━━━━━━━[0m [1m1s[0m 236ms/step - kl_loss: 4.4860e-04 - loss: 859.6536 - reconstruction_loss: 859.6532

[1m4/8[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m0s[0m 235ms/step - kl_loss: 4.4904e-04 - loss: 859.6844 - reconstruction_loss: 859.6841

[1m5/8[0m [32m━━━━━━━━━━━━[0m[37m━━━━━━━━[0m [1m0s[0m 235ms/step - kl_loss: 4.4950e-04 - loss: 859.7070 - reconstruction_loss: 859.7065

[1m6/8[0m [32m━━━━━━━━━━━━━━━[0m[37m━━━━━[0m [1m0s[0m 235ms/step - kl_loss: 4.4997e-04 - loss: 859.7130 - reconstruction_loss: 859.7125

[1m7/8[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m0s[0m 235ms/step - kl_loss: 4.5040e-04 - loss: 859.7195 - reconstruction_loss: 859.7191

[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 259ms/step - kl_loss: 4.5111e-04 - loss: 859.7892 - reconstruction_loss: 859.7888 - val_kl_loss: 4.5665e-04 - val_loss: 859.8659 - val_reconstruction_loss: 859.8655


Epoch 3/50


[1m1/8[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m2s[0m 331ms/step - kl_loss: 4.5665e-04 - loss: 859.8461 - reconstruction_loss: 859.8456

[1m2/8[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m1s[0m 236ms/step - kl_loss: 4.5693e-04 - loss: 859.7028 - reconstruction_loss: 859.7024

[1m3/8[0m [32m━━━━━━━[0m[37m━━━━━━━━━━━━━[0m [1m1s[0m 236ms/step - kl_loss: 4.5723e-04 - loss: 859.6498 - reconstruction_loss: 859.6494

[1m4/8[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m0s[0m 235ms/step - kl_loss: 4.5749e-04 - loss: 859.6769 - reconstruction_loss: 859.6764

[1m5/8[0m [32m━━━━━━━━━━━━[0m[37m━━━━━━━━[0m [1m0s[0m 235ms/step - kl_loss: 4.5777e-04 - loss: 859.6855 - reconstruction_loss: 859.6851

[1m6/8[0m [32m━━━━━━━━━━━━━━━[0m[37m━━━━━[0m [1m0s[0m 235ms/step - kl_loss: 4.5800e-04 - loss: 859.7079 - reconstruction_loss: 859.7074

[1m7/8[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m0s[0m 235ms/step - kl_loss: 4.5821e-04 - loss: 859.7261 - reconstruction_loss: 859.7256

[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 259ms/step - kl_loss: 4.5854e-04 - loss: 859.7801 - reconstruction_loss: 859.7796 - val_kl_loss: 4.6128e-04 - val_loss: 859.9121 - val_reconstruction_loss: 859.9116


Epoch 4/50


[1m1/8[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m2s[0m 331ms/step - kl_loss: 4.6128e-04 - loss: 860.0392 - reconstruction_loss: 860.0388

[1m2/8[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m1s[0m 235ms/step - kl_loss: 4.6151e-04 - loss: 859.9750 - reconstruction_loss: 859.9745

[1m3/8[0m [32m━━━━━━━[0m[37m━━━━━━━━━━━━━[0m [1m1s[0m 236ms/step - kl_loss: 4.6181e-04 - loss: 859.8921 - reconstruction_loss: 859.8916

[1m4/8[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m0s[0m 235ms/step - kl_loss: 4.6213e-04 - loss: 859.8243 - reconstruction_loss: 859.8239

[1m5/8[0m [32m━━━━━━━━━━━━[0m[37m━━━━━━━━[0m [1m0s[0m 235ms/step - kl_loss: 4.6238e-04 - loss: 859.8041 - reconstruction_loss: 859.8036

[1m6/8[0m [32m━━━━━━━━━━━━━━━[0m[37m━━━━━[0m [1m0s[0m 235ms/step - kl_loss: 4.6262e-04 - loss: 859.7940 - reconstruction_loss: 859.7935

[1m7/8[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m0s[0m 235ms/step - kl_loss: 4.6284e-04 - loss: 859.7901 - reconstruction_loss: 859.7896

[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 259ms/step - kl_loss: 4.6324e-04 - loss: 859.8203 - reconstruction_loss: 859.8198 - val_kl_loss: 4.6776e-04 - val_loss: 859.9106 - val_reconstruction_loss: 859.9102


Epoch 5/50


[1m1/8[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m2s[0m 331ms/step - kl_loss: 4.6776e-04 - loss: 859.4241 - reconstruction_loss: 859.4236

[1m2/8[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m1s[0m 236ms/step - kl_loss: 4.6826e-04 - loss: 859.5529 - reconstruction_loss: 859.5524

[1m3/8[0m [32m━━━━━━━[0m[37m━━━━━━━━━━━━━[0m [1m1s[0m 236ms/step - kl_loss: 4.6863e-04 - loss: 859.7089 - reconstruction_loss: 859.7084

[1m4/8[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m0s[0m 235ms/step - kl_loss: 4.6894e-04 - loss: 859.7869 - reconstruction_loss: 859.7864

[1m5/8[0m [32m━━━━━━━━━━━━[0m[37m━━━━━━━━[0m [1m0s[0m 235ms/step - kl_loss: 4.6920e-04 - loss: 859.8174 - reconstruction_loss: 859.8169

[1m6/8[0m [32m━━━━━━━━━━━━━━━[0m[37m━━━━━[0m [1m0s[0m 235ms/step - kl_loss: 4.6941e-04 - loss: 859.8249 - reconstruction_loss: 859.8244

[1m7/8[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m0s[0m 235ms/step - kl_loss: 4.6960e-04 - loss: 859.8242 - reconstruction_loss: 859.8237

[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 258ms/step - kl_loss: 4.6993e-04 - loss: 859.8341 - reconstruction_loss: 859.8336 - val_kl_loss: 4.7354e-04 - val_loss: 859.8754 - val_reconstruction_loss: 859.8749


Epoch 6/50


[1m1/8[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m2s[0m 331ms/step - kl_loss: 4.7354e-04 - loss: 859.7825 - reconstruction_loss: 859.7820

[1m2/8[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m1s[0m 236ms/step - kl_loss: 4.7405e-04 - loss: 859.9767 - reconstruction_loss: 859.9763

[1m3/8[0m [32m━━━━━━━[0m[37m━━━━━━━━━━━━━[0m [1m1s[0m 235ms/step - kl_loss: 4.7442e-04 - loss: 859.8633 - reconstruction_loss: 859.8629

[1m4/8[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m0s[0m 235ms/step - kl_loss: 4.7473e-04 - loss: 859.8477 - reconstruction_loss: 859.8472

[1m5/8[0m [32m━━━━━━━━━━━━[0m[37m━━━━━━━━[0m [1m0s[0m 235ms/step - kl_loss: 4.7501e-04 - loss: 859.8429 - reconstruction_loss: 859.8424

[1m6/8[0m [32m━━━━━━━━━━━━━━━[0m[37m━━━━━[0m [1m0s[0m 235ms/step - kl_loss: 4.7527e-04 - loss: 859.8288 - reconstruction_loss: 859.8283

[1m7/8[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m0s[0m 235ms/step - kl_loss: 4.7550e-04 - loss: 859.8293 - reconstruction_loss: 859.8289

[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 259ms/step - kl_loss: 4.7588e-04 - loss: 859.8460 - reconstruction_loss: 859.8455 - val_kl_loss: 4.8137e-04 - val_loss: 859.8542 - val_reconstruction_loss: 859.8537


Epoch 7/50


[1m1/8[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m2s[0m 333ms/step - kl_loss: 4.8137e-04 - loss: 859.7552 - reconstruction_loss: 859.7547

[1m2/8[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m1s[0m 235ms/step - kl_loss: 4.8155e-04 - loss: 859.6024 - reconstruction_loss: 859.6019

[1m3/8[0m [32m━━━━━━━[0m[37m━━━━━━━━━━━━━[0m [1m1s[0m 235ms/step - kl_loss: 4.8163e-04 - loss: 859.5427 - reconstruction_loss: 859.5422

[1m4/8[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m0s[0m 235ms/step - kl_loss: 4.8176e-04 - loss: 859.5388 - reconstruction_loss: 859.5383

[1m5/8[0m [32m━━━━━━━━━━━━[0m[37m━━━━━━━━[0m [1m0s[0m 235ms/step - kl_loss: 4.8187e-04 - loss: 859.5613 - reconstruction_loss: 859.5608

[1m6/8[0m [32m━━━━━━━━━━━━━━━[0m[37m━━━━━[0m [1m0s[0m 235ms/step - kl_loss: 4.8197e-04 - loss: 859.5768 - reconstruction_loss: 859.5763

[1m7/8[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m0s[0m 235ms/step - kl_loss: 4.8208e-04 - loss: 859.5994 - reconstruction_loss: 859.5989

[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 259ms/step - kl_loss: 4.8229e-04 - loss: 859.6816 - reconstruction_loss: 859.6812 - val_kl_loss: 4.8430e-04 - val_loss: 859.8930 - val_reconstruction_loss: 859.8925


Epoch 8/50


[1m1/8[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m2s[0m 332ms/step - kl_loss: 4.8430e-04 - loss: 859.9863 - reconstruction_loss: 859.9858

[1m2/8[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m1s[0m 235ms/step - kl_loss: 4.8459e-04 - loss: 859.9562 - reconstruction_loss: 859.9557

[1m3/8[0m [32m━━━━━━━[0m[37m━━━━━━━━━━━━━[0m [1m1s[0m 235ms/step - kl_loss: 4.8488e-04 - loss: 859.9028 - reconstruction_loss: 859.9023

[1m4/8[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m0s[0m 235ms/step - kl_loss: 4.8505e-04 - loss: 859.8663 - reconstruction_loss: 859.8658

[1m5/8[0m [32m━━━━━━━━━━━━[0m[37m━━━━━━━━[0m [1m0s[0m 235ms/step - kl_loss: 4.8519e-04 - loss: 859.8542 - reconstruction_loss: 859.8538

[1m6/8[0m [32m━━━━━━━━━━━━━━━[0m[37m━━━━━[0m [1m0s[0m 235ms/step - kl_loss: 4.8529e-04 - loss: 859.8557 - reconstruction_loss: 859.8552

[1m7/8[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m0s[0m 235ms/step - kl_loss: 4.8538e-04 - loss: 859.8604 - reconstruction_loss: 859.8599

[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 258ms/step - kl_loss: 4.8554e-04 - loss: 859.8136 - reconstruction_loss: 859.8131 - val_kl_loss: 4.8840e-04 - val_loss: 859.8639 - val_reconstruction_loss: 859.8634


Epoch 9/50


[1m1/8[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m2s[0m 332ms/step - kl_loss: 4.8840e-04 - loss: 859.5508 - reconstruction_loss: 859.5503

[1m2/8[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m1s[0m 236ms/step - kl_loss: 4.8863e-04 - loss: 859.6238 - reconstruction_loss: 859.6233

[1m3/8[0m [32m━━━━━━━[0m[37m━━━━━━━━━━━━━[0m [1m1s[0m 235ms/step - kl_loss: 4.8912e-04 - loss: 859.7040 - reconstruction_loss: 859.7035

[1m4/8[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m0s[0m 235ms/step - kl_loss: 4.8952e-04 - loss: 859.7659 - reconstruction_loss: 859.7654

[1m5/8[0m [32m━━━━━━━━━━━━[0m[37m━━━━━━━━[0m [1m0s[0m 235ms/step - kl_loss: 4.8981e-04 - loss: 859.7698 - reconstruction_loss: 859.7693

[1m6/8[0m [32m━━━━━━━━━━━━━━━[0m[37m━━━━━[0m [1m0s[0m 235ms/step - kl_loss: 4.9000e-04 - loss: 859.7625 - reconstruction_loss: 859.7620

[1m7/8[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m0s[0m 235ms/step - kl_loss: 4.9015e-04 - loss: 859.7621 - reconstruction_loss: 859.7617

[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 259ms/step - kl_loss: 4.9037e-04 - loss: 859.7936 - reconstruction_loss: 859.7931 - val_kl_loss: 4.9007e-04 - val_loss: 859.8329 - val_reconstruction_loss: 859.8325


Epoch 10/50


[1m1/8[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m2s[0m 332ms/step - kl_loss: 4.9007e-04 - loss: 859.5696 - reconstruction_loss: 859.5691

[1m2/8[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m1s[0m 236ms/step - kl_loss: 4.9024e-04 - loss: 859.7482 - reconstruction_loss: 859.7477

[1m3/8[0m [32m━━━━━━━[0m[37m━━━━━━━━━━━━━[0m [1m1s[0m 236ms/step - kl_loss: 4.9050e-04 - loss: 859.7393 - reconstruction_loss: 859.7388

[1m4/8[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m0s[0m 235ms/step - kl_loss: 4.9075e-04 - loss: 859.7216 - reconstruction_loss: 859.7211

[1m5/8[0m [32m━━━━━━━━━━━━[0m[37m━━━━━━━━[0m [1m0s[0m 235ms/step - kl_loss: 4.9101e-04 - loss: 859.7126 - reconstruction_loss: 859.7121

[1m6/8[0m [32m━━━━━━━━━━━━━━━[0m[37m━━━━━[0m [1m0s[0m 235ms/step - kl_loss: 4.9127e-04 - loss: 859.7160 - reconstruction_loss: 859.7155

[1m7/8[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m0s[0m 235ms/step - kl_loss: 4.9150e-04 - loss: 859.7308 - reconstruction_loss: 859.7303

[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 258ms/step - kl_loss: 4.9186e-04 - loss: 859.7720 - reconstruction_loss: 859.7715 - val_kl_loss: 4.9529e-04 - val_loss: 859.9004 - val_reconstruction_loss: 859.8999


Epoch 11/50


[1m1/8[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m2s[0m 332ms/step - kl_loss: 4.9529e-04 - loss: 859.8552 - reconstruction_loss: 859.8547

[1m2/8[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m1s[0m 235ms/step - kl_loss: 4.9537e-04 - loss: 859.9011 - reconstruction_loss: 859.9006

[1m3/8[0m [32m━━━━━━━[0m[37m━━━━━━━━━━━━━[0m [1m1s[0m 235ms/step - kl_loss: 4.9551e-04 - loss: 859.9147 - reconstruction_loss: 859.9142

[1m4/8[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m0s[0m 235ms/step - kl_loss: 4.9561e-04 - loss: 859.8934 - reconstruction_loss: 859.8929

[1m5/8[0m [32m━━━━━━━━━━━━[0m[37m━━━━━━━━[0m [1m0s[0m 235ms/step - kl_loss: 4.9571e-04 - loss: 859.8773 - reconstruction_loss: 859.8768

[1m6/8[0m [32m━━━━━━━━━━━━━━━[0m[37m━━━━━[0m [1m0s[0m 235ms/step - kl_loss: 4.9579e-04 - loss: 859.8518 - reconstruction_loss: 859.8513

[1m7/8[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m0s[0m 235ms/step - kl_loss: 4.9591e-04 - loss: 859.8433 - reconstruction_loss: 859.8428

[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 258ms/step - kl_loss: 4.9610e-04 - loss: 859.8348 - reconstruction_loss: 859.8344 - val_kl_loss: 5.0073e-04 - val_loss: 859.8491 - val_reconstruction_loss: 859.8486


Epoch 12/50


[1m1/8[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m2s[0m 331ms/step - kl_loss: 5.0073e-04 - loss: 859.0171 - reconstruction_loss: 859.0166

[1m2/8[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m1s[0m 235ms/step - kl_loss: 5.0114e-04 - loss: 859.2046 - reconstruction_loss: 859.2041

[1m3/8[0m [32m━━━━━━━[0m[37m━━━━━━━━━━━━━[0m [1m1s[0m 235ms/step - kl_loss: 5.0135e-04 - loss: 859.3253 - reconstruction_loss: 859.3248

[1m4/8[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m0s[0m 235ms/step - kl_loss: 5.0149e-04 - loss: 859.4317 - reconstruction_loss: 859.4312

[1m5/8[0m [32m━━━━━━━━━━━━[0m[37m━━━━━━━━[0m [1m0s[0m 235ms/step - kl_loss: 5.0158e-04 - loss: 859.5093 - reconstruction_loss: 859.5088

[1m6/8[0m [32m━━━━━━━━━━━━━━━[0m[37m━━━━━[0m [1m0s[0m 235ms/step - kl_loss: 5.0164e-04 - loss: 859.5577 - reconstruction_loss: 859.5573

[1m7/8[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m0s[0m 235ms/step - kl_loss: 5.0169e-04 - loss: 859.5980 - reconstruction_loss: 859.5975

[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 258ms/step - kl_loss: 5.0178e-04 - loss: 859.6241 - reconstruction_loss: 859.6237 - val_kl_loss: 5.0626e-04 - val_loss: 859.8555 - val_reconstruction_loss: 859.8550


Epoch 13/50


[1m1/8[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m2s[0m 332ms/step - kl_loss: 5.0626e-04 - loss: 860.2395 - reconstruction_loss: 860.2390

[1m2/8[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m1s[0m 236ms/step - kl_loss: 5.0647e-04 - loss: 860.0966 - reconstruction_loss: 860.0961

[1m3/8[0m [32m━━━━━━━[0m[37m━━━━━━━━━━━━━[0m [1m1s[0m 236ms/step - kl_loss: 5.0650e-04 - loss: 859.9975 - reconstruction_loss: 859.9970

[1m4/8[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m0s[0m 235ms/step - kl_loss: 5.0648e-04 - loss: 859.9813 - reconstruction_loss: 859.9808

[1m5/8[0m [32m━━━━━━━━━━━━[0m[37m━━━━━━━━[0m [1m0s[0m 235ms/step - kl_loss: 5.0645e-04 - loss: 859.9832 - reconstruction_loss: 859.9827

[1m6/8[0m [32m━━━━━━━━━━━━━━━[0m[37m━━━━━[0m [1m0s[0m 235ms/step - kl_loss: 5.0642e-04 - loss: 859.9811 - reconstruction_loss: 859.9806

[1m7/8[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m0s[0m 235ms/step - kl_loss: 5.0642e-04 - loss: 859.9553 - reconstruction_loss: 859.9548

[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 258ms/step - kl_loss: 5.0643e-04 - loss: 859.9203 - reconstruction_loss: 859.9198 - val_kl_loss: 5.0684e-04 - val_loss: 859.9199 - val_reconstruction_loss: 859.9194


Epoch 14/50


[1m1/8[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m2s[0m 331ms/step - kl_loss: 5.0684e-04 - loss: 859.8661 - reconstruction_loss: 859.8657

[1m2/8[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m1s[0m 235ms/step - kl_loss: 5.0694e-04 - loss: 859.7830 - reconstruction_loss: 859.7825

[1m3/8[0m [32m━━━━━━━[0m[37m━━━━━━━━━━━━━[0m [1m1s[0m 236ms/step - kl_loss: 5.0702e-04 - loss: 859.8770 - reconstruction_loss: 859.8765

[1m4/8[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m0s[0m 236ms/step - kl_loss: 5.0706e-04 - loss: 859.8978 - reconstruction_loss: 859.8973

[1m5/8[0m [32m━━━━━━━━━━━━[0m[37m━━━━━━━━[0m [1m0s[0m 235ms/step - kl_loss: 5.0706e-04 - loss: 859.9019 - reconstruction_loss: 859.9014

[1m6/8[0m [32m━━━━━━━━━━━━━━━[0m[37m━━━━━[0m [1m0s[0m 235ms/step - kl_loss: 5.0708e-04 - loss: 859.8967 - reconstruction_loss: 859.8962

[1m7/8[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m0s[0m 235ms/step - kl_loss: 5.0711e-04 - loss: 859.8823 - reconstruction_loss: 859.8818

[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 259ms/step - kl_loss: 5.0719e-04 - loss: 859.8572 - reconstruction_loss: 859.8567 - val_kl_loss: 5.0865e-04 - val_loss: 859.8293 - val_reconstruction_loss: 859.8288


Epoch 15/50


[1m1/8[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m2s[0m 331ms/step - kl_loss: 5.0865e-04 - loss: 859.4210 - reconstruction_loss: 859.4205

[1m2/8[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m1s[0m 235ms/step - kl_loss: 5.0864e-04 - loss: 859.4585 - reconstruction_loss: 859.4580

[1m3/8[0m [32m━━━━━━━[0m[37m━━━━━━━━━━━━━[0m [1m1s[0m 236ms/step - kl_loss: 5.0874e-04 - loss: 859.4952 - reconstruction_loss: 859.4948

[1m4/8[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m0s[0m 235ms/step - kl_loss: 5.0883e-04 - loss: 859.5371 - reconstruction_loss: 859.5366

[1m5/8[0m [32m━━━━━━━━━━━━[0m[37m━━━━━━━━[0m [1m0s[0m 235ms/step - kl_loss: 5.0889e-04 - loss: 859.5828 - reconstruction_loss: 859.5823

[1m6/8[0m [32m━━━━━━━━━━━━━━━[0m[37m━━━━━[0m [1m0s[0m 235ms/step - kl_loss: 5.0891e-04 - loss: 859.6042 - reconstruction_loss: 859.6038

[1m7/8[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m0s[0m 235ms/step - kl_loss: 5.0891e-04 - loss: 859.6296 - reconstruction_loss: 859.6291

[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 259ms/step - kl_loss: 5.0887e-04 - loss: 859.6744 - reconstruction_loss: 859.6739 - val_kl_loss: 5.0649e-04 - val_loss: 859.9767 - val_reconstruction_loss: 859.9763


Epoch 16/50


[1m1/8[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m2s[0m 373ms/step - kl_loss: 5.0649e-04 - loss: 859.7143 - reconstruction_loss: 859.7138

[1m2/8[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m1s[0m 236ms/step - kl_loss: 5.0635e-04 - loss: 859.7323 - reconstruction_loss: 859.7318

[1m3/8[0m [32m━━━━━━━[0m[37m━━━━━━━━━━━━━[0m [1m1s[0m 236ms/step - kl_loss: 5.0620e-04 - loss: 859.8226 - reconstruction_loss: 859.8221

[1m4/8[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m0s[0m 236ms/step - kl_loss: 5.0608e-04 - loss: 859.8377 - reconstruction_loss: 859.8372

[1m5/8[0m [32m━━━━━━━━━━━━[0m[37m━━━━━━━━[0m [1m0s[0m 235ms/step - kl_loss: 5.0604e-04 - loss: 859.8436 - reconstruction_loss: 859.8431

[1m6/8[0m [32m━━━━━━━━━━━━━━━[0m[37m━━━━━[0m [1m0s[0m 235ms/step - kl_loss: 5.0603e-04 - loss: 859.8306 - reconstruction_loss: 859.8301

[1m7/8[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m0s[0m 235ms/step - kl_loss: 5.0606e-04 - loss: 859.8278 - reconstruction_loss: 859.8273

[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 259ms/step - kl_loss: 5.0613e-04 - loss: 859.8234 - reconstruction_loss: 859.8229 - val_kl_loss: 5.0658e-04 - val_loss: 859.8441 - val_reconstruction_loss: 859.8436


Epoch 17/50


[1m1/8[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m2s[0m 373ms/step - kl_loss: 5.0658e-04 - loss: 859.1378 - reconstruction_loss: 859.1373

[1m2/8[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m1s[0m 236ms/step - kl_loss: 5.0633e-04 - loss: 859.2604 - reconstruction_loss: 859.2599

[1m3/8[0m [32m━━━━━━━[0m[37m━━━━━━━━━━━━━[0m [1m1s[0m 236ms/step - kl_loss: 5.0626e-04 - loss: 859.3093 - reconstruction_loss: 859.3088

[1m4/8[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m0s[0m 236ms/step - kl_loss: 5.0628e-04 - loss: 859.3726 - reconstruction_loss: 859.3721

[1m5/8[0m [32m━━━━━━━━━━━━[0m[37m━━━━━━━━[0m [1m0s[0m 236ms/step - kl_loss: 5.0628e-04 - loss: 859.4044 - reconstruction_loss: 859.4039

[1m6/8[0m [32m━━━━━━━━━━━━━━━[0m[37m━━━━━[0m [1m0s[0m 236ms/step - kl_loss: 5.0623e-04 - loss: 859.4454 - reconstruction_loss: 859.4449

[1m7/8[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m0s[0m 236ms/step - kl_loss: 5.0617e-04 - loss: 859.4939 - reconstruction_loss: 859.4934

[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 259ms/step - kl_loss: 5.0604e-04 - loss: 859.5651 - reconstruction_loss: 859.5646 - val_kl_loss: 5.0495e-04 - val_loss: 859.9072 - val_reconstruction_loss: 859.9067


Epoch 18/50


[1m1/8[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m2s[0m 334ms/step - kl_loss: 5.0495e-04 - loss: 859.6920 - reconstruction_loss: 859.6915

[1m2/8[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m1s[0m 235ms/step - kl_loss: 5.0535e-04 - loss: 859.6962 - reconstruction_loss: 859.6957

[1m3/8[0m [32m━━━━━━━[0m[37m━━━━━━━━━━━━━[0m [1m1s[0m 235ms/step - kl_loss: 5.0561e-04 - loss: 859.7247 - reconstruction_loss: 859.7242

[1m4/8[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m0s[0m 235ms/step - kl_loss: 5.0572e-04 - loss: 859.7672 - reconstruction_loss: 859.7667

[1m5/8[0m [32m━━━━━━━━━━━━[0m[37m━━━━━━━━[0m [1m0s[0m 235ms/step - kl_loss: 5.0580e-04 - loss: 859.7898 - reconstruction_loss: 859.7893

[1m6/8[0m [32m━━━━━━━━━━━━━━━[0m[37m━━━━━[0m [1m0s[0m 235ms/step - kl_loss: 5.0584e-04 - loss: 859.8082 - reconstruction_loss: 859.8077

[1m7/8[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m0s[0m 235ms/step - kl_loss: 5.0589e-04 - loss: 859.8043 - reconstruction_loss: 859.8038

[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 258ms/step - kl_loss: 5.0598e-04 - loss: 859.8282 - reconstruction_loss: 859.8278 - val_kl_loss: 5.0742e-04 - val_loss: 859.8666 - val_reconstruction_loss: 859.8661


Epoch 19/50


[1m1/8[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m2s[0m 332ms/step - kl_loss: 5.0742e-04 - loss: 859.6811 - reconstruction_loss: 859.6806

[1m2/8[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m1s[0m 236ms/step - kl_loss: 5.0731e-04 - loss: 859.7928 - reconstruction_loss: 859.7924

[1m3/8[0m [32m━━━━━━━[0m[37m━━━━━━━━━━━━━[0m [1m1s[0m 236ms/step - kl_loss: 5.0728e-04 - loss: 859.8513 - reconstruction_loss: 859.8508

[1m4/8[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m0s[0m 235ms/step - kl_loss: 5.0725e-04 - loss: 859.8356 - reconstruction_loss: 859.8351

[1m5/8[0m [32m━━━━━━━━━━━━[0m[37m━━━━━━━━[0m [1m0s[0m 235ms/step - kl_loss: 5.0727e-04 - loss: 859.8212 - reconstruction_loss: 859.8207

[1m6/8[0m [32m━━━━━━━━━━━━━━━[0m[37m━━━━━[0m [1m0s[0m 235ms/step - kl_loss: 5.0728e-04 - loss: 859.8064 - reconstruction_loss: 859.8059

[1m7/8[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m0s[0m 235ms/step - kl_loss: 5.0728e-04 - loss: 859.8033 - reconstruction_loss: 859.8029

[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 258ms/step - kl_loss: 5.0728e-04 - loss: 859.8211 - reconstruction_loss: 859.8206 - val_kl_loss: 5.0995e-04 - val_loss: 859.8673 - val_reconstruction_loss: 859.8668


2024-07-25 14:40:05.914039: E external/local_xla/xla/service/slow_operation_alarm.cc:65] Trying algorithm eng0{} for conv (f32[32,32,721,1440]{3,2,1,0}, u8[0]{0}) custom-call(f32[32,64,79,143]{3,2,1,0}, f32[64,32,11,11]{3,2,1,0}), window={size=11x11 stride=9x10}, dim_labels=bf01_oi01->bf01, custom_call_target="__cudnn$convBackwardInput", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0}} is taking a while...


2024-07-25 14:40:40.965805: E external/local_xla/xla/service/slow_operation_alarm.cc:133] The operation took 36.051839104s
Trying algorithm eng0{} for conv (f32[32,32,721,1440]{3,2,1,0}, u8[0]{0}) custom-call(f32[32,64,79,143]{3,2,1,0}, f32[64,32,11,11]{3,2,1,0}), window={size=11x11 stride=9x10}, dim_labels=bf01_oi01->bf01, custom_call_target="__cudnn$convBackwardInput", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0}} is taking a while...


[1m1/3[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m1:35[0m 48s/step - kl_loss: 5.0865e-04 - loss: 860.0677 - reconstruction_loss: 860.0673

[1m2/3[0m [32m━━━━━━━━━━━━━[0m[37m━━━━━━━[0m [1m0s[0m 89ms/step - kl_loss: 5.0865e-04 - loss: 860.0721 - reconstruction_loss: 860.0717 

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 89ms/step - kl_loss: 5.0865e-04 - loss: 860.1227 - reconstruction_loss: 860.1223

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 90ms/step - kl_loss: 5.0865e-04 - loss: 860.1481 - reconstruction_loss: 860.1476


Test loss: {'loss': 860.2235107421875, 'reconstruction_loss': 0.0005086498567834496, 'kl_loss': 860.2239990234375}
200320


!If DVC froze, see `hardlink_lock` in <[36mhttps://man.dvc.org/config#core[39m>                                                                      

[?25l[32m⠋[0m Checking graph
[?25h[1A[2K!  0% Adding...|                                      |0/1 [00:00<?,     ?file/s]Adding...                                                                       
![A
Collecting files and computing hashes in model_dir/history_200320.csv |0.00 [00:[A


                                                                                [A
![A
  0% Checking cache in '/home/lobielodan/parsl_mpi/run_on_cluster/cvae-weather-e[A
                                                                                [A
![A
  0%|          |Adding model_dir/history_200320.csv to0/1 [00:00<?,     ?file/s][A
                                                                                [A
![A
  0%|          |Checking out /home/lobielodan/parsl_mp0/1 [00:00<?,    ?files/s][A
                                                                                [A100% Adding...|████████████████████████████████████████|1/1 [00:00, 39.55file/s]

To track the changes with git, run:

	git add model_dir/history_200320.csv.dvc model_dir/.gitignore

To enable auto staging, run:

	dvc config core.autostage true


[0m

fatal: pathspec '.gitignore' did not match any files


!If DVC froze, see `hardlink_lock` in <[36mhttps://man.dvc.org/config#core[39m>                                                                      

!Collecting                                            |0.00 [00:00,    ?entry/s]

Collecting                                            |11.0 [00:00,  102entry/s]

Collecting                                            |22.0 [00:00, 95.3entry/s]

Collecting                                            |23.0 [00:00, 91.1entry/s]
!Pushing
![A
  0% Checking cache in '/aws-dvc-bucket/files/md5'|  |0/? [00:00<?,    ?files/s][A


  4% Querying cache in '/aws-dvc-bucket/files/md5'| |1/23 [00:00<00:03,  6.76fil[A


 13% Querying cache in '/aws-dvc-bucket/files/md5'|▏|3/23 [00:00<00:01, 12.96fil[A


 26% Querying cache in '/aws-dvc-bucket/files/md5'|▎|6/23 [00:00<00:00, 17.76fil[A


 35% Querying cache in '/aws-dvc-bucket/files/md5'|▎|8/23 [00:00<00:00, 17.88fil[A


 48% Querying cache in '/aws-dvc-bucket/files/md5'|▍|11/23 [00:00<00:01, 11.51fi[A


 61% Querying cache in '/aws-dvc-bucket/files/md5'|▌|14/23 [00:01<00:00, 14.01fi[A


 74% Querying cache in '/aws-dvc-bucket/files/md5'|▋|17/23 [00:01<00:00, 16.30fi[A


 87% Querying cache in '/aws-dvc-bucket/files/md5'|▊|20/23 [00:01<00:00, 17.58fi[A


100% Querying cache in '/aws-dvc-bucket/files/md5'|█|23/23 [00:01<00:00, 18.60fi[A
                                                                                [A
![A
  0% Checking cache in '/home/lobielodan/parsl_mpi/run_on_cluster/cvae-weather-e[A
                                                                                [A
![A
  0%|          |Pushing to local                      0/1 [00:00<?,     ?file/s][A


100%|██████████|Pushing to local                  1/1 [00:00<00:00,  1.94file/s][A
                                                                                [APushing


1 file pushed


[0m

[main 82f1e98] f200320
 Committer: Parallel Works app-run user <lobielodan@mgmt-lobielodan-cvaetraining-00015.pw-canary-us-east-1.pw.local>
Your name and email address were configured automatically based
on your username and hostname. Please check that they are accurate.
You can suppress this message by setting them explicitly. Run the
following command and follow the instructions in your editor to edit
your configuration file:

    git config --global --edit

After doing this, you may fix the identity used for this commit with:

    git commit --amend --reset-author

 1 file changed, 5 insertions(+)
 create mode 100644 run_on_cluster/cvae-weather-ensemble/model_dir/history_200320.csv.dvc




Enumerating objects: 10, done.
Counting objects:  10% (1/10)Counting objects:  20% (2/10)Counting objects:  30% (3/10)Counting objects:  40% (4/10)Counting objects:  50% (5/10)Counting objects:  60% (6/10)Counting objects:  70% (7/10)Counting objects:  80% (8/10)Counting objects:  90% (9/10)Counting objects: 100% (10/10)Counting objects: 100% (10/10), done.
Delta compression using up to 4 threads
Compressing objects:  16% (1/6)Compressing objects:  33% (2/6)Compressing objects:  50% (3/6)Compressing objects:  66% (4/6)Compressing objects:  83% (5/6)Compressing objects: 100% (6/6)Compressing objects: 100% (6/6), done.
Writing objects:  16% (1/6)Writing objects:  33% (2/6)Writing objects:  50% (3/6)Writing objects:  66% (4/6)Writing objects:  83% (5/6)Writing objects: 100% (6/6)Writing objects: 100% (6/6), 605 bytes | 605.00 KiB/s, done.
Total 6 (delta 4), reused 0 (delta 0), pack-reused 0


remote: Resolving deltas:   0% (0/4)[Kremote: Resolving deltas:  25% (1/4)[Kremote: Resolving deltas:  50% (2/4)[Kremote: Resolving deltas:  75% (3/4)[Kremote: Resolving deltas: 100% (4/4)[Kremote: Resolving deltas: 100% (4/4), completed with 4 local objects.[K


To github.com:oobielodan/parsl_mpi.git
   18e8707..82f1e98  main -> main


!If DVC froze, see `hardlink_lock` in <[36mhttps://man.dvc.org/config#core[39m>                                                                      

[?25l[32m⠋[0m Checking graph
[?25h[1A[2K!  0% Adding...|                                      |0/1 [00:00<?,     ?file/s]Adding...                                                                       
![A
Collecting files and computing hashes in model_dir/vae.weights_200320.h5 |0.00 [[A
                                                                                [A
![A
  0% Checking cache in '/home/lobielodan/parsl_mpi/run_on_cluster/cvae-weather-e[A
                                                                                [A
![A
  0%|          |Adding model_dir/vae.weights_200320.h50/1 [00:00<?,     ?file/s][A
                                                                                [A
![A
  0%|          |Checking out /home/lobielodan/parsl_mp0/1 [00:00<?,    ?files/s][A


                                                                                [A100% Adding...|████████████████████████████████████████|1/1 [00:00, 11.72file/s]

To track the changes with git, run:

	git add model_dir/.gitignore model_dir/vae.weights_200320.h5.dvc

To enable auto staging, run:

	dvc config core.autostage true


[0m

fatal: pathspec '.gitignore' did not match any files


!If DVC froze, see `hardlink_lock` in <[36mhttps://man.dvc.org/config#core[39m>                                                                      

!Collecting                                            |0.00 [00:00,    ?entry/s]

Collecting                                            |9.00 [00:00, 88.1entry/s]

Collecting                                            |18.0 [00:00, 85.5entry/s]

Collecting                                            |24.0 [00:00, 82.1entry/s]
!Pushing


![A
  0% Checking cache in '/aws-dvc-bucket/files/md5'|  |0/? [00:00<?,    ?files/s][A


 50% Querying cache in '/aws-dvc-bucket/files/md5'|▌|12/24 [00:00<00:00, 108.09f[A


100% Querying cache in '/aws-dvc-bucket/files/md5'|█|24/24 [00:00<00:00, 112.65f[A
                                                                                [A
![A
  0% Checking cache in '/home/lobielodan/parsl_mpi/run_on_cluster/cvae-weather-e[A
                                                                                [A
![A
  0%|          |Pushing to local                      0/1 [00:00<?,     ?file/s][A


100%|██████████|Pushing to local                  1/1 [00:01<00:00,  1.33s/file][A
                                                                                [APushing


1 file pushed


[0m

[main b816144] f200320
 Committer: Parallel Works app-run user <lobielodan@mgmt-lobielodan-cvaetraining-00015.pw-canary-us-east-1.pw.local>
Your name and email address were configured automatically based
on your username and hostname. Please check that they are accurate.
You can suppress this message by setting them explicitly. Run the
following command and follow the instructions in your editor to edit
your configuration file:

    git config --global --edit

After doing this, you may fix the identity used for this commit with:

    git commit --amend --reset-author

 1 file changed, 5 insertions(+)
 create mode 100644 run_on_cluster/cvae-weather-ensemble/model_dir/vae.weights_200320.h5.dvc




Enumerating objects: 10, done.
Counting objects:  10% (1/10)Counting objects:  20% (2/10)Counting objects:  30% (3/10)Counting objects:  40% (4/10)Counting objects:  50% (5/10)Counting objects:  60% (6/10)Counting objects:  70% (7/10)Counting objects:  80% (8/10)Counting objects:  90% (9/10)Counting objects: 100% (10/10)Counting objects: 100% (10/10), done.
Delta compression using up to 4 threads
Compressing objects:  16% (1/6)Compressing objects:  33% (2/6)Compressing objects:  50% (3/6)Compressing objects:  66% (4/6)Compressing objects:  83% (5/6)Compressing objects: 100% (6/6)Compressing objects: 100% (6/6), done.
Writing objects:  16% (1/6)Writing objects:  33% (2/6)Writing objects:  50% (3/6)Writing objects:  66% (4/6)Writing objects:  83% (5/6)Writing objects: 100% (6/6)Writing objects: 100% (6/6), 597 bytes | 597.00 KiB/s, done.
Total 6 (delta 4), reused 0 (delta 0), pack-reused 0


remote: Resolving deltas:   0% (0/4)[Kremote: Resolving deltas:  25% (1/4)[Kremote: Resolving deltas:  50% (2/4)[Kremote: Resolving deltas:  75% (3/4)[Kremote: Resolving deltas: 100% (4/4)[Kremote: Resolving deltas: 100% (4/4), completed with 4 local objects.[K


To github.com:oobielodan/parsl_mpi.git
   82f1e98..b816144  main -> main


Memory usage after training: {'current': 6979584, 'peak': 18227546112}
