# Train the Topology-Optimizing Variational Autoencoder
## <u> Overview</u>: This model uses the same architecture as the TopoGen-VAE and adds to it fully connected multilayer percptron (MLP), AKA fully connected neural networks, linear neural networks or feedforward neural networks, to predict material properties from the latent vector. Once trained, the material property prediction (MPP) modules are used to perform gradient descent optimization on the latent vectors to achieve targeted properties. The optimized latent vectors are then decoded into topologies that have improved material properties.

### <u> Model configuration</u>: This model contains eight (8) MPP modules for predicting all nonzero material properties - mechanical, thermal conductivity, and fluid permeability. In the following notebook, "3_2_TO_with_TO_VAE", selected properties can be isolated for optimizing topologies with a smaller number of targeted properties and constraints. Having all properties embedded in the trained model adds robustness to the latent space correlations and organization.

In [1]:
# For setting the directory references for the entire package
from ML_workflow_utils_v3.PackageDirectories import PackageDirectories as PD   

# This code automatically sets the rootpath as the directory the entire package is contained in, which is then called to initialize the PackageDirectories class below
import os
# check current path if desired
# currentpath = os.getcwd()
# print(currentpath)

os.chdir('../../../')
rootpath = os.getcwd()
# print(rootpath)

# Alternately, rootpath can be set manually
# rootpath = 'filepath/containing/entire/ML_package/'

directory = PD(rootpath = rootpath)

## Model configuration - material properties

#### <u>Discussion</u>: The material properties that the model will be trained to predict are set below using two variables, `matprops` (list) and `matprops_by_module` (list of lists)


For example, predicting volume fraction, CH_11, CH_22, vH_12, vH_13, kappaH_11 in four separate modules would be configured as follows:

`matprops = ['volFrac', 'CH_11 scaled', 'CH_22 scaled', 'vH_12 scaled', 'vH_13 scaled', 'kappaH_11 scaled']`

`matprops_by_module = [['volFrac'], ['CH_11 scaled', 'CH_22 scaled',], ['vH_12 scaled', 'vH_13 scaled'], ['kappaH_11 scaled']]`

The order of the material properties is maintained, and `matprops_by_module` is an input to the call to instantiate the model. Its number of elements defines the number of modules and the length of each sub-list defines the output dimension of the module.

__Note:__ Due to the model's ability to separate material properties, using *scaled* values is not necessary. However, if desiring to use unscaled properties, we recommend grouping similar properties together to avoid confounding due to scale mismatch.


In [2]:
#### Material Properties #####
matprops = ['volFrac', 
         'CH_11 scaled', 'CH_22 scaled', 'CH_33 scaled', 'CH_44 scaled', 'CH_55 scaled', 'CH_66 scaled',
         'CH_12 scaled', 'CH_13 scaled','CH_23 scaled',
         'EH_11 scaled', 'EH_22 scaled', 'EH_33 scaled',
         'GH_23 scaled', 'GH_13 scaled', 'GH_12 scaled', 
         'vH_12 scaled', 'vH_13 scaled', 'vH_23 scaled', 'vH_21 scaled', 'vH_31 scaled','vH_32 scaled',
         'KH_11 scaled', 'KH_22 scaled', 'KH_33 scaled', 
         'kappaH_11 scaled', 'kappaH_22 scaled', 'kappaH_33 scaled']

matprops_by_module = [['volFrac',], 
                      ['CH_11 scaled', 'CH_22 scaled', 'CH_33 scaled', 'CH_44 scaled', 'CH_55 scaled', 'CH_66 scaled',],
                      ['CH_12 scaled', 'CH_13 scaled','CH_23 scaled',],
                      ['EH_11 scaled', 'EH_22 scaled', 'EH_33 scaled',],
                      ['GH_23 scaled', 'GH_13 scaled', 'GH_12 scaled',],
                      ['vH_12 scaled', 'vH_13 scaled', 'vH_23 scaled', 'vH_21 scaled', 'vH_31 scaled','vH_32 scaled',],
                      ['KH_11 scaled', 'KH_22 scaled', 'KH_33 scaled',],
                      ['kappaH_11 scaled', 'kappaH_22 scaled', 'kappaH_33 scaled']]

num_props = len(matprops) # for determining the output

In [3]:
nbpath = directory.nb_3_1_path
cp_dir = os.path.join(nbpath, 'model_CPs')

""" 
fname_base = "filename base" - This is used as the base for model checkpoints and training history files
set date to True if you want to include the date in the filename base
"""
date = False

"""
set    matprops_abbrev    to indicate the material properties being predicted, differentiating between different models
we recommend setting it to a readable abbreviation

examples: for all properties - matprops_abbrev = allprops
          for all properties *unscaled* - matprops_abbrev = allprops_unscaled

"""

matprops_abbrev = 'allprops'

if date:
    date = '24SEP24'

    fname_base = f'TopOpt_VAE_{matprops_abbrev}_{date}'

else:
    fname_base = f'TopOpt_VAE_{matprops_abbrev}'

In [4]:
import pandas as pd
import numpy as np
import json
import glob
import os

from ML_workflow_utils_v3.Dataset_Preprocessor import Dataset_Preprocessor as DataP

import torch
import torch.nn as nn
import torch.nn.init as init
import torch.nn.functional as F


import torch.utils.data as data
from torch.utils.data import DataLoader
from tqdm import tqdm
import torch.optim as optim

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
"""
This section sets up the dataset using the DatasetCreator class.
"""

# Define the following variables from the directory instance to feed to the DatasetPreprocessor instance:
data_source_directory = directory.source_data_path
meshdir = directory.voxeltopo_path

# Random seed used for splitting the data, or at least the dataframes, into train/validation/test splits.
testset_seed = seed = 17

"""
Setting range of volume fractions removes very low and very high volume fractions (densities) from the data, 
which are not practical for application considerations (additive manufacturing) and can confound model training.
"""
volfrac_range = (0.01, 0.98)


# Sets filepaths for reference

source_data_path = directory.source_data_path
voxel_dir = directory.voxeltopo_path

# Name of the CSV file that contains the homogenized material properties
csv_fn = 'topology_multiphysics_database_by_partno.csv'

# Instantiate the dataset splitting object
datasplits = DataP(csv_fn = csv_fn, data_source_directory = data_source_directory, meshdir = meshdir, volfrac_range=volfrac_range)

In [6]:
"""
The *length* of    test_set_topo_counts    must match the number of topologies in the test split.
If the test set is chosen randomly, each number in test_set_topo_counts corresponds to the number of topologies from the respective topology family that will be in the test set.

By default, every topology family is sampled from to construct the test set. If fewer are desired, create a custom list for topfam_sampling_set
and construct test_set_topo_counts to reflect the number of topology families and number of topologies per family. For example

topo_families = ['lattice', 'tpms', 'topopt']
test_set_topo_counts = [3, 1, 2]


Further configuration options for Dataset_Preprocessor are available in the .py file
"""

# Define list of topology families for sampling the training split
topo_families = datasplits.matpropcsv['topology_family'].unique()

test_set_topo_counts = [0,2,2,2,2,2] # five topology families, two topologies from each family - interpolated topologies not included

# Split the dataset with the call to this function
datasplits.TrainTestSplit(topfam_sampling_set=topo_families, test_set_topo_counts=test_set_topo_counts, translate=False, testset_seed=seed)

In [7]:
# This dataset creator class modified for Regression VAE

from ML_workflow_utils_v3.TO_VAE_training_utils import tovae_train, tovae_validate, TOVAE_loss
from ML_workflow_utils_v3.RVAE_training_utils import RVAEDataset

batch_size=32

trdat = RVAEDataset(datasplits.idxTr, directory.voxeltopo_path, matprops)
trloader = DataLoader(trdat, batch_size=batch_size, shuffle=True) #, shuffle=True

valdat = RVAEDataset(datasplits.idxVal, directory.voxeltopo_path, matprops)
valloader = DataLoader(valdat, batch_size=batch_size, shuffle=True)

tedat = RVAEDataset(datasplits.idxTe, directory.voxeltopo_path, matprops)
teloader = DataLoader(tedat, batch_size=batch_size, shuffle=True)


In [8]:
# This model is very sensitive to learning rate. Above 0.0005, loss diverges to very large numbers, 0.0001 results in stable training
lrate = 0.0001

lratestr = f'{lrate:.0e}'

In [9]:
# Set dimension of latent vector
latent_dim = 512

In [10]:
from ML_workflow_utils_v3.TO_VAE import TOVAE

In [11]:
# Create the VAE model

# If multiple GPUs are available, set to True
gpu_parallel = False

if gpu_parallel:
    tovae = torch.nn.DataParallel(TOVAE(latent_dim, matprops, matprops_by_module)).to(device)
else:
    tovae = TOVAE(latent_dim, matprops, matprops_by_module).to(device)



In [12]:
# Define the optimizer
optimizer = optim.Adam(tovae.parameters(), lr=lrate)


In [13]:
# Filepath of model checkpoint - specifies path where the model's weights are saved
cp_dir = os.path.join(nbpath, 'model_CPs')

# cp = 'checkpoint', which is the term for the model's saved state (trainable weights) for an epoch that has improved validation error
cp_name= f'{fname_base}_model_weights.pth'
best_weights_path = os.path.join(cp_dir, cp_name)

# Path for training history JSONs
histdir = os.path.join(nbpath, 'training_history_JSONs')


In [15]:
# Training loop with model weight saving
EPOCHS= 500
patience = 100

min_val_loss = float('inf')
best_val_loss = float('inf')
early_stop_counter = 0
earlystop_min_delta = 0.001

best_epoch = 0

lossfunc_name = 'TOVAE_loss'

best_epoch = 0


"""

Collecting the full loss value as well as its components. This is important data for monitoring training performance 
and ensuring that unacceptable loss behavior in one part of the function is not obscured by good performance of another component.

tr = training, val = validation

_losses = total loss value (sum)
_recon_loss = reconstruction loss, the inaccuracy between the reconstructed output and the input
_kld_loss = KL Divergence loss, measuring the inferred distribution
_regr_loss = regression loss, or the difference between the material properties that the MPPs predict and the actual values (from the training dataset)
_vf_loss = loss for the prediction of volume fraction, which is produced and trained through the variational sampling strocture

"""

tr_losses = []
tr_recon_loss = []
tr_kld_loss = []
tr_regr_loss = []
tr_vf_loss = []

val_losses = []
val_recon_loss = []
val_kld_loss = []
val_regr_loss = []
val_vf_loss = []



In [None]:
epochs_completed=0

# Setting the beta value for weighting KLD Loss in the loss function - for this value of beta, reconstruction loss converges and KLD Loss is not unacceptably large
beta = 0.01

# Alpha is for weighting the regression loss more strongly - we found that performance was acceptable with alpha = 1.0
alpha = 1.0


try:                
    for epoch in range(EPOCHS):
        # loss_criterion = vae_loss_mod
        # Train the model
        train_loss_output = tovae_train(tovae, trloader, optimizer, beta=beta, alpha=alpha) #beta_schedule[epoch]
        train_loss = train_loss_output[0]
        reconst_loss = train_loss_output[1]
        kl_loss = train_loss_output[2]
        reg_loss = train_loss_output[3]
        vf_loss = train_loss_output[4]


        # Validate the model
        val_loss_output = tovae_validate(tovae, valloader, beta=beta, alpha=alpha) # beta_schedule[epoch]
        val_loss = val_loss_output[0]
        vreconst_loss = val_loss_output[1]
        vkl_loss = val_loss_output[2]
        vreg_loss = val_loss_output[3]
        vvf_loss = val_loss_output[4]

        print(f'train reconst loss: {reconst_loss:.5f}, KL loss: {kl_loss:.5f}, regr loss: {reg_loss:.5f}, volfrac loss: {vf_loss:.5f}')
        print(f'  val reconst loss: {vreconst_loss:.5f}, KL loss: {vkl_loss:.5f}, regr loss: {vreg_loss:.5f}, volfrac loss: {vvf_loss:.5f}')


        # Collect model training history
        tr_losses.append(train_loss)
        tr_recon_loss.append(reconst_loss)
        tr_kld_loss.append(kl_loss)
        tr_regr_loss.append(reg_loss)
        tr_vf_loss.append(vf_loss)


        val_losses.append(val_loss)
        val_recon_loss.append(vreconst_loss)
        val_kld_loss.append(vkl_loss)
        val_regr_loss.append(vreg_loss)
        val_vf_loss.append(vvf_loss)
        
        improvement_delta = best_val_loss - val_loss


        # Save model weights and stop training if three components of loss - reconstruction, regression, and volFrac prediction - are at an acceptable level - set based on development experience
        if vreconst_loss < 0.08 and vreg_loss <= 0.009 and vvf_loss < 0.005:
            # SAVE
            best_val_loss = val_loss
            torch.save(tovae.state_dict(), best_weights_path)  # Save model weights to file
            best_epoch = epoch
            
            hist_dict = {f'train_loss {lossfunc_name}': tr_losses, 'train_reconstruction_loss': tr_recon_loss, 'train_kld_loss': tr_kld_loss, 'train_regression_loss': tr_regr_loss, 'train_volfrac_loss': tr_vf_loss,
                 f'val_loss {lossfunc_name}': val_losses, 'val_reconstruction_loss': val_recon_loss, 'val_kld_loss': val_kld_loss, 'val_regression_loss': val_regr_loss, 'val_volfrac_loss': val_vf_loss}
            
            print(f'Epoch [{epoch+1}/{EPOCHS}]\tTrain Loss: {train_loss:.5f}\tValidation Loss: {val_loss:.5f}')            
            print('Acceptable loss criteria met, stopping training...')
            
            break
            
        
        

        
        
        # Save the model's weights if validation loss is improved
        
        elif val_loss < best_val_loss:
            pct_improved = (best_val_loss - val_loss) / best_val_loss * 100
            print(f"Val loss improved from {best_val_loss:.5f} to {val_loss:.5f} ({pct_improved:.2f}% improvement) saving model state...")
            best_val_loss = val_loss
            torch.save(tovae.state_dict(), best_weights_path)  # Save model weights to file
            best_epoch = epoch
            
            
            hist_dict = {f'train_loss {lossfunc_name}': tr_losses, 'train_reconstruction_loss': tr_recon_loss, 'train_kld_loss': tr_kld_loss, 'train_regression_loss': tr_regr_loss, 'train_volfrac_loss': tr_vf_loss,
                 f'val_loss {lossfunc_name}': val_losses, 'val_reconstruction_loss': val_recon_loss, 'val_kld_loss': val_kld_loss, 'val_regression_loss': val_regr_loss, 'val_volfrac_loss': val_vf_loss}
            
            
            for key in hist_dict.keys():

                for i in range(len(hist_dict[key])):
                    item = hist_dict[key][i]
                    if isinstance(item, torch.Tensor):
                        # hist_dict[key][i] = item.detach().cpu().numpy().item()
                        hist_dict[key][i] = item.item()

            
            histdict_name = f'{fname_base}_traininghist_placeholder_{epoch}.json'
            histpath = os.path.join(histdir, histdict_name)
            with open(f'{histpath}', 'w') as f:
                json.dump(hist_dict, f)
                    
        else:
            print(f'Val loss did not improve from {best_val_loss:.5f}.')
            # early_stop_counter += 1  # Increment early stopping counter

        if improvement_delta > earlystop_min_delta:
            early_stop_counter = 0
        else:
            early_stop_counter +=1



        # Check for early stopping
        if early_stop_counter >= patience:
            print(f'Validation loss did not improve for {early_stop_counter} epochs. Early stopping...')
            tovae.load_state_dict(torch.load(best_weights_path))
            print(f"Model best weights restored - training epoch {best_epoch}")
            break

        print(f'Epoch [{epoch+1}/{EPOCHS}]\tTrain Loss: {train_loss:.5f}\tValidation Loss: {val_loss:.5f}')

        
        epochs_completed +=1


    # Load the best weights at end of training epochs
    # tovae.load_state_dict(torch.load(best_weights_path))  # Load best model weights
    print(f'Training epochs completed, best model weights restored - epoch {best_epoch}')
    min_val_loss = best_val_loss

except KeyboardInterrupt:
    hist_dict = {f'train_loss {lossfunc_name}': tr_losses, 'train_reconstruction_loss': tr_recon_loss, 'train_kld_loss': tr_kld_loss, 'train_regression_loss': tr_regr_loss, 'train_volfrac_loss': tr_vf_loss,
                 f'val_loss {lossfunc_name}': val_losses, 'val_reconstruction_loss': val_recon_loss, 'val_kld_loss': val_kld_loss, 'val_regression_loss': val_regr_loss, 'val_volfrac_loss': val_vf_loss}


In [17]:
# With training complete, define training history dictionary and save
    
hist_dict = {f'train_loss {lossfunc_name}': tr_losses, 'train_reconstruction_loss': tr_recon_loss, 'train_kld_loss': tr_kld_loss, 'train_regression_loss': tr_regr_loss, 'train_volfrac_loss': tr_vf_loss,
             f'val_loss {lossfunc_name}': val_losses, 'val_reconstruction_loss': val_recon_loss, 'val_kld_loss': val_kld_loss, 'val_regression_loss': val_regr_loss, 'val_volfrac_loss': val_vf_loss}

In [18]:
# Extract numbers from loss values if any are in torch.Tensor format

for key in hist_dict.keys():

    for i in range(len(hist_dict[key])):
        item = hist_dict[key][i]
        if isinstance(item, torch.Tensor):
            # hist_dict[key][i] = item.detach().cpu().numpy().item()
            hist_dict[key][i] = item.item()
            


In [19]:
# Save full model training history
histdict_name = f'{fname_base}_model_hist_{EPOCHS}epochs_{best_epoch}_best.json'

histpath = os.path.join(histdir, histdict_name)
with open(f'{histpath}', 'w') as f:
    json.dump(hist_dict, f)

# __End of notebook__ - use notebook "3_2_TO_with_TO_VAE.ipynb" for utilizing trained model to generate topologies