In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt

from keras.callbacks import ModelCheckpoint, LearningRateScheduler # type: ignore
from sklearn.model_selection import train_test_split

from mcvd_transformer.dataset.parser import DataParser
from mcvd_transformer.dataset.dataloader import BatchGenerator
from mcvd_transformer.utils.objects import CoordinateSystem
from mcvd_transformer.model.model import create_model
from mcvd_transformer.model.callbacks import lr_scheduler, AdaptiveLossWeight
from mcvd_transformer.utils.postprocessing import PostProcessing
from mcvd_transformer.utils.evaluator import PerformanceEvaluator

EXPERIMENT_NAME = 'MCvD_Transformer'

2024-12-01 23:29:30.836898: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-01 23:29:30.862037: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-01 23:29:30.887946: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-01 23:29:30.894939: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-01 23:29:30.913061: I tensorflow/core/platform/cpu_feature_guar

Read MCvD Simulations

In [2]:
data_parser = DataParser("data", unwanted_folders=[".git"], include_prism=True)
data_set = data_parser.parse_data()

100%|██████████| 38119/38119 [06:13<00:00, 102.09it/s]


Training Parameters

In [3]:
batch_size = 64
num_epochs = 400

data_seed = 1
numpy_seed = 10
dataloader_seed = 50

Split Dataset into Train, Validation and Test Parts

In [4]:
np.random.seed(numpy_seed)
np.random.shuffle(data_set)

train_set,test_val_set = train_test_split(data_set, test_size=0.3, random_state=data_seed)
val_set,test_set = train_test_split(test_val_set, test_size=0.33, random_state=data_seed)

del test_val_set

print(f"Size of Training Set = {len(train_set)}")
print(f"Size of Validation Set = {len(val_set)}")
print(f"Size of Test Set = {len(test_set)}")

Size of Training Set = 75880
Size of Validation Set = 21788
Size of Test Set = 10732


Create Batch Generators

In [5]:
training_batch_generator = BatchGenerator(
    train_set, 
    batch_size = batch_size, 
    coordinate_system = CoordinateSystem.BOTH, 
    random_rotate = True, 
    entity_order = "shuffle", 
    zero_padding = 10, 
    max_shape = True, 
    shuffle = True, 
    max_spherical_entity = 15,
    flatten = False,
    one_absorber_points = 0,
    random_seed=dataloader_seed
)

validation_batch_generator = BatchGenerator(
    val_set, 
    batch_size = batch_size, 
    coordinate_system = CoordinateSystem.BOTH, 
    random_rotate = True, 
    entity_order = "shuffle", 
    zero_padding = 10, 
    max_shape = True, 
    shuffle = True, 
    max_spherical_entity = 15,
    flatten = False,
    one_absorber_points = 0,
    random_seed=dataloader_seed
)

test_batch_generator = BatchGenerator(
    val_set, 
    batch_size = batch_size, 
    coordinate_system = CoordinateSystem.BOTH, 
    random_rotate = True, 
    entity_order = "shuffle", 
    zero_padding = 10, 
    max_shape = True, 
    shuffle = True, 
    max_spherical_entity = 15,
    flatten = False,
    one_absorber_points = 0,
    random_seed=dataloader_seed
)

Create Model

In [6]:
model, alpha, beta = create_model(training_batch_generator[0][0][0].shape[1:], training_batch_generator[0][0][1].shape[1])
model.summary()

I0000 00:00:1733085348.160286   32647 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1733085348.424252   32647 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1733085348.424822   32647 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1733085348.429503   32647 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1733085348.430122   32647 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:0

Create Training Callbacks

In [7]:
# Create Experiment Log Folder
os.makedirs('experiments' + os.path.sep + EXPERIMENT_NAME, exist_ok=True)

filepath = 'experiments' + os.path.sep + EXPERIMENT_NAME + os.path.sep + 'model.keras'
checkpoint = ModelCheckpoint(filepath, monitor='val_cir_max_loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint, LearningRateScheduler(lr_scheduler, verbose=0), AdaptiveLossWeight(alpha,beta)]

Run Training

In [8]:
history = model.fit(
    training_batch_generator,
    epochs = 5,
    verbose = 1,
    validation_data = validation_batch_generator,
    callbacks=callbacks_list
)

Epoch 1/5


  self._warn_if_super_not_called()
I0000 00:00:1733085395.212980    1409 service.cc:146] XLA service 0x3a56be70 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1733085395.213066    1409 service.cc:154]   StreamExecutor device (0): NVIDIA GeForce RTX 3080 Laptop GPU, Compute Capability 8.6
2024-12-01 23:36:35.500130: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-12-01 23:36:38.998248: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:524] Loaded runtime CuDNN library: 8.1.0 but source was compiled with: 8.9.6.  CuDNN library needs to have matching major version and equal or higher minor version. If using a binary install, upgrade your CuDNN library.  If building from sources, make sure the library loaded at runtime is compatible with the version specified during compile configuration.
2024-12-01 23:36:39.559457: E

FailedPreconditionError: Graph execution error:

Detected at node StatefulPartitionedCall defined at (most recent call last):
  File "/home/umut/anaconda3/envs/mcvd_transformer/lib/python3.9/runpy.py", line 197, in _run_module_as_main

  File "/home/umut/anaconda3/envs/mcvd_transformer/lib/python3.9/runpy.py", line 87, in _run_code

  File "/home/umut/anaconda3/envs/mcvd_transformer/lib/python3.9/site-packages/ipykernel_launcher.py", line 18, in <module>

  File "/home/umut/anaconda3/envs/mcvd_transformer/lib/python3.9/site-packages/traitlets/config/application.py", line 1075, in launch_instance

  File "/home/umut/anaconda3/envs/mcvd_transformer/lib/python3.9/site-packages/ipykernel/kernelapp.py", line 739, in start

  File "/home/umut/anaconda3/envs/mcvd_transformer/lib/python3.9/site-packages/tornado/platform/asyncio.py", line 205, in start

  File "/home/umut/anaconda3/envs/mcvd_transformer/lib/python3.9/asyncio/base_events.py", line 601, in run_forever

  File "/home/umut/anaconda3/envs/mcvd_transformer/lib/python3.9/asyncio/base_events.py", line 1905, in _run_once

  File "/home/umut/anaconda3/envs/mcvd_transformer/lib/python3.9/asyncio/events.py", line 80, in _run

  File "/home/umut/anaconda3/envs/mcvd_transformer/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 545, in dispatch_queue

  File "/home/umut/anaconda3/envs/mcvd_transformer/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 534, in process_one

  File "/home/umut/anaconda3/envs/mcvd_transformer/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 437, in dispatch_shell

  File "/home/umut/anaconda3/envs/mcvd_transformer/lib/python3.9/site-packages/ipykernel/ipkernel.py", line 359, in execute_request

  File "/home/umut/anaconda3/envs/mcvd_transformer/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 778, in execute_request

  File "/home/umut/anaconda3/envs/mcvd_transformer/lib/python3.9/site-packages/ipykernel/ipkernel.py", line 446, in do_execute

  File "/home/umut/anaconda3/envs/mcvd_transformer/lib/python3.9/site-packages/ipykernel/zmqshell.py", line 549, in run_cell

  File "/home/umut/anaconda3/envs/mcvd_transformer/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3006, in run_cell

  File "/home/umut/anaconda3/envs/mcvd_transformer/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3061, in _run_cell

  File "/home/umut/anaconda3/envs/mcvd_transformer/lib/python3.9/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner

  File "/home/umut/anaconda3/envs/mcvd_transformer/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3266, in run_cell_async

  File "/home/umut/anaconda3/envs/mcvd_transformer/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3445, in run_ast_nodes

  File "/home/umut/anaconda3/envs/mcvd_transformer/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3505, in run_code

  File "/tmp/ipykernel_32647/3815100869.py", line 1, in <module>

  File "/home/umut/anaconda3/envs/mcvd_transformer/lib/python3.9/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/home/umut/anaconda3/envs/mcvd_transformer/lib/python3.9/site-packages/keras/src/backend/tensorflow/trainer.py", line 320, in fit

  File "/home/umut/anaconda3/envs/mcvd_transformer/lib/python3.9/site-packages/keras/src/backend/tensorflow/trainer.py", line 121, in one_step_on_iterator

DNN library initialization failed. Look at the errors above for more details.
	 [[{{node StatefulPartitionedCall}}]] [Op:__inference_one_step_on_iterator_11932]

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])

plt.title('Model Loss Curve')
plt.ylabel('loss')
plt.xlabel('epoch')

plt.legend(['train', 'val'], loc='upper left')

Calculate Performance Metrics

In [None]:
model.load_weights(filepath)

train_error_values = PerformanceEvaluator.create_evaluation_results(
    model,
    data_set = train_set,
    coordinate_system = CoordinateSystem.BOTH,
    max_number_of_spherical = 15,
    order = "shuffle",
    path = 'experiments' + os.path.sep + EXPERIMENT_NAME + os.path.sep + "train_error.json"
)

val_error_values = PerformanceEvaluator.create_evaluation_results(
    model,
    data_set = val_set,
    coordinate_system = CoordinateSystem.BOTH,
    max_number_of_spherical = 15,
    order = "shuffle",
    path = 'experiments' + os.path.sep + EXPERIMENT_NAME + os.path.sep + "val_error.json"
)

test_error_values = PerformanceEvaluator.create_evaluation_results(
    model,
    data_set = test_set,
    coordinate_system = CoordinateSystem.BOTH,
    max_number_of_spherical = 15,
    order = "shuffle",
    path = 'experiments' + os.path.sep + EXPERIMENT_NAME + os.path.sep + "test_error.json"
)

Plot Raw Estimations

In [None]:
index = 277
input_topology = val_set[index].rotate(10,45)

input_top,input_num = input_topology.convert_numpy(CoordinateSystem.BOTH,15,"shuffle",False)
shape,max_value,_ = model.predict([np.expand_dims(input_top,axis=0),np.expand_dims(input_num,axis=0)])

prediction = PostProcessing.postprocessing_separate(shape[0],max_value[0])

PerformanceEvaluator.time_graph(
    time_output_actual = input_topology.time_output,
    time_output_predicted_list = [prediction],
    legend=["Ground Truth", "Prediction"],
    image_loc=None,
    time_res=1,
    expension_ratio=1,
    path=None
)

Visualize Topology

In [None]:
input_topology.visualize()