In [1]:
import os

import numpy as np
import tensorflow as tf

In [2]:
os.environ['TF_GPU_ALLOCATOR'] = 'cuda_malloc_async'

In [3]:
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import backend as K

from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import load_model

In [4]:
os.chdir('..')

In [5]:
from src.volumetric_unet import create_volumetric_unet

In [6]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
tf.config.list_physical_devices('GPU')

Num GPUs Available:  1


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [7]:
# Setting seed for reproducibility
os.environ['PYTHONHASHSEED']=str(123)

np.random.seed(123)
tf.random.set_seed(123)

In [8]:
X_train_combined = np.load('./data/X_train_combined.npy')
y_train_combined = np.load('./data/y_train_combined.npy')

In [9]:
X_val = np.load('./data/X_val.npy')
y_val = np.load('./data/y_val.npy')

X_train_combined.shape, y_train_combined.shape, X_val.shape, y_val.shape

((130050, 32, 32, 32, 1),
 (130050, 32, 32, 32, 1),
 (19125, 32, 32, 32, 1),
 (19125, 32, 32, 32, 1))

In [10]:
sample_size_train = int(0.9 * X_train_combined.shape[0])
sample_size_val = int(0.9 * X_val.shape[0])

indices_train = np.random.choice(X_train_combined.shape[0], sample_size_train, replace=False)
indices_val = np.random.choice(X_val.shape[0], sample_size_val, replace=False)

X_train_sample = X_train_combined[indices_train]
y_train_sample = y_train_combined[indices_train]
X_val_sample = X_val[indices_val]
y_val_sample = y_val[indices_val]

X_train_sample.shape, y_train_sample.shape, X_val_sample.shape, y_val_sample.shape

((117045, 32, 32, 32, 1),
 (117045, 32, 32, 32, 1),
 (17212, 32, 32, 32, 1),
 (17212, 32, 32, 32, 1))

In [11]:
# Parameters
Ngrid = 32  # grid size

In [12]:
unet_model = create_volumetric_unet((Ngrid, Ngrid, Ngrid, 1), num_classes=1)

2024-03-13 14:29:54.456233: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-03-13 14:30:01.745030: I tensorflow/core/common_runtime/gpu/gpu_process_state.cc:214] Using CUDA malloc Async allocator for GPU: 0
2024-03-13 14:30:01.748073: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 38420 MB memory:  -> device: 0, name: NVIDIA A100-PCIE-40GB, pci bus id: 0000:41:00.0, compute capability: 8.0


In [13]:
# Compile the model
unet_model.compile(optimizer=Adam(),
                   loss='mse',
                   run_eagerly=False) # making it false to take benefit of graph execution and memory effeciency.

In [14]:
# Define callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=5)

reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.001)

model_checkpoint = ModelCheckpoint('models/unet/130324_90p_samp_augment.hdf5', 
                                   monitor='val_loss',
                                   verbose=1, 
                                   save_best_only=True)

In [None]:
# Train the model
history = unet_model.fit(x=X_train_sample,
                    y=y_train_sample,
                    validation_data=(X_val_sample, y_val_sample),
                    batch_size=32, 
                    epochs=50,
                    callbacks=[early_stopping, reduce_lr, model_checkpoint])

Epoch 1/50


2024-03-12 23:52:25.545135: I tensorflow/stream_executor/cuda/cuda_dnn.cc:368] Loaded cuDNN version 8200


Epoch 1: val_loss improved from inf to 0.35293, saving model to models/unet/130324_90p_samp_augment.hdf5
Epoch 2/50

### Model saved

In [20]:
unet_model = load_model("./initial-state-universe-prediction-box/models/unet/130324_90p_samp_augment.hdf5")

In [21]:
unet_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 32, 32, 32,  0           []                               
                                 1)]                                                              
                                                                                                  
 conv3d (Conv3D)                (None, 32, 32, 32,   1792        ['input_1[0][0]']                
                                64)                                                               
                                                                                                  
 conv3d_1 (Conv3D)              (None, 32, 32, 32,   110656      ['conv3d[0][0]']                 
                                64)                                                           

In [24]:
history = unet_model.fit(x=X_train_sample,
                         y=y_train_sample,
                         validation_data=(X_val_sample, y_val_sample),
                         batch_size=32,
                         epochs=50,  # Set the total number of epochs
                         initial_epoch=23,  # Start training from epoch 2
                         callbacks=[early_stopping, reduce_lr, model_checkpoint])

Epoch 24/50


2024-03-13 14:34:50.373374: I tensorflow/stream_executor/cuda/cuda_dnn.cc:368] Loaded cuDNN version 8200


Epoch 24: val_loss improved from inf to 0.28878, saving model to models/unet/130324_90p_samp_augment.hdf5
Epoch 25/50
Epoch 34: val_loss did not improve from 0.28616
Epoch 35/50
Epoch 36: val_loss did not improve from 0.28616


In [23]:
plot_model(unet_model, 
           to_file='models/unet/unet_model.png',
           show_shapes=True, 
           show_layer_names=True, 
           dpi=30)

-----