In [None]:
import pickle
import numpy as np
import sys
import json
import os
from pathlib import Path

import keras

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

sys.path.append('/home/mvdokh/tracking/')
sys.path.append('/home/mvdokh/tracking/DeepLearningUtils')

from utils import augmenters

from DeepLearningUtils.src.DeepLearningUtils.TrainingData.Keypoint.keypoint_data_generator import KeypointDataGenerator
from DeepLearningUtils.src.DeepLearningUtils.Optimizers.keras_schedulers import CosineDecayWithWarmup

from DeepLearningUtils.keras_unet_collection import models

from pole_tracker import MetricsLoggerCallback

In [16]:
training_path = '/home/mvdokh/data/'

checkpoint_path = 'Tip+Curve+Ryan.weights.h5'

num_classes = 1

input_shape = (256, 256, 3)

BATCH_SIZE = 32

augmentation = augmenters.create_contact_augmentation()

# Create output directory for training artifacts
output_dir = Path(__file__).parent / 'training_outputs' if '__file__' in globals() else Path('./training_outputs')
output_dir.mkdir(parents=True, exist_ok=True)

In [17]:
(training_data, training_labels) = pickle.load(open(training_path + 'training_data.pkl', 'rb'))
(validation_data, validation_labels) = pickle.load(open(training_path + 'testing_data.pkl', 'rb'))

training_generator = KeypointDataGenerator(training_data,
                                             training_labels,
                                             augmentation=augmentation,
                                             batch_size=BATCH_SIZE, training=True, )
validation_generator = KeypointDataGenerator(validation_data, validation_labels, batch_size=BATCH_SIZE,
                                               training=False)

In [18]:
#filter_num=[64, 128, 256, 512, 1024] #256 -> 128 -> 64 -> 32 -> 16
filter_num=[64, 64,64, 64, 64, 64] 
#filter_num=[64, 64,64, 128, 128, 128] 

model = models.att_unet_2d(input_shape, 
                           filter_num=filter_num, 
                           n_labels=num_classes, 
                           stack_num_down=2, 
                           stack_num_up=2, 
                           activation='ReLU', 
                           atten_activation='ReLU', 
                           attention='add', 
                           output_activation='Sigmoid', #For multi-class use Softmax
                           batch_norm=True, 
                           dropout=True,
                           dropout_rate=0.1,
                           l2_regularization=False,
                           l2_weight=1e-4,
                           pool=False, #Uses strided convolutions instead of max pooling
                           unpool=False, #Uses transposed convolutions instead of upsampling
                           backbone='EfficientNetB1',
                           weights='imagenet', 
                           freeze_backbone=True, 
                           freeze_batch_norm=True, 
                           name='attunet')
                           
inputs = keras.Input(shape=input_shape)

x = keras.applications.efficientnet.preprocess_input(inputs)

outputs = model(x)


model = keras.Model(inputs=inputs, outputs=outputs)

In [None]:
warmup_scheduler = CosineDecayWithWarmup(
    initial_learning_rate=1e-5,
    final_learning_rate=1e-4,
    n_warmup_steps=150*5,
    n_cosine_steps=100000,
)
adamopt = keras.optimizers.AdamW(
        learning_rate=warmup_scheduler,
        clipnorm=0.1,
        #weight_decay=0.1,
    )

# TensorBoard for detailed monitoring
tensorboard_callback = keras.callbacks.TensorBoard(
    log_dir="./training_outputs/tensorboard",
    write_graph=True,
    write_images=True,
    histogram_freq=1,
    update_freq='epoch'
)

# Save best model weights
model_checkpoint_callback = keras.callbacks.ModelCheckpoint(
    filepath=str(checkpoint_path),
    save_weights_only=True,
    monitor='val_loss',
    mode='min',
    save_best_only=True)

# Early stopping to prevent overfitting
early_stopping_callback = keras.callbacks.EarlyStopping(
    monitor='val_loss',
    mode='min',
    patience=10,
    restore_best_weights=True)

# Custom metrics logger for persistent tracking
metrics_logger_callback = MetricsLoggerCallback(
    output_dir=output_dir,
    metrics_filename='training_metrics.json',
    plot_filename='training_progress.png'
)

# Learning rate reduction on plateau
#reduce_lr_callback = keras.callbacks.ReduceLROnPlateau(
#    monitor='val_loss',
#    factor=0.5,
#    patience=5,
#    min_lr=1e-7,
#    verbose=1
#)

model.compile(
    optimizer=adamopt,
    loss='mean_squared_error',
    metrics=[
             ],
)

In [None]:
epochs = 150
r = model.fit(training_generator,
              epochs=epochs,
              validation_data=validation_generator,
              callbacks=[tensorboard_callback,
                         model_checkpoint_callback,
                         early_stopping_callback,
                         metrics_logger_callback
                         ])

n_epochs_trained = len(r.history['loss'])

Epoch 1/150
[1m194/194[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m102s[0m 302ms/step - loss: 4.4397e-04 - val_loss: 4.4673e-04 - learning_rate: 1.0000e-04
Epoch 2/150
[1m194/194[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 249ms/step - loss: 3.7247e-04 - val_loss: 4.4152e-04 - learning_rate: 1.0000e-04
Epoch 3/150
[1m194/194[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 234ms/step - loss: 3.6080e-04 - val_loss: 4.3948e-04 - learning_rate: 1.0000e-04
Epoch 4/150
[1m194/194[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 237ms/step - loss: 3.5990e-04 - val_loss: 4.3936e-04 - learning_rate: 1.0000e-04
Epoch 5/150
[1m194/194[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 247ms/step - loss: 3.6807e-04 - val_loss: 4.1437e-04 - learning_rate: 1.0000e-04
Epoch 6/150
[1m194/194[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 169ms/step - loss: 3.4628e-04
Epoch 6: ReduceLROnPlateau reducing learning rate to 4.999999873689376e-05.
[1m194/194[0m

In [1]:
import tensorflow as tf
print("TensorFlow version:", tf.__version__)
print("GPU devices:", tf.config.list_physical_devices('GPU'))

# Test GPU computation
with tf.device('/GPU:0'):
    # Create some random large matrices
    a = tf.random.normal([10000, 10000])
    b = tf.random.normal([10000, 10000])
    # Perform matrix multiplication
    c = tf.matmul(a, b)
print("Matrix multiplication completed successfully on", c.device)

2025-08-28 18:11:11.409549: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-08-28 18:11:11.604161: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-08-28 18:11:11.604181: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-08-28 18:11:11.709234: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-08-28 18:11:11.815277: I tensorflow/core/platform/cpu_feature_guar

TensorFlow version: 2.15.0
GPU devices: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU')]


2025-08-28 18:11:22.707432: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1929] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 43622 MB memory:  -> device: 0, name: NVIDIA L40S, pci bus id: 0000:4a:00.0, compute capability: 8.9
2025-08-28 18:11:22.719787: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1929] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 43622 MB memory:  -> device: 1, name: NVIDIA L40S, pci bus id: 0000:61:00.0, compute capability: 8.9


Matrix multiplication completed successfully on /job:localhost/replica:0/task:0/device:GPU:0


In [1]:
# Install TensorFlow 2.19.0 GPU wheel with bundled CUDA/cuDNN
import sys, subprocess
print("Installing TensorFlow 2.19.0 (GPU, bundled CUDA/cuDNN)...")
subprocess.check_call([sys.executable, "-m", "pip", "install", "--quiet", "--upgrade", "tensorflow[and-cuda]==2.19.0"])  # noqa: E231
import tensorflow as tf
print("TensorFlow version:", tf.__version__)
print("GPU devices:", tf.config.list_physical_devices('GPU'))
print("NOTE: If TensorFlow was upgraded, restart the kernel and re-run from the top.")


Installing TensorFlow 2.19.0 (GPU, bundled CUDA/cuDNN)...


2025-08-29 10:03:22.511433: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-08-29 10:03:23.012999: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1756476203.184553 1623911 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1756476203.230240 1623911 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1756476203.620546 1623911 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

TensorFlow version: 2.19.0
GPU devices: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
NOTE: If TensorFlow was upgraded, restart the kernel and re-run from the top.
