In [1]:
import tensorflow   as tf
from losses import calc_loss
from anchor import multibox_target
from metrics import cls_eval, bbox_eval
from architecture import SSD
from utils import LogWriter
from datasets.face_dataset.face_dataset import CLASS_DICT, NUM_TRAIN_EXAMPLES
import datasets.face_dataset
import tensorflow_datasets as tfds
from batch import BatchDatasetForOD
from mean_average_precision import MetricBuilder
import os
import glob
import re
import numpy as np
import sys

2022-11-14 00:37:08.746361: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-14 00:37:09.757400: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:


@tf.function(reduce_retracing=True)
def training_step(net, X, Y):
    with tf.GradientTape() as tape:
        anchors, cls_preds, bbox_preds = net(X)
        bbox_labels, bbox_masks, cls_labels = multibox_target(anchors, Y)
        c = tf.nn.softmax(cls_preds[0])
        t = tf.cast(c[:,1] >  0.5, tf.float32)
        k = tf.cast(c[:,1] <=  0.5, tf.float32)
        tf.print(tf.reduce_sum(t),tf.reduce_sum(k), sys.stderr)
        l = calc_loss(anchors, cls_preds, cls_labels, bbox_preds, bbox_labels,
                    bbox_masks, positive_negative_ratio, image_size[0], image_size[1])
        l_mean = tf.reduce_mean(l)
    grads = tape.gradient(l_mean, net.trainable_weights)
    optimizer.apply_gradients(zip(grads, net.trainable_weights))
    return l_mean

@tf.function(reduce_retracing=True)
def test_step(net, X, Y):
    anchors, cls_preds, bbox_preds = net(X, training=False)
    bbox_labels, bbox_masks, cls_labels = multibox_target(anchors, Y)
    l = calc_loss(anchors, cls_preds, cls_labels, bbox_preds, bbox_labels,
                    bbox_masks, positive_negative_ratio, image_size[0], image_size[1])
    l_mean = tf.reduce_mean(l)
    acc_err = 1-(cls_eval(cls_preds, cls_labels)/float(tf.size(cls_labels)))
    mae = (bbox_eval(bbox_preds, bbox_labels, bbox_masks))/float(tf.size(bbox_labels))
    return l_mean, acc_err , mae 

num_epoch = 2
start_epoch = -1
checkpoint_interval = 2
checkpoint_dir = './models/tiny_ssd_negative_mining_2'
checkpoint_path = None#'/dl/ssd/models/tiny_ssd_negative_mining_2/9_checkpoint.index'

if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=1e-2,
    decay_steps=10000,
    decay_rate=0.9)
# optimizer = tf.keras.optimizers.SGD(learning_rate=lr_schedule)
batch_size =32
positive_negative_ratio=1000
optimizer = tf.keras.optimizers.SGD(learning_rate=0.001, decay=5e-4)
image_size = (300,300)
log_path = "./log/tiny_ssd_negative_mining_2"
if not os.path.exists(log_path):
    os.makedirs(log_path)
logger = LogWriter(log_path = log_path)
strategy = tf.distribute.MirroredStrategy()
dataset = tfds.load('face_dataset')
min_val_loss = float('inf')


with strategy.scope():
    net = SSD(num_classes=len(CLASS_DICT.keys()))

if checkpoint_path is not None:
    net = tf.keras.models.load_model(checkpoint_path)
    start_epoch = int(checkpoint_path.split('/')[-1].split('_')[0])

for epoch in range(start_epoch+1,num_epoch):
    print(f'--- Epoch {epoch} ---')
    train_dataset = dataset['train'].shuffle(NUM_TRAIN_EXAMPLES)
    valid_dataset = dataset['valid']

    batched_train_dataset = BatchDatasetForOD(train_dataset, batch_size, image_size)
    batched_valid_dataset = BatchDatasetForOD(valid_dataset, batch_size,image_size)

    train_total_loss = 0
    for X,Y in batched_train_dataset:
      loss = training_step(net, X, Y)
      train_total_loss += loss
    
    total_acc_err = 0
    total_box_mae = 0
    val_total_loss = 0
    
    for X,Y in batched_valid_dataset:
      loss, acc_err, box_mae = test_step(net, X, Y)
      val_total_loss += loss
      total_acc_err += acc_err
      total_box_mae += box_mae

    print('train', 'total_loss', train_total_loss)
    print('val', 'total_loss', val_total_loss)
    print('val', 'box_mae', total_box_mae)
    print('val', 'acc_err', total_acc_err)

    if (epoch+1)%checkpoint_interval == 0:
        print(val_total_loss, min_val_loss)
        if val_total_loss < min_val_loss:
            print("Smallest val loss!!!")
            dst = os.path.join(checkpoint_dir, 'best_model')
            net.save(dst)
            min_val_loss = val_total_loss
        print('...Saving model...')
        dst = os.path.join(checkpoint_dir, f'{epoch}_model')
        net.save_weights(dst)
        regex = re.compile(f"^{epoch - checkpoint_interval}_model")
        for file in glob.glob(os.path.join(checkpoint_dir, '*')):
            if regex.match(file.split('/')[-1]):
                os.remove(file)

    logger.add_a_point('train', 'total_loss', train_total_loss,epoch)
    logger.add_a_point('val', 'box_mae', total_box_mae,epoch)
    logger.add_a_point('val', 'acc_err', total_acc_err, epoch)
    logger.add_a_point('val', 'total_loss', val_total_loss, epoch)


INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)


2022-11-14 00:37:28.678087: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-14 00:37:28.704875: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-14 00:37:28.705610: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-14 00:37:28.707031: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

--- Epoch 0 ---


2022-11-14 00:38:10.468175: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8100


982 6182 <ipykernel.iostream.OutStream object at 0x7f7e793884f0>
1097 6067 <ipykernel.iostream.OutStream object at 0x7f7e793884f0>
893 6271 <ipykernel.iostream.OutStream object at 0x7f7e793884f0>
735 6429 <ipykernel.iostream.OutStream object at 0x7f7e793884f0>
754 6410 <ipykernel.iostream.OutStream object at 0x7f7e793884f0>
823 6341 <ipykernel.iostream.OutStream object at 0x7f7e793884f0>
832 6332 <ipykernel.iostream.OutStream object at 0x7f7e793884f0>
870 6294 <ipykernel.iostream.OutStream object at 0x7f7e793884f0>
820 6344 <ipykernel.iostream.OutStream object at 0x7f7e793884f0>
757 6407 <ipykernel.iostream.OutStream object at 0x7f7e793884f0>
745 6419 <ipykernel.iostream.OutStream object at 0x7f7e793884f0>
931 6233 <ipykernel.iostream.OutStream object at 0x7f7e793884f0>
858 6306 <ipykernel.iostream.OutStream object at 0x7f7e793884f0>
767 6397 <ipykernel.iostream.OutStream object at 0x7f7e793884f0>
764 6400 <ipykernel.iostream.OutStream object at 0x7f7e793884f0>
772 6392 <ipykernel.iost



INFO:tensorflow:Assets written to: ./models/tiny_ssd_negative_mining_2/best_model/assets


INFO:tensorflow:Assets written to: ./models/tiny_ssd_negative_mining_2/best_model/assets


...Saving model...


In [8]:
with strategy.scope():
    new_net = SSD(num_classes=len(CLASS_DICT.keys()))
new_net.load_weights('/dl/ssd/models/tiny_ssd_negative_mining_2/1_model')

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x7f7bf9773c40>

In [12]:
new_net = tf.keras.models.load_model('/dl/ssd/models/tiny_ssd_negative_mining_2/best_model')





In [15]:
for X,Y in batched_train_dataset:
    loss = training_step(net, X, Y)

558 6606 <ipykernel.iostream.OutStream object at 0x7f7e793884f0>
817 6347 <ipykernel.iostream.OutStream object at 0x7f7e793884f0>
735 6429 <ipykernel.iostream.OutStream object at 0x7f7e793884f0>
810 6354 <ipykernel.iostream.OutStream object at 0x7f7e793884f0>
717 6447 <ipykernel.iostream.OutStream object at 0x7f7e793884f0>
731 6433 <ipykernel.iostream.OutStream object at 0x7f7e793884f0>
659 6505 <ipykernel.iostream.OutStream object at 0x7f7e793884f0>
672 6492 <ipykernel.iostream.OutStream object at 0x7f7e793884f0>


KeyboardInterrupt: 