In [2]:
import datetime

from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard
from tensorflow.keras.layers import RandomRotation, RandomZoom, Rescaling, Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.models import Sequential
import tensorflow as tf

epochs = 100

width = 256
height = 256
model_name = 'v2.h5'

In [3]:
def get_new_model(width, height):
    model = Sequential([
        # preprocessing layers 
        Rescaling(1./255, input_shape=(width, height, 1)),
        RandomRotation(0.2),
        RandomZoom(0.2, 0.2),
        # convolutional layers
        Conv2D(32, (3, 3), activation='relu', padding='same', name='conv1'),
        MaxPooling2D(pool_size=(2, 2), name='maxpool1'),
        Conv2D(64, (3, 3), activation='relu', padding='same', name='conv2'),
        MaxPooling2D(pool_size=(2, 2), name='maxpool2'),
        Conv2D(128, (3, 3), activation='relu', padding='same', name='conv3'),
        MaxPooling2D(pool_size=(2, 2), name='maxpool3'),
        Flatten(name='flatten'),
        Dense(128, activation='relu', name='dense1'),
        Dropout(0.5, name='dropout'),
        Dense(2, activation='softmax', name='output')
    ])
    
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [4]:
train_dir = "./data_training/training"
validation_dir = "./data_training/validation"

def train_model(model, batch_size, width, height, epochs, model_name, initial_epoch=0, log_dir=None):
    train_ds = tf.keras.preprocessing.image_dataset_from_directory(
        './data_training/training',
        image_size=(width, height),
        batch_size=batch_size,
        label_mode='categorical',
        color_mode='grayscale')

    validation_ds = tf.keras.preprocessing.image_dataset_from_directory(
        './data_training/validation',
        image_size=(width, height),
        batch_size=batch_size,
        label_mode='categorical',
        color_mode='grayscale')

    # Use the existing log directory if provided, else create a new one
    if not log_dir:
        log_dir = f"logs/{datetime.datetime.now().strftime('%Y%m%d-%H%M%S')}_{model_name}"
    
    tensorboard = TensorBoard(log_dir=log_dir, histogram_freq=1, write_images=True)
    checkpoint_path = f"{model_name}_{{epoch:02d}}.weights.h5"
    checkpoint = ModelCheckpoint(checkpoint_path, save_weights_only=True, save_freq='epoch')

    history = model.fit(
        train_ds,
        epochs=epochs,
        initial_epoch=initial_epoch,
        validation_data=validation_ds,
        callbacks=[tensorboard, checkpoint])

    return model, history

In [5]:
# Load the existing model and weights
model = get_new_model(width, height)

# Resume training
checkpoint_file = 'v2.h5_32.weights.h5'
model.load_weights(checkpoint_file)
existing_log_dir = "logs/20240503-233657_v2.h5"
#existing_log_dir = None
initial_epoch = 32

model, history = train_model(model, batch_size=32, width=width, height=height, epochs=epochs, model_name=model_name, initial_epoch=initial_epoch, log_dir=existing_log_dir)

  super().__init__(**kwargs)
2024-05-04 07:15:49.739127: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M3 Max
2024-05-04 07:15:49.739153: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 128.00 GB
2024-05-04 07:15:49.739157: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 48.00 GB
2024-05-04 07:15:49.739173: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-05-04 07:15:49.739184: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
  saveable.load_own_variables(weights_store.get(inner_path))


Found 88442 files belonging to 2 classes.
Found 4026 files belonging to 2 classes.
Epoch 33/100


2024-05-04 07:15:52.295148: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m2764/2764[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m208s[0m 75ms/step - accuracy: 0.9821 - loss: 0.0509 - val_accuracy: 0.9853 - val_loss: 0.0471
Epoch 34/100
[1m2764/2764[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m218s[0m 79ms/step - accuracy: 0.9820 - loss: 0.0523 - val_accuracy: 0.9903 - val_loss: 0.0405
Epoch 35/100
[1m2764/2764[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m207s[0m 75ms/step - accuracy: 0.9830 - loss: 0.0476 - val_accuracy: 0.9896 - val_loss: 0.0471
Epoch 36/100
[1m2764/2764[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m206s[0m 75ms/step - accuracy: 0.9837 - loss: 0.0479 - val_accuracy: 0.9851 - val_loss: 0.0421
Epoch 37/100
[1m2764/2764[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m208s[0m 75ms/step - accuracy: 0.9831 - loss: 0.0502 - val_accuracy: 0.9886 - val_loss: 0.0557
Epoch 38/100
[1m2764/2764[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m208s[0m 75ms/step - accuracy: 0.9837 - loss: 0.0473 - val_accuracy: 0.9883 - val_loss: 0.0440