# 06 - Model 1 Training

In [13]:
# importing libraries
import numpy as np
import tensorflow as tf
from tensorflow.keras import Model, Input
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dropout, UpSampling2D, concatenate, LeakyReLU, ReLU
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.metrics import MeanIoU
from sklearn.model_selection import train_test_split
import tifffile
from pathlib import Path
import os
import mlflow
import mlflow.tensorflow
from tqdm import tqdm
import time

# Project path
TERRAFLOOD = Path('../')

In [14]:
# ------------ inputs --------------
# experiment's meta data
experiment_number = 1
model_architecture = 1
epochs = 1
early_stopping_patience = 10
learning_rate_patience = 5

# input and output paths
load_path = TERRAFLOOD.joinpath("data/dataset_balanced/")
save_path = TERRAFLOOD.joinpath(f"experiments/model_{model_architecture}_exp_{experiment_number}/")

# structure of logging and saving with naming convention
# Checkpoint of the model
checkpoint_dir = save_path.joinpath("checkpoint/")
checkpoint_path = checkpoint_dir / f"model_{model_architecture}_check_exp_{experiment_number}_epochs_{epochs}_patience_{early_stopping_patience}_{learning_rate_patience}.keras"

# Logging on tensorboard
tensorboard_logs_dir = save_path.joinpath("tensorboard_log/")

# Saving final log on ML-Flow
mlflow_logs_dir = save_path.joinpath("mlflow_log/")

# Saving the final model
model_save_dir = save_path.joinpath("model/")
model_path = model_save_dir / f"model_{model_architecture}_exp_{experiment_number}_epochs_{epochs}_patience_{early_stopping_patience}_{learning_rate_patience}.keras"

# directory existence ensurance
checkpoint_dir.mkdir(parents=True, exist_ok=True)
tensorboard_logs_dir.mkdir(parents=True, exist_ok=True)
mlflow_logs_dir.mkdir(parents=True, exist_ok=True)
model_save_dir.mkdir(parents=True, exist_ok=True)

In [15]:
# Hardware info
# Print TensorFlow version
print("TensorFlow version:", tf.__version__)

# List available devices
devices = tf.config.list_physical_devices()
print("Available devices:", devices)

# GPU info in details (assuming nvidia as GPU device)
!nvidia-smi

TensorFlow version: 2.17.0
Available devices: [PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
Mon Aug 12 14:10:19 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.12             Driver Version: 535.104.12   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla P100-PCIE-16GB           On  | 00000000:01:00.0 Off |                    0 |
| N/A   30C    P0              31W / 250W |  15812MiB / 16384MiB |      0%      Default |
|                                         |                      |   

In [16]:
# Define Jaccard Loss
def jaccard_loss(y_true, y_pred, smooth=100):
    """
    Calculates the Jaccard loss, also known as the Intersection over Union (IoU) loss.
    Args:
        y_true (tensor): Ground truth labels.
        y_pred (tensor): Predicted labels.
        smooth (float): Smoothing factor to avoid division by zero.
    Returns:
        jaccard loss (tensor)
    """
    y_true_f = tf.keras.backend.flatten(y_true)
    y_pred_f = tf.keras.backend.flatten(y_pred)
    intersection = tf.keras.backend.sum(y_true_f * y_pred_f)
    sum_ = tf.keras.backend.sum(y_true_f + y_pred_f)
    jac = (intersection + smooth) / (sum_ - intersection + smooth)
    return 1 - jac

In [17]:
# Define U-Net model (single channel)
def unet_one_channel(input_size=(256, 256, 1), loss_function='jaccard_loss'):
    inputs = Input(input_size)

    def conv_block(inputs, num_filters):
        conv = Conv2D(num_filters, 3, activation=LeakyReLU(), padding="same")(inputs)
        conv = Conv2D(num_filters, 3, activation=LeakyReLU(), padding="same")(conv)
        return conv

    conv1 = conv_block(inputs, 64)
    pool1 = MaxPooling2D(pool_size=(2, 2))(conv1)

    conv2 = conv_block(pool1, 128)
    pool2 = MaxPooling2D(pool_size=(2, 2))(conv2)

    conv3 = conv_block(pool2, 256)
    pool3 = MaxPooling2D(pool_size=(2, 2))(conv3)

    conv4 = conv_block(pool3, 512)
    drop4 = Dropout(0.3)(conv4)
    pool4 = MaxPooling2D(pool_size=(2, 2))(drop4)

    conv5 = conv_block(pool4, 1024)
    drop5 = Dropout(0.3)(conv5)

    def up_block(inputs, skip_connection, num_filters):
        up = Conv2D(num_filters, 2, activation=LeakyReLU(), padding="same")(UpSampling2D(size=(2, 2))(inputs))
        merge = concatenate([skip_connection, up], axis=3)
        conv = conv_block(merge, num_filters)
        return conv

    conv6 = up_block(drop5, drop4, 512)
    conv7 = up_block(conv6, conv3, 256)
    conv8 = up_block(conv7, conv2, 128)
    conv9 = up_block(conv8, conv1, 64)

    conv9 = Conv2D(2, 1, activation=LeakyReLU(), padding="same")(conv9)
    conv10 = Conv2D(1, 1, activation="sigmoid")(conv9)

    model = Model(inputs, conv10)

    model.compile(optimizer=RMSprop(learning_rate=1e-4), loss=jaccard_loss, metrics=["accuracy", MeanIoU(num_classes=2)])
    return model

In [18]:
# Load and preprocess the data with progress bar
# (in case of enough ram)
def load_and_preprocess_data(image_paths, mask_paths, image_size=(256, 256)):
    X = np.empty((len(image_paths), *image_size, 1), dtype=np.float32)
    y = np.empty((len(mask_paths), *image_size, 1), dtype=np.float32)

    start_time = time.time()

    with tqdm(total=len(image_paths), desc="Loading and preprocessing data") as pbar:
        for i, (image_path, mask_path) in enumerate(zip(image_paths, mask_paths)):
            img = tifffile.imread(image_path)
            img = np.expand_dims(img, axis=-1) if img.ndim == 2 else img
            img = (img - (-25)) / (5 - (-25))  # Normalize to [0, 1]
            X[i] = img

            mask = tifffile.imread(mask_path)
            mask = np.expand_dims(mask, axis=-1) if mask.ndim == 2 else mask
            y[i] = mask

            elapsed_time = time.time() - start_time
            estimated_total_time = elapsed_time / (i + 1) * len(image_paths)
            remaining_time = estimated_total_time - elapsed_time
            pbar.set_postfix({
                'Elapsed': f"{elapsed_time:.2f}s",
                'ETA': f"{remaining_time:.2f}s"
            })
            pbar.update(1)

    return X, y

# DataLoader (in case of huge dataset)
class DataGenerator(tf.keras.utils.Sequence):
    def __init__(self, image_paths, mask_paths, batch_size=32, image_size=(256, 256), n_channels=1, shuffle=True):
        self.image_paths = image_paths
        self.mask_paths = mask_paths
        self.batch_size = batch_size
        self.image_size = image_size
        self.n_channels = n_channels
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        return int(np.ceil(len(self.image_paths) / self.batch_size))

    def __getitem__(self, index):
        indexes = self.indexes[index * self.batch_size : (index + 1) * self.batch_size]
        image_paths_batch = [self.image_paths[i] for i in indexes]
        mask_paths_batch = [self.mask_paths[i] for i in indexes]
        
        X, y = self.__data_generation(image_paths_batch, mask_paths_batch)
        
        return X, y

    def on_epoch_end(self):
        self.indexes = np.arange(len(self.image_paths))
        if self.shuffle:
            np.random.shuffle(self.indexes)

    def __data_generation(self, image_paths_batch, mask_paths_batch):
        X = np.empty((len(image_paths_batch), *self.image_size, self.n_channels))
        y = np.empty((len(mask_paths_batch), *self.image_size, 1))

        for i, (image_path, mask_path) in enumerate(zip(image_paths_batch, mask_paths_batch)):
            # Load image
            img = tifffile.imread(image_path)
            img = np.expand_dims(img, axis=-1) if self.n_channels == 1 else img
            img = img.astype(np.float32)

            img = (img - (-25)) / (5 - (-25))  # Normalize to [0, 1] 
            img = np.squeeze(img)  # Remove singleton dimension if present
            # img = cv2.resize(img, self.image_size)
            X[i,] = img

            # Load mask
            mask = tifffile.imread(mask_path)
            mask = np.expand_dims(mask, axis=-1)
            mask = mask.astype(np.float32)

            y[i,] = mask

        return X, y

In [19]:
# Add Loggings
# Define the MetricsLogger class
class MetricsLogger(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        if logs is not None:
            # Log metrics to MLflow
            mlflow.log_metrics({
                'loss': logs.get('loss'),
                'accuracy': logs.get('accuracy'),
                'val_loss': logs.get('val_loss'),
                'val_accuracy': logs.get('val_accuracy')
            }, step=epoch)

In [20]:
# Loading data vvvvvvvv
# Reading the paths of all files
scene_dirs = [d for d in load_path.iterdir() if d.is_dir()] # and d.name != "scene1"] # Uncomment to have one scene totally untouched

X_path = []
y_path = []

for scene_dir in scene_dirs:
    data_list = os.listdir(scene_dir)
    X_path.extend([scene_dir / f"{x}/vv.tif" for x in data_list])
    y_path.extend([scene_dir / f"{x}/mask.tif" for x in data_list])

print(len(X_path), len(y_path), sep='\n')

3309
3309


In [21]:
# Train test split
X_train_paths, X_test_paths, y_train_paths, y_test_paths = train_test_split(X_path, y_path, test_size=0.2, shuffle=False)
print(f"X_train: {len(X_train_paths)}, y_train: {len(y_train_paths)}, X_test: {len(X_test_paths)}, y_test: {len(y_test_paths)}")

X_train: 2647, y_train: 2647, X_test: 662, y_test: 662


In [22]:
# Loading files into RAM (assuming having enough RAM space, not using DataGenerator here)

# Load and preprocess the data with progress bar
X_train, y_train = load_and_preprocess_data(X_train_paths, y_train_paths)
X_test, y_test = load_and_preprocess_data(X_test_paths, y_test_paths)

Loading and preprocessing data: 100%|██████████| 2647/2647 [00:06<00:00, 426.59it/s, Elapsed=6.21s, ETA=0.00s]
Loading and preprocessing data: 100%|██████████| 662/662 [00:01<00:00, 363.47it/s, Elapsed=1.82s, ETA=-0.00s]


In [23]:
# Pretrain preprations  vvvvv

# Model Checkpointing
checkpoint = tf.keras.callbacks.ModelCheckpoint(str(checkpoint_path), monitor='val_loss', save_best_only=True, mode='min')

# Early Stopping
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=early_stopping_patience, restore_best_weights=True)

# TensorBoard Callback
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=str(tensorboard_logs_dir), histogram_freq=1)

# Learning Rate Reducer
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=learning_rate_patience, min_lr=1e-6)

# For loading model from checkpoint
custom_objects = {
    'jaccard_loss': jaccard_loss,
    'MeanIoU': MeanIoU(num_classes=2),
    'LeakyReLU': LeakyReLU,
}

In [27]:
# ------------ Training the model ---------------

# Initialize MLflow
mlflow.tensorflow.autolog()

# Start MLflow experiment
with mlflow.start_run() as run:
    # Check if checkpoint exists
    if checkpoint_path.exists():
        print("Checkpoint found. Loading model from checkpoint...")
        model = tf.keras.models.load_model(str(checkpoint_path), custom_objects=custom_objects)
    else:
        print("No checkpoint found. Initializing new model...")
        model = unet_one_channel(input_size=(256, 256, 1), loss_function='jaccard_loss')

    # Train the model
    model.fit(
        X_train, y_train,
        epochs=epochs,
        batch_size=16,
        validation_data=(X_test, y_test),
        callbacks=[checkpoint, early_stopping, tensorboard_callback, MetricsLogger(), reduce_lr]
    )

    # Define conda environment
    # For compatibility with ml-flow fixed on the pod
    conda_env = {
        'channels': ['defaults'],
        'dependencies': [
            'python=3.11',
            'tensorflow=2.17.0',
            'keras=3.3.3',
            'cloudpickle=3.0.0',
            'pip',
            {
                'pip': ['mlflow', 'cloudpickle==3.0.0']
            }
        ],
        'name': 'mlflow-env'
    }

    # Log the final model to MLflow
    mlflow.keras.log_model(model, "model", conda_env=conda_env)  # Use "model" as a valid path

    # Save model after training ends
    model.save(str(model_path) + '.keras')  # Save in Keras format

    print("Model logged and saved successfully!")
    print(mlflow_logs_dir, model_path, sep="\n")


Checkpoint found. Loading model from checkpoint...


[1m166/166[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 460ms/step - accuracy: 0.9262 - loss: 0.1263 - mean_io_u_1: 0.5621

2024-08-12 14:30:06.832043: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 2264924160 exceeds 10% of free system memory.


[1m166/166[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 781ms/step - accuracy: 0.9262 - loss: 0.1263 - mean_io_u_1: 0.5616 - val_accuracy: 0.8684 - val_loss: 0.1254 - val_mean_io_u_1: 0.7660 - learning_rate: 1.0000e-04
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 772ms/step


2024/08/12 14:31:19 INFO mlflow.tracking._tracking_service.client: 🏃 View run monumental-newt-844 at: http://ml-flow-dev-mlflow:5000/#/experiments/0/runs/05c3d6f6da594399a0d34d13ae3e4655.
2024/08/12 14:31:19 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://ml-flow-dev-mlflow:5000/#/experiments/0.


Model logged and saved successfully!
../experiments/model_1_exp_1/mlflow_log
../experiments/model_1_exp_1/model/model_1_exp_1_epochs_1_patience_10_5.keras
