In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import mlflow
import mlflow.tensorflow
from sklearn.model_selection import train_test_split
import io, os, time



In [2]:
# Load the dataset
def load_dataset(filename='robot_arm_dataset_10M.npz'):
    data = np.load(f'./Data/{filename}')
    return data['inputs'], data['outputs']

In [3]:
# Define the model
def create_model(input_shape, output_shape):
    model = keras.Sequential([
        keras.layers.Input(shape=input_shape),
        keras.layers.Dense(128),
        keras.layers.BatchNormalization(),
        keras.layers.Activation('relu'),
        keras.layers.Dense(64),
        keras.layers.BatchNormalization(),
        keras.layers.Activation('relu'),
        keras.layers.Dense(output_shape)
    ])
    return model

In [4]:
class VerboseLoggingCallback(keras.callbacks.Callback):
    def __init__(self):
        super().__init__()
        self.output = io.StringIO()

    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        output = f"Epoch {epoch+1}/{self.params['epochs']} - "
        output += " - ".join(f"{k}: {v:.4f}" for k, v in logs.items())
        print(output)
        self.output.write(output + "\n")

    def get_output(self):
        return self.output.getvalue()
    
class LearningRateLogger(keras.callbacks.Callback):
    def __init__(self, tensorboard_writer):
        super().__init__()
        self.tensorboard_writer = tensorboard_writer

    def on_epoch_end(self, epoch, logs=None):
        lr = self.model.optimizer.lr
        if hasattr(lr, 'value'):
            lr = lr.value()
        with self.tensorboard_writer.as_default():
            tf.summary.scalar('learning_rate', data=lr, step=epoch)
        mlflow.log_metric("learning_rate", lr, step=epoch)

In [5]:
def train_model(batch_size, epochs, learning_rate, test_size=0.2, experiment_name="Inverse Kinematics NN", run_name=None):
    # Set up MLflow
    mlflow.set_experiment(experiment_name)
    
    # Generate a unique run name if one is provided
    if run_name:
        timestamp = int(time.time())
        unique_run_name = f"{run_name}_{timestamp}"
    else:
        unique_run_name = None
        
    with mlflow.start_run(run_name=run_name) as run:
        # Create a consistent directory structure for TensorBoard logs
        run_id = run.info.run_id
        run_name = run.data.tags.get('mlflow.runName', run_id)
        log_dir = os.path.join("logs", experiment_name, f"{run_name}_{run_id}")
        os.makedirs(log_dir, exist_ok=True)

        # Load and split the data
        X, y = load_dataset()
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

        # Log parameters
        mlflow.log_param("batch_size", batch_size)
        mlflow.log_param("epochs", epochs)
        mlflow.log_param("initial_learning_rate", learning_rate)
        mlflow.log_param("test_size", test_size)

        # Create and compile the model
        model = create_model(input_shape=(3,), output_shape=3)
        optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
        model.compile(optimizer=optimizer, loss='mse')

        # Log model summary
        model_summary = io.StringIO()
        model.summary(print_fn=lambda x: model_summary.write(x + '\n'))
        mlflow.log_text(model_summary.getvalue(), "model_summary.txt")

        # Set up TensorBoard callback and writer
        tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
        tensorboard_writer = tf.summary.create_file_writer(log_dir)

        # Set up other callbacks
        early_stopping = keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)
        reduce_lr = keras.callbacks.ReduceLROnPlateau(factor=0.2, patience=3)
        lr_logger = LearningRateLogger(tensorboard_writer)
        
        callbacks = [tensorboard_callback, early_stopping, reduce_lr, lr_logger]

        # Log callback names
        callback_names = [callback.__class__.__name__ for callback in callbacks]
        mlflow.log_param("callbacks", ", ".join(callback_names))

        # Train the model
        history = model.fit(
            X_train, y_train,
            epochs=epochs,
            batch_size=batch_size,
            validation_data=(X_test, y_test),
            callbacks=callbacks,
            verbose=1
        )

        # Log metrics
        for epoch, (loss, val_loss) in enumerate(zip(
            history.history['loss'],
            history.history['val_loss']
        )):
            mlflow.log_metric("train_loss", loss, step=epoch)
            mlflow.log_metric("val_loss", val_loss, step=epoch)

        # Log the TensorBoard log directory
        mlflow.log_param("tensorboard_log_dir", log_dir)

        # Log the model
        mlflow.tensorflow.log_model(model, "model")

    print("Training completed and logged with MLflow and TensorBoard.")
    print(f"Experiment name: {experiment_name}")
    print(f"Run name: {run_name}")
    print(f"Run ID: {run_id}")
    print(f"TensorBoard logs saved to: {log_dir}")
    print("To view in TensorBoard, run:")
    print(f"tensorboard --logdir logs/{experiment_name}")

In [6]:
train_model(
    batch_size=2**15,
    epochs=50,  # Increased epochs to demonstrate early stopping
    learning_rate=0.001,
    test_size=0.2,
    run_name="SimpleLRScheduler"
)

2024/08/16 18:03:41 INFO mlflow.tracking.fluent: Experiment with name 'Inverse Kinematics NN' does not exist. Creating a new experiment.


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50




INFO:tensorflow:Assets written to: /tmp/tmp4uxsy7ik/model/data/model/assets




Training completed and logged with MLflow and TensorBoard.
Experiment name: Inverse Kinematics NN
Run name: SimpleLRScheduler
Run ID: 4019ecd4c16545dc8e853a45d7c1a0ed
TensorBoard logs saved to: logs/Inverse Kinematics NN/SimpleLRScheduler_4019ecd4c16545dc8e853a45d7c1a0ed
To view in TensorBoard, run:
tensorboard --logdir logs/Inverse Kinematics NN
