In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
import time
import pandas as pd
import mlflow
import mlflow.tensorflow
from sklearn.model_selection import train_test_split
import io
import os
from IPython.display import display, HTML

In [2]:
# Constants
L1, L2, L3 = 1.0, 1.5, 0.5  # link lengths

@tf.function
def forward_kinematics_tf(theta):
    theta1, theta2, theta3 = tf.unstack(theta, axis=1)
    
    x = L1 * tf.cos(theta1) * tf.sin(theta2) + L2 * tf.cos(theta1) * tf.sin(theta2 + theta3)
    y = L1 * tf.sin(theta1) * tf.sin(theta2) + L2 * tf.sin(theta1) * tf.sin(theta2 + theta3)
    z = L1 * tf.cos(theta2) + L2 * tf.cos(theta2 + theta3) + L3
    
    return tf.stack([x, y, z], axis=1)

def evaluate_model(model, test_inputs, test_outputs, input_mean, input_std, batch_size=2**16):
    test_inputs = tf.convert_to_tensor(test_inputs, dtype=tf.float32)
    input_mean = tf.convert_to_tensor(input_mean, dtype=tf.float32)
    input_std = tf.convert_to_tensor(input_std, dtype=tf.float32)
    predicted_angles_normalized = model.predict(test_inputs, batch_size=batch_size)
    predicted_angles = predicted_angles_normalized * (np.pi/2)
    true_xyz = test_inputs * input_std + input_mean
    predicted_xyz = forward_kinematics_tf(predicted_angles)
    errors = tf.norm(true_xyz - predicted_xyz, axis=1)
    return errors.numpy(), true_xyz.numpy(), predicted_xyz.numpy()

def custom_loss(fk_weight=1, delta=0.1):
    def loss_fn(y_true, y_pred):
        # Huber loss for joint angles
        angle_loss = huber_loss(y_true, y_pred, delta)
        
        # Forward kinematics loss (using Huber loss)
        fk_true = forward_kinematics_tf(y_true)
        fk_pred = forward_kinematics_tf(y_pred)
        fk_loss = huber_loss(fk_true, fk_pred, delta)
        
        # Combine losses
        total_loss = tf.reduce_mean(angle_loss) + fk_weight * tf.reduce_mean(fk_loss)
        return total_loss
    return loss_fn

def huber_loss(y_true, y_pred, delta=1.0):
    error = y_true - y_pred
    is_small_error = tf.abs(error) <= delta
    squared_loss = tf.square(error) / 2
    linear_loss = delta * (tf.abs(error) - delta / 2)
    return tf.where(is_small_error, squared_loss, linear_loss)

def huber_exp_loss(y_true, y_pred, delta=0.2, alpha=1):
    error = y_true - y_pred
    is_small_error = tf.abs(error) <= delta
    squared_loss = tf.square(error) / 2
    exp_loss = alpha * (tf.exp(tf.abs(error) - delta) - 1) + delta * tf.abs(error) - delta**2 / 2
    return tf.where(is_small_error, squared_loss, exp_loss)

def log_cosh_loss(y_true, y_pred):
    return tf.reduce_mean(tf.math.log(tf.math.cosh(y_pred - y_true)))

def create_model(config):
    model = keras.Sequential([
        keras.layers.Dense(128, activation='relu', input_shape=(3,)),
        keras.layers.Dense(256, activation='relu'),
        keras.layers.Dense(256, activation='relu'),
        keras.layers.Dense(128, activation='relu'),
        keras.layers.Dense(3, activation='tanh')
    ])
    
    loss_type = config['loss_type']
    if loss_type == 'standard':
        model.compile(optimizer='adam', loss='mse')
    elif loss_type == 'custom':
        model.compile(optimizer='adam', loss=custom_loss())
    elif loss_type == 'modified_custom':
        model.compile(optimizer='adam', loss=custom_loss(fk_weight=config.get('fk_weight', 10)))
    
    return model

def load_and_preprocess_data(filename='robot_arm_dataset_10M.npz'):
    data = np.load(f'./Data/{filename}')
    inputs, outputs = data['inputs'], data['outputs']
    
    input_mean = np.mean(inputs, axis=0)
    input_std = np.std(inputs, axis=0)
    inputs_normalized = (inputs - input_mean) / input_std

    outputs_normalized = outputs / (np.pi/2)

    split_index = int(0.9 * len(inputs))
    train_inputs, test_inputs = inputs_normalized[:split_index], inputs_normalized[split_index:]
    train_outputs, test_outputs = outputs_normalized[:split_index], outputs_normalized[split_index:]

    return (train_inputs, train_outputs), (test_inputs, test_outputs), input_mean, input_std

class VerboseLoggingCallback(keras.callbacks.Callback):
    def __init__(self):
        super().__init__()
        self.output = io.StringIO()

    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        output = f"Epoch {epoch+1}/{self.params['epochs']} - "
        output += " - ".join(f"{k}: {v:.4f}" for k, v in logs.items())
        print(output)
        self.output.write(output + "\n")

    def get_output(self):
        return self.output.getvalue()

class LearningRateLogger(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        lr = self.model.optimizer.lr
        if hasattr(lr, 'value'):
            lr = lr.value()
        mlflow.log_metric("learning_rate", lr, step=epoch)
        
class CosineDecayWithWarmupCallback(tf.keras.callbacks.Callback):
    def __init__(self, initial_learning_rate, warmup_steps, total_steps):
        super(CosineDecayWithWarmupCallback, self).__init__()
        self.initial_learning_rate = initial_learning_rate
        self.warmup_steps = warmup_steps
        self.total_steps = total_steps
        self.current_step = 0

    def on_train_batch_begin(self, batch, logs=None):
        if self.current_step < self.warmup_steps:
            lr = self.initial_learning_rate * (self.current_step / self.warmup_steps)
        else:
            progress = (self.current_step - self.warmup_steps) / (self.total_steps - self.warmup_steps)
            lr = 0.5 * self.initial_learning_rate * (1 + np.cos(np.pi * progress))

        tf.keras.backend.set_value(self.model.optimizer.lr, lr)
        self.current_step += 1

def plot_error_distribution(errors, title, save_path=None):
    plt.figure(figsize=(12, 6))
    plt.hist(errors, bins=400, alpha=0.5)
    plt.title(title)
    plt.xlabel('Error (Euclidean distance)')
    plt.ylabel('Frequency')
    plt.xlim([0, 0.5])
    
    if save_path:
        plt.savefig(save_path)
        plt.close()
    else:
        plt.show()

def plot_true_vs_predicted(true_xyz, predicted_xyz, title, save_path=None):
    fig, axes = plt.subplots(1, 3, figsize=(18, 6))
    fig.suptitle(title)
    
    for i, coord in enumerate(['X', 'Y', 'Z']):
        ax = axes[i]
        ax.scatter(true_xyz[:, i], predicted_xyz[:, i], alpha=0.1)
        ax.plot([true_xyz[:, i].min(), true_xyz[:, i].max()], [true_xyz[:, i].min(), true_xyz[:, i].max()], 'r--')
        ax.set_xlabel(f'True {coord}')
        ax.set_ylabel(f'Predicted {coord}')
        ax.set_title(f'{coord} Coordinate: True vs Predicted')
    
    plt.tight_layout()
    
    if save_path:
        plt.savefig(save_path)
        plt.close()
    else:
        plt.show()
        
def train_and_evaluate_model(train_data, valid_data, test_data, input_mean, input_std, config):
    train_inputs, train_outputs = train_data
    valid_inputs, valid_outputs = valid_data
    test_inputs, test_outputs = test_data
    
    experiment_name = config['experiment_name']
    mlflow.set_experiment(experiment_name)
    
    with mlflow.start_run(run_name=config['model_name']):
        # Calculate total steps
        steps_per_epoch = len(train_inputs) // config['batch_size']
        total_steps = steps_per_epoch * epochs
        warmup_steps = int(0.1 * total_steps)  # 10% of total steps for warmup
        
        # Log parameters
        mlflow.log_params(config)
        
        # Create and compile model
        model = create_model(config)
        
        # Log model summary
        model_summary = io.StringIO()
        model.summary(print_fn=lambda x: model_summary.write(x + '\n'))
        mlflow.log_text(model_summary.getvalue(), "model_summary.txt")
        
        # Set up callbacks
        early_stopping = keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)
        
        lr_scheduler = CosineDecayWithWarmupCallback(
            config['initial_learning_rate'],
            warmup_steps,
            total_steps
        )
        
        verbose_logging = VerboseLoggingCallback()
        lr_logger = LearningRateLogger()
        
        callbacks = [early_stopping, lr_scheduler, verbose_logging, lr_logger]
        
        # Log callback names
        callback_names = [callback.__class__.__name__ for callback in callbacks]
        mlflow.log_param("callbacks", ", ".join(callback_names))
        
        # Train the model
        start_time = time.time()
        history = model.fit(
            train_inputs, train_outputs,
            epochs=config['epochs'],
            batch_size=config['batch_size'],
            validation_data=(valid_inputs, valid_outputs),
            callbacks=callbacks
        )
        training_time = time.time() - start_time
        
        # Log training metrics
        for epoch, (loss, val_loss) in enumerate(zip(history.history['loss'], history.history['val_loss'])):
            mlflow.log_metric("train_loss", loss, step=epoch)
            mlflow.log_metric("val_loss", val_loss, step=epoch)
        
        mlflow.log_metric("training_time", training_time)
        
        # Log model
        mlflow.tensorflow.log_model(model, "model")
        
        # Log training output
        mlflow.log_text(verbose_logging.get_output(), "training_output.txt")
        
        # Evaluate model
        errors, true_xyz, predicted_xyz = evaluate_model(model, test_inputs, test_outputs, input_mean, input_std)
        
        # Log evaluation metrics
        mlflow.log_metric("mean_error", np.mean(errors))
        mlflow.log_metric("median_error", np.median(errors))
        mlflow.log_metric("90th_percentile_error", np.percentile(errors, 90))
        mlflow.log_metric("max_error", np.max(errors))
        
        # Generate and log the true vs predicted plot
        true_vs_pred_plot_path = f"{config['model_name']}_true_vs_predicted.png"
        plot_true_vs_predicted(true_xyz, predicted_xyz, f"{config['model_name']} Model: True vs Predicted", save_path=true_vs_pred_plot_path)
        mlflow.log_artifact(true_vs_pred_plot_path)
        os.remove(true_vs_pred_plot_path)  # Clean up the temporary file
        
        # Generate and log the error distribution plot
        error_dist_plot_path = f"{config['model_name']}_error_distribution.png"
        plot_error_distribution(errors, f"{config['model_name']} Model: Error Distribution", save_path=error_dist_plot_path)
        mlflow.log_artifact(error_dist_plot_path)
        os.remove(error_dist_plot_path)  # Clean up the temporary file
        
        # Print summary statistics
        print(f"\n{config['model_name']} Model:")
        print(f"Mean Error: {np.mean(errors):.4f}")
        print(f"Median Error: {np.median(errors):.4f}")
        print(f"90th Percentile Error: {np.percentile(errors, 90):.4f}")
        print(f"Max Error: {np.max(errors):.4f}")
        print(f"Training Time: {training_time:.2f} seconds")
        
        return {
            'model': model,
            'history': history,
            'errors': errors,
            'true_xyz': true_xyz,
            'predicted_xyz': predicted_xyz,
            'training_time': training_time
        }

def run_single_experiment(config):
    (train_inputs, train_outputs), (test_inputs, test_outputs), input_mean, input_std = load_and_preprocess_data()
    
    # Split test data into validation and test sets
    valid_inputs, test_inputs, valid_outputs, test_outputs = train_test_split(
        test_inputs, test_outputs, test_size=0.5, random_state=42
    )
    
    return train_and_evaluate_model(
        (train_inputs, train_outputs),
        (valid_inputs, valid_outputs),
        (test_inputs, test_outputs),
        input_mean, input_std,
        config
    )

In [3]:
epochs = 200

configs = [
    {
        "model_name": "Huber Loss (both), fk=10e200,d01",
        "loss_type": "modified_custom",
        "fk_weight": 10,
        "epochs": epochs,
        "initial_learning_rate": 1e-3,
        "batch_size": 65536,
        "experiment_name": "Inverse Kinematics NN Comparison"
    }
]

results = {}
for config in configs:
    results[config['model_name']] = run_single_experiment(config)

Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/mlflow/store/tracking/file_store.py", line 317, in search_experiments
    exp = self._get_experiment(exp_id, view_type)
  File "/usr/local/lib/python3.8/dist-packages/mlflow/store/tracking/file_store.py", line 410, in _get_experiment
    meta = FileStore._read_yaml(experiment_dir, FileStore.META_DATA_FILE_NAME)
  File "/usr/local/lib/python3.8/dist-packages/mlflow/store/tracking/file_store.py", line 1341, in _read_yaml
    return _read_helper(root, file_name, attempts_remaining=retries)
  File "/usr/local/lib/python3.8/dist-packages/mlflow/store/tracking/file_store.py", line 1334, in _read_helper
    result = read_yaml(root, file_name)
  File "/usr/local/lib/python3.8/dist-packages/mlflow/utils/file_utils.py", line 309, in read_yaml
    raise MissingConfigException(f"Yaml file '{file_path}' does not exist.")
mlflow.exceptions.MissingConfigException: Yaml file '/tf/workdir/mlruns/mlruns/meta.yaml' does not

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200




INFO:tensorflow:Assets written to: /tmp/tmprnah6waq/model/data/model/assets


INFO:tensorflow:Assets written to: /tmp/tmprnah6waq/model/data/model/assets



Huber Loss (both), fk=10e200,d01 Model:
Mean Error: 0.0356
Median Error: 0.0349
90th Percentile Error: 0.0526
Max Error: 3.9077
Training Time: 57.15 seconds


In [4]:
results[configs[0]['model_name']]

{'model': <keras.engine.sequential.Sequential at 0x7efbb666b610>,
 'history': <keras.callbacks.History at 0x7efca8512cd0>,
 'errors': array([0.02859872, 0.02799114, 0.02627422, ..., 0.01310541, 0.03912053,
        0.02646932], dtype=float32),
 'true_xyz': array([[ 0.6068389 , -1.5461596 ,  1.9428234 ],
        [-1.2951391 ,  1.9623394 ,  1.2530879 ],
        [-1.8338947 ,  1.0807508 ,  1.7556641 ],
        ...,
        [ 0.52119195, -1.9730337 , -0.3695886 ],
        [ 0.19297831,  1.5389277 , -0.6839478 ],
        [-1.3054239 ,  1.4633682 ,  0.14486456]], dtype=float32),
 'predicted_xyz': array([[ 0.6174133 , -1.5723236 ,  1.9474621 ],
        [-1.3103077 ,  1.9816096 ,  1.2665814 ],
        [-1.8496726 ,  1.0984539 ,  1.7669777 ],
        ...,
        [ 0.5119995 , -1.9813952 , -0.37375206],
        [ 0.20352566,  1.573889  , -0.6699166 ],
        [-1.2805868 ,  1.4610975 ,  0.13599938]], dtype=float32),
 'training_time': 57.15261268615723}