In [8]:
import numpy as np
from stable_baselines3 import PPO
from gymnasium import spaces, Env
from boat_simulation import Boat, wrap_phase

In [9]:
import torch
import torch.nn as nn
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor

class CustomMLP(BaseFeaturesExtractor):
    def __init__(self, observation_space, features_dim=1):
        super(CustomMLP, self).__init__(observation_space, features_dim)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    def linear_relu(self, x, in_units, out_units):
        x = nn.Linear(in_units, out_units).to(self.device)(x)
        return nn.ReLU().to(self.device)(x)

    def mlp(self, x, units):
        for i in range(1, len(units)):
            x = self.linear_relu(x, units[i - 1], units[i])
        return x

    def forward(self, observations):
        x = observations[:, [0, 1, 2, 3, 7, 8]].to(self.device)

        return self.mlp(x, [6, 300, 200, self.features_dim])

In [10]:
from collections import deque
from random import randint

class MultiMarkEnv(Env):
    def __init__(self, config, bounds=None):
        if bounds is None:
            self.MIN_X = -250
            self.MAX_X = 250
            self.MIN_Y = 0
            self.MAX_Y = 250
        else:
            self.MIN_X = bounds[0]
            self.MAX_X = bounds[1]
            self.MIN_Y = bounds[2]
            self.MAX_Y = bounds[3]
        self.MAX_SPEED = 10
        self.boat = Boat(mass=960, drag_coefficient=0.003, inertia_factor=0.3)
        self.MAX_MARKS = config['max_marks']
        self.MAX_REMAINING_SECONDS = config['max_seconds_per_leg']
        self.LEG_RADIUS = config['leg_radius']
        self.target_x = np.zeros((self.MAX_MARKS,))
        self.target_y= np.zeros((self.MAX_MARKS,))
        self.current_mark = 0
        self.heading_change = 0

        self.penalty_queue = deque(maxlen=3)
        self.DEAD_ZONE_ANGLE = 30 * np.pi / 180

        self.plot_fn = config['plot_fn']

        self.trajectory = []

        self.MAX_DISTANCE = np.sqrt((self.MAX_X - self.MIN_X) ** 2 + (self.MAX_Y - self.MIN_Y) ** 2)

        self.observation = np.zeros((9,))
        self.action_space = spaces.Box(low=-1, high=1, shape=(1,))

        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(9,))
        self.reward_range = spaces.Box(low=-1, high=self.MAX_SPEED, shape=())

    def reset(self, seed = None, options = None):
        super().reset(seed=seed)
        # Initialization logic
        # Initialize state variables: x, y, speed, etc.
        # Return the initial observation

        # Always start at 0.0, and generate random target marks from there
        if self.trajectory != []:
            bounds = (self.MIN_X, self.MAX_X, self.MIN_Y, self.MAX_Y)
            marks = [None] * self.MAX_MARKS
            for i in range(self.MAX_MARKS):
                marks[i] = (self.target_x[i], self.target_y[i])
            self.plot_fn(self.trajectory, marks, bounds)

        self.x = 0.0
        self.y = 0.0

        current_x = 0
        current_y = 0
        for i in range(self.MAX_MARKS):
            N = 2
            phase_step = randint(0, N)
            phase = 2 * np.pi / (1.0 * N) * phase_step
            current_x = current_x + np.cos(phase + np.pi/2) * self.LEG_RADIUS
            current_y = current_y + np.sin(phase + np.pi/2) * self.LEG_RADIUS
            self.target_x[i] = current_x
            self.target_y[i] = current_y

        self.current_mark = 0
        self.tack_count = 0
        self.heading = np.arctan2(self.target_x[0] - self.x, self.target_y[0] - self.y)
        self.angle_to_mark = 0
        self.speed = 0
        self.vmg = 0
        self.has_tacked = False
        self.remaining_seconds = self.MAX_REMAINING_SECONDS
        self.delta_t = 0
        self.reward = 0
        self.heading_change = 0

        self.boat.position[0] = self.x
        self.boat.position[1] = self.y
        self.boat.speed = self.speed
        self.boat.heading = self.heading

        self.next_target__distance = np.sqrt((self.target_x[1] - self.x) ** 2 + self.target_y[1] ** 2)
        self.next_target__vmg = 0
        self.next_target__heading = np.arctan2(self.target_x[1] - self.x, self.target_y[1] - self.y)
        self.next_target__angle_to_mark = 0

        self.distance = np.sqrt((self.target_x[0] - self.x) ** 2 + self.target_y[0] ** 2)
        self.initial_distance = self.distance

        self.observation[0] = self.distance / self.MAX_DISTANCE

        self.is_terminal = False
        self.is_truncated = False

        self.trajectory = []
        self.append_to_trajectory()

        return self.observation, {}

    def append_to_trajectory(self):
        heading_deg = self.boat.heading * 180 / np.pi
        if heading_deg > 180:
            heading_deg = heading_deg - 360
        meta = {
            'current_mark': self.current_mark,
            'vmg': self.vmg,
            'heading': heading_deg,
            'reward': self.reward
        }
        self.trajectory.append({'x': self.boat.position[0], 'y': self.boat.position[1], 'meta': meta})
        return self

    def step(self, action):
        # each action spans 1 seconds
        dt = 1
        self.apply_action(action[0], dt).calculate_reward().is_terminal_state()
        self.observation = np.stack([
            self.distance / self.MAX_DISTANCE,
            self.vmg,
            self.heading,
            self.angle_to_mark,
            self.next_target__distance / self.MAX_DISTANCE,
            self.next_target__vmg,
            self.next_target__angle_to_mark,
            self.has_tacked,
            self.is_terminal and not self.is_truncated
        ])

        self.append_to_trajectory()

        return self.observation, self.reward, self.is_terminal, self.is_truncated, {}

    def render(self, mode='human'):
        pass

    def calculate_momentum_penalty(self, heading_change):
        penalty = abs(heading_change) / np.pi
        return penalty

    def apply_action(self, action, dt):
        desired_heading = action * np.pi
        prev_heading = self.heading

        self.boat.step(desired_heading, dt)

        heading = self.boat.heading
        self.speed = self.boat.speed
        self.x = self.boat.position[0]
        self.y = self.boat.position[1]

        self.heading_change = heading - prev_heading

        self.heading = heading
        self.has_tacked = (prev_heading < np.pi) != (heading < np.pi)

        dx = self.target_x[self.current_mark] - self.x
        dy = self.target_y[self.current_mark] - self.y

        self.angle_to_mark = wrap_phase(np.arctan2(dx, dy))

        target_unit = np.stack([np.cos(self.angle_to_mark), np.sin(self.angle_to_mark)])
        heading_unit = np.stack([np.cos(heading), np.sin(heading)])

        self.vmg = (target_unit @ heading_unit) * self.speed

        self.distance = np.sqrt(dx ** 2 + dy ** 2)
        self.tack_count = 1 if self.has_tacked else 0
        self.remaining_seconds -= dt
        self.delta_t = dt


        dx = self.target_x[min(self.current_mark + 1, self.MAX_MARKS - 1)] - self.x
        dy = self.target_y[min(self.current_mark + 1, self.MAX_MARKS - 1)] - self.y

        self.next_target__angle_to_mark = wrap_phase(np.arctan2(dx, dy))

        target_unit = np.stack([np.cos(self.next_target__angle_to_mark), np.sin(self.next_target__angle_to_mark)])
        self.next_target__vmg = (target_unit @ heading_unit) * self.speed
        self.next_target__distance = np.sqrt(dx ** 2 + dy ** 2)

        return self

    def is_terminal_state(self):
        if self.distance < 5:
            self.current_mark += 1
            self.remaining_seconds = self.MAX_REMAINING_SECONDS
            if self.current_mark == self.MAX_MARKS:
                self.is_terminal = True
                self.is_truncated = False
                return self

        has_collided = self.x < self.MIN_X or self.x > self.MAX_X or self.y < self.MIN_Y or self.y > self.MAX_Y

        if has_collided or self.remaining_seconds < 1:
            self.is_terminal = True
            self.is_truncated = True
            return self

        self.is_terminal = False
        self.is_truncated = False
        return self

    def calculate_reward(self):
#         penalty = min(1, sum(self.penalty_queue))
#         self.reward = 0.1 * self.vmg - 0.01 * penalty * abs(self.heading_change)
        # self.reward = 0.1 * self.vmg - 0.05 * abs(self.heading_change / np.pi)
        self.reward = 0.1 * self.vmg
        return self

In [11]:
from IPython.core.display import display, HTML, Javascript

disable_scroll_script = """
var idx = Jupyter.notebook.get_selected_index();
var cell = Jupyter.notebook.get_cell(idx);
cell.output_area._should_scroll = function(lines) {
    return false;
}
"""

display(Javascript(disable_scroll_script))


Importing display from IPython.core.display is deprecated since IPython 7.14, please import from IPython display



<IPython.core.display.Javascript object>

In [12]:
import plotly.graph_objects as go
import time
import ipywidgets as widgets
from IPython.display import display

outer_radius = 2.2 * 250

dt = 1
num_marks = 2
max_seconds = 500 * num_marks / dt

# Initialize the figure and scatter plot
fig = go.FigureWidget()
scatter = fig.add_scatter(mode='markers+lines', name='trajectory')
marks_scatter = fig.add_scatter(mode='markers+text', name='marks')
fig.update_xaxes(range=[-outer_radius, outer_radius],dtick=25)
fig.update_yaxes(range=[-outer_radius, outer_radius],dtick=25)
fig.layout.width=800
fig.layout.height=800

heading_fig = go.FigureWidget()
heading_scatter = heading_fig.add_scatter(mode='markers+lines')
heading_fig.update_xaxes(range=[0, max_seconds],dtick=25)
heading_fig.update_yaxes(range=[-210, 210],dtick=30)
heading_fig.layout.width=800
heading_fig.layout.height=600
heading_fig.layout.title = "Heading over time"

vmg_fig = go.FigureWidget()
vmg_scatter = vmg_fig.add_scatter(mode='markers+lines')
vmg_fig.update_xaxes(range=[0, max_seconds],dtick=25)
vmg_fig.update_yaxes(range=[-4, 4],dtick=2)
vmg_fig.layout.width=800
vmg_fig.layout.height=400
vmg_fig.layout.title = "VMG over time"

# Initialize output for the text
out = widgets.Output()

# Display text and figure
display(out)
display(fig)
display(heading_fig)
display(vmg_fig)

episode = 0

colormap = ['blue', 'red', 'green', 'orange', 'black']

def plot(data, marks, bounds):
    global episode, out, fig
    episode += 1
    if episode % 50 != 0:
        return
    x_values = [point['x'] for point in data]
    y_values = [point['y'] for point in data]
    meta_values = [{k: '%.3f' % v for (k, v) in point['meta'].items()} for point in data]

    min_x, max_x, min_y, max_y = bounds


    color_meta = [point['meta']['current_mark'] for point in data]
    colors = [colormap[color % len(colormap)] for color in color_meta]

    headings = [point['meta']['heading'] for point in data]
    vmg = [point['meta']['vmg'] for point in data]

    global_meta = {'final_position': (x_values[-1], y_values[-1]), 'reward': sum([point['meta']['reward'] for point in data]), 'iters': len(data), 'episode': episode}

    # Update data
    scatter = fig.data[0]
    scatter.x = x_values
    scatter.y = y_values
    scatter.hovertext = meta_values  # Add this line
    scatter.marker.color = colors
    scatter.line.color = 'lightgrey'

    marks_scatter = fig.data[1]
    marks_scatter.x = [x for x, _ in marks]
    marks_scatter.y = [y for _, y in marks]
    marks_scatter.text = list(range(len(marks)))
    marks_scatter.textposition = 'bottom right'
    marks_scatter.marker.color = 'black'
    marks_scatter.marker.symbol = 'x'
    marks_scatter.marker.size = 10

    heading_scatter = heading_fig.data[0]
    heading_scatter.x = list(range(len(headings)))
    heading_scatter.y = headings
    heading_scatter.marker.color = colors
    heading_scatter.line.color = 'lightgrey'

    vmg_scatter = vmg_fig.data[0]
    vmg_scatter.x = list(range(len(vmg)))
    vmg_scatter.y = vmg
    vmg_scatter.marker.color = colors
    vmg_scatter.line.color = 'lightgrey'

    fig.update_xaxes(range=[min_x, max_x],dtick=25)
    fig.update_yaxes(range=[min_y, max_y],dtick=25)
    fig.layout.width=800
    fig.layout.height=800

    heading_fig.update_yaxes(range=[-210, 210],dtick=30)
    heading_fig.layout.width=800
    heading_fig.layout.height=600

    vmg_fig.layout.width=800
    vmg_fig.layout.height=400

    # Update the text output
    with out:
        out.clear_output(wait=True)
        print(f"Global Meta: {str(global_meta)}")

None

Output()

FigureWidget({
    'data': [{'mode': 'markers+lines',
              'name': 'trajectory',
              'type': 'scatter',
              'uid': 'ba3e05ad-890b-4171-a1a7-cb81f2b75d67'},
             {'mode': 'markers+text', 'name': 'marks', 'type': 'scatter', 'uid': '89f212c2-3889-4ad2-ae59-99bda54191ca'}],
    'layout': {'height': 800,
               'template': '...',
               'width': 800,
               'xaxis': {'dtick': 25, 'range': [-550.0, 550.0]},
               'yaxis': {'dtick': 25, 'range': [-550.0, 550.0]}}
})

FigureWidget({
    'data': [{'mode': 'markers+lines', 'type': 'scatter', 'uid': '7516b396-d913-46f7-866c-d54e9d88a0d3'}],
    'layout': {'height': 600,
               'template': '...',
               'title': {'text': 'Heading over time'},
               'width': 800,
               'xaxis': {'dtick': 25, 'range': [0, 1000.0]},
               'yaxis': {'dtick': 30, 'range': [-210, 210]}}
})

FigureWidget({
    'data': [{'mode': 'markers+lines', 'type': 'scatter', 'uid': '1f44e45d-7320-4793-b043-d6c67c26c5b3'}],
    'layout': {'height': 400,
               'template': '...',
               'title': {'text': 'VMG over time'},
               'width': 800,
               'xaxis': {'dtick': 25, 'range': [0, 1000.0]},
               'yaxis': {'dtick': 2, 'range': [-4, 4]}}
})

In [13]:
# Configuration

r = 250

config = {
    'max_marks': 2,
    'max_seconds_per_leg': 500,
    'plot_fn': plot,
    'leg_radius': r
}

outer_radius = config['max_marks'] * r + 0.1 * r
bounds = [-outer_radius, outer_radius, -outer_radius, outer_radius]
# Initialize Environment
env = MultiMarkEnv(config, bounds=bounds)

# Initialize PPO model
model = PPO("MlpPolicy", env, verbose=1, device='cuda', policy_kwargs={"features_extractor_class": CustomMLP, "features_extractor_kwargs": {"features_dim": 1}})

# Train the model
# model.learn(total_timesteps=1_000_000)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [14]:
model.learn(total_timesteps=4_000_000)

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 381      |
|    ep_rew_mean     | -36.3    |
| time/              |          |
|    fps             | 1084     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 2048     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 379          |
|    ep_rew_mean          | -36.7        |
| time/                   |              |
|    fps                  | 939          |
|    iterations           | 2            |
|    time_elapsed         | 4            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0048234216 |
|    clip_fraction        | 0.0483       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.43        |
|    explained_variance   | 0.00175      |
|    learning_r

KeyboardInterrupt: 