<a href="https://colab.research.google.com/github/nicoRomeroCuruchet/DynamicProgramming/blob/main/testing_bary.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pickle
import numpy as np
from PolicyIteration import PolicyIteration
from utils.utils import plot_2D_value_function,\
                        plot_3D_value_function,\
                        test_enviroment

# CartPoleEnv 

### Observation Space

The observation is a `ndarray` with shape `(4,)` with the values corresponding to the following positions and velocities:

| Num | Observation           | Min                 | Max               |
|-----|-----------------------|---------------------|-------------------|
| 0   | Cart Position         | -4.8                | 4.8               |
| 1   | Cart Velocity         | -Inf                | Inf               |
| 2   | Pole Angle            | ~ -0.418 rad (-24°) | ~ 0.418 rad (24°) |
| 3   | Pole Angular Velocity | -Inf                | Inf               |

### Action Space

The action is a `ndarray` with shape `(1,)` which can take values `{0, 1}` indicating the direction
of the fixed force the cart is pushed with.

- 0: Push cart to the left
- 1: Push cart to the right

In [None]:
# Train cartpole environment:

from classic_control.cartpole import CartPoleEnv

env = CartPoleEnv(sutton_barto_reward=True)
# position thresholds:
x_lim = 2.5
theta_lim = 0.25 
# velocity thresholds:
x_dot_lim = 2.5
theta_dot_lim = 2.5

bins_space = {
    "x_space": np.linspace(-x_lim, x_lim, 20),                         # position space         (0)
    "x_dot_space": np.linspace(-x_dot_lim, x_dot_lim, 20),             # velocity space         (1)
    "theta_space": np.linspace(-theta_lim, theta_lim, 20),             # angle space            (2)
    "theta_dot_space": np.linspace(-theta_dot_lim, theta_dot_lim, 20), # angular velocity space (3)
}

pi = PolicyIteration(
    env=env, 
    bins_space=bins_space,
    action_space=[0, 1],
    gamma=0.99,
    theta=1e-3
)

pi.run()

In [None]:
# Test cartpole environment:

with open(env.__class__.__name__ + ".pkl", "rb") as f:
    pi = pickle.load(f)

test_enviroment(CartPoleEnv(sutton_barto_reward=True, render_mode="human"), pi)

# Continuous_MountainCarEnv

## Observation Space

The observation is a `ndarray` with shape `(2,)` where the elements correspond to the following:

| Num | Observation                          | Min  | Max | Unit         |
|-----|--------------------------------------|------|-----|--------------|
| 0   | position of the car along the x-axis | -Inf | Inf | position (m) |
| 1   | velocity of the car                  | -Inf | Inf | position (m) |

## Action Space

The action is a `ndarray` with shape `(1,)`, representing the directional force applied on the car.
The action is clipped in the range `[-1,1]` and multiplied by a power of 0.0015.


In [1]:
import pickle
import numpy as np
from PolicyIteration import PolicyIteration
from utils.utils import plot_2D_value_function,\
                        plot_3D_value_function,\
                        test_enviroment

# Train mountain car environment:



from classic_control.continuous_mountain_car import Continuous_MountainCarEnv

env=Continuous_MountainCarEnv()

bins_space = {
    "x_space":     np.linspace(env.min_position, env.max_position, 257),      # position space         (0)
    "x_dot_space": np.linspace(-abs(env.max_speed), abs(env.max_speed), 257), # velocity space         (1)
}

pi = PolicyIteration(
    env=env, 
    bins_space=bins_space,
    action_space=[-1, 1],
    gamma=0.99,
    theta=1e-3,
)

pi.run()

[32m2024-08-29 18:39:07.934[0m | [1mINFO    [0m | [36mPolicyIteration[0m:[36m__init__[0m:[36m106[0m - [1mPolicy Iteration was correctly initialized.[0m
[32m2024-08-29 18:39:07.936[0m | [1mINFO    [0m | [36mPolicyIteration[0m:[36m__init__[0m:[36m107[0m - [1mThe enviroment name is: Continuous_MountainCarEnv[0m
[32m2024-08-29 18:39:07.936[0m | [1mINFO    [0m | [36mPolicyIteration[0m:[36m__init__[0m:[36m108[0m - [1mThe action space is: [-1, 1][0m
[32m2024-08-29 18:39:07.937[0m | [1mINFO    [0m | [36mPolicyIteration[0m:[36m__init__[0m:[36m109[0m - [1mNumber of states: 66049[0m
[32m2024-08-29 18:39:07.937[0m | [1mINFO    [0m | [36mPolicyIteration[0m:[36mrun[0m:[36m409[0m - [1mGenerating transition and reward function table...[0m
100%|██████████| 66049/66049 [00:09<00:00, 6782.66it/s]
[32m2024-08-29 18:39:17.681[0m | [1mINFO    [0m | [36mPolicyIteration[0m:[36mrun[0m:[36m411[0m - [1mTransition and reward function table gen

In [None]:
import numpy as np
from scipy.spatial import KDTree
import matplotlib.pyplot as plt 
delta = 257
# Extract the x and y values
x_values = np.linspace(env.min_position, env.max_position, delta)
y_values = np.linspace(-abs(env.max_speed), abs(env.max_speed), int(delta))
X, Y = np.meshgrid(x_values, y_values)
cartesian_product = np.c_[X.ravel(), Y.ravel()]
#avoid repeating the same point
cartesian_product = np.unique(cartesian_product, axis=0)
# Highlighted point
highlight_point = (0.25, 0.0606981)

# Create the plot
plt.figure(figsize=(30,30))
plt.plot(X.ravel(), Y.ravel(), 'go', label='Data Points', markersize=4)

plt.plot(highlight_point[0], highlight_point[1], 'bx', label='Highlighted Point', markersize=2)

tree = KDTree(np.c_[X.ravel(), Y.ravel()])

dd, ii = tree.query((highlight_point[0], highlight_point[1]), k=20)
#print(ii)
# create a simplex
simplex = np.c_[X.ravel()[ii], Y.ravel()[ii]]
x_cord = simplex[0][0]
index_x = 0
for s in simplex:
    if s[0] != x_cord: 
        break
    index_x += 1

#simplex = np.vstack([simplex, np.array([X.ravel()[ii][2], Y.ravel()[ii][2]])])
simplex = np.c_[X.ravel()[ii[:2]], Y.ravel()[ii[:2]]]
simplex = np.vstack([simplex, np.array([X.ravel()[ii[index_x]], Y.ravel()[ii[index_x]]])])

# get bariocentric coordinates
A = np.vstack([np.array(simplex).T, np.ones(len(simplex))])
#b = np.hstack([highlight_point, [1]])

# get the inverse of A
A_inv = np.linalg.inv(A)

# plot nearest neighbors
#plt.plot(X.ravel()[ii[:2]], Y.ravel()[ii[:2]], 'yo', label='Nearest Neighbors', markersize=4)
#plt.plot(X.ravel()[ii[index_x]], Y.ravel()[ii[index_x]], 'yo', label='Nearest Neighbors', markersize=4)
# plot

a = np.array([ 0.31875  ,  -0.07      ])
b = np.array([ 0.31875 ,   -0.06945313])
c = np.array ([ 0.32578125, -0.07      ])
p = np.array([ 0.3190625 ,-0.07     ])
plt.plot(a[0], a[1], 'ro', label='A', markersize=2)
plt.plot(b[0], b[1], 'ro', label='B', markersize=2)
plt.plot(c[0], c[1], 'ro', label='C', markersize=2)
plt.plot(p[0], p[1], 'ro', label='P', markersize=5)



In [2]:
# Test mountain car environment:

with open(env.__class__.__name__ + ".pkl", "rb") as f:
    pi: PolicyIteration = pickle.load(f)

test_enviroment(Continuous_MountainCarEnv(render_mode="human"), pi)

Episode 0 finished after 65 timesteps
Total reward: 93.50000000000001
Episode 1 finished after 66 timesteps
Total reward: 93.4
Episode 2 finished after 65 timesteps
Total reward: 93.50000000000001
Episode 3 finished after 65 timesteps
Total reward: 93.50000000000001
Episode 4 finished after 66 timesteps
Total reward: 93.4
Episode 5 finished after 67 timesteps
Total reward: 93.30000000000001
Episode 6 finished after 65 timesteps
Total reward: 93.50000000000001
Episode 7 finished after 65 timesteps
Total reward: 93.50000000000001
Episode 8 finished after 66 timesteps
Total reward: 93.4
Episode 9 finished after 66 timesteps
Total reward: 93.4
Episode 10 finished after 66 timesteps
Total reward: 93.4
Episode 11 finished after 65 timesteps
Total reward: 93.50000000000001
Episode 12 finished after 67 timesteps
Total reward: 93.30000000000001
Episode 13 finished after 65 timesteps
Total reward: 93.50000000000001
Episode 14 finished after 67 timesteps
Total reward: 93.30000000000001
Episode 15

KeyboardInterrupt: 

In [None]:
# graph the value function of the mountain car environment:
plot_3D_value_function(pi.value_function)
plot_2D_value_function(pi.value_function)