<a href="https://colab.research.google.com/github/nicoRomeroCuruchet/DynamicProgramming/blob/main/testing_bary.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pickle
import numpy as np
from pathlib import Path
from utils.utils import test_enviroment
from PolicyIteration import PolicyIteration, PolicyIterationConfig

# CartPoleEnv 

### Observation Space

The observation is a `ndarray` with shape `(4,)` with the values corresponding to the following positions and velocities:

| Num | Observation           | Min                 | Max               |
|-----|-----------------------|---------------------|-------------------|
| 0   | Cart Position         | -4.8                | 4.8               |
| 1   | Cart Velocity         | -Inf                | Inf               |
| 2   | Pole Angle            | ~ -0.418 rad (-24°) | ~ 0.418 rad (24°) |
| 3   | Pole Angular Velocity | -Inf                | Inf               |

### Action Space

The action is a `ndarray` with shape `(1,)` which can take values `{0, 1}` indicating the direction
of the fixed force the cart is pushed with.

- 0: Push cart to the left
- 1: Push cart to the right

In [None]:
from classic_control.cartpole import CartPoleEnv

# CartPole environment:
env = CartPoleEnv()
# position thresholds:
x_lim         = 2.4
theta_lim     = 0.418 
# velocity thresholds:
x_dot_lim     = 3.1
theta_dot_lim = 3.1

bins_space = {
    "x_space"         : np.linspace(-x_lim, x_lim, 10,  dtype=np.float32),                     # position space          (0)
    "x_dot_space"     : np.linspace(-x_dot_lim, x_dot_lim, 10,  dtype=np.float32),             # velocity space          (1)
    "theta_space"     : np.linspace(-theta_lim, theta_lim, 10, dtype=np.float32),              # angle space             (2)
    "theta_dot_space" : np.linspace(-theta_dot_lim, theta_dot_lim, 10, dtype=np.float32),      # angular velocity space  (3)
}

# Create a custom configuration
custom_config = PolicyIterationConfig(
    gamma=0.95,          # Discount factor 
    theta=1e-3,          # Convergence threshold 
    n_steps=200,         # number of iterations
    log=True,            # Enable logging
    log_interval=50,     # Update logging more frequently
    img_path=Path("./img")  # Custom image save directory
)

pi = PolicyIteration(
    env=env, 
    bins_space=bins_space,
    action_space=np.array([0, 1], dtype=np.int32),
    config=custom_config
)

pi.run()

In [None]:
# Test cartpole environment:
pi = PolicyIteration.load(Path("CartPoleEnv_policy.pkl"))
test_enviroment(CartPoleEnv(sutton_barto_reward=True, render_mode="human"), pi)

## Observation Space

The observation is a `ndarray` with shape `(2,)` where the elements correspond to the following:

| Num | Observation                          | Min   | Max  | Unit         |
|-----|--------------------------------------|-------|------|--------------|
| 0   | position of the car along the x-axis | -1.2  | 0.6  | position (m) |
| 1   | velocity of the car                  | -0.07 | 0.07 | velocity (v) |

## Action Space

There are 3 discrete deterministic actions:

- 0: Accelerate to the left
- 1: Don't accelerate
- 2: Accelerate to the right


In [None]:
from classic_control.continuous_mountain_car import Continuous_MountainCarEnv
env=Continuous_MountainCarEnv()

bins_space = {
    "x_space":     np.linspace(env.min_position, env.max_position, 100,      dtype=np.float32),    # position space    (0)
    "x_dot_space": np.linspace(-abs(env.max_speed), abs(env.max_speed), 100, dtype=np.float32),    # velocity space    (1)
}

# Create a custom configuration
custom_config = PolicyIterationConfig(
    gamma=0.99,          # Discount factor 
    theta=1e-3,          # Convergence threshold 
    n_steps=200,         # number of iterations
    log=True,            # Enable logging
    log_interval=50,     # Update logging more frequently
    img_path=Path("./img")  # Custom image save directory
)

pi = PolicyIteration(
    env=env, 
    bins_space=bins_space,
    action_space=np.linspace(-1.0, +1.0, 9, dtype=np.float32),
    config=custom_config
)

pi.run()

In [None]:
# Test mountain car environment:
pi = PolicyIteration.load(Path("Continuous_MountainCarEnv_policy.pkl"))
test_enviroment(Continuous_MountainCarEnv(render_mode="human"), pi)

In [None]:
import pstats
from pstats import SortKey
p = pstats.Stats("profile-results.prof")
p.strip_dirs().sort_stats(SortKey.CUMULATIVE).print_stats(20)  # Top 20 functions by cumulative time

In [None]:
import matplotlib.pyplot as plt

x_size = len(pi.bins_space['x_space'])
xdot_size = len(pi.bins_space['x_dot_space'])

# Reshape the 1D value function into a 2D array
vf_2d = pi.value_function.reshape(x_size, xdot_size)
# normalize beetwen 0 and -1
vf_2d = vf_2d - np.min(vf_2d)
vf_2d = vf_2d / np.max(vf_2d)


# Create coordinate grids for plotting
x_values = pi.bins_space['x_space']
xdot_values = pi.bins_space['x_dot_space']
X, Xdot = np.meshgrid(x_values, xdot_values, indexing='ij')

# vf_2d now matches X.shape (x_size, xdot_size)

plt.figure(figsize=(7, 5))
# Option A: contourf for smooth transitions
contour = plt.contourf(Xdot, X, vf_2d, levels=50, cmap='hot_r')
#plt.gca().invert_yaxis()
cbar = plt.colorbar(contour)
cbar.set_label("Value")

plt.xlabel("Position (x)")
plt.ylabel("Velocity (ẋ)")
plt.title("2D Value Function")
plt.show()


In [None]:
import matplotlib.pyplot as plt

# After running policy iteration...
plt.figure(figsize=(8, 5))
plt.plot(pi.policy_convergence, marker='o', linestyle='-')
plt.xlabel('Policy Improvement Iteration')
plt.ylabel('Sum of Policy Changes')
plt.title('Policy Convergence Over Iterations')
plt.grid(True)
plt.show()

In [None]:
import matplotlib.pyplot as plt

# After running policy iteration...
plt.figure(figsize=(8, 5))
plt.plot(pi.delta, marker='o', linestyle='-')
plt.xlabel('Policy Improvement Iteration')
plt.ylabel('Sum of Policy Changes')
plt.title('Policy Convergence Over Iterations')
plt.grid(True)
plt.show()