<a href="https://colab.research.google.com/github/nicoRomeroCuruchet/DynamicProgramming/blob/main/testing_bary.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pickle
import numpy as np
from PolicyIteration import PolicyIteration
from utils.utils import plot_2D_value_function,\
                        plot_3D_value_function,\
                        test_enviroment

# CartPoleEnv 

### Observation Space

The observation is a `ndarray` with shape `(4,)` with the values corresponding to the following positions and velocities:

| Num | Observation           | Min                 | Max               |
|-----|-----------------------|---------------------|-------------------|
| 0   | Cart Position         | -4.8                | 4.8               |
| 1   | Cart Velocity         | -Inf                | Inf               |
| 2   | Pole Angle            | ~ -0.418 rad (-24°) | ~ 0.418 rad (24°) |
| 3   | Pole Angular Velocity | -Inf                | Inf               |

### Action Space

The action is a `ndarray` with shape `(1,)` which can take values `{0, 1}` indicating the direction
of the fixed force the cart is pushed with.

- 0: Push cart to the left
- 1: Push cart to the right

In [None]:
# Train cartpole environment:
from classic_control.cartpole import CartPoleEnv

env = CartPoleEnv(sutton_barto_reward=True)
# position thresholds:
x_lim         = 2.4
theta_lim     = 0.418 
# velocity thresholds:
x_dot_lim     = 3.1
theta_dot_lim = 3.1

bins_space = {
    "x_space"         : np.linspace(-x_lim, x_lim, 10,  dtype=np.float32),                     # position space         (0)
    "x_dot_space"     : np.linspace(-x_dot_lim, x_dot_lim, 7,  dtype=np.float32),              # velocity space         (1)
    "theta_space"     : np.linspace(-theta_lim, theta_lim, 10, dtype=np.float32),              # angle space            (2)
    "theta_dot_space" : np.linspace(-theta_dot_lim, theta_dot_lim, 7, dtype=np.float32),       # angular velocity space (3)
}

pi = PolicyIteration(
    env=env, 
    bins_space=bins_space,
    action_space=np.array([0, 1], dtype=np.int32),
    gamma=0.99,
    theta=1e-3
)

pi.run()

In [None]:
# Test cartpole environment:

with open(env.__class__.__name__ + ".pkl", "rb") as f:
    pi = pickle.load(f)

test_enviroment(CartPoleEnv(sutton_barto_reward=True, render_mode="human"), pi)

## Observation Space

The observation is a `ndarray` with shape `(2,)` where the elements correspond to the following:

| Num | Observation                          | Min   | Max  | Unit         |
|-----|--------------------------------------|-------|------|--------------|
| 0   | position of the car along the x-axis | -1.2  | 0.6  | position (m) |
| 1   | velocity of the car                  | -0.07 | 0.07 | velocity (v) |

## Action Space

There are 3 discrete deterministic actions:

- 0: Accelerate to the left
- 1: Don't accelerate
- 2: Accelerate to the right


In [2]:
from classic_control.continuous_mountain_car import Continuous_MountainCarEnv

env=Continuous_MountainCarEnv()

bins_space = {
    "x_space":     np.linspace(env.min_position, env.max_position, 20,      dtype=np.float32),    # position space    (0)
    "x_dot_space": np.linspace(-abs(env.max_speed), abs(env.max_speed), 20, dtype=np.float32),    # velocity space    (1)
}

pi = PolicyIteration(
    env=env, 
    bins_space=bins_space,
    action_space=np.linspace(-1.0, +1.0,9, dtype=np.float32),
    gamma=0.99,
    theta=1e-3,
)
#pi.run()

[32m2024-09-08 18:22:37.935[0m | [1mINFO    [0m | [36mPolicyIteration[0m:[36m__init__[0m:[36m120[0m - [1mLower bounds: [-1.2  -0.07][0m
[32m2024-09-08 18:22:37.935[0m | [1mINFO    [0m | [36mPolicyIteration[0m:[36m__init__[0m:[36m121[0m - [1mUpper bounds: [0.6  0.07][0m
[32m2024-09-08 18:22:38.015[0m | [1mINFO    [0m | [36mPolicyIteration[0m:[36m__init__[0m:[36m127[0m - [1mCreating Delaunay triangulation...[0m
[32m2024-09-08 18:22:38.019[0m | [1mINFO    [0m | [36mPolicyIteration[0m:[36m__init__[0m:[36m129[0m - [1mDelaunay triangulation created.[0m
[32m2024-09-08 18:22:38.101[0m | [1mINFO    [0m | [36mPolicyIteration[0m:[36m__init__[0m:[36m156[0m - [1mPolicy Iteration was correctly initialized.[0m
[32m2024-09-08 18:22:38.102[0m | [1mINFO    [0m | [36mPolicyIteration[0m:[36m__init__[0m:[36m157[0m - [1mThe enviroment name is: Continuous_MountainCarEnv[0m
[32m2024-09-08 18:22:38.103[0m | [1mINFO    [0m | [36mPolicy

In [None]:
# Test mountain car environment:
with open(env.__class__.__name__ + ".pkl", "rb") as f:
    pi: PolicyIteration = pickle.load(f)

test_enviroment(Continuous_MountainCarEnv(render_mode="human"), pi)

In [None]:
# graph the value function of the mountain car environment:
plot_3D_value_function(pi.value_function)
plot_2D_value_function(pi.value_function)

In [3]:
import cProfile, pstats, io
from pstats import SortKey
pr = cProfile.Profile()
pr.enable()
pi.run()
pr.disable()
s = io.StringIO()
sortby = SortKey.CUMULATIVE
ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
ps.print_stats()
print(s.getvalue())

[32m2024-09-08 18:20:53.463[0m | [1mINFO    [0m | [36mPolicyIteration[0m:[36mrun[0m:[36m462[0m - [1mGenerating transition and reward function table...[0m
[32m2024-09-08 18:20:54.806[0m | [1mINFO    [0m | [36mPolicyIteration[0m:[36mrun[0m:[36m464[0m - [1mTransition and reward function table generated.[0m
  0%|          | 0/100 [00:00<?, ?it/s][32m2024-09-08 18:20:54.811[0m | [1mINFO    [0m | [36mPolicyIteration[0m:[36mrun[0m:[36m466[0m - [1msolving step 0[0m
[32m2024-09-08 18:20:54.812[0m | [1mINFO    [0m | [36mPolicyIteration[0m:[36mpolicy_evaluation[0m:[36m384[0m - [1mStarting policy evaluation[0m
[32m2024-09-08 18:20:55.026[0m | [1mINFO    [0m | [36mPolicyIteration[0m:[36mpolicy_evaluation[0m:[36m410[0m - [1mMax Error: 99.95833587646484                            | Avg Error: 6.203125                            | 0<0.001[0m
[32m2024-09-08 18:20:58.895[0m | [1mINFO    [0m | [36mPolicyIteration[0m:[36mpolicy_evaluation

         51850623 function calls (51844773 primitive calls) in 47.769 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        2    0.000    0.000   47.901   23.951 c:\Users\nicor\anaconda3\envs\DynamicProgramming\Lib\site-packages\IPython\core\interactiveshell.py:3541(run_code)
      3/2    0.000    0.000   47.901   23.951 {built-in method builtins.exec}
        1    0.001    0.001   47.901   47.901 c:\users\nicor\dynamicprogramming\src\PolicyIteration.py:458(run)
        5    0.655    0.131   45.690    9.138 c:\users\nicor\dynamicprogramming\src\PolicyIteration.py:377(policy_evaluation)
    24840    0.347    0.000   34.198    0.001 c:\users\nicor\dynamicprogramming\src\PolicyIteration.py:345(get_value)
    55172    0.306    0.000   28.899    0.001 c:\Users\nicor\anaconda3\envs\DynamicProgramming\Lib\site-packages\jax\_src\array.py:318(__getitem__)
    55181    0.221    0.000   28.388    0.001 c:\Users\nicor\anaconda3\env

In [3]:
import cProfile, pstats, io
from pstats import SortKey
pr = cProfile.Profile()
pr.enable()
pi.run()
pr.disable()
s = io.StringIO()
sortby = SortKey.CUMULATIVE
ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
ps.print_stats()
print(s.getvalue())

[32m2024-09-08 18:22:41.817[0m | [1mINFO    [0m | [36mPolicyIteration[0m:[36mrun[0m:[36m462[0m - [1mGenerating transition and reward function table...[0m
[32m2024-09-08 18:22:42.958[0m | [1mINFO    [0m | [36mPolicyIteration[0m:[36mrun[0m:[36m464[0m - [1mTransition and reward function table generated.[0m
  0%|          | 0/100 [00:00<?, ?it/s][32m2024-09-08 18:22:42.964[0m | [1mINFO    [0m | [36mPolicyIteration[0m:[36mrun[0m:[36m466[0m - [1msolving step 0[0m
[32m2024-09-08 18:22:42.965[0m | [1mINFO    [0m | [36mPolicyIteration[0m:[36mpolicy_evaluation[0m:[36m384[0m - [1mStarting policy evaluation[0m
[32m2024-09-08 18:22:43.160[0m | [1mINFO    [0m | [36mPolicyIteration[0m:[36mpolicy_evaluation[0m:[36m410[0m - [1mMax Error: 99.95833587646484                            | Avg Error: 6.203125                            | 0<0.001[0m
[32m2024-09-08 18:22:45.875[0m | [1mINFO    [0m | [36mPolicyIteration[0m:[36mpolicy_evaluation

         37094980 function calls (37089144 primitive calls) in 36.303 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        2    0.000    0.000   36.377   18.189 c:\Users\nicor\anaconda3\envs\DynamicProgramming\Lib\site-packages\IPython\core\interactiveshell.py:3541(run_code)
      3/2    0.000    0.000   36.377   18.189 {built-in method builtins.exec}
        1    0.001    0.001   36.377   36.377 c:\users\nicor\dynamicprogramming\src\PolicyIteration.py:458(run)
        5    0.635    0.127   34.423    6.885 c:\users\nicor\dynamicprogramming\src\PolicyIteration.py:377(policy_evaluation)
    55172    0.298    0.000   28.988    0.001 c:\Users\nicor\anaconda3\envs\DynamicProgramming\Lib\site-packages\jax\_src\array.py:318(__getitem__)
    55181    0.216    0.000   28.485    0.001 c:\Users\nicor\anaconda3\envs\DynamicProgramming\Lib\site-packages\jax\_src\numpy\lax_numpy.py:7832(_rewriting_take)
    24840    0.609    0.000  