[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/rl-tools/pyrltools/blob/master/examples/notebooks/learning_curves.ipynb)

# Learning Curve
Install dependencies

In [None]:
!pip install pyrltools[mkl] gymnasium matplotlib

If in Colab, clone PyRLtools so that we can use the [example C++ Pendulum implementation](https://github.com/rl-tools/pyrltools/tree/master/examples/custom_environment/environment.h)

In [None]:
![[ -n "${COLAB_GPU+set}" ]] && git clone https://github.com/rl-tools/pyrltools.git || true

In [None]:
from pyrltools import SAC
import matplotlib.pyplot as plt
import gymnasium as gym
import numpy as np
import os

Define the C++ environment, so that the JIT compiler can find it (location depends on wether the notebook is run locally or in Colab)

In [None]:
custom_environment = {
    "path": os.path.abspath("../custom_environment") if "COLAB_GPU" not in os.environ else "pyrltools/examples/custom_environment",
    "action_dim": 1,
    "observation_dim": 3,
}

In [None]:
n_seeds = 10
n_steps = 10000
evaluation_interval = 1000
environment_name = "Pendulum-v1"
use_python_environment = False

In [None]:
def evaluate_policy(policy, n_episodes=10):
    env_replay = gym.make(environment_name)
    env_replay.reset(seed=0xf00d)
    returns = []
    for episode_i in range(n_episodes):
        observation, _ = env_replay.reset()
        finished = False
        rewards = 0
        while not finished:
            action = policy(observation)
            observation, reward, terminated, truncated, _ = env_replay.step(action)
            rewards += reward
            finished = terminated or truncated
        returns.append(rewards)
    return returns

def scale_action(action, env):
    return action * (env.action_space.high - env.action_space.low) / 2.0 + (env.action_space.high + env.action_space.low) / 2.0

In [None]:
returns = []
for seed in range(n_seeds):
    example_env = gym.make(environment_name)
    kwargs = {"STEP_LIMIT": n_steps}
    if use_python_environment:
        def env_factory():
            env = gym.make(environment_name)
            env.reset(seed=seed)
            return env
        sac = SAC(env_factory, enable_evaluation=False, **kwargs)
    else:
        sac = SAC(custom_environment, enable_evaluation=False, **kwargs)
    state = sac.State(seed)
    returns_seed = []
    for step_i in range(n_steps):
        if step_i % evaluation_interval == 0:
            returns_seed.append(evaluate_policy(lambda observation: scale_action(state.action(observation), example_env)))
        state.step()
    returns.append(returns_seed)

In [None]:
returns = np.array(returns)
returns_aggregate = returns.mean(axis=-1)
returns_mean = returns_aggregate.mean(axis=0)
returns_std = returns_aggregate.std(axis=0)
horizontal = range(0, n_steps, evaluation_interval)
plt.fill_between(horizontal, returns_mean - returns_std, returns_mean + returns_std, alpha=0.1)
plt.plot(horizontal, returns_mean, label="PyRLtools")
plt.xlabel("Steps")
plt.ylabel("Returns")
plt.legend()