In [2]:
%load_ext autoreload
%autoreload 1

from environment import TorchLikeCarEnv, CarEnv
%aimport environment
from policy import QNetwork, recover_flattened
%aimport policy

[Evotorch RL example](https://github.com/nnaisense/evotorch?tab=readme-ov-file#a-reinforcement-learning-example)
In a GymNE problem, the solver tries to maximize the total reward

In [3]:
import torch
from evotorch.algorithms import CMAES, CEM
from evotorch.logging import StdOutLogger, PicklingLogger
from evotorch.neuroevolution import GymNE

def gymNE_env(*args, **kwargs): return TorchLikeCarEnv(10, False, evaluation=False, *args, **kwargs)
action_dim = gymNE_env().action_space.n
device = "cuda" if torch.cuda.is_available() else "cpu"

In [5]:
import torch

q_network = QNetwork(action_dim).to(device)
problem = GymNE(
    env=gymNE_env,
    network=q_network,
    num_episodes=1,
)

searcher = CEM(
    problem,
    # The keyword arguments below refer to hyperparameters specific to the
    # cross entropy method algorithm. It is recommended to tune these
    # hyperparameters according to the problem at hand.
    popsize=15,  # population size
    parenthood_ratio=0.3,  # better solutions become parents
    stdev_init=10.0,  # initial standard deviation of the search distribution
)

# Instantiate a standard output logger
StdOutLogger(searcher)

# Optional: Instantiate a logger to pickle and save the results periodically.
# In this example, among the saved results will be the center of the search
# distribution, since we are using PGPE which is distribution-based.
PicklingLogger(searcher, interval=10, directory="EA_results")

# Run the algorithm for the specified amount of generations
searcher.run(10)

[2024-03-10 20:50:53] INFO     <13036> evotorch.core: Instance of `GymNE` (id:1805835953488) -- The `dtype` for the problem's decision variables is set as torch.float32
[2024-03-10 20:50:53] INFO     <13036> evotorch.core: Instance of `GymNE` (id:1805835953488) -- `eval_dtype` (the dtype of the fitnesses and evaluation data) is set as torch.float32
[2024-03-10 20:50:53] INFO     <13036> evotorch.core: Instance of `GymNE` (id:1805835953488) -- The `device` of the problem is set as cpu
[2024-03-10 20:50:53] INFO     <13036> evotorch.core: Instance of `GymNE` (id:1805835953488) -- The number of actors that will be allocated for parallelized evaluation is 0


KeyboardInterrupt: 

In [11]:
from evotorch.algorithms import SNES
q_network = QNetwork(action_dim).to(device)
problem = GymNE(
    env=gymNE_env,
    network=q_network,
    num_episodes=1,
    action_noise_stdev=0.1,
)
searcher = SNES(problem, stdev_init=5, popsize=20, popsize_max=40)
StdOutLogger(searcher)
searcher.run(50)

[2024-03-10 18:06:06] INFO     <11760> evotorch.core: Instance of `GymNE` (id:1674915931152) -- The `dtype` for the problem's decision variables is set as torch.float32
[2024-03-10 18:06:06] INFO     <11760> evotorch.core: Instance of `GymNE` (id:1674915931152) -- `eval_dtype` (the dtype of the fitnesses and evaluation data) is set as torch.float32
[2024-03-10 18:06:06] INFO     <11760> evotorch.core: Instance of `GymNE` (id:1674915931152) -- The `device` of the problem is set as cpu
[2024-03-10 18:06:06] INFO     <11760> evotorch.core: Instance of `GymNE` (id:1674915931152) -- The number of actors that will be allocated for parallelized evaluation is 0
                   iter : 1
              mean_eval : 0.04982553794980049
          pop_best_eval : 10.844755172729492
            median_eval : 3.2594640254974365
              best_eval : 10.844755172729492
             worst_eval : -44.87783432006836
total_interaction_count : 898
    total_episode_count : 20

                   iter 

In [11]:
import pickle
with open("EA_results/GymNE_2024-03-10-20.21.07_5548_generation000090.pickle", "rb") as f:
    results = pickle.load(f)
best = results["best"]
from policy import SimpleQNetwork
q_network = SimpleQNetwork(action_dim)
recover_flattened(best, q_network)
torch.save(q_network, "models/CMAES.pth")
# model = torch.load("models/CEM.pth", map_location=device)
env = CarEnv(300, display=True, evaluation=False, draw_central_line=True)

obs = env.reset()
score = 0

while True:
    obstensor = CarEnv.obs2tensor(obs, device)
    action = q_network(obstensor).argmax(dim=1)
    obs, reward, done, _ = env.step(action)
    score += reward
    if done:
        break
print(score)

10.625791735721206


On these two algorithms (SNES and CEM), the result is that the car is going straight forward. We must fine-tune this using longer time for each episode.
We will also use a stronger noise for the road to force the policy to learn in difficult situations.

In [13]:
from evotorch.algorithms import SNES
def gymNE_env(*args, **kwargs): return TorchLikeCarEnv(30, True, evaluation=False, scale=0.002, persistence=0.01, *args, **kwargs)
q_network = torch.load("models/SNES1.pth", map_location=device)
problem = GymNE(
    env=gymNE_env,
    network=q_network,
    num_episodes=1,
    action_noise_stdev=0.1,
)
searcher = SNES(problem, stdev_init=5, popsize=20, popsize_max=40)
StdOutLogger(searcher)
PicklingLogger(searcher, interval=10, directory="EA_results")
searcher.run(100)

[2024-03-10 20:03:30] INFO     <16244> evotorch.core: Instance of `GymNE` (id:2570471146192) -- The `dtype` for the problem's decision variables is set as torch.float32
[2024-03-10 20:03:30] INFO     <16244> evotorch.core: Instance of `GymNE` (id:2570471146192) -- `eval_dtype` (the dtype of the fitnesses and evaluation data) is set as torch.float32
[2024-03-10 20:03:30] INFO     <16244> evotorch.core: Instance of `GymNE` (id:2570471146192) -- The `device` of the problem is set as cpu
[2024-03-10 20:03:30] INFO     <16244> evotorch.core: Instance of `GymNE` (id:2570471146192) -- The number of actors that will be allocated for parallelized evaluation is 0


TypeError: 'NoneType' object is not subscriptable

Results tend to be worse...

=> alive_bonus_schedule to force long term reward

In [11]:
from policy import SimpleQNetwork
from evotorch.algorithms import PyCMAES

def gymNE_env(*args, **kwargs): return TorchLikeCarEnv(30, False, evaluation=False, scale=0.002, persistence=0.01, *args, **kwargs)
q_network =torch.load("models/CMAES.pth", map_location=device)
problem = GymNE(
    env=gymNE_env,
    network=q_network,
    num_episodes=1,
)
searcher = PyCMAES(problem, stdev_init=5)
StdOutLogger(searcher)
PicklingLogger(searcher, interval=10, directory="EA_results")
searcher.run(10)

[2024-03-10 20:21:02] INFO     < 5548> evotorch.core: Instance of `GymNE` (id:1793141663504) -- The `dtype` for the problem's decision variables is set as torch.float32
[2024-03-10 20:21:02] INFO     < 5548> evotorch.core: Instance of `GymNE` (id:1793141663504) -- `eval_dtype` (the dtype of the fitnesses and evaluation data) is set as torch.float32
[2024-03-10 20:21:02] INFO     < 5548> evotorch.core: Instance of `GymNE` (id:1793141663504) -- The `device` of the problem is set as cpu
[2024-03-10 20:21:02] INFO     < 5548> evotorch.core: Instance of `GymNE` (id:1793141663504) -- The number of actors that will be allocated for parallelized evaluation is 0
(15_w,31)-aCMA-ES (mu_w=8.9,w_1=20%) in dimension 10838 (seed=3690875053, Sun Mar 10 20:21:07 2024)
                   iter : 1
              mean_eval : 0.3022964298725128
            median_eval : -2.4058985710144043
          pop_best_eval : 45.930137634277344
              best_eval : 45.930137634277344
             worst_eval : -10