In [1]:
%load_ext autoreload
%autoreload 1

from environment import TorchLikeCarEnv, CarEnv
%aimport environment
from policy import QNetwork, recover_flattened
%aimport policy

[Evotorch RL example](https://github.com/nnaisense/evotorch?tab=readme-ov-file#a-reinforcement-learning-example)
In a GymNE problem, the solver tries to maximize the total reward

In [6]:
import torch
from evotorch.algorithms import CMAES, CEM
from evotorch.logging import StdOutLogger, PicklingLogger
from evotorch.neuroevolution import GymNE

def gymNE_env(*args, **kwargs): return TorchLikeCarEnv(10, False, evaluation=False, *args, **kwargs)
action_dim = gymNE_env().action_space.n
device = "cuda" if torch.cuda.is_available() else "cpu"

In [24]:
import torch

q_network = QNetwork(action_dim).to(device)
problem = GymNE(
    env=gymNE_env,
    network=q_network,
    num_episodes=1,
)

searcher = CEM(
    problem,
    # The keyword arguments below refer to hyperparameters specific to the
    # cross entropy method algorithm. It is recommended to tune these
    # hyperparameters according to the problem at hand.
    popsize=15,  # population size
    parenthood_ratio=0.3,  # better solutions become parents
    stdev_init=10.0,  # initial standard deviation of the search distribution
)

# Instantiate a standard output logger
StdOutLogger(searcher)

# Optional: Instantiate a logger to pickle and save the results periodically.
# In this example, among the saved results will be the center of the search
# distribution, since we are using PGPE which is distribution-based.
PicklingLogger(searcher, interval=10, directory="EA_results")

# Run the algorithm for the specified amount of generations
searcher.run(10)

[2024-03-10 18:55:02] INFO     <11760> evotorch.core: Instance of `GymNE` (id:1674949213840) -- The `dtype` for the problem's decision variables is set as torch.float32
[2024-03-10 18:55:02] INFO     <11760> evotorch.core: Instance of `GymNE` (id:1674949213840) -- `eval_dtype` (the dtype of the fitnesses and evaluation data) is set as torch.float32
[2024-03-10 18:55:02] INFO     <11760> evotorch.core: Instance of `GymNE` (id:1674949213840) -- The `device` of the problem is set as cpu
[2024-03-10 18:55:02] INFO     <11760> evotorch.core: Instance of `GymNE` (id:1674949213840) -- The number of actors that will be allocated for parallelized evaluation is 0
                   iter : 1
              mean_eval : 3.5604984760284424
          pop_best_eval : 19.455232620239258
            median_eval : 4.084464073181152
              best_eval : 19.455232620239258
             worst_eval : -8.768762588500977
total_interaction_count : 625
    total_episode_count : 15



KeyboardInterrupt: 

In [11]:
from evotorch.algorithms import SNES
q_network = QNetwork(action_dim).to(device)
problem = GymNE(
    env=gymNE_env,
    network=q_network,
    num_episodes=1,
    action_noise_stdev=0.1,
)
searcher = SNES(problem, stdev_init=5, popsize=20, popsize_max=40)
StdOutLogger(searcher)
searcher.run(50)

[2024-03-10 18:06:06] INFO     <11760> evotorch.core: Instance of `GymNE` (id:1674915931152) -- The `dtype` for the problem's decision variables is set as torch.float32
[2024-03-10 18:06:06] INFO     <11760> evotorch.core: Instance of `GymNE` (id:1674915931152) -- `eval_dtype` (the dtype of the fitnesses and evaluation data) is set as torch.float32
[2024-03-10 18:06:06] INFO     <11760> evotorch.core: Instance of `GymNE` (id:1674915931152) -- The `device` of the problem is set as cpu
[2024-03-10 18:06:06] INFO     <11760> evotorch.core: Instance of `GymNE` (id:1674915931152) -- The number of actors that will be allocated for parallelized evaluation is 0
                   iter : 1
              mean_eval : 0.04982553794980049
          pop_best_eval : 10.844755172729492
            median_eval : 3.2594640254974365
              best_eval : 10.844755172729492
             worst_eval : -44.87783432006836
total_interaction_count : 898
    total_episode_count : 20

                   iter 

In [18]:
best = searcher.status["pop_best"]
recover_flattened(best, q_network)
torch.save(q_network, "models/CEM.pth")

In [23]:
model = torch.load("models/CEM.pth", map_location=device)
env = CarEnv(300, display=True, evaluation=False, draw_central_line=True)

obs = env.reset()
score = 0

while True:
    obstensor = CarEnv.obs2tensor(obs, device)
    action = q_network(obstensor).argmax(dim=1)
    obs, reward, done, _ = env.step(action)
    score += reward
    if done:
        break
print(score)

35.44203730871672
