# Training the agent

## Settings

In [2]:
# workspace directory 
WORKSPACE_DIR = "/content/gdrive/MyDrive/RLF002/vse-004-from-basestrat"

# environment parameters
# set race (see racesim/input/parameters for possible races)
race = "Shanghai_2019"
race_pars_file = f"/content/racesim/input/parameters/pars_{race}.ini"
mcs_pars_file = "/content/racesim/input/parameters/pars_mcs.ini"
# VSE type for other drivers: 'basestrategy', 'realstrategy', 'supervised', 'reinforcement' (if already available),
# 'multi_agent' (if VSE should learn for all drivers at once)
vse_others = "realstrategy"

# hyperparameters
num_iterations = 1
replay_buffer_max_length = 200_000
initial_collect_steps = 200
collect_steps_per_iteration = 1

fc_layer_params = (64, 64,)
batch_size = 64
learning_rate = 1e-3
gamma = 1.0  # discount rate
n_step_update = 1
target_update_period = 1
dueling_q_net = False

# training options
num_iterations      = 5_000   # 100_000
log_interval        = 5_000   # 100_000
eval_interval       = 100     # 50_000
checkpoint_interval = 5_000
num_eval_episodes   = 10

# postprocessing (currently not implemented for multi-agent environment)
calculate_final_positions = False  # activate or deactivate evaluation after training
num_races_postproc = 1000  # 10_000
# VSE type for other drivers: 'basestrategy', 'realstrategy', 'supervised', 'reinforcement' (if already available)
vse_others_postproc = "realstrategy"

vse_paths = {
    # "reinf_nnmodel": "/content/gdrive/MyDrive/RLF000/vse-003-from-reinforcement/exports/2023-05-24-final/nn_reinforcement_Shanghai_2019.tflite",
    # "reinf_preprocessor": "/content/gdrive/MyDrive/RLF000/vse-003-from-realstrat/exports/2023-05-24-final/preprocessor_reinforcement_Shanghai_2019.pkl",
    "supervised_nnmodel_cc": "/content/racesim/input/vse/nn_supervised_compoundchoice.tflite",
    "supervised_nnmodel_tc": "/content/racesim/input/vse/nn_supervised_tirechange.tflite",
    "supervised_preprocessor_cc": "/content/racesim/input/vse/preprocessor_supervised_compoundchoice.pkl",
    "supervised_preprocessor_tc": "/content/racesim/input/vse/preprocessor_supervised_tirechange.pkl"
}

Mount Google Drive.

In [3]:
from google.colab import drive

drive.mount("/content/gdrive")

Mounted at /content/gdrive


Check Colab settings.

In [4]:
from psutil import virtual_memory

gpu_info = !nvidia-smi
gpu_info = "\n".join(gpu_info)
if gpu_info.find("failed") >= 0:
  print("Not connected to a GPU")
else:
  print(gpu_info)

ram_gb = virtual_memory().total / 1e9
print(f"Your runtime has {ram_gb:.1f} gigabytes of available RAM\n")

if ram_gb < 20:
  print("Not using a high-RAM runtime")
else:
  print("You are using a high-RAM runtime!")

/bin/bash: nvidia-smi: command not found
Your runtime has 13.6 gigabytes of available RAM

Not using a high-RAM runtime


## Installation

Install code repository

In [5]:
!rm -rf msca-race-simulation/
!git clone --depth 1 https://github.com/pezon/msca-race-simulation 
!cp -R msca-race-simulation/* .

Cloning into 'msca-race-simulation'...
remote: Enumerating objects: 365, done.[K
remote: Counting objects: 100% (365/365), done.[K
remote: Compressing objects: 100% (304/304), done.[K
remote: Total 365 (delta 175), reused 209 (delta 52), pack-reused 0[K
Receiving objects: 100% (365/365), 4.14 MiB | 18.85 MiB/s, done.
Resolving deltas: 100% (175/175), done.


Install dependencies.

In [6]:
!pip install -r requirements.txt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tf-agents (from -r requirements.txt (line 15))
  Downloading tf_agents-0.16.0-py3-none-any.whl (1.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m
Collecting gym<=0.23.0,>=0.17.0 (from tf-agents->-r requirements.txt (line 15))
  Downloading gym-0.23.0.tar.gz (624 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m624.4/624.4 kB[0m [31m49.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pygame==2.1.3 (from tf-agents->-r requirements.txt (line 15))
  Downloading pygame-2.1.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.7/13.7 MB[0

## Initialization

Import libraries

In [7]:
import warnings
from datetime import datetime
from pathlib import Path

import numpy as np
import tensorflow as tf
from tensorflow.keras.optimizers.schedules import PolynomialDecay
from tf_agents.agents.dqn.dqn_agent import DqnAgent
from tf_agents.environments.tf_py_environment import TFPyEnvironment
from tf_agents.drivers.dynamic_step_driver import DynamicStepDriver
from tf_agents.metrics.tf_metrics import AverageReturnMetric
from tf_agents.networks.q_network import QNetwork
from tf_agents.policies.py_tf_eager_policy import PyTFEagerPolicy
from tf_agents.policies.random_tf_policy import RandomTFPolicy
from tf_agents.replay_buffers.tf_uniform_replay_buffer \
  import TFUniformReplayBuffer
from tf_agents.trajectories import trajectory
from tf_agents.utils import common
from tqdm import trange

from helper_funcs.src.io import save_preprocessor, save_policy_tflite
from machine_learning_rl_training.src.rl_environment_multi_agent \
  import RaceSimulation as MultiAgentRaceSimulation
from machine_learning_rl_training.src.rl_environment_single_agent \
  import RaceSimulation as SingleAgentRaceSimulation
from racesim.src.import_pars import import_pars
 
warnings.filterwarnings("ignore")

# Set working directory
workspace_dir = Path(WORKSPACE_DIR)
checkpoint_dir = workspace_dir / "checkpoint"
policy_dir = workspace_dir / "policy"
export_dir = workspace_dir / "exports"

# Create directories
checkpoint_dir.mkdir(exist_ok=True, parents=True)
policy_dir.mkdir(exist_ok=True, parents=True)
export_dir.mkdir(exist_ok=True, parents=True)

today = datetime.today().strftime("%Y-%m-%d")

### Check parameters 
Check training input.

In [8]:
race_pars_file

'/content/racesim/input/parameters/pars_Shanghai_2019.ini'

In [11]:
if vse_others == "multi_agent" and calculate_final_positions:
    print("WARNING: Evaluation of trained strategy is currently not implemented for the multi-agent environment!"
          " Setting calculate_final_positions = False!")
    calculate_final_positions = False

# ----------------------------------------------------------------------------------------------------------------------
# CHECK FOR WET RACE ---------------------------------------------------------------------------------------------------
# ----------------------------------------------------------------------------------------------------------------------

# load parameter file
pars_in = import_pars(
    use_print=False,
    use_vse=False,
    race_pars_file=race_pars_file,
    mcs_pars_file=mcs_pars_file)[0]

# loop through drivers and check for intermediate or wet tire compounds in real race
for driver in pars_in["driver_pars"]:
    if any([True if strat[1] in ["I", "W"] else False for strat in pars_in["driver_pars"][driver]["strategy_info"]]):
        raise RuntimeError(f"Cannot train for current race {race} because it was a (partly) wet race!")

### Setup environment

In [12]:
if vse_others == "multi_agent":
    train_py_env = MultiAgentRaceSimulation(
        race_pars_file=race_pars_file,
        mcs_pars_file=mcs_pars_file,
        use_prob_infl=True,
        create_rand_events=True,
        vse_paths=vse_paths)
    eval_py_env = MultiAgentRaceSimulation(
        race_pars_file=race_pars_file,
        mcs_pars_file=mcs_pars_file,
        use_prob_infl=True,
        create_rand_events=True,
        vse_paths=vse_paths)
else:
    train_py_env = SingleAgentRaceSimulation(
        race_pars_file=race_pars_file,
        mcs_pars_file=mcs_pars_file,
        vse_type=vse_others,
        use_prob_infl=True,
        create_rand_events=True,
        vse_paths=vse_paths)
    eval_py_env = SingleAgentRaceSimulation(
        race_pars_file=race_pars_file,
        mcs_pars_file=mcs_pars_file,
        vse_type=vse_others,
        use_prob_infl=True,
        create_rand_events=True,
        vse_paths=vse_paths)

train_tf_env = TFPyEnvironment(environment=train_py_env)
eval_tf_env = TFPyEnvironment(environment=eval_py_env)

print(f"INFO: Race: {race}, strategy of other drivers: {vse_others}")
if train_py_env.batched:
    print(f"INFO: Batched environment: {train_py_env.batched()}, batch size: {train_py_env.batch_size}")
print(f"INFO: Observation spec: {train_py_env.time_step_spec().observation}")
print(f"INFO: Action spec: {train_py_env.action_spec()}")

INFO: Race: Shanghai_2019, strategy of other drivers: basestrategy
INFO: Observation spec: BoundedArraySpec(shape=(40,), dtype=dtype('float32'), name='observation', minimum=0.0, maximum=1.0)
INFO: Action spec: BoundedArraySpec(shape=(), dtype=dtype('int32'), name='action', minimum=0, maximum=3)


### Setup DQN Agent

In [14]:
q_net = QNetwork(
    input_tensor_spec=train_tf_env.observation_spec(),
    action_spec=train_tf_env.action_spec(),
    fc_layer_params=fc_layer_params)

optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)
global_step = tf.compat.v1.train.get_or_create_global_step()

boltzmann_fn = PolynomialDecay(
    initial_learning_rate=1.0,
    decay_steps=num_iterations,
    end_learning_rate=0.01)

agent = DqnAgent(
    time_step_spec=train_tf_env.time_step_spec(),
    action_spec=train_tf_env.action_spec(),
    q_network=q_net,
    optimizer=optimizer,
    n_step_update=n_step_update,
    target_update_period=target_update_period,
    td_errors_loss_fn=common.element_wise_squared_loss,
    gamma=gamma,
    train_step_counter=global_step)

agent.initialize()

### Setup policy

In [15]:
random_policy = RandomTFPolicy(
    time_step_spec=train_tf_env.time_step_spec(),
    action_spec=train_tf_env.action_spec())

eager_policy = PyTFEagerPolicy(
    agent.collect_policy,
    use_tf_function=True)

### Collect data

We use a Driver to collect experience in an environment. To use a Driver, we specify an observer `replay_buffer.add_batch` that instructs the driver to add trajectory elements to the replay buffer when it receives a trajectory. 

Then we run the experience collecting loop using the driver.

Source: [DynamicStepDriver | TensorFlow Documentation](https://www.tensorflow.org/agents/api_docs/python/tf_agents/drivers/dynamic_step_driver/DynamicStepDriver)

In [16]:
replay_buffer = TFUniformReplayBuffer(
    data_spec=agent.collect_data_spec,
    batch_size=train_tf_env.batch_size,
    max_length=replay_buffer_max_length)

avg_return_metric = AverageReturnMetric()

driver = DynamicStepDriver(
    train_tf_env,
    agent.collect_policy,
    observers=[
        replay_buffer.add_batch,
        avg_return_metric,
    ],
    num_steps=collect_steps_per_iteration)

# Initial data collection:
# initial driver.run will reset the environment and initialize the policy
for _ in range(initial_collect_steps):
    final_time_step, policy_state = driver.run()
print(final_time_step, policy_state)
print(avg_return_metric.result().numpy())

TimeStep(
{'discount': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([1.], dtype=float32)>,
 'observation': <tf.Tensor: shape=(1, 40), dtype=float32, numpy=
array([[0.64285713, 0.94736844, 0.        , 0.10714286, 1.        ,
        0.        , 0.        , 0.        , 1.        , 1.        ,
        0.        , 0.        , 0.        , 0.        , 1.        ,
        0.        , 1.        , 0.        , 1.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 1.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ]],
      dtype=float32)>,
 'reward': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.20745972], dtype=float32)>,
 'step_type': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([1], dtype=int32)>}) ()
-61.376194


Reading data for a train step

After adding trajectory elements to the replay buffer, we can read batches of trajectory fom the replay buffer to use as input for a train step.

In [17]:
# Dataset generates trajectories with shape [BxTx...] where
# T = n_step_update + 1.
dataset = replay_buffer.as_dataset(
    num_parallel_calls=3,
    sample_batch_size=batch_size,
    num_steps=2
).prefetch(3)

# inspection:
dataset_iterator = iter(dataset)

Instructions for updating:
Use `tf.data.Dataset.counter(...)` instead.
Instructions for updating:
Use `as_dataset(..., single_deterministic_pass=False) instead.


### Setup checkpointing and saving

In [18]:
from tf_agents.policies.policy_saver import PolicySaver
from tf_agents.utils.common import Checkpointer

train_checkpointer = Checkpointer(
    ckpt_dir=checkpoint_dir,
    max_to_keep=20,
    agent=agent,
    policy=agent.policy,
    replay_buffer=replay_buffer,
    global_step=global_step
)

policy_saver = PolicySaver(agent.policy)

If there is checkpoint saved in the working directory, it will be restored.

In [19]:
print(f"Restoring checkpoint: {checkpoint_dir}")
train_checkpointer.initialize_or_restore()
global_step = tf.compat.v1.train.get_global_step()

Restoring checkpoint: /content/gdrive/MyDrive/RLF002/vse-004-from-basestrat/checkpoint


### Evaluation metrics

Agent earns `+5` reward per `+1` position changes and `-5` per `-1` position change.

In [20]:
from functools import partial
from tf_agents.drivers.dynamic_episode_driver import DynamicEpisodeDriver

eval_avg_return_metric = AverageReturnMetric()

eval_driver = DynamicEpisodeDriver(
    eval_tf_env,
    agent.policy,
    observers=[
        eval_avg_return_metric,
    ],
    num_episodes=num_eval_episodes)

In [21]:
from helper_funcs.src.io import evaluate_policy

evaluate_policy(
    eval_tf_env,
    eval_py_env,
    agent.policy,
    num_episodes=3
    # num_eval_episodes
)

race 1: driver = 4, lap = 1, action = 2
race 1: driver = 4, lap = 2, action = 0
race 1: driver = 4, lap = 3, action = 0
race 1: driver = 4, lap = 4, action = 0
race 1: driver = 4, lap = 5, action = 0
race 1: driver = 4, lap = 6, action = 0
race 1: driver = 4, lap = 7, action = 0
race 1: driver = 4, lap = 8, action = 0
race 1: driver = 4, lap = 9, action = 0
race 1: driver = 4, lap = 10, action = 0
race 1: driver = 4, lap = 11, action = 0
race 1: driver = 4, lap = 12, action = 0
race 1: driver = 4, lap = 13, action = 0
race 1: driver = 4, lap = 14, action = 0
race 1: driver = 4, lap = 15, action = 0
race 1: driver = 4, lap = 16, action = 0
race 1: driver = 4, lap = 17, action = 0
race 1: driver = 4, lap = 18, action = 0
race 1: driver = 4, lap = 19, action = 0
race 1: driver = 4, lap = 20, action = 0
race 1: driver = 4, lap = 21, action = 0
race 1: driver = 4, lap = 22, action = 0
race 1: driver = 4, lap = 23, action = 0
race 1: driver = 4, lap = 24, action = 0
race 1: driver = 4, lap =

-8.425848180040097

In [22]:
evaluate_policy(
    eval_tf_env,
    eval_py_env,
    agent.policy,
    num_episodes=num_eval_episodes,
    print_lap_decisions=False
)

-6.033732782646257

## Train the agent

Two things must happen during the training loop:

1. collect data from the environment
2. use that data to train the agent's neural network

Periodically, we evaluate the policy and print the cur rent score.

In [19]:
%%time

# (Optional) Optimize by wrapping some of the code in a graph using TF function.
agent.train = common.function(agent.train)

# reset training step
agent.train_step_counter.assign(0)

# evaluate the agent's policy once before training
eval_tf_env.reset()
eval_driver.run()
rewards = [eval_avg_return_metric.result()]
results = [evaluate_policy(
    eval_tf_env,
    eval_py_env,
    agent.policy,
    num_episodes=3
    # num_eval_episodes
)]

# reset the environment
time_step = train_tf_env.reset()

for _ in (pbar := trange(num_iterations)):
    # Collect a few steps using collect_policy and save to the replay buffer.
    time_step, policy_state = driver.run()

    # Sample a batch of data from the buffer and update the agent's network.
    experience, _ = next(dataset_iterator)
    train_loss = agent.train(experience).loss
    step = int(agent.train_step_counter.numpy())

    # Update progress bar status
    if step % log_interval == 0:
        pbar.set_description(f"{step=}, {train_loss=:.3f}")

    # Evaluate
    if step % eval_interval == 0:
        pbar.set_description(f"Evaluating. {step=}")
        eval_tf_env.reset()
        eval_driver.run()
        rewards.append(eval_avg_return_metric.result())
        results.append(evaluate_policy(
            eval_tf_env,
            eval_py_env,
            agent.policy,
            num_episodes=3
            # num_eval_episodes
        ))
        pbar.set_description(
            f"{step=}, average return={rewards[-1]:.3f}")

    # Checkpoint / save models
    if step % checkpoint_interval == 0:
        train_checkpointer.save(global_step)
        policy_saver.save(policy_dir)
        save_preprocessor(train_py_env, export_dir / f"{today}-{step}", race=race)
        save_policy_tflite(policy_dir, export_dir / f"{today}-{step}", race=race)

Instructions for updating:
back_prop=False is deprecated. Consider using tf.stop_gradient instead.
Instead of:
results = tf.foldr(fn, elems, back_prop=False)
Use:
results = tf.nest.map_structure(tf.stop_gradient, tf.foldr(fn, elems))
step=5000, average return=-7.486: 100%|██████████| 5000/5000 [04:58<00:00, 16.73it/s]

CPU times: user 5min 55s, sys: 3.66 s, total: 5min 59s
Wall time: 6min 35s





## Evaluate the agent

Rewards at evaluation points:

In [20]:
rewards

[<tf.Tensor: shape=(), dtype=float32, numpy=-10.646892>,
 <tf.Tensor: shape=(), dtype=float32, numpy=-7.485831>]

Run a few episodes using learned agent policy. (equivalent to eval_driver metrics)

In [25]:
from helper_funcs.src.io import evaluate_policy

evaluate_policy(
    eval_tf_env,
    eval_py_env,
    agent.policy,
    num_episodes=3
    # num_eval_episodes
)

race 1: driver = 9, lap = 1, action = PolicyStep(action=<tf.Tensor: shape=(1,), dtype=int32, numpy=array([2], dtype=int32)>, state=(), info=())
race 1: driver = 9, lap = 2, action = PolicyStep(action=<tf.Tensor: shape=(1,), dtype=int32, numpy=array([0], dtype=int32)>, state=(), info=())
race 1: driver = 9, lap = 3, action = PolicyStep(action=<tf.Tensor: shape=(1,), dtype=int32, numpy=array([0], dtype=int32)>, state=(), info=())
race 1: driver = 9, lap = 4, action = PolicyStep(action=<tf.Tensor: shape=(1,), dtype=int32, numpy=array([0], dtype=int32)>, state=(), info=())
race 1: driver = 9, lap = 5, action = PolicyStep(action=<tf.Tensor: shape=(1,), dtype=int32, numpy=array([0], dtype=int32)>, state=(), info=())
race 1: driver = 9, lap = 6, action = PolicyStep(action=<tf.Tensor: shape=(1,), dtype=int32, numpy=array([0], dtype=int32)>, state=(), info=())
race 1: driver = 9, lap = 7, action = PolicyStep(action=<tf.Tensor: shape=(1,), dtype=int32, numpy=array([0], dtype=int32)>, state=(), i

-8.930442159723802

More precise estimate.

In [26]:
evaluate_policy(
    eval_tf_env,
    eval_py_env,
    agent.policy,
    num_episodes=num_eval_episodes,
    print_lap_decisions=False,
)

TypeError: ignored

## Save policy

Checkpoint model at the end of training

In [48]:
train_checkpointer.save(global_step)
print(f"Saved checkpoint: {checkpoint_dir}")

Saved checkpoint: /content/gdrive/MyDrive/RLF002/vse-004-from-basestrat/checkpoint


Save preprocessor

In [49]:
saved_preprocessor = save_preprocessor(train_py_env, export_dir / f"{today}-final", race=race)
print(f"{saved_preprocessor=}")

saved_preprocessor=PosixPath('/content/gdrive/MyDrive/RLF002/vse-004-from-basestrat/exports/2023-05-22-final/preprocessor_reinforcement_Shanghai_2019.pkl')


Save the policy

Converts Q Network to TFlite. See [TensorFlow Lite converter](https://www.tensorflow.org/lite/convert) for more details.

In [23]:
policy_saver.save(policy_dir)
print(f"Saved policy: {policy_dir}")

saved_tflite = save_policy_tflite(policy_dir, export_dir / f"{today}-final", race=race)
print(f"{saved_tflite=}")



Saved policy: /content/gdrive/MyDrive/RLF002/vse-004-from-basestrat/policy
saved_tflite=PosixPath('/content/gdrive/MyDrive/RLF002/vse-004-from-basestrat/exports/2023-05-22-final/nn_reinforcement_Shanghai_2019.tflite')


Download the checkpoint and policy zip files.

In [None]:
# download_archive(exported_checkpoint)
# download_archive(exported_policy)

At this point, you can either (1) continue training iterations, or (2) generate an artifact to check the performance of the loaded policy, or (3) save the policy.

When you save the policy and restore it, you cannot continue with the training, but you can deploy the model.

## Evaluate TFLite model

Create environment and run a few races.

In [25]:
from machine_learning_rl_training.src.rl_evaluate_policy import print_returns_positions

py_env = SingleAgentRaceSimulation(
    race_pars_file=race_pars_file,
    mcs_pars_file=mcs_pars_file,
    vse_type=vse_others,
    use_prob_infl=True,
    create_rand_events=True,
    vse_paths=vse_paths)


print_returns_positions(
    py_env=py_env,
    num_races=3,
    tf_lite_path=str(saved_tflite),
    vse_others=vse_others_postproc,
    print_lap_decisions=True,
)

INFO: Evaluating reinforcement VSE by average returns and positions over 3 races against basestrategy VSE...
race 1: driver = 3, lap = 1, action = 0
race 1: driver = 3, lap = 2, action = 0
race 1: driver = 3, lap = 3, action = 0
race 1: driver = 3, lap = 4, action = 0
race 1: driver = 3, lap = 5, action = 0
race 1: driver = 3, lap = 6, action = 0
race 1: driver = 3, lap = 7, action = 0
race 1: driver = 3, lap = 8, action = 0
race 1: driver = 3, lap = 9, action = 0
race 1: driver = 3, lap = 10, action = 0
race 1: driver = 3, lap = 11, action = 0
race 1: driver = 3, lap = 12, action = 0
race 1: driver = 3, lap = 13, action = 0
race 1: driver = 3, lap = 14, action = 0
race 1: driver = 3, lap = 15, action = 0
race 1: driver = 3, lap = 16, action = 0
race 1: driver = 3, lap = 17, action = 0
race 1: driver = 3, lap = 18, action = 0
race 1: driver = 3, lap = 19, action = 1
race 1: driver = 3, lap = 20, action = 0
race 1: driver = 3, lap = 21, action = 0
race 1: driver = 3, lap = 22, action = 

Get a more precise estimate

In [28]:
num_races_postproc

10000

In [29]:
print_returns_positions(
    py_env=py_env,
    num_races=num_races_postproc,
    tf_lite_path=str(saved_tflite),
    vse_others=vse_others_postproc,
    print_lap_decisions=False,
)

INFO: Evaluating reinforcement VSE by average returns and positions over 100 races against basestrategy VSE...
INFO: Progress: |██████████████████████████████████████████████████| 100.0% 
RESULT: Average return (total): -4.210 (FCY: -3.736, no FCY: -5.560), average position (total): 9.6 (FCY: 9.2, no FCY: 10.8), FCY races: 74, no FCY races: 26
