# Reinforcement Learning- and FEM-based Inverse Design

## Experiment Logger

In [1]:
import os
import neptune.new as neptune
from neptune.new.types import File
from neptune.new.utils import stringify_unsupported

os.environ['NEPTUNE_PROJECT']="pil-clemson/metamtl-rl-test"
os.environ['NEPTUNE_NOTEBOOK_ID']="45d03d69-6ac7-41ca-8af8-80caaa73aad5"
os.environ['NEPTUNE_NOTEBOOK_PATH']="metamaterial-rl/RemoteFEM-DQN.ipynb"

exp = None

In [2]:
experiment_repeat = 1

In [3]:
tags = ['Cloak', 'DeltaT-Only', 'Dev']

## Import

In [4]:
from __future__ import annotations
from typing import Union, Optional, Callable, Any
from typing import Tuple, List, Set, Dict
from typing import NamedTuple
from typing import Generator

In [5]:
from collections import defaultdict, deque
from types import SimpleNamespace
import queue
from queue import PriorityQueue
from enum import Enum

In [6]:
from dataclasses import dataclass, field

In [7]:
import traceback
import tracemalloc

In [8]:
import ipywidgets as widgets
from IPython.display import clear_output

In [9]:
import os
import sys
import copy
import time
from datetime import datetime, timedelta
from pprint import pformat, pprint
import multiprocessing
import random
import math
import itertools
import uuid

In [10]:
from tqdm.notebook import trange, tqdm

In [11]:
import matplotlib.pyplot as plt

In [12]:
import plotly.express as px

In [13]:
import torch
from torch import nn

from torch import Tensor, BoolTensor

from torch.nn.modules.loss import _Loss
from torch.optim import Optimizer

from torchvision.transforms import PILToTensor

print('PyTorch version:', torch.__version__)

PyTorch version: 1.13.0


In [14]:
import torchinfo

In [15]:
import numpy as np

In [16]:
from skimage.measure import block_reduce

In [17]:
from SimHubClient import SimHubClient

In [18]:
torch.set_default_dtype(torch.double)

## Computing Devices

In [19]:
print('CPU Cores:', multiprocessing.cpu_count())

CPU Cores: 56


In [20]:
# Getting all memory using os.popen()
mem_bytes = os.sysconf('SC_PAGE_SIZE') * os.sysconf('SC_PHYS_PAGES')  # e.g. 4015976448
mem_gib = mem_bytes/(1024.**3)
print('Memory size:', int(mem_gib), 'GiB')

Memory size: 376 GiB


In [21]:
available_gpus = [torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())]
print('GPUs:', available_gpus)

GPUs: ['Tesla V100S-PCIE-32GB', 'Tesla V100S-PCIE-32GB']


In [22]:
cuda = torch.device('cpu') if torch.cuda.is_available() else torch.device('cpu')
print('Current computing device:', cuda)

Current computing device: cpu


## DEBUG FLAG

In [23]:
DEBUG = False

In [24]:
class DEBUG:
    result_generation=True
    result_visualization=False
    
    transition_log=True
    transition_log_buffer=''
    transition_log_buffer_gen=''

    action_log=False
    prediction_log=False
    state_log=True
    epsilon_log=True
    reward_log=True
    state_visualization=False
    state_target_diff_visualization=False
    
    visualization_sampling_rate=.000
    
    optimizer_sample_log=False
    
    start_from_goal=False
    
    trace_memory=False
    
    in_generation_mode=False
    
    transition_checked=None

In [25]:
if DEBUG.trace_memory:
    os.environ['PYTHONTRACEMALLOC'] = '3'
    tracemalloc.start()

## Helper Functions

In [26]:
clip = lambda x, l, u: l if x < l else u if x > u else x

## Hyperparameters

In [27]:
environment_config = {
    'rings': 3,
    
    'result_size': (400, 400),
    'result_range': (293.15, 353.15),
}

hyperparameters = {
    'target_update_interval': 10,
    'optimization_iterations': 1,
    'experience_replay_capacity': 10000,
    'replay_batch_size': 32,
    'lr': .001,
    'discount_factor': .9,
    'epsilon_initial': 1.,
    'epsilon_minimal': .1,
    'epsilon_halflife': 2000,
    'epsilon_boost_preterminal': .3,
    'epsilon_generate': .1,

    'max_episode': 300,
    'max_step_per_episode': 500,
    
    'goal_reward': 10000.,
    'terminal_threshold': [.1, .5],  #
    'invalid_state_penalty': 0.,
    'failed_episode_penalty': -10.,
    
    'low_value_earlier_stop_min_step': 150,
    'low_value_earlier_stop_threshold': [0.3, 1.5],
    'low_value_earlier_stop_step_count': 10,
    
    
    'reward_func': ['-logx-0.5', '-logx'],  #
    
}    




## Reinforcement Learning Environment

## Interfaces and Dataclasses

In [28]:
class State(dict):
    def __init__(self) -> None:
        super().__init__()
        
    def step(self, action: 'Action') -> 'State': 
        return action(copy.deepcopy(self))

    def to_tensor(self) -> Tensor: raise NotImplementedError

In [29]:
class Action:
    def __init__(self, name: str, action: Callable[[State], State]) -> None:
        self.name = name
        self.action = action
        
    def __repr__(self) -> str:
        return self.name
        
    def __call__(self, state: State) -> State:
        return self.action(state)
# Action = Callable[[State], State]

In [30]:
class Environment:    
    def __init__(self) -> None:
        self._state: State = None
        self._action_space: List[Action] = list()
        self._valid_actions: BoolTensor = None

    def __repr__(self) -> str:
        return f'''{self.__class__.__name__}(
    Action space size: {self.action_count()}
    Current state: {self.state}
)'''
        
    @property
    def state(self) -> State: return self._state
    
    @property
    def action_space(self) -> List[Action]: return self._action_space        

    def action_count(self) -> int: return len(self._action_space)        
    
    def reset(self) -> None: raise NotImplementedError
        
    def step(self, action_index: int) -> None: raise NotImplementedError


In [31]:
class ReplayTransition(NamedTuple):
    state: State
    action_index: int
    reward: float
    next_state: State
    note: str

In [32]:
@dataclass
class SimulationTransition:
    # From stepping
    episode: int
    step: int
    
    state: State
    action_index: int
    next_state: State
    
    
    action_name: str = None
    action_type: str = None
    
    # From FEM simulator
    state_id: str = None
    next_state_id: str = None
    
    state_sim: Dict[str, Any] = None
    next_state_sim: Dict[str, Any] = None
    
    # From reward function
    state_sim_value: float = None
    is_state_terminal: bool = None
    state_terminal_type: int = None
    
    next_state_sim_value: float = None
    is_next_state_terminal: bool = None
    next_state_terminal_type: int = None
    
    reward: float = None    
    
    def __repr__(self) -> str:
        return f'[{self.episode}-{self.step}] ' +\
                f'{self.state}({self.state_sim_value}, {self.is_state_terminal})' +\
                f' =={self.action_name}({self.action_type})==> ' +\
                f'{self.next_state}({self.state_sim_value}, {self.is_state_terminal})' +\
                f'  R:{self.reward} {"Loop!" if self.state_id == self.next_state_id else ""}'

In [33]:
RewardFunc = Callable[[SimulationTransition], SimulationTransition]

## State and Environment

In [34]:
class HarvestRingState(State):
    def __init__(self) -> None:
        super().__init__()
        # # Fixed start
        # self['r'] = [50, 60., 70., 80., 100.]
        # self['k'] = [0., 30., 30., 30., 30., 10.]
        # Random start
        self['r'] = [50, 60., 70., 80., 100.]
        self['k'] = [0., 30., 30., 30., 30., 10.]
    
    def to_tensor(self) -> torch.Tensor:
        return torch.cat([torch.tensor(self['r']), torch.tensor(self['k'])]).double()

In [35]:
HarvestRingState().to_tensor()

tensor([ 50.,  60.,  70.,  80., 100.,   0.,  30.,  30.,  30.,  30.,  10.])

In [36]:
class HarvRingEnvironment(Environment):
    def __init__(self) -> None:
        super().__init__()
        
        self.rings = environment_config['rings']
        
        self.reset()
        
        def adjust_ring_r(ring: int, r_mod: float, lower_bound: float, upper_bound: float):
            def action(state: State):
                old_value = state['r'][ring]
                new_value = clip(old_value + r_mod, lower_bound, upper_bound)
                
                if new_value == state['r'][ring - 1] or new_value == state['r'][ring + 1]:
                    new_value = old_value
                    
                state['r'][ring] = new_value
                return state
            return Action(f'{ring}:r{r_mod}', action)
                
        def adjust_ring_k(ring, k_mod: float, lower_bound: float, upper_bound: float):
            def action(state: State):
                old_value = state['k'][ring]
                new_value = clip(old_value + k_mod, lower_bound, upper_bound)
                state['k'][ring] = new_value
                return state
            return Action(f'{ring}:k{k_mod}', action)
          
        for ring in [1, 2, 3]:
            self._action_space.append(adjust_ring_r(ring, +5., 55., 95.))
            self._action_space.append(adjust_ring_r(ring, -5., 55., 95.))
        
        for ring in [1, 2, 3, 4]:
            self._action_space.append(adjust_ring_k(ring, +5., 0., 60.))
            self._action_space.append(adjust_ring_k(ring, -5., 0., 60.))
            
        # TBD: adjust k of the board (region 3)
            
    def reset(self) -> None: 
        self._state = HarvestRingState()
                
        
    def step(self, action_index: int) -> None: 
        action = self._action_space[action_index]
        self._state = self._state.step(action)

## DQN

### FEM-based Reward & Terminal Function

In [37]:
def badloe(reward_thresholds: List[float]):
    low, mid, high = reward_thresholds
    def func(eta):
        if eta < low:
            return -10
        if eta > high:
            return 10000
        return (eta / mid) ** 9 - 1 
    return func

reward_funcs = {
    'linear': lambda x: x,
    '-logx': lambda x: -np.log(x),
    '-logx-0.5': lambda x: -np.log(x) - 0.5,
    # 'badloe': badloe(hyperparameters['reward_thresholds']),
}

class FEMReward():
    def __init__(self,
                 hyperparameters: Dict[str, Any]) -> None:
        
        self.reward_func = [reward_funcs[func] for func in hyperparameters['reward_func']]
        
        self.max_step_per_episode = hyperparameters['max_step_per_episode']

        self.goal_reward = hyperparameters['goal_reward']
        self.terminal_threshold = hyperparameters['terminal_threshold']
        self.invalid_state_penalty = hyperparameters['invalid_state_penalty']
        self.failed_episode_penalty = hyperparameters['failed_episode_penalty']
        
        self.current_episode = 0
        
        self.low_value_min_step = hyperparameters['low_value_earlier_stop_min_step']
        self.low_value_threshold = hyperparameters['low_value_earlier_stop_threshold']
        self.low_value_step_threshold = hyperparameters['low_value_earlier_stop_step_count']
        self.low_value_step_count = 0
        

    def __call__(self, transition: SimulationTransition) -> SimulationTransition:
        """
        Calculate reward value for a transition, and determine if a terminal state is reached

        Parameters
        ----------
        transition : SimulationTransition
            A transition with completed simulation data

        Returns
        -------
        float | None
            Reward value, None if the next_state is terminal
        bool
            The next_state is terminal

        Raises
        ------
        TODO
        """
        
        if transition.episode != self.current_episode:
            self.low_value_step_count = 0
            self.current_episode = transition.episode
        

        size = environment_config['result_size']
        center = (int(size[0] / 2), int(size[1] / 2))
        cloaked_radius = transition.state['r'][0]
        t_delta_ref = .3 * cloaked_radius
        
        
        Y, X = np.ogrid[:size[0], :size[1]]
        dist_from_center = np.sqrt((X - center[0]) ** 2 + (Y - center[1]) ** 2)
        t_dist_mask = dist_from_center <= cloaked_radius
        t_dist_ref = np.load('ref400.npy')
        
        if transition.state_sim and transition.state_sim['status'] == 'done':
            state_t_dist = transition.state_sim['output']['temperature_distribution'][2].reshape(environment_config['result_size'])
            state_t_a = state_t_dist[center[0], center[1] - cloaked_radius]
            state_t_b = state_t_dist[center[0], center[1] + cloaked_radius]
            
            state_t_delta = np.abs(state_t_a - state_t_b)
            
            state_t_neuturality = np.sum(np.abs(state_t_dist - t_dist_ref)) / t_dist_mask.size
            
            transition.state_sim_value = [state_t_delta / t_delta_ref, state_t_neuturality]
            transition.is_state_terminal = transition.state_sim_value[0] <= self.terminal_threshold[0] \
                and transition.state_sim_value[1] <= self.terminal_threshold[1]

        
        if transition.next_state_sim and transition.next_state_sim['status'] == 'done':
            next_state_t_dist = transition.next_state_sim['output']['temperature_distribution'][2].reshape(environment_config['result_size'])
            next_state_t_a = next_state_t_dist[center[0], center[1] - cloaked_radius]
            next_state_t_b = next_state_t_dist[center[0], center[1] + cloaked_radius]
            
            next_state_t_delta = np.abs(next_state_t_a - next_state_t_b)
            
            next_state_t_neuturality = np.sum(np.abs(next_state_t_dist - t_dist_ref)) / t_dist_mask.size
            
            transition.next_state_sim_value = [next_state_t_delta / t_delta_ref, next_state_t_neuturality]
            transition.is_next_state_terminal = transition.next_state_sim_value <= self.terminal_threshold \
                and transition.state_sim_value[1] <= self.terminal_threshold[1]
        
        if transition.state_sim_value:
            transition.reward = self.reward_func[0](transition.state_sim_value[0]) \
                + self.reward_func[1](transition.state_sim_value[1])
            exp['reward_dT'].append(self.reward_func[0](transition.state_sim_value[0]))
            exp['reward_Mv'].append(self.reward_func[1](transition.state_sim_value[1]))
            exp[f'reward_total'].append(transition.reward)
            exp[f'reward_total_episode/{transition.episode}'].append(transition.reward)
        else:
            transition.reward = self.invalid_state_penalty
        
        # Reward only for primary terminal condition
        if transition.is_state_terminal:
            transition.reward = self.goal_reward
            transition.state_terminal_type = 0  # 0 for success episode
        else:
            # Determine alternative terminal
            # Alt-term 1 (after X step)
            if transition.episode >= self.max_step_per_episode:
                transition.is_state_terminal = True;
                transition.state_terminal_type = 1
                transition.reward = self.failed_episode_penalty
            # Alt-term 2 (trapped in low-val area)
            elif transition.step >= self.low_value_min_step: 
                if transition.state_sim_value[0] > self.low_value_threshold[0] \
                    and transition.state_sim_value[1] > self.low_value_threshold[1]:
                    self.low_value_step_count += 1
                    exp['low_val'].append(f'E{transition.episode} Low value step {self.low_value_step_count} {transition.state_sim_value}')
                    if self.low_value_step_count >= self.low_value_step_threshold:
                        transition.is_state_terminal = True;
                        transition.state_terminal_type = 2
                        transition.reward = self.failed_episode_penalty
                        exp['low_val'].append(f'E{transition.episode} Low value terminal {self.low_value_step_count}')
                else:
                    if self.low_value_step_count > 0:
                        exp['low_val'].append(f'E{transition.episode} Low value count reset')
                    self.low_value_step_count = 0

            
        # LOGGING
        if transition.state_sim and transition.state_sim['status'] == 'done':
            if DEBUG.state_log:
                if not DEBUG.in_generation_mode:
                    exp['state_values_dT'].append(transition.state_sim_value[0])
                    exp['state_values_Mv'].append(transition.state_sim_value[1])
                else:
                    exp['state_values_gen_dT'].append(transition.state_sim_value[0])  
                    exp['state_values_gen_Mv'].append(transition.state_sim_value[1])            
                    
        if DEBUG.reward_log and not DEBUG.in_generation_mode:                
            exp['reward'].append(transition.reward)
        
        return transition

### Network Definition

In [38]:
# Network Container
class Model():
    def __init__(self, network: nn.Module, loss_func: _Loss, optimizer: Optimizer):
        self.network = network
        self.loss_func = loss_func
        self.optimizer = optimizer

    def __call__(self, network_input: Tensor) -> Tensor:
        return self.network(network_input)

In [39]:
def QNet(state_size: int = 11, action_number: int = 14, target_network: bool = False):
    net = nn.Sequential(
        nn.Linear(state_size, 128, device=cuda, dtype=torch.double),
        nn.ReLU(),
        nn.Linear(128, 256, device=cuda, dtype=torch.double),
        nn.ReLU(),
        nn.Linear(256, action_number, device=cuda, dtype=torch.double),
    )
    if target_network:
        return Model(network=net, loss_func=None, optimizer=None)
    else:
        # exp['Network'] = str(torchinfo.summary(net, input_size=(32, state_size), 
        #                                        device=cuda, verbose=0))
        return Model(network=net, loss_func=nn.HuberLoss(), optimizer=torch.optim.Adam(net.parameters(), 0.001))

### Replay Memory Class

In [40]:
class ReplayMemory():
    def __init__(self, capacity):
        self.memory: deque = deque([], maxlen=capacity)

    def push(self, *args):
        self.memory.append(ReplayTransition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

### Agent Class

In [41]:
class Agent():
    def __init__(self, environment: Environment, simulator: SimHubClient, reward_func: RewardFunc, 
                 policy_network: nn.Module, target_network: nn.Module, hyperparameters: Dict[str, Any]) -> None:
        self.environment: Environment = environment
        self.fem_simulator: SimHubClient = simulator
        self.fem_reward_func: RewardFunc = reward_func
        
        self.policy_network: Model = policy_network
        self.target_network: Model = target_network  
        
        self.target_update_interval: int = hyperparameters['target_update_interval']

        self.optimization_iterations: int = hyperparameters['optimization_iterations']
        self.max_step_per_episode: int = hyperparameters['max_step_per_episode']
        self.experience_replay: ReplayMemory = ReplayMemory(hyperparameters['experience_replay_capacity'])
        self.replay_batch_size: int = hyperparameters['replay_batch_size']

        self.discount_factor: float = hyperparameters['discount_factor']
        self.epsilon_initial: float = hyperparameters['epsilon_initial']
        self.epsilon_minimal: float = hyperparameters['epsilon_minimal']
        self.epsilon_halflife: float = hyperparameters['epsilon_halflife']
        self.epsilon_generate: float = hyperparameters['epsilon_generate']
        
        self.epsilon_boost_preterminal: float = hyperparameters['epsilon_boost_preterminal']
        
        
        
        self.pending_transitions: List[SimulationTransition] = list()
        
        # Set to true when generating result
        self.generation_mode: bool = False
        self.explored_step: int = 0
        
        self.total_steps: int = 0
        
        self.episode: int = 0
        self.step_num: int = 0
        
        # Logging result of episode and boost epsilon when needed
        self.previous_episode_terminal: List[bool] = list()
        self.terminal_reached: bool = False
        
        self.convergence_episode: int = 0
        self.convergence_step: int = 100000
        self.convergence_episode_gen: int = 0
        self.convergence_step_gen: int = 100000

        
    def updaet_action_mask(self) -> None:
        ...
        
    
    def select_action(self) -> Tuple[State, int, str]:
        """
        Decide an action based on epsilon greedy algorithm

        Returns
        -------
        State
            Current state instance
        int
            Index number of an action in the action space
        
        str
            Action type, literal string of "Prediction" or "Random"
        """
        state = self.environment.state
        # epsilon = self.epsilon_minimal + (self.epsilon_initial  - self.epsilon_minimal) * \
        #             math.exp(-1. * self.total_steps / self.epsilon_decay)
        
        # Determining epsilon
        
        epsilon = max(self.epsilon_initial 
                             * (0.5 ** (self.total_steps / self.epsilon_halflife)), 
                             self.epsilon_minimal)
        
        if not self.terminal_reached: epsilon += self.epsilon_boost_preterminal
        
        if self.generation_mode: epsilon = self.epsilon_generate
        
        # Epsilon determined
        
        if DEBUG.epsilon_log:
            if self.generation_mode:
                exp['epsilon_gen'].append(epsilon, step=self.episode + self.step_num / self.max_step_per_episode)
            else:
                exp['epsilon'].append(epsilon, step=self.episode + self.step_num / self.max_step_per_episode)
        
        
        
        if random.random() > epsilon:
            prediction = self.policy_network(state.to_tensor().flatten()).flatten()
            
            if DEBUG.prediction_log:
                if not self.generation_mode:
                    log_target = 'prediction'
                else:
                    log_target = 'prediction_gen'
                exp[f'{log_target}/{self.episode}'].append(f'Step {self.step_num}', step=self.step_num)
                exp[f'{log_target}/{self.episode}'].append(str(state), step=self.step_num+0.1)
                preds = []
                for i in range(len(prediction)):
                    preds.append((prediction[i].item(), str(self.environment.action_space[i])))
                preds.sort(reverse=True)
                exp[f'{log_target}/{self.episode}'].append(pformat(preds), step=self.step_num+0.2)
                
            action_index = prediction.argmax().item()
            action_type = 'Prediction'
            
        else:
            action_index = random.randrange(len(self.environment.action_space))
            action_type = 'Random'
            self.explored_step += 1
            
        if not self.generation_mode:
            self.total_steps += 1
        return state, action_index, action_type
    
    def step(self) -> SimulationTransition:
        """
        Perform an action in the in the environment and submit the transition as FEM task to simulator

        Returns
        -------
        SimulationTransition
            Return the transition
        """
        state, action_index, action_type = self.select_action()
        self.environment.step(action_index)
        next_state = self.environment.state
        
        
        transition = SimulationTransition(self.episode, self.step_num, state, action_index, next_state)
        transition.state_id, state_result = self.fem_simulator.submit_task(state)
        transition.next_state_id, next_state_result = self.fem_simulator.submit_task(next_state)
        transition.action_type = action_type
        transition.action_name = self.environment.action_space[action_index].name
        
        self.pending_transitions.append(transition)
        return transition
    
    def compute_reward(self, transition: SimulationTransition) -> None:
        """
        Compute reward value and terminal status for a COMPLETED transition. 
        The states, action and reward will be pushed into experience replay
        
        If the current state is terminal, transition.next_state will be set to None

        """
        self.fem_reward_func(transition)
        
        if DEBUG.transition_log:
            if not self.generation_mode:
                DEBUG.transition_log_buffer += str(transition) + '\n'
                # exp[f'transitions/{transition.episode}'].append(str(transition))
            else:
                DEBUG.transition_log_buffer_gen += str(transition) + '\n'
                # exp[f'transitions_gen/{transition.episode}'].append(str(transition))

        if not self.generation_mode:
            self.experience_replay.push(transition.state.to_tensor(), 
                                        transition.action_index, 
                                        transition.reward, 
                                        None if transition.is_state_terminal else transition.next_state.to_tensor(),
                                        f'{transition.episode}-{transition.step}')

        
    def compute_pending_rewards(self) -> Tuple[SimulationTransition, float]:
        episode_return: float = 0.
        for transition in tqdm(self.pending_transitions):
            terminal_transition = transition
            
            transition.state_sim = self.fem_simulator.wait_for_task(transition.state_id)
            transition.next_state_sim = self.fem_simulator.wait_for_task(transition.next_state_id)
        
            self.compute_reward(transition)
            
            episode_return = episode_return * self.discount_factor + transition.reward;
            
            if transition.is_state_terminal:
                self.terminal_reached = True
                if not self.generation_mode:
                    exp['terminal_type'].append(transition.state_terminal_type, step=transition.episode)
                else:
                    exp['terminal_type_gen'].append(transition.state_terminal_type, step=transition.episode)
                break
                
        self.pending_transitions.clear()
        return transition, episode_return
        
    def optimize(self) -> None:
        if len(self.experience_replay) < self.replay_batch_size: return

        for i in range(self.optimization_iterations):
            samples = self.experience_replay.sample(self.replay_batch_size)
            batch = ReplayTransition(*zip(*samples))
            
            if DEBUG.optimizer_sample_log:
                filename = f'logs/sampled_transition-{self.total_steps + i / self.optimization_iterations}.log'
                with open(filename, 'w') as fp:
                    pprint(samples, stream=fp)
                exp['sampled_transition'].upload_files(filename)

            non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                                      batch.next_state)), device=cuda, dtype=torch.bool)
            # If none of the transition has a valid next_step, skip the round
            if not non_final_mask.any():
                return
            non_final_next_states = torch.stack([s.flatten() for s in batch.next_state
                                                            if s is not None])

            state_batch = torch.stack([s.flatten() for s in batch.state])
            action_batch = torch.tensor(batch.action_index, device=cuda).unsqueeze(1)
            reward_batch = torch.tensor(batch.reward, device=cuda)

            state_action_values = self.policy_network(state_batch).gather(1, action_batch)

            next_state_values = torch.zeros(self.replay_batch_size, device=cuda)
            next_state_values[non_final_mask] = self.target_network(non_final_next_states).max(1)[0].detach()

            expected_state_action_values = (next_state_values * self.discount_factor) + reward_batch

            loss = self.policy_network.loss_func(state_action_values, expected_state_action_values.unsqueeze(1))
            optimization_loss = float(loss)
            self.policy_network.optimizer.zero_grad()
            loss.backward()
            for param in self.policy_network.network.parameters():
                param.grad.data.clamp_(-1, 1)
            self.policy_network.optimizer.step()
            
            exp['optimization_loss'].append(optimization_loss, step=self.total_steps + i / self.optimization_iterations)
        
    def update_target_network(self) -> None:
        self.target_network.network.load_state_dict(self.policy_network.network.state_dict())
        
    def train(self, episodes: int) -> None:
        for episode in range(episodes):
            print('')
            print(f'Episode: {episode}')
            self.episode = episode
            
            self.environment.reset()
            
            print('Stepping...')
            for self.step_num in trange(self.max_step_per_episode):
                self.step()

                self.optimize()
                
                if self.total_steps % self.target_update_interval == 0:
                    self.update_target_network()
                    
            exp['total_explored'].append(self.explored_step, step=self.episode)

            print('Processing rewards...')
            transition, episode_return = self.compute_pending_rewards()
            
            if transition.is_state_terminal and transition.state_terminal_type == 0: 
                print(f'Terminal state found in episode {transition.episode} step {transition.step} deltaT {transition.state_sim_value}:')
                print(transition.state)
                exp['goal_reached'].append(f'{transition.state}-{transition.state_sim_value}', step=self.episode)
           
            print(f'Episode return: {episode_return}')
            exp['episode_return'].append(episode_return, step=self.episode)
                
            exp['terminal_step'].append(transition.step, step=self.episode)
            
            if transition.step < self.convergence_step:
                self.convergence_step = transition.step
                self.convergence_episode = transition.episode
            elif transition.step > self.convergence_step:
                self.convergence_step = 100000
                self.convergence_episode = 0                
            
            self.fem_simulator.clear_tasks()

            if DEBUG.result_generation:
                generated_result = self.generate()
                
                exp['generated_result'].append(str(generated_result), step=self.episode)
                
                if DEBUG.result_visualization:
                    print('Visualizing result...')
                    log_vis_sim(generated_result.state_sim['output']['temperature_distribution'][2], 'generated_state_vis', 
                                append=True, step=self.episode, vrange=(293.15, 353.15))
                
            if DEBUG.transition_log:
                log_file = f'logs/transition-{self.episode}.log'
                with open(log_file, 'w') as fp:
                    fp.write(DEBUG.transition_log_buffer)
                DEBUG.transition_log_buffer = ''
                exp['transition_log'].upload_files(log_file)
                
                log_file = f'logs/transition-gen-{self.episode}.log'
                with open(log_file, 'w') as fp:
                    fp.write(DEBUG.transition_log_buffer_gen)
                DEBUG.transition_log_buffer_gen = ''
                exp['transition_log'].upload_files(log_file)

            
            if DEBUG.trace_memory:
                snapshot = tracemalloc.take_snapshot()
                with open(f'logs/mem{self.episode}.log', 'w') as fp:
                    for line in snapshot.statistics('lineno')[:30]:
                        print(line, file=fp)
            
        
    def generate(self) -> State:
        self.generation_mode = True
        if DEBUG:
            DEBUG.in_generation_mode=True
            
        print('Generating...')
            
        self.environment.reset()
            
        for self.step_num in trange(self.max_step_per_episode):
            self.step()

            
        print('Evaluating states...')
        transition, episode_return = self.compute_pending_rewards()
        
        if transition.is_state_terminal and transition.state_terminal_type == 0: 
            print(f'Terminal state reached in step {transition.step} deltaT {transition.state_sim_value}:')
            print(transition.state)
            exp['goal_reached_gen'].append(f'{transition.state}-{transition.state_sim_value}', step=self.episode) 
        
        print(f'Episode return: {episode_return}')
        exp['episode_return_gen'].append(episode_return, step=self.episode)
            
        exp['terminal_step_gen'].append(transition.step, step=self.episode)
        
        if transition.step < self.convergence_step_gen:
            self.convergence_step_gen = transition.step
            self.convergence_episode_gen = transition.episode
        elif transition.step > self.convergence_step:
            self.convergence_step_gen = 100000
            self.convergence_episode_gen = 0         
        
        self.generation_mode = False
        if DEBUG:
            DEBUG.in_generation_mode=False
        return transition
        

In [42]:
open('/home/nwen/simhub/database/db_ip.log').read().strip()

'10.125.9.30'

## Training

In [43]:
for i in range(experiment_repeat):
    #
    exp = neptune.init_run(project="pil-clemson/metamtl-rl-test",
                           capture_hardware_metrics=True,
                           capture_stderr=True,
                           capture_stdout=True,
                           source_files=['RemoteFEM-DQN-Harv.ipynb'],
                          )
    
    exp['sys/tags'].add(tags)
    
    #
    if i > 0:
        exp['sys/tags'].add(['Rerun'])

    #
    exp['EnvConfig'] = stringify_unsupported(environment_config)
    exp['Hyperparameters'] = stringify_unsupported(hyperparameters)

    #
    env = HarvRingEnvironment()

    fem = SimHubClient('10.128.97.115')
    fem.set_experiment('./elmer_thermal_cloak_ring/elmer_task.yml')

    reward_func = FEMReward(hyperparameters)

    agent = Agent(env, fem, reward_func, QNet(), QNet(target_network=True), hyperparameters)

    #
    agent.train(hyperparameters['max_episode'])

    #
    exp['convergence'] = f'{agent.convergence_episode}({agent.convergence_step})' \
                        + f'/{agent.convergence_episode_gen}({agent.convergence_step_gen})'

    #
    agent.episode += 1
    generated_result = agent.generate()

    exp['generated_result_final'] = str(generated_result)

    exp['sys/tags'].add(['Done'])
    if generated_result.state_sim_value <= hyperparameters['terminal_threshold']:
        exp['sys/tags'].add(['Sucessful'])

    print('Done')

    #
    fem.close()

    #
    exp.stop()
    
    clear_output(wait=True)

https://app.neptune.ai/pil-clemson/metamtl-rl-test/e/RLTEST-74
Remember to stop your run once you’ve finished logging your metadata (https://docs.neptune.ai/api/run#stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.
Waiting for database connection..


  exp['EnvConfig'] = stringify_unsupported(environment_config)


Connected to database

Episode: 0
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: -23.05514672588891
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: -21.391548001725766

Episode: 1
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: -22.79095860387317
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 10.657260473246092

Episode: 2
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: -14.865325044354
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: -8.426644660163323

Episode: 3
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: -14.825675180035478
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: -3.8275699712901394

Episode: 4
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 3.9756230767302023
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 1.1632728275946667

Episode: 5
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 12.11459094631113
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: -17.541878823380127

Episode: 6
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 4.9606025068905195
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 3.1137108893773906

Episode: 7
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 9.588913021097065
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 10.93631075542918

Episode: 8
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 10.595267156981247
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 14.061200811512954

Episode: 9
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 8.491098797847988
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 15.873923628676337

Episode: 10
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 14.361667217886035
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 17.292307181450784

Episode: 11
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: -22.87243013820234
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 15.44056927484118

Episode: 12
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 14.696542553913295
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 14.2504428394543

Episode: 13
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 18.38183708420553
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 12.943847836885137

Episode: 14
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 18.368941299112556
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 18.664351355781708

Episode: 15
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 17.57188617797308
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 12.751773249820449

Episode: 16
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 18.66408412673296
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 16.524855863120248

Episode: 17
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 19.46216560249096
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 18.231003359863678

Episode: 18
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 20.11450622332541
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 16.115322351043453

Episode: 19
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 18.95371153841422
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 19.298685287677316

Episode: 20
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 19.776475880725513
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 17.701824287645415

Episode: 21
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 18.980818910220076
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 18.687139596966396

Episode: 22
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 20.305074048181293
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: -17.744922359929355

Episode: 23
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 20.2231569467791
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 18.0093095033721

Episode: 24
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 2.155172814047463
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: -22.273011269280886

Episode: 25
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 13.792667394535133
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: -11.015416170892546

Episode: 26
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: -24.55719431354146
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 0.5311902234979704

Episode: 27
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 6.610856658960657
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 4.790120934007032

Episode: 28
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 19.41085208388486
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 4.107504434385212

Episode: 29
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 19.9501880131747
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 20.35338622952122

Episode: 30
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 19.79889167695019
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 19.45410803458011

Episode: 31
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 19.36724090884091
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 18.983967608687767

Episode: 32
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 18.5709573632212
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 17.883015919904945

Episode: 33
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 20.14865805574706
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 19.293594473839086

Episode: 34
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 14.450711698539939
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 7.448271926802315

Episode: 35
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 14.965959007828998
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 7.106438369034068

Episode: 36
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 19.870782655986467
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 19.955049601081242

Episode: 37
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 18.8515454569843
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 13.59476429959653

Episode: 38
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 14.94133168435884
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: -9.220365056767351

Episode: 39
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 12.653351425399814
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 7.195019322079195

Episode: 40
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 9.272052995818742
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 19.49157978289023

Episode: 41
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 18.675806835522707
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 6.038100456305393

Episode: 42
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: -9.041685307934982
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 7.7280821556156

Episode: 43
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 19.41256168734333
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 4.336992621093527

Episode: 44
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 18.159743735975802
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 21.01135423426234

Episode: 45
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 21.057108184324335
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 20.345414382768418

Episode: 46
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 22.814647541520802
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 10.703221111533235

Episode: 47
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 22.60601133245641
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 13.790998258276023

Episode: 48
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 22.609330366628907
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 9.367342233280196

Episode: 49
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.31279590279307
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 15.471236130095702

Episode: 50
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 22.550475750113627
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.25749151309712

Episode: 51
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.08518304017464
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 8.967694557052784

Episode: 52
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.905926777686666
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: -12.23662270227976

Episode: 53
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 17.68407192580159
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 15.173849761493456

Episode: 54
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 17.580573643179196
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 12.074787446866567

Episode: 55
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 16.5797581750572
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 8.143449978769478

Episode: 56
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 15.79391095204149
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.28098756235425

Episode: 57
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 22.375355986314254
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 16.338019244942487

Episode: 58
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.901488076672212
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 20.639978178943018

Episode: 59
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.213170070773742
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 18.40492847274479

Episode: 60
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.749358473406808
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 15.11983015243988

Episode: 61
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.345320259866767
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 14.066586280983257

Episode: 62
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.85069408542583
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.681033795556466

Episode: 63
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 22.55738859055189
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 22.103819492307494

Episode: 64
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.424493990642418
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.903851972540963

Episode: 65
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 20.546872729536325
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 11.413880734849633

Episode: 66
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 11.781932887455518
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.556187021938825

Episode: 67
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 15.809743687958235
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.850951637723277

Episode: 68
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 21.183976488419678
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.540990192324053

Episode: 69
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.318989127694195
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 16.174688953977697

Episode: 70
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.84598108019587
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 19.84134914810169

Episode: 71
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.614739152366404
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 15.790218692157541

Episode: 72
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.78619335227436
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 2.946090667318253

Episode: 73
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 20.833020734418113
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.598208565151694

Episode: 74
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.867537893031514
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 21.883688303208395

Episode: 75
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 22.915610886767908
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: -5.7056434677863725

Episode: 76
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.327227290013404
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.08260429194364

Episode: 77
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 22.603912787733456
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: -2.1950426069550453

Episode: 78
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 20.4726687161866
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.09569710837885

Episode: 79
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 22.070790494938564
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 6.6680967359516865

Episode: 80
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.407865325733667
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: -21.933261231515996

Episode: 81
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: -22.607079408721003
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: -21.367636019118006

Episode: 82
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.31445365683463
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 19.79242615269996

Episode: 83
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 21.657188591245628
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 14.956034249005144

Episode: 84
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 22.71121007099918
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.02491335886016

Episode: 85
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.01983426927542
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 10.470931694215363

Episode: 86
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 21.99427164157776
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 22.827038755375433

Episode: 87
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 17.61838501736647
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.279058876801834

Episode: 88
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 22.441889906741146
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 20.656182266212202

Episode: 89
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 21.90180899145264
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 19.167976540273504

Episode: 90
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.712661286467945
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: -0.017306489434897945

Episode: 91
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.696879337155277
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: -1.0532963384800456

Episode: 92
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.321120243854523
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: -10.68692266882145

Episode: 93
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.67980593244119
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 22.700616170305448

Episode: 94
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.70813688858003
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 10.28368534697947

Episode: 95
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 22.28436844421209
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.656678932471955

Episode: 96
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 21.61389155012217
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 18.402309118704373

Episode: 97
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 22.633616673650337
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 3.51235167647506

Episode: 98
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 17.724385287992355
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: -1.8601302699212272

Episode: 99
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: -10.243442089730632
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 10.116191753332624

Episode: 100
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 18.93604075162242
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 4.441499362634381

Episode: 101
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: -9.004417017154083
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.795936425448275

Episode: 102
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.563914333334278
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 10.977297238892586

Episode: 103
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 16.921749782323367
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 22.990823207884425

Episode: 104
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 21.40012232709999
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: -8.52594758398824

Episode: 105
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 16.594933685619523
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 15.982830267104674

Episode: 106
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.86082032069701
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 19.60870056920679

Episode: 107
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 22.984746818904064
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.11613187956569

Episode: 108
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.33449678062136
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 15.034433343180519

Episode: 109
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.258428113303356
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 12.17899895991614

Episode: 110
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.412323702262192
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 22.95226181699195

Episode: 111
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 22.819170985769517
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 22.122791905917623

Episode: 112
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.131521423821294
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.27406158340646

Episode: 113
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.09060519045417
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 21.397002613297797

Episode: 114
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.4815785412685
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.55087589311579

Episode: 115
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.141580683213373
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 14.027931352496662

Episode: 116
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 20.707995203844362
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.67183209126854

Episode: 117
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.43030079638044
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 21.312533549954285

Episode: 118
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.50866795656221
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 21.22580688698167

Episode: 119
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.37111383050115
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 18.14335891033165

Episode: 120
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.5241444568094
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 16.863472622188016

Episode: 121
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 22.894950062204387
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.520721152238423

Episode: 122
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.471270967971858
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 14.348046542762969

Episode: 123
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: -15.474159497907216
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 1.7671220457360697

Episode: 124
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: -0.47955671348782847
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 18.42475873637816

Episode: 125
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.18741985867898
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 11.497461705279143

Episode: 126
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.411391539185203
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 15.331292907698042

Episode: 127
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.205747474720376
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 12.721738029516064

Episode: 128
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 4.806917295986297
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 13.800113083749613

Episode: 129
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.64786123339536
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 18.73825080656664

Episode: 130
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 10.50935752002672
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.74653522378209

Episode: 131
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.662803899849486
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 22.84663331062959

Episode: 132
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.097275688771923
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 3.327741291855089

Episode: 133
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 22.246513766612267
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 22.842957388542764

Episode: 134
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 22.06831943168383
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 16.23983231920505

Episode: 135
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.770154644143595
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 7.637819078107144

Episode: 136
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 22.821927085600763
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 10.017254975377293

Episode: 137
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: -0.2803608252185307
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 22.991623455079658

Episode: 138
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 7.436654112679281
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 11.194113006805255

Episode: 139
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.28967146889931
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 7.926188455135776

Episode: 140
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.146099610869417
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: -8.40322113364434

Episode: 141
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 20.962886092242933
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 6.980838163982953

Episode: 142
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 21.876184542987204
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.683974971499513

Episode: 143
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.113398245566746
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.872610944696596

Episode: 144
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 22.996660262397075
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: -12.08903800184249

Episode: 145
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 22.083386765032806
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 17.783298429241473

Episode: 146
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.143404441334045
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.843372701502155

Episode: 147
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 21.07924506084104
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.880806271608552

Episode: 148
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 21.626813248875774
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 1.0067365601854128

Episode: 149
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.802478699283903
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 19.441220397392215

Episode: 150
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.705344074557267
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 22.732520845340964

Episode: 151
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 21.32600174794032
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 21.89416970907509

Episode: 152
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: -7.891009789999392
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 5.799903648720521

Episode: 153
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 1.1463343089189424
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 22.776093830119123

Episode: 154
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 13.056410709480154
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 8.20741563295318

Episode: 155
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 21.708069683383357
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 15.041167879275838

Episode: 156
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 14.836925785147303
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 16.429236951873598

Episode: 157
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 16.979738826277632
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 9.075838457034502

Episode: 158
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 16.200417071674533
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: -10.325094560655643

Episode: 159
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 21.44140446150657
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 15.00764478300642

Episode: 160
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 8.89645566157352
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 15.313857252034124

Episode: 161
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.125507782856978
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: -7.4275798429326585

Episode: 162
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.457992875803797
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 2.0100270754383316

Episode: 163
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: -10.800049356495617
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: -7.743846755834376

Episode: 164
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.800652333802766
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 17.33358453037226

Episode: 165
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.37123993402768
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 6.490516191931431

Episode: 166
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.578628269786698
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.086299625880123

Episode: 167
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 21.180913989111254
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 22.78373414601135

Episode: 168
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.229984718972823
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 10.791476877363579

Episode: 169
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.841190529333858
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 12.128492614959002

Episode: 170
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.269987403063773
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 9.709137266354652

Episode: 171
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.130165564721437
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 21.762062675956628

Episode: 172
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 22.178740665968252
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: -10.41233035714547

Episode: 173
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.221009035410706
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 17.242775305191827

Episode: 174
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 21.138954798880835
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.018530156955602

Episode: 175
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.60430182650554
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 7.150639181578374

Episode: 176
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: -7.917752245643303
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.713400796067496

Episode: 177
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 12.278949101711945
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 17.157070159673335

Episode: 178
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.175557679970687
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 11.386498060167893

Episode: 179
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 21.808283401786166
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 21.623705808219203

Episode: 180
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 12.36063817280927
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 9.283375620584787

Episode: 181
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 17.0596134347507
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 18.084558934272735

Episode: 182
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.85354199833592
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 22.90358119540768

Episode: 183
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 11.371368581522983
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: -11.647828238407355

Episode: 184
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.65282700184207
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 19.29499480239195

Episode: 185
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 21.65132154503692
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.651049964298537

Episode: 186
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 21.557043263366992
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 12.076451105986857

Episode: 187
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 21.49741731107958
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 17.660331191680136

Episode: 188
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 4.374032004208115
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 8.66860102136293

Episode: 189
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 14.152181767560005
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 11.781719262023941

Episode: 190
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 13.54690613332107
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 15.7063577777537

Episode: 191
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: -9.874015538575685
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: -9.274260634712595

Episode: 192
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 20.554318607438535
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 18.227570653416443

Episode: 193
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 14.160373095054455
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: -12.498323774877134

Episode: 194
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.290143947483635
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 4.92687468620133

Episode: 195
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 13.293676728708107
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 17.329294479619005

Episode: 196
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 22.69132226067102
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 17.723107148806324

Episode: 197
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 16.572222884250664
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 3.628034802909445

Episode: 198
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 18.667363856127622
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 17.01141756467051

Episode: 199
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 9.935744296434162
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 7.9121692203418625

Episode: 200
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 9.469280197040256
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 11.539318455956334

Episode: 201
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: -6.631342864183037
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: -1.1041052006631817

Episode: 202
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 7.867767970200913
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 12.574367912157577

Episode: 203
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 11.71076054537998
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 6.441564979256112

Episode: 204
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 15.537055695714265
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 5.051652654159874

Episode: 205
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 5.472413393206605
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 3.1528114698319913

Episode: 206
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 21.85813585796397
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 4.055672212767802

Episode: 207
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 2.2754000612331344
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 7.754231824117529

Episode: 208
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 14.72572915384443
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 16.25024368205161

Episode: 209
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: -15.737647873531252
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 10.991218222355828

Episode: 210
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 14.8482155336794
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 15.829432037123746

Episode: 211
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 13.273085776409703
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 12.566136840917089

Episode: 212
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 14.541053153868551
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 13.428363339028698

Episode: 213
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 10.533164395482112
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 12.83112139980829

Episode: 214
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 22.677460809390517
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 9.952319005309686

Episode: 215
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 19.09999136956315
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: -7.765889197728855

Episode: 216
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 12.393184708321481
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 13.440909946668524

Episode: 217
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 19.56481061885463
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 19.716265000194767

Episode: 218
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 17.43458961537057
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 9.559940248322874

Episode: 219
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 15.550516949446203
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 15.95914174700342

Episode: 220
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 15.288985662431005
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 20.175140613950038

Episode: 221
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 16.258384568430323
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 14.660110906611294

Episode: 222
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 15.458226609227326
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 4.90756444397265

Episode: 223
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 14.998261864137396
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: -13.473384038074208

Episode: 224
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 14.687842264333321
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 16.95433408386384

Episode: 225
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 14.005450725943156
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 13.740738803253935

Episode: 226
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 16.796963157886648
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 11.833450000899777

Episode: 227
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 18.370318639044395
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 13.19084579480168

Episode: 228
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 15.053254295153524
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 15.91732403963228

Episode: 229
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 14.03944537665624
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 12.806452234134227

Episode: 230
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 20.593279883105733
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: -12.296776547518505

Episode: 231
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 19.22119141374626
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 17.456290502503226

Episode: 232
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 16.772069355658275
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: -12.62512976683559

Episode: 233
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 14.454188275996017
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 6.784429549754644

Episode: 234
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 17.91790955087797
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: -20.978467012061607

Episode: 235
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: -15.336852931230915
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 15.171989392297714

Episode: 236
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 17.945536611081238
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 0.07054546009673346

Episode: 237
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 18.39286336072682
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 7.7315260050350725

Episode: 238
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 13.30363731071969
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 0.10052963172916746

Episode: 239
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 16.94310583015898
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 15.65138033336936

Episode: 240
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 20.348193788272436
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 6.828427528480168

Episode: 241
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 11.020038473584657
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 13.597731037759221

Episode: 242
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 9.590782891199346
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 16.269717666915824

Episode: 243
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 20.45832385936129
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 8.610318119074122

Episode: 244
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 14.361818634885964
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 11.649390476771373

Episode: 245
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 20.544680709613814
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 7.8050048112543315

Episode: 246
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 11.113316310068718
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 10.33601868997979

Episode: 247
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: -12.672166598116355
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: -11.033966796269585

Episode: 248
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 19.49885159825976
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 20.542611020681704

Episode: 249
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 19.123815975308368
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 11.340540266559426

Episode: 250
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 9.864313668700113
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 18.97120271253559

Episode: 251
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 19.841696921725845
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 15.841769130245346

Episode: 252
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 19.030403869755165
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.457114830690365

Episode: 253
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 15.316956788026019
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 18.664503438000242

Episode: 254
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 19.581687590062952
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 17.408045608862896

Episode: 255
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 18.391165189178576
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 14.974840016230525

Episode: 256
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 19.562324326515093
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 16.344798822284808

Episode: 257
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 17.984991316778558
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 7.492352887237009

Episode: 258
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 7.2405667294115625
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 3.450647156347622

Episode: 259
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 14.77917132243804
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 18.662824486742252

Episode: 260
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 22.333072429950555
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 12.874042971564364

Episode: 261
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 18.66551545976778
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 20.535104964283292

Episode: 262
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 22.167332121218994
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 17.95652617041088

Episode: 263
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 15.654743839291877
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 7.996050190503583

Episode: 264
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 21.63902577368187
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 17.131995676325854

Episode: 265
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 19.222478472277977
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 5.797618450574944

Episode: 266
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 16.24579644920868
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 17.825587458131054

Episode: 267
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.137059864004115
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 10.523274603897432

Episode: 268
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 18.682504784785575
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 18.71582724495861

Episode: 269
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 11.209534536609617
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 13.242325844108263

Episode: 270
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.562039871585217
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 6.18599010405104

Episode: 271
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: -12.250414720890262
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 8.201831306485325

Episode: 272
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 21.693093180722034
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 6.697762726197936

Episode: 273
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 14.900512508419048
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: -11.742780833717628

Episode: 274
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 20.248384245490968
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 1.0838984988410878

Episode: 275
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 15.105105774299751
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 11.280194905507672

Episode: 276
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 5.634818578524679
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: -12.617900282639058

Episode: 277
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 12.949319709362031
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 14.27439369764415

Episode: 278
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.38467320693471
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: -7.440673964728315

Episode: 279
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 13.621600895025743
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 16.546915900349642

Episode: 280
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 23.618815021618115
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 15.771212536590804

Episode: 281
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 15.56224684967927
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 11.749517531020455

Episode: 282
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 16.710739719754265
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 5.634348729107564

Episode: 283
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 10.532331979907003
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 3.7253254111964877

Episode: 284
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 3.38045101372206
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 14.278648787343608

Episode: 285
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 0.01482146170519378
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 7.753390596940057

Episode: 286
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: -8.237938179563763
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 7.378603608188481

Episode: 287
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 9.98579702323067
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 17.21013969374121

Episode: 288
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 16.24262649643252
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 5.450163903040933

Episode: 289
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: -8.852678707815121
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 15.554598794414154

Episode: 290
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 4.2921084008521095
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 10.210471814137957

Episode: 291
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 9.514086759259909
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: -0.7271185945954843

Episode: 292
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 1.983195218929698
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 4.2721528537348314

Episode: 293
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 18.131882881915338
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 8.04126091733158

Episode: 294
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 13.641529868144708
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 13.824739821345585

Episode: 295
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 6.219258695387855
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: -6.95274338440932

Episode: 296
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 19.70424834866546
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 5.072081918971506

Episode: 297
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 15.877489808440775
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: -11.41367957793497

Episode: 298
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 9.859326503636089
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: 4.803743917088497

Episode: 299
Stepping...


  0%|          | 0/500 [00:00<?, ?it/s]

Processing rewards...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: -3.460004881317202
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: -8.740428259671395
Generating...


  0%|          | 0/500 [00:00<?, ?it/s]

Evaluating states...


  0%|          | 0/500 [00:00<?, ?it/s]

Episode return: -10.599469116834422
Done
Shutting down background jobs, please wait a moment...
Done!
Waiting for the remaining 633 operations to synchronize with Neptune. Do not kill this process.
All 633 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://app.neptune.ai/pil-clemson/metamtl-rl-test/e/RLTEST-74


In [44]:
raise NotImplementedError

NotImplementedError: 

In [None]:
fem.close()

## 