<a href="https://colab.research.google.com/github/mikohuhu/q-learning-taxi-v3-MMAI845/blob/JAY/finalized_taxiV3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [30]:
import gym
import numpy as np
import time
import pandas as pd

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

In [15]:
# Initialize Taxi-v3 environment
env = gym.make('Taxi-v3')

In [17]:
#The first code implements a sampling method where actions are randomly selected at each timestep without any learning involved.
#This is akin to a brute-force approach, where actions are chosen without regard to the state or any learned policies.

def evaluate_sampling_method(num_episodes=100):
    penalties = []
    timesteps = []
    rewards = []

    for _ in range(num_episodes):
        state = env.reset()
        episode_penalties = 0
        episode_timesteps = 0
        episode_rewards = 0

        done = False
        while not done:
            # Sample a random action
            action = env.action_space.sample()

            # Perform the action
            next_state, reward, done, _ = env.step(action)

            # Update episode statistics
            episode_penalties += 1 if reward == -10 else 0
            episode_timesteps += 1
            episode_rewards += reward

            state = next_state

        penalties.append(episode_penalties)
        timesteps.append(episode_timesteps)
        rewards.append(episode_rewards)

    # Calculate metrics
    avg_penalties = np.mean(penalties)
    avg_timesteps = np.mean(timesteps)
    avg_rewards = np.mean(rewards)

    return avg_penalties, avg_timesteps, avg_rewards

# Perform the experiment and measure the execution time
start_time = time.time()
avg_penalties, avg_timesteps, avg_rewards = evaluate_sampling_method()
end_time = time.time()
execution_time = end_time - start_time

# Print metrics
print("Experiment 1: Sampling Method")
print("Average number of penalties per episode:", avg_penalties)
print("Average number of timesteps per trip:", avg_timesteps)
print("Average rewards per move:", avg_rewards)
print("Total time to execute:", execution_time, "seconds")

Experiment 1: Sampling Method
Average number of penalties per episode: 64.2
Average number of timesteps per trip: 195.88
Average rewards per move: -772.42
Total time to execute: 0.8409316539764404 seconds


In [18]:
env = gym.make('Taxi-v3')

def q_learning(env, num_episodes=100000, alpha=0.1, gamma=0.6, epsilon=0.1):
    Q = np.zeros((env.observation_space.n, env.action_space.n))

    total_penalties = 0
    total_timesteps = 0
    total_rewards = 0

    start_time = time.time()  # Start time

    for episode in range(num_episodes):
        state = env.reset()
        episode_penalties = 0
        episode_timesteps = 0
        episode_rewards = 0

        done = False
        while not done:
            # Epsilon-greedy action selection
            if np.random.uniform(0, 1) < epsilon:
                action = env.action_space.sample()  # Exploration
            else:
                action = np.argmax(Q[state])  # Exploitation

            next_state, reward, done, _ = env.step(action)

            # Q-Learning update
            best_next_action = np.argmax(Q[next_state])
            Q[state, action] += alpha * (reward + gamma * Q[next_state, best_next_action] - Q[state, action])

            # Update episode statistics
            episode_penalties += 1 if reward == -10 else 0
            episode_timesteps += 1
            episode_rewards += reward

            state = next_state

        total_penalties += episode_penalties
        total_timesteps += episode_timesteps
        total_rewards += episode_rewards

    # Calculate metrics
    avg_penalties = total_penalties / num_episodes
    avg_timesteps = total_timesteps / num_episodes
    avg_rewards = total_rewards / (total_timesteps - total_penalties)  # Average rewards per move

    end_time = time.time()  # End time
    execution_time = end_time - start_time  # Calculate execution time

    return avg_penalties, avg_timesteps, avg_rewards, execution_time

# Perform Q-Learning experiment
avg_penalties, avg_timesteps, avg_rewards, execution_time = q_learning(env)

# Print metrics
print("Experiment 2: Q-Learning")
print("Average number of penalties per episode:", avg_penalties)
print("Average number of timesteps per trip:", avg_timesteps)
print("Average rewards per move:", avg_rewards)
print("Total time to train:", execution_time, "seconds")

Experiment 2: Q-Learning
Average number of penalties per episode: 0.48125
Average number of timesteps per trip: 15.89639
Average rewards per move: 0.046793606804738715
Total time to train: 69.6101667881012 seconds


In [19]:
def sarsa(env, num_episodes=100000, alpha=0.2, gamma=0.8, epsilon=0.1):
    Q = np.zeros((env.observation_space.n, env.action_space.n))

    total_penalties = 0
    total_timesteps = 0
    total_rewards = 0

    start_time = time.time()  # Start time

    for episode in range(num_episodes):
        state = env.reset()
        action = epsilon_greedy_policy(Q, state, epsilon)

        episode_penalties = 0
        episode_timesteps = 0
        episode_rewards = 0

        done = False
        while not done:
            next_state, reward, done, _ = env.step(action)
            next_action = epsilon_greedy_policy(Q, next_state, epsilon)

            # SARSA update
            Q[state, action] += alpha * (reward + gamma * Q[next_state, next_action] - Q[state, action])

            # Update episode statistics
            episode_penalties += 1 if reward == -10 else 0
            episode_timesteps += 1
            episode_rewards += reward

            state = next_state
            action = next_action

        total_penalties += episode_penalties
        total_timesteps += episode_timesteps
        total_rewards += episode_rewards

    # Calculate metrics
    avg_penalties = total_penalties / num_episodes
    avg_timesteps = total_timesteps / num_episodes
    avg_rewards = total_rewards / (total_timesteps - total_penalties)  # Average rewards per move

    end_time = time.time()  # End time
    execution_time = end_time - start_time  # Calculate execution time

    return avg_penalties, avg_timesteps, avg_rewards, execution_time

def epsilon_greedy_policy(Q, state, epsilon):
    if np.random.uniform(0, 1) < epsilon:
        return np.random.choice(env.action_space.n)  # Exploration
    else:
        return np.argmax(Q[state])  # Exploitation

# Perform SARSA experiment
avg_penalties, avg_timesteps, avg_rewards, execution_time = sarsa(env)

# Print metrics
print("Experiment 3: SARSA")
print("Average number of penalties per episode:", avg_penalties)
print("Average number of timesteps per trip:", avg_timesteps)
print("Average rewards per move:", avg_rewards)
print("Total time to train:", execution_time, "seconds")

Experiment 3: SARSA
Average number of penalties per episode: 0.90008
Average number of timesteps per trip: 28.31204
Average rewards per move: -0.5942176334709375
Total time to train: 120.70162725448608 seconds


In [28]:
import numpy as np
import time
from keras.optimizers import Adam
import gym
from keras.models import Sequential
from keras.layers import Dense, Embedding, Reshape
from collections import deque
import random
from tqdm import tqdm  # Importing tqdm for progress bar

# Initialize the environment with the new step API
env_taxi = gym.make("Taxi-v3", new_step_api=True).env

class TaxiAgent:
    def __init__(self, env, optimizer):
        self._state_size = env.observation_space.n
        self._action_size = env.action_space.n
        self._optimizer = optimizer
        self.experience_replay_memory = deque(maxlen=2000)
        self.discount = 0.6
        self.exploration = 0.1
        self.q_network = self._build_compile_model()
        self.target_network = self._build_compile_model()
        self.align_both_model()

    def gather(self, state, action, reward, next_state, terminated):
        self.experience_replay_memory.append((state, action, reward, next_state, terminated))

    def _build_compile_model(self):
        model = Sequential()
        model.add(Embedding(self._state_size, 10, input_length=1))
        model.add(Reshape((10,)))
        model.add(Dense(50, activation='relu'))
        model.add(Dense(50, activation='relu'))
        model.add(Dense(self._action_size, activation='linear'))
        model.compile(loss='mse', optimizer=self._optimizer)
        return model

    def align_both_model(self):
        self.target_network.set_weights(self.q_network.get_weights())

    def act(self, state):
        if np.random.rand() <= self.exploration:
            return env_taxi.action_space.sample()
        q_values = self.q_network.predict(state, verbose=0)
        return np.argmax(q_values[0])

    def retrain(self, batch_size):
        minibatch = random.sample(self.experience_replay_memory, batch_size)
        for state, action, reward, next_state, terminated in minibatch:
            target = self.q_network.predict(state, verbose=0)
            if terminated:
                target[0][action] = reward
            else:
                t = self.target_network.predict(next_state, verbose=0)
                target[0][action] = reward + self.discount * np.amax(t)
            self.q_network.fit(state, target, epochs=1, verbose=0)  # Reduce the epochs

def deep_q_learning(env, num_training_episodes=100, num_evaluation_episodes=100, alpha=0.01, gamma=0.6, epsilon=0.1, batch_size=32, timesteps_per_episode=40, epochs=4):
    # Training phase
    start_time = time.time()  # Record start time for training
    total_penalties = 0
    total_timesteps = 0
    total_rewards = 0

    for e in tqdm(range(num_training_episodes), desc="Training Episodes"):
        state = env_taxi.reset()
        state = np.reshape(state, [1, 1])
        reward = 0
        terminated = False
        timesteps = 0
        penalties = 0

        while not terminated and timesteps < timesteps_per_episode:
            action = taxi_agent.act(state)
            next_state, reward, terminated, truncated, info = env_taxi.step(action)
            next_state = np.reshape(next_state, [1, 1])
            taxi_agent.gather(state, action, reward, next_state, terminated)
            state = next_state
            timesteps += 1
            if reward == -10:
                penalties += 1

            # Debugging print statements
            print(f"Episode {e + 1}, Timestep {timesteps}: action={action}, reward={reward}, terminated={terminated}, penalties={penalties}")

            if len(taxi_agent.experience_replay_memory) > batch_size and timesteps % 10 == 0:  # Retrain less frequently
                print(f"Retraining on mini-batch at episode {e + 1}")
                taxi_agent.retrain(batch_size)

        total_penalties += penalties
        total_timesteps += timesteps
        total_rewards += reward

    end_time = time.time()  # Record end time for training
    execution_time = end_time - start_time  # Calculate time to train

    # Evaluation phase
    total_penalties = 0
    total_timesteps = 0
    total_rewards = 0
    for e in tqdm(range(num_evaluation_episodes), desc="Evaluation Episodes"):
        # Evaluation episode loop
        state = env_taxi.reset()
        state = np.reshape(state, [1, 1])
        reward = 0
        terminated = False
        timesteps = 0
        penalties = 0

        while not terminated and timesteps < timesteps_per_episode:
            action = taxi_agent.act(state)
            next_state, reward, terminated, truncated, info = env_taxi.step(action)
            next_state = np.reshape(next_state, [1, 1])
            state = next_state
            timesteps += 1
            if reward == -10:
                penalties += 1

            # Debugging print statements
            print(f"Evaluation Episode {e + 1}, Timestep {timesteps}: action={action}, reward={reward}, terminated={terminated}, penalties={penalties}")

        total_penalties += penalties
        total_timesteps += timesteps
        total_rewards += reward

    # Calculate averages for evaluation metrics
    average_penalties = total_penalties / num_evaluation_episodes
    average_timesteps = total_timesteps / num_evaluation_episodes
    average_rewards_per_move = total_rewards / (total_timesteps - total_penalties)

    # Print metrics
    print("Deep Q-Learning Method:")
    print("Average number of penalties per episode:", average_penalties)
    print("Average number of timesteps per trip:", average_timesteps)
    print("Average rewards per move:", average_rewards_per_move)
    print("Time to train:", execution_time, "seconds")

# Creating the optimizer
optimizer = Adam(learning_rate=0.01)

# Creating the TaxiAgent instance
taxi_agent = TaxiAgent(env_taxi, optimizer)

# Defining parameters for the experiment
num_training_episodes = 100
num_evaluation_episodes = 100
batch_size = 32
timesteps_per_episode = 40

# Running the Deep Q-Learning experiment
deep_q_learning(env_taxi, num_training_episodes, num_evaluation_episodes, batch_size=batch_size, timesteps_per_episode=timesteps_per_episode)


Training Episodes:   0%|          | 0/100 [00:00<?, ?it/s]

Episode 1, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Episode 1, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Episode 1, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Episode 1, Timestep 4: action=3, reward=-1, terminated=False, penalties=0
Episode 1, Timestep 5: action=3, reward=-1, terminated=False, penalties=0
Episode 1, Timestep 6: action=4, reward=-10, terminated=False, penalties=1
Episode 1, Timestep 7: action=3, reward=-1, terminated=False, penalties=1
Episode 1, Timestep 8: action=3, reward=-1, terminated=False, penalties=1
Episode 1, Timestep 9: action=3, reward=-1, terminated=False, penalties=1
Episode 1, Timestep 10: action=3, reward=-1, terminated=False, penalties=1
Episode 1, Timestep 11: action=3, reward=-1, terminated=False, penalties=1
Episode 1, Timestep 12: action=3, reward=-1, terminated=False, penalties=1
Episode 1, Timestep 13: action=3, reward=-1, terminated=False, penalties=1
Episode 1, Timestep 14: action=3,

Training Episodes:   1%|          | 1/100 [00:11<19:02, 11.54s/it]

Episode 2, Timestep 1: action=5, reward=-10, terminated=False, penalties=1
Episode 2, Timestep 2: action=5, reward=-10, terminated=False, penalties=2
Episode 2, Timestep 3: action=5, reward=-10, terminated=False, penalties=3
Episode 2, Timestep 4: action=5, reward=-10, terminated=False, penalties=4
Episode 2, Timestep 5: action=5, reward=-10, terminated=False, penalties=5
Episode 2, Timestep 6: action=5, reward=-10, terminated=False, penalties=6
Episode 2, Timestep 7: action=5, reward=-10, terminated=False, penalties=7
Episode 2, Timestep 8: action=5, reward=-10, terminated=False, penalties=8
Episode 2, Timestep 9: action=5, reward=-10, terminated=False, penalties=9
Episode 2, Timestep 10: action=5, reward=-10, terminated=False, penalties=10
Retraining on mini-batch at episode 2
Episode 2, Timestep 11: action=2, reward=-1, terminated=False, penalties=10
Episode 2, Timestep 12: action=2, reward=-1, terminated=False, penalties=10
Episode 2, Timestep 13: action=2, reward=-1, terminated=Fa

Training Episodes:   2%|▏         | 2/100 [00:48<43:05, 26.38s/it]

Episode 3, Timestep 1: action=3, reward=-1, terminated=False, penalties=0
Episode 3, Timestep 2: action=3, reward=-1, terminated=False, penalties=0
Episode 3, Timestep 3: action=3, reward=-1, terminated=False, penalties=0
Episode 3, Timestep 4: action=3, reward=-1, terminated=False, penalties=0
Episode 3, Timestep 5: action=2, reward=-1, terminated=False, penalties=0
Episode 3, Timestep 6: action=3, reward=-1, terminated=False, penalties=0
Episode 3, Timestep 7: action=3, reward=-1, terminated=False, penalties=0
Episode 3, Timestep 8: action=3, reward=-1, terminated=False, penalties=0
Episode 3, Timestep 9: action=0, reward=-1, terminated=False, penalties=0
Episode 3, Timestep 10: action=3, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 3
Episode 3, Timestep 11: action=3, reward=-1, terminated=False, penalties=0
Episode 3, Timestep 12: action=0, reward=-1, terminated=False, penalties=0
Episode 3, Timestep 13: action=0, reward=-1, terminated=False, penaltie

Training Episodes:   3%|▎         | 3/100 [01:22<48:46, 30.17s/it]

Episode 4, Timestep 1: action=2, reward=-1, terminated=False, penalties=0
Episode 4, Timestep 2: action=2, reward=-1, terminated=False, penalties=0
Episode 4, Timestep 3: action=2, reward=-1, terminated=False, penalties=0
Episode 4, Timestep 4: action=2, reward=-1, terminated=False, penalties=0
Episode 4, Timestep 5: action=2, reward=-1, terminated=False, penalties=0
Episode 4, Timestep 6: action=2, reward=-1, terminated=False, penalties=0
Episode 4, Timestep 7: action=2, reward=-1, terminated=False, penalties=0
Episode 4, Timestep 8: action=2, reward=-1, terminated=False, penalties=0
Episode 4, Timestep 9: action=2, reward=-1, terminated=False, penalties=0
Episode 4, Timestep 10: action=2, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 4
Episode 4, Timestep 11: action=5, reward=-10, terminated=False, penalties=1
Episode 4, Timestep 12: action=3, reward=-1, terminated=False, penalties=1
Episode 4, Timestep 13: action=3, reward=-1, terminated=False, penalti

Training Episodes:   4%|▍         | 4/100 [01:59<52:25, 32.76s/it]

Episode 5, Timestep 1: action=3, reward=-1, terminated=False, penalties=0
Episode 5, Timestep 2: action=3, reward=-1, terminated=False, penalties=0
Episode 5, Timestep 3: action=3, reward=-1, terminated=False, penalties=0
Episode 5, Timestep 4: action=3, reward=-1, terminated=False, penalties=0
Episode 5, Timestep 5: action=3, reward=-1, terminated=False, penalties=0
Episode 5, Timestep 6: action=3, reward=-1, terminated=False, penalties=0
Episode 5, Timestep 7: action=3, reward=-1, terminated=False, penalties=0
Episode 5, Timestep 8: action=3, reward=-1, terminated=False, penalties=0
Episode 5, Timestep 9: action=3, reward=-1, terminated=False, penalties=0
Episode 5, Timestep 10: action=3, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 5
Episode 5, Timestep 11: action=0, reward=-1, terminated=False, penalties=0
Episode 5, Timestep 12: action=2, reward=-1, terminated=False, penalties=0
Episode 5, Timestep 13: action=2, reward=-1, terminated=False, penaltie

Training Episodes:   5%|▌         | 5/100 [02:35<53:49, 33.99s/it]

Episode 6, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Episode 6, Timestep 2: action=3, reward=-1, terminated=False, penalties=0
Episode 6, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Episode 6, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Episode 6, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Episode 6, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Episode 6, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Episode 6, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Episode 6, Timestep 9: action=1, reward=-1, terminated=False, penalties=0
Episode 6, Timestep 10: action=1, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 6
Episode 6, Timestep 11: action=5, reward=-10, terminated=False, penalties=1
Episode 6, Timestep 12: action=1, reward=-1, terminated=False, penalties=1
Episode 6, Timestep 13: action=1, reward=-1, terminated=False, penalti

Training Episodes:   6%|▌         | 6/100 [03:09<53:13, 33.98s/it]

Episode 7, Timestep 1: action=2, reward=-1, terminated=False, penalties=0
Episode 7, Timestep 2: action=2, reward=-1, terminated=False, penalties=0
Episode 7, Timestep 3: action=2, reward=-1, terminated=False, penalties=0
Episode 7, Timestep 4: action=2, reward=-1, terminated=False, penalties=0
Episode 7, Timestep 5: action=2, reward=-1, terminated=False, penalties=0
Episode 7, Timestep 6: action=2, reward=-1, terminated=False, penalties=0
Episode 7, Timestep 7: action=2, reward=-1, terminated=False, penalties=0
Episode 7, Timestep 8: action=2, reward=-1, terminated=False, penalties=0
Episode 7, Timestep 9: action=1, reward=-1, terminated=False, penalties=0
Episode 7, Timestep 10: action=2, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 7
Episode 7, Timestep 11: action=2, reward=-1, terminated=False, penalties=0
Episode 7, Timestep 12: action=2, reward=-1, terminated=False, penalties=0
Episode 7, Timestep 13: action=3, reward=-1, terminated=False, penaltie

Training Episodes:   7%|▋         | 7/100 [03:47<54:41, 35.28s/it]

Episode 8, Timestep 1: action=3, reward=-1, terminated=False, penalties=0
Episode 8, Timestep 2: action=3, reward=-1, terminated=False, penalties=0
Episode 8, Timestep 3: action=3, reward=-1, terminated=False, penalties=0
Episode 8, Timestep 4: action=4, reward=-1, terminated=False, penalties=0
Episode 8, Timestep 5: action=3, reward=-1, terminated=False, penalties=0
Episode 8, Timestep 6: action=3, reward=-1, terminated=False, penalties=0
Episode 8, Timestep 7: action=3, reward=-1, terminated=False, penalties=0
Episode 8, Timestep 8: action=3, reward=-1, terminated=False, penalties=0
Episode 8, Timestep 9: action=3, reward=-1, terminated=False, penalties=0
Episode 8, Timestep 10: action=3, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 8
Episode 8, Timestep 11: action=2, reward=-1, terminated=False, penalties=0
Episode 8, Timestep 12: action=2, reward=-1, terminated=False, penalties=0
Episode 8, Timestep 13: action=2, reward=-1, terminated=False, penaltie

Training Episodes:   8%|▊         | 8/100 [04:21<53:11, 34.69s/it]

Episode 9, Timestep 1: action=0, reward=-1, terminated=False, penalties=0
Episode 9, Timestep 2: action=0, reward=-1, terminated=False, penalties=0
Episode 9, Timestep 3: action=0, reward=-1, terminated=False, penalties=0
Episode 9, Timestep 4: action=2, reward=-1, terminated=False, penalties=0
Episode 9, Timestep 5: action=0, reward=-1, terminated=False, penalties=0
Episode 9, Timestep 6: action=0, reward=-1, terminated=False, penalties=0
Episode 9, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Episode 9, Timestep 8: action=0, reward=-1, terminated=False, penalties=0
Episode 9, Timestep 9: action=0, reward=-1, terminated=False, penalties=0
Episode 9, Timestep 10: action=0, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 9
Episode 9, Timestep 11: action=0, reward=-1, terminated=False, penalties=0
Episode 9, Timestep 12: action=0, reward=-1, terminated=False, penalties=0
Episode 9, Timestep 13: action=0, reward=-1, terminated=False, penaltie

Training Episodes:   9%|▉         | 9/100 [04:58<53:44, 35.43s/it]

Episode 10, Timestep 1: action=0, reward=-1, terminated=False, penalties=0
Episode 10, Timestep 2: action=0, reward=-1, terminated=False, penalties=0
Episode 10, Timestep 3: action=0, reward=-1, terminated=False, penalties=0
Episode 10, Timestep 4: action=0, reward=-1, terminated=False, penalties=0
Episode 10, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Episode 10, Timestep 6: action=0, reward=-1, terminated=False, penalties=0
Episode 10, Timestep 7: action=0, reward=-1, terminated=False, penalties=0
Episode 10, Timestep 8: action=0, reward=-1, terminated=False, penalties=0
Episode 10, Timestep 9: action=0, reward=-1, terminated=False, penalties=0
Episode 10, Timestep 10: action=0, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 10
Episode 10, Timestep 11: action=0, reward=-1, terminated=False, penalties=0
Episode 10, Timestep 12: action=0, reward=-1, terminated=False, penalties=0
Episode 10, Timestep 13: action=0, reward=-1, terminated=F

Training Episodes:  10%|█         | 10/100 [05:33<52:58, 35.32s/it]

Episode 11, Timestep 1: action=4, reward=-10, terminated=False, penalties=1
Episode 11, Timestep 2: action=1, reward=-1, terminated=False, penalties=1
Episode 11, Timestep 3: action=0, reward=-1, terminated=False, penalties=1
Episode 11, Timestep 4: action=1, reward=-1, terminated=False, penalties=1
Episode 11, Timestep 5: action=0, reward=-1, terminated=False, penalties=1
Episode 11, Timestep 6: action=1, reward=-1, terminated=False, penalties=1
Episode 11, Timestep 7: action=0, reward=-1, terminated=False, penalties=1
Episode 11, Timestep 8: action=1, reward=-1, terminated=False, penalties=1
Episode 11, Timestep 9: action=0, reward=-1, terminated=False, penalties=1
Episode 11, Timestep 10: action=1, reward=-1, terminated=False, penalties=1
Retraining on mini-batch at episode 11
Episode 11, Timestep 11: action=0, reward=-1, terminated=False, penalties=1
Episode 11, Timestep 12: action=2, reward=-1, terminated=False, penalties=1
Episode 11, Timestep 13: action=2, reward=-1, terminated=

Training Episodes:  11%|█         | 11/100 [06:09<52:39, 35.51s/it]

Episode 12, Timestep 1: action=2, reward=-1, terminated=False, penalties=0
Episode 12, Timestep 2: action=2, reward=-1, terminated=False, penalties=0
Episode 12, Timestep 3: action=0, reward=-1, terminated=False, penalties=0
Episode 12, Timestep 4: action=0, reward=-1, terminated=False, penalties=0
Episode 12, Timestep 5: action=2, reward=-1, terminated=False, penalties=0
Episode 12, Timestep 6: action=2, reward=-1, terminated=False, penalties=0
Episode 12, Timestep 7: action=2, reward=-1, terminated=False, penalties=0
Episode 12, Timestep 8: action=2, reward=-1, terminated=False, penalties=0
Episode 12, Timestep 9: action=2, reward=-1, terminated=False, penalties=0
Episode 12, Timestep 10: action=2, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 12
Episode 12, Timestep 11: action=2, reward=-1, terminated=False, penalties=0
Episode 12, Timestep 12: action=1, reward=-1, terminated=False, penalties=0
Episode 12, Timestep 13: action=5, reward=-10, terminated=

Training Episodes:  12%|█▏        | 12/100 [06:45<52:16, 35.64s/it]

Episode 13, Timestep 1: action=3, reward=-1, terminated=False, penalties=0
Episode 13, Timestep 2: action=3, reward=-1, terminated=False, penalties=0
Episode 13, Timestep 3: action=3, reward=-1, terminated=False, penalties=0
Episode 13, Timestep 4: action=3, reward=-1, terminated=False, penalties=0
Episode 13, Timestep 5: action=3, reward=-1, terminated=False, penalties=0
Episode 13, Timestep 6: action=3, reward=-1, terminated=False, penalties=0
Episode 13, Timestep 7: action=3, reward=-1, terminated=False, penalties=0
Episode 13, Timestep 8: action=3, reward=-1, terminated=False, penalties=0
Episode 13, Timestep 9: action=3, reward=-1, terminated=False, penalties=0
Episode 13, Timestep 10: action=3, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 13
Episode 13, Timestep 11: action=1, reward=-1, terminated=False, penalties=0
Episode 13, Timestep 12: action=1, reward=-1, terminated=False, penalties=0
Episode 13, Timestep 13: action=1, reward=-1, terminated=F

Training Episodes:  13%|█▎        | 13/100 [07:20<51:20, 35.40s/it]

Episode 14, Timestep 1: action=0, reward=-1, terminated=False, penalties=0
Episode 14, Timestep 2: action=3, reward=-1, terminated=False, penalties=0
Episode 14, Timestep 3: action=3, reward=-1, terminated=False, penalties=0
Episode 14, Timestep 4: action=3, reward=-1, terminated=False, penalties=0
Episode 14, Timestep 5: action=3, reward=-1, terminated=False, penalties=0
Episode 14, Timestep 6: action=3, reward=-1, terminated=False, penalties=0
Episode 14, Timestep 7: action=3, reward=-1, terminated=False, penalties=0
Episode 14, Timestep 8: action=3, reward=-1, terminated=False, penalties=0
Episode 14, Timestep 9: action=3, reward=-1, terminated=False, penalties=0
Episode 14, Timestep 10: action=3, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 14
Episode 14, Timestep 11: action=3, reward=-1, terminated=False, penalties=0
Episode 14, Timestep 12: action=3, reward=-1, terminated=False, penalties=0
Episode 14, Timestep 13: action=3, reward=-1, terminated=F

Training Episodes:  14%|█▍        | 14/100 [07:58<51:49, 36.16s/it]

Episode 15, Timestep 1: action=3, reward=-1, terminated=False, penalties=0
Episode 15, Timestep 2: action=2, reward=-1, terminated=False, penalties=0
Episode 15, Timestep 3: action=3, reward=-1, terminated=False, penalties=0
Episode 15, Timestep 4: action=2, reward=-1, terminated=False, penalties=0
Episode 15, Timestep 5: action=3, reward=-1, terminated=False, penalties=0
Episode 15, Timestep 6: action=2, reward=-1, terminated=False, penalties=0
Episode 15, Timestep 7: action=3, reward=-1, terminated=False, penalties=0
Episode 15, Timestep 8: action=2, reward=-1, terminated=False, penalties=0
Episode 15, Timestep 9: action=3, reward=-1, terminated=False, penalties=0
Episode 15, Timestep 10: action=2, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 15
Episode 15, Timestep 11: action=1, reward=-1, terminated=False, penalties=0
Episode 15, Timestep 12: action=1, reward=-1, terminated=False, penalties=0
Episode 15, Timestep 13: action=1, reward=-1, terminated=F

Training Episodes:  15%|█▌        | 15/100 [08:31<50:09, 35.41s/it]

Episode 16, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Episode 16, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Episode 16, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Episode 16, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Episode 16, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Episode 16, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Episode 16, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Episode 16, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Episode 16, Timestep 9: action=1, reward=-1, terminated=False, penalties=0
Episode 16, Timestep 10: action=1, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 16
Episode 16, Timestep 11: action=3, reward=-1, terminated=False, penalties=0
Episode 16, Timestep 12: action=3, reward=-1, terminated=False, penalties=0
Episode 16, Timestep 13: action=3, reward=-1, terminated=F

Training Episodes:  16%|█▌        | 16/100 [09:09<50:47, 36.28s/it]

Episode 17, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Episode 17, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Episode 17, Timestep 3: action=5, reward=-10, terminated=False, penalties=1
Episode 17, Timestep 4: action=1, reward=-1, terminated=False, penalties=1
Episode 17, Timestep 5: action=1, reward=-1, terminated=False, penalties=1
Episode 17, Timestep 6: action=1, reward=-1, terminated=False, penalties=1
Episode 17, Timestep 7: action=1, reward=-1, terminated=False, penalties=1
Episode 17, Timestep 8: action=1, reward=-1, terminated=False, penalties=1
Episode 17, Timestep 9: action=1, reward=-1, terminated=False, penalties=1
Episode 17, Timestep 10: action=2, reward=-1, terminated=False, penalties=1
Retraining on mini-batch at episode 17
Episode 17, Timestep 11: action=0, reward=-1, terminated=False, penalties=1
Episode 17, Timestep 12: action=0, reward=-1, terminated=False, penalties=1
Episode 17, Timestep 13: action=0, reward=-1, terminated=

Training Episodes:  17%|█▋        | 17/100 [09:44<49:35, 35.84s/it]

Episode 18, Timestep 1: action=0, reward=-1, terminated=False, penalties=0
Episode 18, Timestep 2: action=2, reward=-1, terminated=False, penalties=0
Episode 18, Timestep 3: action=0, reward=-1, terminated=False, penalties=0
Episode 18, Timestep 4: action=0, reward=-1, terminated=False, penalties=0
Episode 18, Timestep 5: action=0, reward=-1, terminated=False, penalties=0
Episode 18, Timestep 6: action=0, reward=-1, terminated=False, penalties=0
Episode 18, Timestep 7: action=0, reward=-1, terminated=False, penalties=0
Episode 18, Timestep 8: action=0, reward=-1, terminated=False, penalties=0
Episode 18, Timestep 9: action=0, reward=-1, terminated=False, penalties=0
Episode 18, Timestep 10: action=0, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 18
Episode 18, Timestep 11: action=1, reward=-1, terminated=False, penalties=0
Episode 18, Timestep 12: action=2, reward=-1, terminated=False, penalties=0
Episode 18, Timestep 13: action=2, reward=-1, terminated=F

Training Episodes:  18%|█▊        | 18/100 [10:21<49:20, 36.11s/it]

Episode 19, Timestep 1: action=0, reward=-1, terminated=False, penalties=0
Episode 19, Timestep 2: action=0, reward=-1, terminated=False, penalties=0
Episode 19, Timestep 3: action=0, reward=-1, terminated=False, penalties=0
Episode 19, Timestep 4: action=0, reward=-1, terminated=False, penalties=0
Episode 19, Timestep 5: action=0, reward=-1, terminated=False, penalties=0
Episode 19, Timestep 6: action=0, reward=-1, terminated=False, penalties=0
Episode 19, Timestep 7: action=0, reward=-1, terminated=False, penalties=0
Episode 19, Timestep 8: action=2, reward=-1, terminated=False, penalties=0
Episode 19, Timestep 9: action=0, reward=-1, terminated=False, penalties=0
Episode 19, Timestep 10: action=0, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 19
Episode 19, Timestep 11: action=3, reward=-1, terminated=False, penalties=0
Episode 19, Timestep 12: action=3, reward=-1, terminated=False, penalties=0
Episode 19, Timestep 13: action=3, reward=-1, terminated=F

Training Episodes:  19%|█▉        | 19/100 [10:57<48:47, 36.14s/it]

Episode 20, Timestep 1: action=2, reward=-1, terminated=False, penalties=0
Episode 20, Timestep 2: action=2, reward=-1, terminated=False, penalties=0
Episode 20, Timestep 3: action=2, reward=-1, terminated=False, penalties=0
Episode 20, Timestep 4: action=2, reward=-1, terminated=False, penalties=0
Episode 20, Timestep 5: action=2, reward=-1, terminated=False, penalties=0
Episode 20, Timestep 6: action=2, reward=-1, terminated=False, penalties=0
Episode 20, Timestep 7: action=2, reward=-1, terminated=False, penalties=0
Episode 20, Timestep 8: action=2, reward=-1, terminated=False, penalties=0
Episode 20, Timestep 9: action=2, reward=-1, terminated=False, penalties=0
Episode 20, Timestep 10: action=2, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 20
Episode 20, Timestep 11: action=0, reward=-1, terminated=False, penalties=0
Episode 20, Timestep 12: action=1, reward=-1, terminated=False, penalties=0
Episode 20, Timestep 13: action=0, reward=-1, terminated=F

Training Episodes:  20%|██        | 20/100 [11:33<47:54, 35.94s/it]

Episode 21, Timestep 1: action=2, reward=-1, terminated=False, penalties=0
Episode 21, Timestep 2: action=3, reward=-1, terminated=False, penalties=0
Episode 21, Timestep 3: action=2, reward=-1, terminated=False, penalties=0
Episode 21, Timestep 4: action=2, reward=-1, terminated=False, penalties=0
Episode 21, Timestep 5: action=3, reward=-1, terminated=False, penalties=0
Episode 21, Timestep 6: action=2, reward=-1, terminated=False, penalties=0
Episode 21, Timestep 7: action=3, reward=-1, terminated=False, penalties=0
Episode 21, Timestep 8: action=4, reward=-1, terminated=False, penalties=0
Episode 21, Timestep 9: action=3, reward=-1, terminated=False, penalties=0
Episode 21, Timestep 10: action=3, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 21
Episode 21, Timestep 11: action=2, reward=-1, terminated=False, penalties=0
Episode 21, Timestep 12: action=2, reward=-1, terminated=False, penalties=0
Episode 21, Timestep 13: action=2, reward=-1, terminated=F

Training Episodes:  21%|██        | 21/100 [12:11<48:12, 36.61s/it]

Episode 22, Timestep 1: action=0, reward=-1, terminated=False, penalties=0
Episode 22, Timestep 2: action=0, reward=-1, terminated=False, penalties=0
Episode 22, Timestep 3: action=0, reward=-1, terminated=False, penalties=0
Episode 22, Timestep 4: action=0, reward=-1, terminated=False, penalties=0
Episode 22, Timestep 5: action=0, reward=-1, terminated=False, penalties=0
Episode 22, Timestep 6: action=0, reward=-1, terminated=False, penalties=0
Episode 22, Timestep 7: action=0, reward=-1, terminated=False, penalties=0
Episode 22, Timestep 8: action=0, reward=-1, terminated=False, penalties=0
Episode 22, Timestep 9: action=0, reward=-1, terminated=False, penalties=0
Episode 22, Timestep 10: action=0, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 22
Episode 22, Timestep 11: action=0, reward=-1, terminated=False, penalties=0
Episode 22, Timestep 12: action=0, reward=-1, terminated=False, penalties=0
Episode 22, Timestep 13: action=0, reward=-1, terminated=F

Training Episodes:  22%|██▏       | 22/100 [12:43<46:00, 35.39s/it]

Episode 23, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Episode 23, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Episode 23, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Episode 23, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Episode 23, Timestep 5: action=0, reward=-1, terminated=False, penalties=0
Episode 23, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Episode 23, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Episode 23, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Episode 23, Timestep 9: action=1, reward=-1, terminated=False, penalties=0
Episode 23, Timestep 10: action=1, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 23
Episode 23, Timestep 11: action=2, reward=-1, terminated=False, penalties=0
Episode 23, Timestep 12: action=2, reward=-1, terminated=False, penalties=0
Episode 23, Timestep 13: action=2, reward=-1, terminated=F

Training Episodes:  23%|██▎       | 23/100 [13:22<46:34, 36.29s/it]

Episode 24, Timestep 1: action=2, reward=-1, terminated=False, penalties=0
Episode 24, Timestep 2: action=2, reward=-1, terminated=False, penalties=0
Episode 24, Timestep 3: action=2, reward=-1, terminated=False, penalties=0
Episode 24, Timestep 4: action=2, reward=-1, terminated=False, penalties=0
Episode 24, Timestep 5: action=2, reward=-1, terminated=False, penalties=0
Episode 24, Timestep 6: action=2, reward=-1, terminated=False, penalties=0
Episode 24, Timestep 7: action=2, reward=-1, terminated=False, penalties=0
Episode 24, Timestep 8: action=2, reward=-1, terminated=False, penalties=0
Episode 24, Timestep 9: action=2, reward=-1, terminated=False, penalties=0
Episode 24, Timestep 10: action=2, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 24
Episode 24, Timestep 11: action=1, reward=-1, terminated=False, penalties=0
Episode 24, Timestep 12: action=1, reward=-1, terminated=False, penalties=0
Episode 24, Timestep 13: action=3, reward=-1, terminated=F

Training Episodes:  24%|██▍       | 24/100 [13:56<45:20, 35.80s/it]

Episode 25, Timestep 1: action=3, reward=-1, terminated=False, penalties=0
Episode 25, Timestep 2: action=3, reward=-1, terminated=False, penalties=0
Episode 25, Timestep 3: action=3, reward=-1, terminated=False, penalties=0
Episode 25, Timestep 4: action=3, reward=-1, terminated=False, penalties=0
Episode 25, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Episode 25, Timestep 6: action=3, reward=-1, terminated=False, penalties=0
Episode 25, Timestep 7: action=3, reward=-1, terminated=False, penalties=0
Episode 25, Timestep 8: action=3, reward=-1, terminated=False, penalties=0
Episode 25, Timestep 9: action=3, reward=-1, terminated=False, penalties=0
Episode 25, Timestep 10: action=3, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 25
Episode 25, Timestep 11: action=2, reward=-1, terminated=False, penalties=0
Episode 25, Timestep 12: action=2, reward=-1, terminated=False, penalties=0
Episode 25, Timestep 13: action=2, reward=-1, terminated=F

Training Episodes:  25%|██▌       | 25/100 [14:34<45:14, 36.20s/it]

Episode 26, Timestep 1: action=2, reward=-1, terminated=False, penalties=0
Episode 26, Timestep 2: action=2, reward=-1, terminated=False, penalties=0
Episode 26, Timestep 3: action=2, reward=-1, terminated=False, penalties=0
Episode 26, Timestep 4: action=2, reward=-1, terminated=False, penalties=0
Episode 26, Timestep 5: action=2, reward=-1, terminated=False, penalties=0
Episode 26, Timestep 6: action=2, reward=-1, terminated=False, penalties=0
Episode 26, Timestep 7: action=0, reward=-1, terminated=False, penalties=0
Episode 26, Timestep 8: action=2, reward=-1, terminated=False, penalties=0
Episode 26, Timestep 9: action=2, reward=-1, terminated=False, penalties=0
Episode 26, Timestep 10: action=2, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 26
Episode 26, Timestep 11: action=3, reward=-1, terminated=False, penalties=0
Episode 26, Timestep 12: action=3, reward=-1, terminated=False, penalties=0
Episode 26, Timestep 13: action=3, reward=-1, terminated=F

Training Episodes:  26%|██▌       | 26/100 [15:10<44:41, 36.24s/it]

Episode 27, Timestep 1: action=0, reward=-1, terminated=False, penalties=0
Episode 27, Timestep 2: action=0, reward=-1, terminated=False, penalties=0
Episode 27, Timestep 3: action=0, reward=-1, terminated=False, penalties=0
Episode 27, Timestep 4: action=0, reward=-1, terminated=False, penalties=0
Episode 27, Timestep 5: action=0, reward=-1, terminated=False, penalties=0
Episode 27, Timestep 6: action=0, reward=-1, terminated=False, penalties=0
Episode 27, Timestep 7: action=3, reward=-1, terminated=False, penalties=0
Episode 27, Timestep 8: action=0, reward=-1, terminated=False, penalties=0
Episode 27, Timestep 9: action=0, reward=-1, terminated=False, penalties=0
Episode 27, Timestep 10: action=0, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 27
Episode 27, Timestep 11: action=2, reward=-1, terminated=False, penalties=0
Episode 27, Timestep 12: action=2, reward=-1, terminated=False, penalties=0
Episode 27, Timestep 13: action=2, reward=-1, terminated=F

Training Episodes:  27%|██▋       | 27/100 [15:45<43:29, 35.75s/it]

Episode 28, Timestep 1: action=0, reward=-1, terminated=False, penalties=0
Episode 28, Timestep 2: action=3, reward=-1, terminated=False, penalties=0
Episode 28, Timestep 3: action=3, reward=-1, terminated=False, penalties=0
Episode 28, Timestep 4: action=3, reward=-1, terminated=False, penalties=0
Episode 28, Timestep 5: action=3, reward=-1, terminated=False, penalties=0
Episode 28, Timestep 6: action=2, reward=-1, terminated=False, penalties=0
Episode 28, Timestep 7: action=3, reward=-1, terminated=False, penalties=0
Episode 28, Timestep 8: action=3, reward=-1, terminated=False, penalties=0
Episode 28, Timestep 9: action=3, reward=-1, terminated=False, penalties=0
Episode 28, Timestep 10: action=4, reward=-10, terminated=False, penalties=1
Retraining on mini-batch at episode 28
Episode 28, Timestep 11: action=3, reward=-1, terminated=False, penalties=1
Episode 28, Timestep 12: action=3, reward=-1, terminated=False, penalties=1
Episode 28, Timestep 13: action=3, reward=-1, terminated=

Training Episodes:  28%|██▊       | 28/100 [16:20<42:52, 35.73s/it]

Episode 29, Timestep 1: action=3, reward=-1, terminated=False, penalties=0
Episode 29, Timestep 2: action=3, reward=-1, terminated=False, penalties=0
Episode 29, Timestep 3: action=3, reward=-1, terminated=False, penalties=0
Episode 29, Timestep 4: action=0, reward=-1, terminated=False, penalties=0
Episode 29, Timestep 5: action=3, reward=-1, terminated=False, penalties=0
Episode 29, Timestep 6: action=3, reward=-1, terminated=False, penalties=0
Episode 29, Timestep 7: action=3, reward=-1, terminated=False, penalties=0
Episode 29, Timestep 8: action=3, reward=-1, terminated=False, penalties=0
Episode 29, Timestep 9: action=3, reward=-1, terminated=False, penalties=0
Episode 29, Timestep 10: action=3, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 29
Episode 29, Timestep 11: action=1, reward=-1, terminated=False, penalties=0
Episode 29, Timestep 12: action=1, reward=-1, terminated=False, penalties=0
Episode 29, Timestep 13: action=1, reward=-1, terminated=F

Training Episodes:  29%|██▉       | 29/100 [16:52<41:00, 34.65s/it]

Episode 30, Timestep 1: action=0, reward=-1, terminated=False, penalties=0
Episode 30, Timestep 2: action=0, reward=-1, terminated=False, penalties=0
Episode 30, Timestep 3: action=0, reward=-1, terminated=False, penalties=0
Episode 30, Timestep 4: action=0, reward=-1, terminated=False, penalties=0
Episode 30, Timestep 5: action=0, reward=-1, terminated=False, penalties=0
Episode 30, Timestep 6: action=0, reward=-1, terminated=False, penalties=0
Episode 30, Timestep 7: action=0, reward=-1, terminated=False, penalties=0
Episode 30, Timestep 8: action=0, reward=-1, terminated=False, penalties=0
Episode 30, Timestep 9: action=0, reward=-1, terminated=False, penalties=0
Episode 30, Timestep 10: action=0, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 30
Episode 30, Timestep 11: action=0, reward=-1, terminated=False, penalties=0
Episode 30, Timestep 12: action=0, reward=-1, terminated=False, penalties=0
Episode 30, Timestep 13: action=0, reward=-1, terminated=F

Training Episodes:  30%|███       | 30/100 [17:30<41:24, 35.50s/it]

Episode 31, Timestep 1: action=2, reward=-1, terminated=False, penalties=0
Episode 31, Timestep 2: action=0, reward=-1, terminated=False, penalties=0
Episode 31, Timestep 3: action=2, reward=-1, terminated=False, penalties=0
Episode 31, Timestep 4: action=2, reward=-1, terminated=False, penalties=0
Episode 31, Timestep 5: action=2, reward=-1, terminated=False, penalties=0
Episode 31, Timestep 6: action=2, reward=-1, terminated=False, penalties=0
Episode 31, Timestep 7: action=2, reward=-1, terminated=False, penalties=0
Episode 31, Timestep 8: action=2, reward=-1, terminated=False, penalties=0
Episode 31, Timestep 9: action=2, reward=-1, terminated=False, penalties=0
Episode 31, Timestep 10: action=2, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 31
Episode 31, Timestep 11: action=2, reward=-1, terminated=False, penalties=0
Episode 31, Timestep 12: action=4, reward=-10, terminated=False, penalties=1
Episode 31, Timestep 13: action=2, reward=-1, terminated=

Training Episodes:  31%|███       | 31/100 [18:04<40:12, 34.97s/it]

Episode 32, Timestep 1: action=3, reward=-1, terminated=False, penalties=0
Episode 32, Timestep 2: action=3, reward=-1, terminated=False, penalties=0
Episode 32, Timestep 3: action=3, reward=-1, terminated=False, penalties=0
Episode 32, Timestep 4: action=3, reward=-1, terminated=False, penalties=0
Episode 32, Timestep 5: action=3, reward=-1, terminated=False, penalties=0
Episode 32, Timestep 6: action=3, reward=-1, terminated=False, penalties=0
Episode 32, Timestep 7: action=3, reward=-1, terminated=False, penalties=0
Episode 32, Timestep 8: action=3, reward=-1, terminated=False, penalties=0
Episode 32, Timestep 9: action=3, reward=-1, terminated=False, penalties=0
Episode 32, Timestep 10: action=3, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 32
Episode 32, Timestep 11: action=3, reward=-1, terminated=False, penalties=0
Episode 32, Timestep 12: action=3, reward=-1, terminated=False, penalties=0
Episode 32, Timestep 13: action=4, reward=-10, terminated=

Training Episodes:  32%|███▏      | 32/100 [18:40<40:05, 35.37s/it]

Episode 33, Timestep 1: action=3, reward=-1, terminated=False, penalties=0
Episode 33, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Episode 33, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Episode 33, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Episode 33, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Episode 33, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Episode 33, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Episode 33, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Episode 33, Timestep 9: action=1, reward=-1, terminated=False, penalties=0
Episode 33, Timestep 10: action=2, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 33
Episode 33, Timestep 11: action=3, reward=-1, terminated=False, penalties=0
Episode 33, Timestep 12: action=2, reward=-1, terminated=False, penalties=0
Episode 33, Timestep 13: action=5, reward=-10, terminated=

Training Episodes:  33%|███▎      | 33/100 [19:14<39:03, 34.98s/it]

Episode 34, Timestep 1: action=3, reward=-1, terminated=False, penalties=0
Episode 34, Timestep 2: action=4, reward=-10, terminated=False, penalties=1
Episode 34, Timestep 3: action=3, reward=-1, terminated=False, penalties=1
Episode 34, Timestep 4: action=4, reward=-10, terminated=False, penalties=2
Episode 34, Timestep 5: action=4, reward=-10, terminated=False, penalties=3
Episode 34, Timestep 6: action=3, reward=-1, terminated=False, penalties=3
Episode 34, Timestep 7: action=3, reward=-1, terminated=False, penalties=3
Episode 34, Timestep 8: action=3, reward=-1, terminated=False, penalties=3
Episode 34, Timestep 9: action=3, reward=-1, terminated=False, penalties=3
Episode 34, Timestep 10: action=3, reward=-1, terminated=False, penalties=3
Retraining on mini-batch at episode 34
Episode 34, Timestep 11: action=4, reward=-10, terminated=False, penalties=4
Episode 34, Timestep 12: action=3, reward=-1, terminated=False, penalties=4
Episode 34, Timestep 13: action=3, reward=-1, terminat

Training Episodes:  34%|███▍      | 34/100 [19:45<37:18, 33.92s/it]

Episode 35, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Episode 35, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Episode 35, Timestep 3: action=3, reward=-1, terminated=False, penalties=0
Episode 35, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Episode 35, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Episode 35, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Episode 35, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Episode 35, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Episode 35, Timestep 9: action=1, reward=-1, terminated=False, penalties=0
Episode 35, Timestep 10: action=1, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 35
Episode 35, Timestep 11: action=1, reward=-1, terminated=False, penalties=0
Episode 35, Timestep 12: action=1, reward=-1, terminated=False, penalties=0
Episode 35, Timestep 13: action=1, reward=-1, terminated=F

Training Episodes:  35%|███▌      | 35/100 [20:23<37:57, 35.04s/it]

Episode 36, Timestep 1: action=2, reward=-1, terminated=False, penalties=0
Episode 36, Timestep 2: action=2, reward=-1, terminated=False, penalties=0
Episode 36, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Episode 36, Timestep 4: action=2, reward=-1, terminated=False, penalties=0
Episode 36, Timestep 5: action=2, reward=-1, terminated=False, penalties=0
Episode 36, Timestep 6: action=2, reward=-1, terminated=False, penalties=0
Episode 36, Timestep 7: action=2, reward=-1, terminated=False, penalties=0
Episode 36, Timestep 8: action=2, reward=-1, terminated=False, penalties=0
Episode 36, Timestep 9: action=2, reward=-1, terminated=False, penalties=0
Episode 36, Timestep 10: action=2, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 36
Episode 36, Timestep 11: action=0, reward=-1, terminated=False, penalties=0
Episode 36, Timestep 12: action=0, reward=-1, terminated=False, penalties=0
Episode 36, Timestep 13: action=2, reward=-1, terminated=F

Training Episodes:  36%|███▌      | 36/100 [20:56<36:50, 34.54s/it]

Episode 37, Timestep 1: action=2, reward=-1, terminated=False, penalties=0
Episode 37, Timestep 2: action=2, reward=-1, terminated=False, penalties=0
Episode 37, Timestep 3: action=2, reward=-1, terminated=False, penalties=0
Episode 37, Timestep 4: action=2, reward=-1, terminated=False, penalties=0
Episode 37, Timestep 5: action=2, reward=-1, terminated=False, penalties=0
Episode 37, Timestep 6: action=2, reward=-1, terminated=False, penalties=0
Episode 37, Timestep 7: action=2, reward=-1, terminated=False, penalties=0
Episode 37, Timestep 8: action=2, reward=-1, terminated=False, penalties=0
Episode 37, Timestep 9: action=2, reward=-1, terminated=False, penalties=0
Episode 37, Timestep 10: action=2, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 37
Episode 37, Timestep 11: action=2, reward=-1, terminated=False, penalties=0
Episode 37, Timestep 12: action=2, reward=-1, terminated=False, penalties=0
Episode 37, Timestep 13: action=2, reward=-1, terminated=F

Training Episodes:  37%|███▋      | 37/100 [21:32<36:44, 34.99s/it]

Episode 38, Timestep 1: action=5, reward=-10, terminated=False, penalties=1
Episode 38, Timestep 2: action=1, reward=-1, terminated=False, penalties=1
Episode 38, Timestep 3: action=1, reward=-1, terminated=False, penalties=1
Episode 38, Timestep 4: action=1, reward=-1, terminated=False, penalties=1
Episode 38, Timestep 5: action=1, reward=-1, terminated=False, penalties=1
Episode 38, Timestep 6: action=1, reward=-1, terminated=False, penalties=1
Episode 38, Timestep 7: action=1, reward=-1, terminated=False, penalties=1
Episode 38, Timestep 8: action=1, reward=-1, terminated=False, penalties=1
Episode 38, Timestep 9: action=1, reward=-1, terminated=False, penalties=1
Episode 38, Timestep 10: action=1, reward=-1, terminated=False, penalties=1
Retraining on mini-batch at episode 38
Episode 38, Timestep 11: action=2, reward=-1, terminated=False, penalties=1
Episode 38, Timestep 12: action=2, reward=-1, terminated=False, penalties=1
Episode 38, Timestep 13: action=2, reward=-1, terminated=

Training Episodes:  38%|███▊      | 38/100 [22:09<36:29, 35.31s/it]

Episode 39, Timestep 1: action=2, reward=-1, terminated=False, penalties=0
Episode 39, Timestep 2: action=2, reward=-1, terminated=False, penalties=0
Episode 39, Timestep 3: action=2, reward=-1, terminated=False, penalties=0
Episode 39, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Episode 39, Timestep 5: action=0, reward=-1, terminated=False, penalties=0
Episode 39, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Episode 39, Timestep 7: action=0, reward=-1, terminated=False, penalties=0
Episode 39, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Episode 39, Timestep 9: action=0, reward=-1, terminated=False, penalties=0
Episode 39, Timestep 10: action=1, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 39
Episode 39, Timestep 11: action=0, reward=-1, terminated=False, penalties=0
Episode 39, Timestep 12: action=0, reward=-1, terminated=False, penalties=0
Episode 39, Timestep 13: action=0, reward=-1, terminated=F

Training Episodes:  39%|███▉      | 39/100 [22:43<35:39, 35.07s/it]

Episode 40, Timestep 1: action=2, reward=-1, terminated=False, penalties=0
Episode 40, Timestep 2: action=2, reward=-1, terminated=False, penalties=0
Episode 40, Timestep 3: action=2, reward=-1, terminated=False, penalties=0
Episode 40, Timestep 4: action=2, reward=-1, terminated=False, penalties=0
Episode 40, Timestep 5: action=2, reward=-1, terminated=False, penalties=0
Episode 40, Timestep 6: action=2, reward=-1, terminated=False, penalties=0
Episode 40, Timestep 7: action=2, reward=-1, terminated=False, penalties=0
Episode 40, Timestep 8: action=2, reward=-1, terminated=False, penalties=0
Episode 40, Timestep 9: action=2, reward=-1, terminated=False, penalties=0
Episode 40, Timestep 10: action=2, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 40
Episode 40, Timestep 11: action=1, reward=-1, terminated=False, penalties=0
Episode 40, Timestep 12: action=1, reward=-1, terminated=False, penalties=0
Episode 40, Timestep 13: action=1, reward=-1, terminated=F

Training Episodes:  40%|████      | 40/100 [23:20<35:36, 35.61s/it]

Episode 41, Timestep 1: action=3, reward=-1, terminated=False, penalties=0
Episode 41, Timestep 2: action=3, reward=-1, terminated=False, penalties=0
Episode 41, Timestep 3: action=3, reward=-1, terminated=False, penalties=0
Episode 41, Timestep 4: action=3, reward=-1, terminated=False, penalties=0
Episode 41, Timestep 5: action=3, reward=-1, terminated=False, penalties=0
Episode 41, Timestep 6: action=3, reward=-1, terminated=False, penalties=0
Episode 41, Timestep 7: action=3, reward=-1, terminated=False, penalties=0
Episode 41, Timestep 8: action=3, reward=-1, terminated=False, penalties=0
Episode 41, Timestep 9: action=3, reward=-1, terminated=False, penalties=0
Episode 41, Timestep 10: action=3, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 41
Episode 41, Timestep 11: action=3, reward=-1, terminated=False, penalties=0
Episode 41, Timestep 12: action=3, reward=-1, terminated=False, penalties=0
Episode 41, Timestep 13: action=3, reward=-1, terminated=F

Training Episodes:  41%|████      | 41/100 [23:53<34:15, 34.84s/it]

Episode 42, Timestep 1: action=3, reward=-1, terminated=False, penalties=0
Episode 42, Timestep 2: action=3, reward=-1, terminated=False, penalties=0
Episode 42, Timestep 3: action=3, reward=-1, terminated=False, penalties=0
Episode 42, Timestep 4: action=3, reward=-1, terminated=False, penalties=0
Episode 42, Timestep 5: action=3, reward=-1, terminated=False, penalties=0
Episode 42, Timestep 6: action=3, reward=-1, terminated=False, penalties=0
Episode 42, Timestep 7: action=3, reward=-1, terminated=False, penalties=0
Episode 42, Timestep 8: action=4, reward=-10, terminated=False, penalties=1
Episode 42, Timestep 9: action=3, reward=-1, terminated=False, penalties=1
Episode 42, Timestep 10: action=3, reward=-1, terminated=False, penalties=1
Retraining on mini-batch at episode 42
Episode 42, Timestep 11: action=0, reward=-1, terminated=False, penalties=1
Episode 42, Timestep 12: action=2, reward=-1, terminated=False, penalties=1
Episode 42, Timestep 13: action=0, reward=-1, terminated=

Training Episodes:  42%|████▏     | 42/100 [24:30<34:21, 35.55s/it]

Episode 43, Timestep 1: action=2, reward=-1, terminated=False, penalties=0
Episode 43, Timestep 2: action=2, reward=-1, terminated=False, penalties=0
Episode 43, Timestep 3: action=2, reward=-1, terminated=False, penalties=0
Episode 43, Timestep 4: action=2, reward=-1, terminated=False, penalties=0
Episode 43, Timestep 5: action=2, reward=-1, terminated=False, penalties=0
Episode 43, Timestep 6: action=2, reward=-1, terminated=False, penalties=0
Episode 43, Timestep 7: action=2, reward=-1, terminated=False, penalties=0
Episode 43, Timestep 8: action=5, reward=-10, terminated=False, penalties=1
Episode 43, Timestep 9: action=2, reward=-1, terminated=False, penalties=1
Episode 43, Timestep 10: action=2, reward=-1, terminated=False, penalties=1
Retraining on mini-batch at episode 43
Episode 43, Timestep 11: action=1, reward=-1, terminated=False, penalties=1
Episode 43, Timestep 12: action=1, reward=-1, terminated=False, penalties=1
Episode 43, Timestep 13: action=1, reward=-1, terminated=

Training Episodes:  43%|████▎     | 43/100 [25:02<32:47, 34.52s/it]

Episode 44, Timestep 1: action=2, reward=-1, terminated=False, penalties=0
Episode 44, Timestep 2: action=2, reward=-1, terminated=False, penalties=0
Episode 44, Timestep 3: action=0, reward=-1, terminated=False, penalties=0
Episode 44, Timestep 4: action=0, reward=-1, terminated=False, penalties=0
Episode 44, Timestep 5: action=2, reward=-1, terminated=False, penalties=0
Episode 44, Timestep 6: action=2, reward=-1, terminated=False, penalties=0
Episode 44, Timestep 7: action=2, reward=-1, terminated=False, penalties=0
Episode 44, Timestep 8: action=2, reward=-1, terminated=False, penalties=0
Episode 44, Timestep 9: action=2, reward=-1, terminated=False, penalties=0
Episode 44, Timestep 10: action=2, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 44
Episode 44, Timestep 11: action=2, reward=-1, terminated=False, penalties=0
Episode 44, Timestep 12: action=2, reward=-1, terminated=False, penalties=0
Episode 44, Timestep 13: action=2, reward=-1, terminated=F

Training Episodes:  44%|████▍     | 44/100 [25:39<32:45, 35.09s/it]

Episode 45, Timestep 1: action=0, reward=-1, terminated=False, penalties=0
Episode 45, Timestep 2: action=0, reward=-1, terminated=False, penalties=0
Episode 45, Timestep 3: action=0, reward=-1, terminated=False, penalties=0
Episode 45, Timestep 4: action=4, reward=-10, terminated=False, penalties=1
Episode 45, Timestep 5: action=0, reward=-1, terminated=False, penalties=1
Episode 45, Timestep 6: action=0, reward=-1, terminated=False, penalties=1
Episode 45, Timestep 7: action=0, reward=-1, terminated=False, penalties=1
Episode 45, Timestep 8: action=0, reward=-1, terminated=False, penalties=1
Episode 45, Timestep 9: action=0, reward=-1, terminated=False, penalties=1
Episode 45, Timestep 10: action=0, reward=-1, terminated=False, penalties=1
Retraining on mini-batch at episode 45
Episode 45, Timestep 11: action=2, reward=-1, terminated=False, penalties=1
Episode 45, Timestep 12: action=2, reward=-1, terminated=False, penalties=1
Episode 45, Timestep 13: action=2, reward=-1, terminated=

Training Episodes:  45%|████▌     | 45/100 [26:15<32:31, 35.47s/it]

Episode 46, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Episode 46, Timestep 2: action=2, reward=-1, terminated=False, penalties=0
Episode 46, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Episode 46, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Episode 46, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Episode 46, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Episode 46, Timestep 7: action=3, reward=-1, terminated=False, penalties=0
Episode 46, Timestep 8: action=2, reward=-1, terminated=False, penalties=0
Episode 46, Timestep 9: action=1, reward=-1, terminated=False, penalties=0
Episode 46, Timestep 10: action=1, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 46
Episode 46, Timestep 11: action=2, reward=-1, terminated=False, penalties=0
Episode 46, Timestep 12: action=3, reward=-1, terminated=False, penalties=0
Episode 46, Timestep 13: action=2, reward=-1, terminated=F

Training Episodes:  46%|████▌     | 46/100 [26:50<31:54, 35.45s/it]

Episode 47, Timestep 1: action=2, reward=-1, terminated=False, penalties=0
Episode 47, Timestep 2: action=2, reward=-1, terminated=False, penalties=0
Episode 47, Timestep 3: action=2, reward=-1, terminated=False, penalties=0
Episode 47, Timestep 4: action=2, reward=-1, terminated=False, penalties=0
Episode 47, Timestep 5: action=2, reward=-1, terminated=False, penalties=0
Episode 47, Timestep 6: action=2, reward=-1, terminated=False, penalties=0
Episode 47, Timestep 7: action=2, reward=-1, terminated=False, penalties=0
Episode 47, Timestep 8: action=0, reward=-1, terminated=False, penalties=0
Episode 47, Timestep 9: action=2, reward=-1, terminated=False, penalties=0
Episode 47, Timestep 10: action=2, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 47
Episode 47, Timestep 11: action=2, reward=-1, terminated=False, penalties=0
Episode 47, Timestep 12: action=2, reward=-1, terminated=False, penalties=0
Episode 47, Timestep 13: action=2, reward=-1, terminated=F

Training Episodes:  47%|████▋     | 47/100 [27:27<31:28, 35.63s/it]

Episode 48, Timestep 1: action=3, reward=-1, terminated=False, penalties=0
Episode 48, Timestep 2: action=3, reward=-1, terminated=False, penalties=0
Episode 48, Timestep 3: action=3, reward=-1, terminated=False, penalties=0
Episode 48, Timestep 4: action=3, reward=-1, terminated=False, penalties=0
Episode 48, Timestep 5: action=3, reward=-1, terminated=False, penalties=0
Episode 48, Timestep 6: action=3, reward=-1, terminated=False, penalties=0
Episode 48, Timestep 7: action=3, reward=-1, terminated=False, penalties=0
Episode 48, Timestep 8: action=3, reward=-1, terminated=False, penalties=0
Episode 48, Timestep 9: action=3, reward=-1, terminated=False, penalties=0
Episode 48, Timestep 10: action=2, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 48
Episode 48, Timestep 11: action=4, reward=-10, terminated=False, penalties=1
Episode 48, Timestep 12: action=1, reward=-1, terminated=False, penalties=1
Episode 48, Timestep 13: action=1, reward=-1, terminated=

Training Episodes:  48%|████▊     | 48/100 [27:58<29:52, 34.47s/it]

Episode 49, Timestep 1: action=3, reward=-1, terminated=False, penalties=0
Episode 49, Timestep 2: action=3, reward=-1, terminated=False, penalties=0
Episode 49, Timestep 3: action=3, reward=-1, terminated=False, penalties=0
Episode 49, Timestep 4: action=3, reward=-1, terminated=False, penalties=0
Episode 49, Timestep 5: action=3, reward=-1, terminated=False, penalties=0
Episode 49, Timestep 6: action=3, reward=-1, terminated=False, penalties=0
Episode 49, Timestep 7: action=3, reward=-1, terminated=False, penalties=0
Episode 49, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Episode 49, Timestep 9: action=0, reward=-1, terminated=False, penalties=0
Episode 49, Timestep 10: action=3, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 49
Episode 49, Timestep 11: action=3, reward=-1, terminated=False, penalties=0
Episode 49, Timestep 12: action=3, reward=-1, terminated=False, penalties=0
Episode 49, Timestep 13: action=3, reward=-1, terminated=F

Training Episodes:  49%|████▉     | 49/100 [28:36<30:12, 35.54s/it]

Episode 50, Timestep 1: action=2, reward=-1, terminated=False, penalties=0
Episode 50, Timestep 2: action=2, reward=-1, terminated=False, penalties=0
Episode 50, Timestep 3: action=2, reward=-1, terminated=False, penalties=0
Episode 50, Timestep 4: action=2, reward=-1, terminated=False, penalties=0
Episode 50, Timestep 5: action=2, reward=-1, terminated=False, penalties=0
Episode 50, Timestep 6: action=2, reward=-1, terminated=False, penalties=0
Episode 50, Timestep 7: action=2, reward=-1, terminated=False, penalties=0
Episode 50, Timestep 8: action=2, reward=-1, terminated=False, penalties=0
Episode 50, Timestep 9: action=2, reward=-1, terminated=False, penalties=0
Episode 50, Timestep 10: action=2, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 50
Episode 50, Timestep 11: action=1, reward=-1, terminated=False, penalties=0
Episode 50, Timestep 12: action=0, reward=-1, terminated=False, penalties=0
Episode 50, Timestep 13: action=1, reward=-1, terminated=F

Training Episodes:  50%|█████     | 50/100 [29:11<29:17, 35.14s/it]

Episode 51, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Episode 51, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Episode 51, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Episode 51, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Episode 51, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Episode 51, Timestep 6: action=5, reward=-10, terminated=False, penalties=1
Episode 51, Timestep 7: action=1, reward=-1, terminated=False, penalties=1
Episode 51, Timestep 8: action=1, reward=-1, terminated=False, penalties=1
Episode 51, Timestep 9: action=1, reward=-1, terminated=False, penalties=1
Episode 51, Timestep 10: action=1, reward=-1, terminated=False, penalties=1
Retraining on mini-batch at episode 51
Episode 51, Timestep 11: action=1, reward=-1, terminated=False, penalties=1
Episode 51, Timestep 12: action=1, reward=-1, terminated=False, penalties=1
Episode 51, Timestep 13: action=1, reward=-1, terminated=

Training Episodes:  51%|█████     | 51/100 [29:47<29:01, 35.54s/it]

Episode 52, Timestep 1: action=2, reward=-1, terminated=False, penalties=0
Episode 52, Timestep 2: action=2, reward=-1, terminated=False, penalties=0
Episode 52, Timestep 3: action=2, reward=-1, terminated=False, penalties=0
Episode 52, Timestep 4: action=2, reward=-1, terminated=False, penalties=0
Episode 52, Timestep 5: action=2, reward=-1, terminated=False, penalties=0
Episode 52, Timestep 6: action=2, reward=-1, terminated=False, penalties=0
Episode 52, Timestep 7: action=2, reward=-1, terminated=False, penalties=0
Episode 52, Timestep 8: action=2, reward=-1, terminated=False, penalties=0
Episode 52, Timestep 9: action=2, reward=-1, terminated=False, penalties=0
Episode 52, Timestep 10: action=2, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 52
Episode 52, Timestep 11: action=1, reward=-1, terminated=False, penalties=0
Episode 52, Timestep 12: action=0, reward=-1, terminated=False, penalties=0
Episode 52, Timestep 13: action=1, reward=-1, terminated=F

Training Episodes:  52%|█████▏    | 52/100 [30:22<28:14, 35.31s/it]

Episode 53, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Episode 53, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Episode 53, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Episode 53, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Episode 53, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Episode 53, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Episode 53, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Episode 53, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Episode 53, Timestep 9: action=5, reward=-10, terminated=False, penalties=1
Episode 53, Timestep 10: action=1, reward=-1, terminated=False, penalties=1
Retraining on mini-batch at episode 53
Episode 53, Timestep 11: action=4, reward=-10, terminated=False, penalties=2
Episode 53, Timestep 12: action=3, reward=-1, terminated=False, penalties=2
Episode 53, Timestep 13: action=1, reward=-1, terminated

Training Episodes:  53%|█████▎    | 53/100 [30:55<27:09, 34.66s/it]

Episode 54, Timestep 1: action=0, reward=-1, terminated=False, penalties=0
Episode 54, Timestep 2: action=3, reward=-1, terminated=False, penalties=0
Episode 54, Timestep 3: action=3, reward=-1, terminated=False, penalties=0
Episode 54, Timestep 4: action=0, reward=-1, terminated=False, penalties=0
Episode 54, Timestep 5: action=0, reward=-1, terminated=False, penalties=0
Episode 54, Timestep 6: action=0, reward=-1, terminated=False, penalties=0
Episode 54, Timestep 7: action=0, reward=-1, terminated=False, penalties=0
Episode 54, Timestep 8: action=0, reward=-1, terminated=False, penalties=0
Episode 54, Timestep 9: action=0, reward=-1, terminated=False, penalties=0
Episode 54, Timestep 10: action=0, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 54
Episode 54, Timestep 11: action=0, reward=-1, terminated=False, penalties=0
Episode 54, Timestep 12: action=0, reward=-1, terminated=False, penalties=0
Episode 54, Timestep 13: action=0, reward=-1, terminated=F

Training Episodes:  54%|█████▍    | 54/100 [31:34<27:29, 35.85s/it]

Episode 55, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Episode 55, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Episode 55, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Episode 55, Timestep 4: action=5, reward=-10, terminated=False, penalties=1
Episode 55, Timestep 5: action=1, reward=-1, terminated=False, penalties=1
Episode 55, Timestep 6: action=1, reward=-1, terminated=False, penalties=1
Episode 55, Timestep 7: action=1, reward=-1, terminated=False, penalties=1
Episode 55, Timestep 8: action=1, reward=-1, terminated=False, penalties=1
Episode 55, Timestep 9: action=1, reward=-1, terminated=False, penalties=1
Episode 55, Timestep 10: action=1, reward=-1, terminated=False, penalties=1
Retraining on mini-batch at episode 55
Episode 55, Timestep 11: action=2, reward=-1, terminated=False, penalties=1
Episode 55, Timestep 12: action=2, reward=-1, terminated=False, penalties=1
Episode 55, Timestep 13: action=2, reward=-1, terminated=

Training Episodes:  55%|█████▌    | 55/100 [32:07<26:21, 35.13s/it]

Episode 56, Timestep 1: action=3, reward=-1, terminated=False, penalties=0
Episode 56, Timestep 2: action=3, reward=-1, terminated=False, penalties=0
Episode 56, Timestep 3: action=3, reward=-1, terminated=False, penalties=0
Episode 56, Timestep 4: action=3, reward=-1, terminated=False, penalties=0
Episode 56, Timestep 5: action=3, reward=-1, terminated=False, penalties=0
Episode 56, Timestep 6: action=3, reward=-1, terminated=False, penalties=0
Episode 56, Timestep 7: action=3, reward=-1, terminated=False, penalties=0
Episode 56, Timestep 8: action=3, reward=-1, terminated=False, penalties=0
Episode 56, Timestep 9: action=3, reward=-1, terminated=False, penalties=0
Episode 56, Timestep 10: action=3, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 56
Episode 56, Timestep 11: action=5, reward=-10, terminated=False, penalties=1
Episode 56, Timestep 12: action=2, reward=-1, terminated=False, penalties=1
Episode 56, Timestep 13: action=3, reward=-1, terminated=

Training Episodes:  56%|█████▌    | 56/100 [32:44<26:08, 35.65s/it]

Episode 57, Timestep 1: action=3, reward=-1, terminated=False, penalties=0
Episode 57, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Episode 57, Timestep 3: action=3, reward=-1, terminated=False, penalties=0
Episode 57, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Episode 57, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Episode 57, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Episode 57, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Episode 57, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Episode 57, Timestep 9: action=1, reward=-1, terminated=False, penalties=0
Episode 57, Timestep 10: action=1, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 57
Episode 57, Timestep 11: action=0, reward=-1, terminated=False, penalties=0
Episode 57, Timestep 12: action=0, reward=-1, terminated=False, penalties=0
Episode 57, Timestep 13: action=3, reward=-1, terminated=F

Training Episodes:  57%|█████▋    | 57/100 [33:18<25:11, 35.14s/it]

Episode 58, Timestep 1: action=0, reward=-1, terminated=False, penalties=0
Episode 58, Timestep 2: action=0, reward=-1, terminated=False, penalties=0
Episode 58, Timestep 3: action=0, reward=-1, terminated=False, penalties=0
Episode 58, Timestep 4: action=0, reward=-1, terminated=False, penalties=0
Episode 58, Timestep 5: action=0, reward=-1, terminated=False, penalties=0
Episode 58, Timestep 6: action=0, reward=-1, terminated=False, penalties=0
Episode 58, Timestep 7: action=2, reward=-1, terminated=False, penalties=0
Episode 58, Timestep 8: action=0, reward=-1, terminated=False, penalties=0
Episode 58, Timestep 9: action=0, reward=-1, terminated=False, penalties=0
Episode 58, Timestep 10: action=0, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 58
Episode 58, Timestep 11: action=0, reward=-1, terminated=False, penalties=0
Episode 58, Timestep 12: action=0, reward=-1, terminated=False, penalties=0
Episode 58, Timestep 13: action=0, reward=-1, terminated=F

Training Episodes:  58%|█████▊    | 58/100 [33:53<24:31, 35.03s/it]

Episode 59, Timestep 1: action=0, reward=-1, terminated=False, penalties=0
Episode 59, Timestep 2: action=0, reward=-1, terminated=False, penalties=0
Episode 59, Timestep 3: action=0, reward=-1, terminated=False, penalties=0
Episode 59, Timestep 4: action=0, reward=-1, terminated=False, penalties=0
Episode 59, Timestep 5: action=0, reward=-1, terminated=False, penalties=0
Episode 59, Timestep 6: action=0, reward=-1, terminated=False, penalties=0
Episode 59, Timestep 7: action=0, reward=-1, terminated=False, penalties=0
Episode 59, Timestep 8: action=0, reward=-1, terminated=False, penalties=0
Episode 59, Timestep 9: action=0, reward=-1, terminated=False, penalties=0
Episode 59, Timestep 10: action=0, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 59
Episode 59, Timestep 11: action=1, reward=-1, terminated=False, penalties=0
Episode 59, Timestep 12: action=1, reward=-1, terminated=False, penalties=0
Episode 59, Timestep 13: action=1, reward=-1, terminated=F

Training Episodes:  59%|█████▉    | 59/100 [34:30<24:23, 35.70s/it]

Episode 60, Timestep 1: action=3, reward=-1, terminated=False, penalties=0
Episode 60, Timestep 2: action=3, reward=-1, terminated=False, penalties=0
Episode 60, Timestep 3: action=3, reward=-1, terminated=False, penalties=0
Episode 60, Timestep 4: action=3, reward=-1, terminated=False, penalties=0
Episode 60, Timestep 5: action=3, reward=-1, terminated=False, penalties=0
Episode 60, Timestep 6: action=3, reward=-1, terminated=False, penalties=0
Episode 60, Timestep 7: action=3, reward=-1, terminated=False, penalties=0
Episode 60, Timestep 8: action=3, reward=-1, terminated=False, penalties=0
Episode 60, Timestep 9: action=3, reward=-1, terminated=False, penalties=0
Episode 60, Timestep 10: action=3, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 60
Episode 60, Timestep 11: action=5, reward=-10, terminated=False, penalties=1
Episode 60, Timestep 12: action=0, reward=-1, terminated=False, penalties=1
Episode 60, Timestep 13: action=0, reward=-1, terminated=

Training Episodes:  60%|██████    | 60/100 [35:05<23:36, 35.40s/it]

Episode 61, Timestep 1: action=3, reward=-1, terminated=False, penalties=0
Episode 61, Timestep 2: action=3, reward=-1, terminated=False, penalties=0
Episode 61, Timestep 3: action=4, reward=-10, terminated=False, penalties=1
Episode 61, Timestep 4: action=3, reward=-1, terminated=False, penalties=1
Episode 61, Timestep 5: action=3, reward=-1, terminated=False, penalties=1
Episode 61, Timestep 6: action=3, reward=-1, terminated=False, penalties=1
Episode 61, Timestep 7: action=0, reward=-1, terminated=False, penalties=1
Episode 61, Timestep 8: action=0, reward=-1, terminated=False, penalties=1
Episode 61, Timestep 9: action=3, reward=-1, terminated=False, penalties=1
Episode 61, Timestep 10: action=3, reward=-1, terminated=False, penalties=1
Retraining on mini-batch at episode 61
Episode 61, Timestep 11: action=3, reward=-1, terminated=False, penalties=1
Episode 61, Timestep 12: action=3, reward=-1, terminated=False, penalties=1
Episode 61, Timestep 13: action=3, reward=-1, terminated=

Training Episodes:  61%|██████    | 61/100 [35:43<23:35, 36.30s/it]

Episode 62, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Episode 62, Timestep 2: action=0, reward=-1, terminated=False, penalties=0
Episode 62, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Episode 62, Timestep 4: action=0, reward=-1, terminated=False, penalties=0
Episode 62, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Episode 62, Timestep 6: action=0, reward=-1, terminated=False, penalties=0
Episode 62, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Episode 62, Timestep 8: action=0, reward=-1, terminated=False, penalties=0
Episode 62, Timestep 9: action=1, reward=-1, terminated=False, penalties=0
Episode 62, Timestep 10: action=0, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 62
Episode 62, Timestep 11: action=1, reward=-1, terminated=False, penalties=0
Episode 62, Timestep 12: action=1, reward=-1, terminated=False, penalties=0
Episode 62, Timestep 13: action=1, reward=-1, terminated=F

Training Episodes:  62%|██████▏   | 62/100 [36:15<22:16, 35.16s/it]

Episode 63, Timestep 1: action=0, reward=-1, terminated=False, penalties=0
Episode 63, Timestep 2: action=0, reward=-1, terminated=False, penalties=0
Episode 63, Timestep 3: action=0, reward=-1, terminated=False, penalties=0
Episode 63, Timestep 4: action=0, reward=-1, terminated=False, penalties=0
Episode 63, Timestep 5: action=0, reward=-1, terminated=False, penalties=0
Episode 63, Timestep 6: action=0, reward=-1, terminated=False, penalties=0
Episode 63, Timestep 7: action=0, reward=-1, terminated=False, penalties=0
Episode 63, Timestep 8: action=5, reward=-10, terminated=False, penalties=1
Episode 63, Timestep 9: action=0, reward=-1, terminated=False, penalties=1
Episode 63, Timestep 10: action=0, reward=-1, terminated=False, penalties=1
Retraining on mini-batch at episode 63
Episode 63, Timestep 11: action=1, reward=-1, terminated=False, penalties=1
Episode 63, Timestep 12: action=1, reward=-1, terminated=False, penalties=1
Episode 63, Timestep 13: action=0, reward=-1, terminated=

Training Episodes:  63%|██████▎   | 63/100 [36:52<21:51, 35.44s/it]

Episode 64, Timestep 1: action=2, reward=-1, terminated=False, penalties=0
Episode 64, Timestep 2: action=2, reward=-1, terminated=False, penalties=0
Episode 64, Timestep 3: action=2, reward=-1, terminated=False, penalties=0
Episode 64, Timestep 4: action=2, reward=-1, terminated=False, penalties=0
Episode 64, Timestep 5: action=4, reward=-10, terminated=False, penalties=1
Episode 64, Timestep 6: action=2, reward=-1, terminated=False, penalties=1
Episode 64, Timestep 7: action=2, reward=-1, terminated=False, penalties=1
Episode 64, Timestep 8: action=2, reward=-1, terminated=False, penalties=1
Episode 64, Timestep 9: action=2, reward=-1, terminated=False, penalties=1
Episode 64, Timestep 10: action=2, reward=-1, terminated=False, penalties=1
Retraining on mini-batch at episode 64
Episode 64, Timestep 11: action=1, reward=-1, terminated=False, penalties=1
Episode 64, Timestep 12: action=1, reward=-1, terminated=False, penalties=1
Episode 64, Timestep 13: action=1, reward=-1, terminated=

Training Episodes:  64%|██████▍   | 64/100 [37:28<21:23, 35.64s/it]

Episode 65, Timestep 1: action=3, reward=-1, terminated=False, penalties=0
Episode 65, Timestep 2: action=0, reward=-1, terminated=False, penalties=0
Episode 65, Timestep 3: action=3, reward=-1, terminated=False, penalties=0
Episode 65, Timestep 4: action=3, reward=-1, terminated=False, penalties=0
Episode 65, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Episode 65, Timestep 6: action=0, reward=-1, terminated=False, penalties=0
Episode 65, Timestep 7: action=3, reward=-1, terminated=False, penalties=0
Episode 65, Timestep 8: action=3, reward=-1, terminated=False, penalties=0
Episode 65, Timestep 9: action=3, reward=-1, terminated=False, penalties=0
Episode 65, Timestep 10: action=3, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 65
Episode 65, Timestep 11: action=3, reward=-1, terminated=False, penalties=0
Episode 65, Timestep 12: action=3, reward=-1, terminated=False, penalties=0
Episode 65, Timestep 13: action=3, reward=-1, terminated=F

Training Episodes:  65%|██████▌   | 65/100 [37:59<20:02, 34.36s/it]

Episode 66, Timestep 1: action=2, reward=-1, terminated=False, penalties=0
Episode 66, Timestep 2: action=2, reward=-1, terminated=False, penalties=0
Episode 66, Timestep 3: action=2, reward=-1, terminated=False, penalties=0
Episode 66, Timestep 4: action=2, reward=-1, terminated=False, penalties=0
Episode 66, Timestep 5: action=2, reward=-1, terminated=False, penalties=0
Episode 66, Timestep 6: action=2, reward=-1, terminated=False, penalties=0
Episode 66, Timestep 7: action=2, reward=-1, terminated=False, penalties=0
Episode 66, Timestep 8: action=2, reward=-1, terminated=False, penalties=0
Episode 66, Timestep 9: action=2, reward=-1, terminated=False, penalties=0
Episode 66, Timestep 10: action=2, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 66
Episode 66, Timestep 11: action=2, reward=-1, terminated=False, penalties=0
Episode 66, Timestep 12: action=2, reward=-1, terminated=False, penalties=0
Episode 66, Timestep 13: action=2, reward=-1, terminated=F

Training Episodes:  66%|██████▌   | 66/100 [38:37<20:07, 35.53s/it]

Episode 67, Timestep 1: action=3, reward=-1, terminated=False, penalties=0
Episode 67, Timestep 2: action=0, reward=-1, terminated=False, penalties=0
Episode 67, Timestep 3: action=0, reward=-1, terminated=False, penalties=0
Episode 67, Timestep 4: action=0, reward=-1, terminated=False, penalties=0
Episode 67, Timestep 5: action=0, reward=-1, terminated=False, penalties=0
Episode 67, Timestep 6: action=0, reward=-1, terminated=False, penalties=0
Episode 67, Timestep 7: action=0, reward=-1, terminated=False, penalties=0
Episode 67, Timestep 8: action=0, reward=-1, terminated=False, penalties=0
Episode 67, Timestep 9: action=0, reward=-1, terminated=False, penalties=0
Episode 67, Timestep 10: action=0, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 67
Episode 67, Timestep 11: action=2, reward=-1, terminated=False, penalties=0
Episode 67, Timestep 12: action=0, reward=-1, terminated=False, penalties=0
Episode 67, Timestep 13: action=0, reward=-1, terminated=F

Training Episodes:  67%|██████▋   | 67/100 [39:12<19:25, 35.32s/it]

Episode 68, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Episode 68, Timestep 2: action=4, reward=-10, terminated=False, penalties=1
Episode 68, Timestep 3: action=1, reward=-1, terminated=False, penalties=1
Episode 68, Timestep 4: action=5, reward=-10, terminated=False, penalties=2
Episode 68, Timestep 5: action=1, reward=-1, terminated=False, penalties=2
Episode 68, Timestep 6: action=1, reward=-1, terminated=False, penalties=2
Episode 68, Timestep 7: action=1, reward=-1, terminated=False, penalties=2
Episode 68, Timestep 8: action=1, reward=-1, terminated=False, penalties=2
Episode 68, Timestep 9: action=1, reward=-1, terminated=False, penalties=2
Episode 68, Timestep 10: action=1, reward=-1, terminated=False, penalties=2
Retraining on mini-batch at episode 68
Episode 68, Timestep 11: action=0, reward=-1, terminated=False, penalties=2
Episode 68, Timestep 12: action=0, reward=-1, terminated=False, penalties=2
Episode 68, Timestep 13: action=5, reward=-10, terminate

Training Episodes:  68%|██████▊   | 68/100 [39:51<19:22, 36.32s/it]

Episode 69, Timestep 1: action=3, reward=-1, terminated=False, penalties=0
Episode 69, Timestep 2: action=3, reward=-1, terminated=False, penalties=0
Episode 69, Timestep 3: action=3, reward=-1, terminated=False, penalties=0
Episode 69, Timestep 4: action=3, reward=-1, terminated=False, penalties=0
Episode 69, Timestep 5: action=3, reward=-1, terminated=False, penalties=0
Episode 69, Timestep 6: action=0, reward=-1, terminated=False, penalties=0
Episode 69, Timestep 7: action=2, reward=-1, terminated=False, penalties=0
Episode 69, Timestep 8: action=2, reward=-1, terminated=False, penalties=0
Episode 69, Timestep 9: action=2, reward=-1, terminated=False, penalties=0
Episode 69, Timestep 10: action=2, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 69
Episode 69, Timestep 11: action=2, reward=-1, terminated=False, penalties=0
Episode 69, Timestep 12: action=2, reward=-1, terminated=False, penalties=0
Episode 69, Timestep 13: action=2, reward=-1, terminated=F

Training Episodes:  69%|██████▉   | 69/100 [40:26<18:34, 35.94s/it]

Episode 70, Timestep 1: action=2, reward=-1, terminated=False, penalties=0
Episode 70, Timestep 2: action=2, reward=-1, terminated=False, penalties=0
Episode 70, Timestep 3: action=2, reward=-1, terminated=False, penalties=0
Episode 70, Timestep 4: action=2, reward=-1, terminated=False, penalties=0
Episode 70, Timestep 5: action=2, reward=-1, terminated=False, penalties=0
Episode 70, Timestep 6: action=2, reward=-1, terminated=False, penalties=0
Episode 70, Timestep 7: action=2, reward=-1, terminated=False, penalties=0
Episode 70, Timestep 8: action=2, reward=-1, terminated=False, penalties=0
Episode 70, Timestep 9: action=2, reward=-1, terminated=False, penalties=0
Episode 70, Timestep 10: action=2, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 70
Episode 70, Timestep 11: action=1, reward=-1, terminated=False, penalties=0
Episode 70, Timestep 12: action=3, reward=-1, terminated=False, penalties=0
Episode 70, Timestep 13: action=3, reward=-1, terminated=F

Training Episodes:  70%|███████   | 70/100 [41:02<17:56, 35.89s/it]

Episode 71, Timestep 1: action=2, reward=-1, terminated=False, penalties=0
Episode 71, Timestep 2: action=2, reward=-1, terminated=False, penalties=0
Episode 71, Timestep 3: action=2, reward=-1, terminated=False, penalties=0
Episode 71, Timestep 4: action=2, reward=-1, terminated=False, penalties=0
Episode 71, Timestep 5: action=2, reward=-1, terminated=False, penalties=0
Episode 71, Timestep 6: action=2, reward=-1, terminated=False, penalties=0
Episode 71, Timestep 7: action=2, reward=-1, terminated=False, penalties=0
Episode 71, Timestep 8: action=2, reward=-1, terminated=False, penalties=0
Episode 71, Timestep 9: action=2, reward=-1, terminated=False, penalties=0
Episode 71, Timestep 10: action=2, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 71
Episode 71, Timestep 11: action=2, reward=-1, terminated=False, penalties=0
Episode 71, Timestep 12: action=2, reward=-1, terminated=False, penalties=0
Episode 71, Timestep 13: action=2, reward=-1, terminated=F

Training Episodes:  71%|███████   | 71/100 [41:37<17:20, 35.88s/it]

Episode 72, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Episode 72, Timestep 2: action=2, reward=-1, terminated=False, penalties=0
Episode 72, Timestep 3: action=4, reward=-10, terminated=False, penalties=1
Episode 72, Timestep 4: action=2, reward=-1, terminated=False, penalties=1
Episode 72, Timestep 5: action=2, reward=-1, terminated=False, penalties=1
Episode 72, Timestep 6: action=2, reward=-1, terminated=False, penalties=1
Episode 72, Timestep 7: action=2, reward=-1, terminated=False, penalties=1
Episode 72, Timestep 8: action=2, reward=-1, terminated=False, penalties=1
Episode 72, Timestep 9: action=2, reward=-1, terminated=False, penalties=1
Episode 72, Timestep 10: action=2, reward=-1, terminated=False, penalties=1
Retraining on mini-batch at episode 72
Episode 72, Timestep 11: action=2, reward=-1, terminated=False, penalties=1
Episode 72, Timestep 12: action=2, reward=-1, terminated=False, penalties=1
Episode 72, Timestep 13: action=2, reward=-1, terminated=

Training Episodes:  72%|███████▏  | 72/100 [42:11<16:25, 35.21s/it]

Episode 73, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Episode 73, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Episode 73, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Episode 73, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Episode 73, Timestep 5: action=0, reward=-1, terminated=False, penalties=0
Episode 73, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Episode 73, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Episode 73, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Episode 73, Timestep 9: action=1, reward=-1, terminated=False, penalties=0
Episode 73, Timestep 10: action=1, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 73
Episode 73, Timestep 11: action=3, reward=-1, terminated=False, penalties=0
Episode 73, Timestep 12: action=3, reward=-1, terminated=False, penalties=0
Episode 73, Timestep 13: action=3, reward=-1, terminated=F

Training Episodes:  73%|███████▎  | 73/100 [42:50<16:22, 36.38s/it]

Episode 74, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Episode 74, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Episode 74, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Episode 74, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Episode 74, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Episode 74, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Episode 74, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Episode 74, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Episode 74, Timestep 9: action=1, reward=-1, terminated=False, penalties=0
Episode 74, Timestep 10: action=1, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 74
Episode 74, Timestep 11: action=1, reward=-1, terminated=False, penalties=0
Episode 74, Timestep 12: action=1, reward=-1, terminated=False, penalties=0
Episode 74, Timestep 13: action=1, reward=-1, terminated=F

Training Episodes:  74%|███████▍  | 74/100 [43:23<15:18, 35.34s/it]

Episode 75, Timestep 1: action=3, reward=-1, terminated=False, penalties=0
Episode 75, Timestep 2: action=5, reward=-10, terminated=False, penalties=1
Episode 75, Timestep 3: action=3, reward=-1, terminated=False, penalties=1
Episode 75, Timestep 4: action=3, reward=-1, terminated=False, penalties=1
Episode 75, Timestep 5: action=3, reward=-1, terminated=False, penalties=1
Episode 75, Timestep 6: action=3, reward=-1, terminated=False, penalties=1
Episode 75, Timestep 7: action=3, reward=-1, terminated=False, penalties=1
Episode 75, Timestep 8: action=3, reward=-1, terminated=False, penalties=1
Episode 75, Timestep 9: action=3, reward=-1, terminated=False, penalties=1
Episode 75, Timestep 10: action=3, reward=-1, terminated=False, penalties=1
Retraining on mini-batch at episode 75
Episode 75, Timestep 11: action=1, reward=-1, terminated=False, penalties=1
Episode 75, Timestep 12: action=1, reward=-1, terminated=False, penalties=1
Episode 75, Timestep 13: action=1, reward=-1, terminated=

Training Episodes:  75%|███████▌  | 75/100 [44:00<14:56, 35.88s/it]

Episode 76, Timestep 1: action=2, reward=-1, terminated=False, penalties=0
Episode 76, Timestep 2: action=2, reward=-1, terminated=False, penalties=0
Episode 76, Timestep 3: action=2, reward=-1, terminated=False, penalties=0
Episode 76, Timestep 4: action=2, reward=-1, terminated=False, penalties=0
Episode 76, Timestep 5: action=2, reward=-1, terminated=False, penalties=0
Episode 76, Timestep 6: action=2, reward=-1, terminated=False, penalties=0
Episode 76, Timestep 7: action=2, reward=-1, terminated=False, penalties=0
Episode 76, Timestep 8: action=2, reward=-1, terminated=False, penalties=0
Episode 76, Timestep 9: action=2, reward=-1, terminated=False, penalties=0
Episode 76, Timestep 10: action=2, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 76
Episode 76, Timestep 11: action=1, reward=-1, terminated=False, penalties=0
Episode 76, Timestep 12: action=1, reward=-1, terminated=False, penalties=0
Episode 76, Timestep 13: action=1, reward=-1, terminated=F

Training Episodes:  76%|███████▌  | 76/100 [44:36<14:17, 35.71s/it]

Episode 77, Timestep 1: action=0, reward=-1, terminated=False, penalties=0
Episode 77, Timestep 2: action=0, reward=-1, terminated=False, penalties=0
Episode 77, Timestep 3: action=0, reward=-1, terminated=False, penalties=0
Episode 77, Timestep 4: action=0, reward=-1, terminated=False, penalties=0
Episode 77, Timestep 5: action=0, reward=-1, terminated=False, penalties=0
Episode 77, Timestep 6: action=0, reward=-1, terminated=False, penalties=0
Episode 77, Timestep 7: action=0, reward=-1, terminated=False, penalties=0
Episode 77, Timestep 8: action=0, reward=-1, terminated=False, penalties=0
Episode 77, Timestep 9: action=3, reward=-1, terminated=False, penalties=0
Episode 77, Timestep 10: action=0, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 77
Episode 77, Timestep 11: action=0, reward=-1, terminated=False, penalties=0
Episode 77, Timestep 12: action=0, reward=-1, terminated=False, penalties=0
Episode 77, Timestep 13: action=0, reward=-1, terminated=F

Training Episodes:  77%|███████▋  | 77/100 [45:09<13:25, 35.01s/it]

Episode 78, Timestep 1: action=2, reward=-1, terminated=False, penalties=0
Episode 78, Timestep 2: action=3, reward=-1, terminated=False, penalties=0
Episode 78, Timestep 3: action=2, reward=-1, terminated=False, penalties=0
Episode 78, Timestep 4: action=3, reward=-1, terminated=False, penalties=0
Episode 78, Timestep 5: action=2, reward=-1, terminated=False, penalties=0
Episode 78, Timestep 6: action=3, reward=-1, terminated=False, penalties=0
Episode 78, Timestep 7: action=2, reward=-1, terminated=False, penalties=0
Episode 78, Timestep 8: action=3, reward=-1, terminated=False, penalties=0
Episode 78, Timestep 9: action=2, reward=-1, terminated=False, penalties=0
Episode 78, Timestep 10: action=3, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 78
Episode 78, Timestep 11: action=1, reward=-1, terminated=False, penalties=0
Episode 78, Timestep 12: action=1, reward=-1, terminated=False, penalties=0
Episode 78, Timestep 13: action=3, reward=-1, terminated=F

Training Episodes:  78%|███████▊  | 78/100 [45:48<13:14, 36.12s/it]

Episode 79, Timestep 1: action=3, reward=-1, terminated=False, penalties=0
Episode 79, Timestep 2: action=3, reward=-1, terminated=False, penalties=0
Episode 79, Timestep 3: action=3, reward=-1, terminated=False, penalties=0
Episode 79, Timestep 4: action=2, reward=-1, terminated=False, penalties=0
Episode 79, Timestep 5: action=3, reward=-1, terminated=False, penalties=0
Episode 79, Timestep 6: action=3, reward=-1, terminated=False, penalties=0
Episode 79, Timestep 7: action=3, reward=-1, terminated=False, penalties=0
Episode 79, Timestep 8: action=3, reward=-1, terminated=False, penalties=0
Episode 79, Timestep 9: action=3, reward=-1, terminated=False, penalties=0
Episode 79, Timestep 10: action=4, reward=-10, terminated=False, penalties=1
Retraining on mini-batch at episode 79
Episode 79, Timestep 11: action=1, reward=-1, terminated=False, penalties=1
Episode 79, Timestep 12: action=1, reward=-1, terminated=False, penalties=1
Episode 79, Timestep 13: action=1, reward=-1, terminated=

Training Episodes:  79%|███████▉  | 79/100 [46:21<12:19, 35.22s/it]

Episode 80, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Episode 80, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Episode 80, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Episode 80, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Episode 80, Timestep 5: action=0, reward=-1, terminated=False, penalties=0
Episode 80, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Episode 80, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Episode 80, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Episode 80, Timestep 9: action=1, reward=-1, terminated=False, penalties=0
Episode 80, Timestep 10: action=1, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 80
Episode 80, Timestep 11: action=0, reward=-1, terminated=False, penalties=0
Episode 80, Timestep 12: action=1, reward=-1, terminated=False, penalties=0
Episode 80, Timestep 13: action=0, reward=-1, terminated=F

Training Episodes:  80%|████████  | 80/100 [46:58<11:53, 35.68s/it]

Episode 81, Timestep 1: action=2, reward=-1, terminated=False, penalties=0
Episode 81, Timestep 2: action=2, reward=-1, terminated=False, penalties=0
Episode 81, Timestep 3: action=2, reward=-1, terminated=False, penalties=0
Episode 81, Timestep 4: action=2, reward=-1, terminated=False, penalties=0
Episode 81, Timestep 5: action=2, reward=-1, terminated=False, penalties=0
Episode 81, Timestep 6: action=4, reward=-10, terminated=False, penalties=1
Episode 81, Timestep 7: action=2, reward=-1, terminated=False, penalties=1
Episode 81, Timestep 8: action=2, reward=-1, terminated=False, penalties=1
Episode 81, Timestep 9: action=2, reward=-1, terminated=False, penalties=1
Episode 81, Timestep 10: action=2, reward=-1, terminated=False, penalties=1
Retraining on mini-batch at episode 81
Episode 81, Timestep 11: action=3, reward=-1, terminated=False, penalties=1
Episode 81, Timestep 12: action=3, reward=-1, terminated=False, penalties=1
Episode 81, Timestep 13: action=3, reward=-1, terminated=

Training Episodes:  81%|████████  | 81/100 [47:32<11:08, 35.20s/it]

Episode 82, Timestep 1: action=2, reward=-1, terminated=False, penalties=0
Episode 82, Timestep 2: action=2, reward=-1, terminated=False, penalties=0
Episode 82, Timestep 3: action=2, reward=-1, terminated=False, penalties=0
Episode 82, Timestep 4: action=2, reward=-1, terminated=False, penalties=0
Episode 82, Timestep 5: action=2, reward=-1, terminated=False, penalties=0
Episode 82, Timestep 6: action=2, reward=-1, terminated=False, penalties=0
Episode 82, Timestep 7: action=2, reward=-1, terminated=False, penalties=0
Episode 82, Timestep 8: action=2, reward=-1, terminated=False, penalties=0
Episode 82, Timestep 9: action=2, reward=-1, terminated=False, penalties=0
Episode 82, Timestep 10: action=2, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 82
Episode 82, Timestep 11: action=2, reward=-1, terminated=False, penalties=0
Episode 82, Timestep 12: action=2, reward=-1, terminated=False, penalties=0
Episode 82, Timestep 13: action=2, reward=-1, terminated=F

Training Episodes:  82%|████████▏ | 82/100 [48:06<10:29, 34.99s/it]

Episode 83, Timestep 1: action=2, reward=-1, terminated=False, penalties=0
Episode 83, Timestep 2: action=2, reward=-1, terminated=False, penalties=0
Episode 83, Timestep 3: action=2, reward=-1, terminated=False, penalties=0
Episode 83, Timestep 4: action=2, reward=-1, terminated=False, penalties=0
Episode 83, Timestep 5: action=2, reward=-1, terminated=False, penalties=0
Episode 83, Timestep 6: action=2, reward=-1, terminated=False, penalties=0
Episode 83, Timestep 7: action=2, reward=-1, terminated=False, penalties=0
Episode 83, Timestep 8: action=2, reward=-1, terminated=False, penalties=0
Episode 83, Timestep 9: action=2, reward=-1, terminated=False, penalties=0
Episode 83, Timestep 10: action=2, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 83
Episode 83, Timestep 11: action=4, reward=-10, terminated=False, penalties=1
Episode 83, Timestep 12: action=3, reward=-1, terminated=False, penalties=1
Episode 83, Timestep 13: action=3, reward=-1, terminated=

Training Episodes:  83%|████████▎ | 83/100 [48:44<10:09, 35.85s/it]

Episode 84, Timestep 1: action=0, reward=-1, terminated=False, penalties=0
Episode 84, Timestep 2: action=0, reward=-1, terminated=False, penalties=0
Episode 84, Timestep 3: action=0, reward=-1, terminated=False, penalties=0
Episode 84, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Episode 84, Timestep 5: action=0, reward=-1, terminated=False, penalties=0
Episode 84, Timestep 6: action=0, reward=-1, terminated=False, penalties=0
Episode 84, Timestep 7: action=0, reward=-1, terminated=False, penalties=0
Episode 84, Timestep 8: action=0, reward=-1, terminated=False, penalties=0
Episode 84, Timestep 9: action=0, reward=-1, terminated=False, penalties=0
Episode 84, Timestep 10: action=3, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 84
Episode 84, Timestep 11: action=3, reward=-1, terminated=False, penalties=0
Episode 84, Timestep 12: action=3, reward=-1, terminated=False, penalties=0
Episode 84, Timestep 13: action=3, reward=-1, terminated=F

Training Episodes:  84%|████████▍ | 84/100 [49:17<09:20, 35.02s/it]

Episode 85, Timestep 1: action=2, reward=-1, terminated=False, penalties=0
Episode 85, Timestep 2: action=2, reward=-1, terminated=False, penalties=0
Episode 85, Timestep 3: action=2, reward=-1, terminated=False, penalties=0
Episode 85, Timestep 4: action=2, reward=-1, terminated=False, penalties=0
Episode 85, Timestep 5: action=3, reward=-1, terminated=False, penalties=0
Episode 85, Timestep 6: action=2, reward=-1, terminated=False, penalties=0
Episode 85, Timestep 7: action=2, reward=-1, terminated=False, penalties=0
Episode 85, Timestep 8: action=2, reward=-1, terminated=False, penalties=0
Episode 85, Timestep 9: action=2, reward=-1, terminated=False, penalties=0
Episode 85, Timestep 10: action=2, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 85
Episode 85, Timestep 11: action=1, reward=-1, terminated=False, penalties=0
Episode 85, Timestep 12: action=1, reward=-1, terminated=False, penalties=0
Episode 85, Timestep 13: action=1, reward=-1, terminated=F

Training Episodes:  85%|████████▌ | 85/100 [49:53<08:47, 35.16s/it]

Episode 86, Timestep 1: action=3, reward=-1, terminated=False, penalties=0
Episode 86, Timestep 2: action=3, reward=-1, terminated=False, penalties=0
Episode 86, Timestep 3: action=3, reward=-1, terminated=False, penalties=0
Episode 86, Timestep 4: action=3, reward=-1, terminated=False, penalties=0
Episode 86, Timestep 5: action=3, reward=-1, terminated=False, penalties=0
Episode 86, Timestep 6: action=3, reward=-1, terminated=False, penalties=0
Episode 86, Timestep 7: action=3, reward=-1, terminated=False, penalties=0
Episode 86, Timestep 8: action=3, reward=-1, terminated=False, penalties=0
Episode 86, Timestep 9: action=3, reward=-1, terminated=False, penalties=0
Episode 86, Timestep 10: action=3, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 86
Episode 86, Timestep 11: action=2, reward=-1, terminated=False, penalties=0
Episode 86, Timestep 12: action=2, reward=-1, terminated=False, penalties=0
Episode 86, Timestep 13: action=2, reward=-1, terminated=F

Training Episodes:  86%|████████▌ | 86/100 [50:28<08:12, 35.19s/it]

Episode 87, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Episode 87, Timestep 2: action=5, reward=-10, terminated=False, penalties=1
Episode 87, Timestep 3: action=4, reward=-10, terminated=False, penalties=2
Episode 87, Timestep 4: action=3, reward=-1, terminated=False, penalties=2
Episode 87, Timestep 5: action=3, reward=-1, terminated=False, penalties=2
Episode 87, Timestep 6: action=3, reward=-1, terminated=False, penalties=2
Episode 87, Timestep 7: action=3, reward=-1, terminated=False, penalties=2
Episode 87, Timestep 8: action=3, reward=-1, terminated=False, penalties=2
Episode 87, Timestep 9: action=3, reward=-1, terminated=False, penalties=2
Episode 87, Timestep 10: action=3, reward=-1, terminated=False, penalties=2
Retraining on mini-batch at episode 87
Episode 87, Timestep 11: action=1, reward=-1, terminated=False, penalties=2
Episode 87, Timestep 12: action=2, reward=-1, terminated=False, penalties=2
Episode 87, Timestep 13: action=2, reward=-1, terminated

Training Episodes:  87%|████████▋ | 87/100 [51:04<07:41, 35.49s/it]

Episode 88, Timestep 1: action=3, reward=-1, terminated=False, penalties=0
Episode 88, Timestep 2: action=2, reward=-1, terminated=False, penalties=0
Episode 88, Timestep 3: action=3, reward=-1, terminated=False, penalties=0
Episode 88, Timestep 4: action=2, reward=-1, terminated=False, penalties=0
Episode 88, Timestep 5: action=3, reward=-1, terminated=False, penalties=0
Episode 88, Timestep 6: action=2, reward=-1, terminated=False, penalties=0
Episode 88, Timestep 7: action=5, reward=-10, terminated=False, penalties=1
Episode 88, Timestep 8: action=3, reward=-1, terminated=False, penalties=1
Episode 88, Timestep 9: action=2, reward=-1, terminated=False, penalties=1
Episode 88, Timestep 10: action=3, reward=-1, terminated=False, penalties=1
Retraining on mini-batch at episode 88
Episode 88, Timestep 11: action=3, reward=-1, terminated=False, penalties=1
Episode 88, Timestep 12: action=3, reward=-1, terminated=False, penalties=1
Episode 88, Timestep 13: action=3, reward=-1, terminated=

Training Episodes:  88%|████████▊ | 88/100 [51:41<07:11, 35.95s/it]

Episode 89, Timestep 1: action=0, reward=-1, terminated=False, penalties=0
Episode 89, Timestep 2: action=0, reward=-1, terminated=False, penalties=0
Episode 89, Timestep 3: action=3, reward=-1, terminated=False, penalties=0
Episode 89, Timestep 4: action=0, reward=-1, terminated=False, penalties=0
Episode 89, Timestep 5: action=0, reward=-1, terminated=False, penalties=0
Episode 89, Timestep 6: action=0, reward=-1, terminated=False, penalties=0
Episode 89, Timestep 7: action=0, reward=-1, terminated=False, penalties=0
Episode 89, Timestep 8: action=0, reward=-1, terminated=False, penalties=0
Episode 89, Timestep 9: action=0, reward=-1, terminated=False, penalties=0
Episode 89, Timestep 10: action=0, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 89
Episode 89, Timestep 11: action=0, reward=-1, terminated=False, penalties=0
Episode 89, Timestep 12: action=0, reward=-1, terminated=False, penalties=0
Episode 89, Timestep 13: action=0, reward=-1, terminated=F

Training Episodes:  89%|████████▉ | 89/100 [52:14<06:26, 35.13s/it]

Episode 90, Timestep 1: action=4, reward=-10, terminated=False, penalties=1
Episode 90, Timestep 2: action=2, reward=-1, terminated=False, penalties=1
Episode 90, Timestep 3: action=2, reward=-1, terminated=False, penalties=1
Episode 90, Timestep 4: action=1, reward=-1, terminated=False, penalties=1
Episode 90, Timestep 5: action=2, reward=-1, terminated=False, penalties=1
Episode 90, Timestep 6: action=2, reward=-1, terminated=False, penalties=1
Episode 90, Timestep 7: action=2, reward=-1, terminated=False, penalties=1
Episode 90, Timestep 8: action=2, reward=-1, terminated=False, penalties=1
Episode 90, Timestep 9: action=2, reward=-1, terminated=False, penalties=1
Episode 90, Timestep 10: action=2, reward=-1, terminated=False, penalties=1
Retraining on mini-batch at episode 90
Episode 90, Timestep 11: action=1, reward=-1, terminated=False, penalties=1
Episode 90, Timestep 12: action=1, reward=-1, terminated=False, penalties=1
Episode 90, Timestep 13: action=1, reward=-1, terminated=

Training Episodes:  90%|█████████ | 90/100 [52:52<05:59, 35.96s/it]

Episode 91, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Episode 91, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Episode 91, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Episode 91, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Episode 91, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Episode 91, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Episode 91, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Episode 91, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Episode 91, Timestep 9: action=1, reward=-1, terminated=False, penalties=0
Episode 91, Timestep 10: action=1, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 91
Episode 91, Timestep 11: action=0, reward=-1, terminated=False, penalties=0
Episode 91, Timestep 12: action=2, reward=-1, terminated=False, penalties=0
Episode 91, Timestep 13: action=2, reward=-1, terminated=F

Training Episodes:  91%|█████████ | 91/100 [53:26<05:17, 35.33s/it]

Episode 92, Timestep 1: action=2, reward=-1, terminated=False, penalties=0
Episode 92, Timestep 2: action=2, reward=-1, terminated=False, penalties=0
Episode 92, Timestep 3: action=2, reward=-1, terminated=False, penalties=0
Episode 92, Timestep 4: action=2, reward=-1, terminated=False, penalties=0
Episode 92, Timestep 5: action=2, reward=-1, terminated=False, penalties=0
Episode 92, Timestep 6: action=2, reward=-1, terminated=False, penalties=0
Episode 92, Timestep 7: action=2, reward=-1, terminated=False, penalties=0
Episode 92, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Episode 92, Timestep 9: action=2, reward=-1, terminated=False, penalties=0
Episode 92, Timestep 10: action=2, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 92
Episode 92, Timestep 11: action=3, reward=-1, terminated=False, penalties=0
Episode 92, Timestep 12: action=3, reward=-1, terminated=False, penalties=0
Episode 92, Timestep 13: action=3, reward=-1, terminated=F

Training Episodes:  92%|█████████▏| 92/100 [54:02<04:44, 35.51s/it]

Episode 93, Timestep 1: action=3, reward=-1, terminated=False, penalties=0
Episode 93, Timestep 2: action=3, reward=-1, terminated=False, penalties=0
Episode 93, Timestep 3: action=3, reward=-1, terminated=False, penalties=0
Episode 93, Timestep 4: action=3, reward=-1, terminated=False, penalties=0
Episode 93, Timestep 5: action=3, reward=-1, terminated=False, penalties=0
Episode 93, Timestep 6: action=2, reward=-1, terminated=False, penalties=0
Episode 93, Timestep 7: action=3, reward=-1, terminated=False, penalties=0
Episode 93, Timestep 8: action=3, reward=-1, terminated=False, penalties=0
Episode 93, Timestep 9: action=3, reward=-1, terminated=False, penalties=0
Episode 93, Timestep 10: action=3, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 93
Episode 93, Timestep 11: action=2, reward=-1, terminated=False, penalties=0
Episode 93, Timestep 12: action=2, reward=-1, terminated=False, penalties=0
Episode 93, Timestep 13: action=2, reward=-1, terminated=F

Training Episodes:  93%|█████████▎| 93/100 [54:39<04:11, 35.90s/it]

Episode 94, Timestep 1: action=3, reward=-1, terminated=False, penalties=0
Episode 94, Timestep 2: action=3, reward=-1, terminated=False, penalties=0
Episode 94, Timestep 3: action=3, reward=-1, terminated=False, penalties=0
Episode 94, Timestep 4: action=3, reward=-1, terminated=False, penalties=0
Episode 94, Timestep 5: action=3, reward=-1, terminated=False, penalties=0
Episode 94, Timestep 6: action=3, reward=-1, terminated=False, penalties=0
Episode 94, Timestep 7: action=3, reward=-1, terminated=False, penalties=0
Episode 94, Timestep 8: action=3, reward=-1, terminated=False, penalties=0
Episode 94, Timestep 9: action=3, reward=-1, terminated=False, penalties=0
Episode 94, Timestep 10: action=1, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 94
Episode 94, Timestep 11: action=2, reward=-1, terminated=False, penalties=0
Episode 94, Timestep 12: action=3, reward=-1, terminated=False, penalties=0
Episode 94, Timestep 13: action=3, reward=-1, terminated=F

Training Episodes:  94%|█████████▍| 94/100 [55:11<03:28, 34.75s/it]

Episode 95, Timestep 1: action=3, reward=-1, terminated=False, penalties=0
Episode 95, Timestep 2: action=3, reward=-1, terminated=False, penalties=0
Episode 95, Timestep 3: action=3, reward=-1, terminated=False, penalties=0
Episode 95, Timestep 4: action=3, reward=-1, terminated=False, penalties=0
Episode 95, Timestep 5: action=3, reward=-1, terminated=False, penalties=0
Episode 95, Timestep 6: action=3, reward=-1, terminated=False, penalties=0
Episode 95, Timestep 7: action=3, reward=-1, terminated=False, penalties=0
Episode 95, Timestep 8: action=3, reward=-1, terminated=False, penalties=0
Episode 95, Timestep 9: action=4, reward=-10, terminated=False, penalties=1
Episode 95, Timestep 10: action=1, reward=-1, terminated=False, penalties=1
Retraining on mini-batch at episode 95
Episode 95, Timestep 11: action=3, reward=-1, terminated=False, penalties=1
Episode 95, Timestep 12: action=3, reward=-1, terminated=False, penalties=1
Episode 95, Timestep 13: action=3, reward=-1, terminated=

Training Episodes:  95%|█████████▌| 95/100 [55:49<02:58, 35.67s/it]

Episode 96, Timestep 1: action=0, reward=-1, terminated=False, penalties=0
Episode 96, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Episode 96, Timestep 3: action=0, reward=-1, terminated=False, penalties=0
Episode 96, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Episode 96, Timestep 5: action=0, reward=-1, terminated=False, penalties=0
Episode 96, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Episode 96, Timestep 7: action=0, reward=-1, terminated=False, penalties=0
Episode 96, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Episode 96, Timestep 9: action=0, reward=-1, terminated=False, penalties=0
Episode 96, Timestep 10: action=1, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 96
Episode 96, Timestep 11: action=0, reward=-1, terminated=False, penalties=0
Episode 96, Timestep 12: action=1, reward=-1, terminated=False, penalties=0
Episode 96, Timestep 13: action=0, reward=-1, terminated=F

Training Episodes:  96%|█████████▌| 96/100 [56:23<02:21, 35.29s/it]

Episode 97, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Episode 97, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Episode 97, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Episode 97, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Episode 97, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Episode 97, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Episode 97, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Episode 97, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Episode 97, Timestep 9: action=0, reward=-1, terminated=False, penalties=0
Episode 97, Timestep 10: action=1, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 97
Episode 97, Timestep 11: action=0, reward=-1, terminated=False, penalties=0
Episode 97, Timestep 12: action=3, reward=-1, terminated=False, penalties=0
Episode 97, Timestep 13: action=3, reward=-1, terminated=F

Training Episodes:  97%|█████████▋| 97/100 [56:59<01:46, 35.36s/it]

Episode 98, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Episode 98, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Episode 98, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Episode 98, Timestep 4: action=3, reward=-1, terminated=False, penalties=0
Episode 98, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Episode 98, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Episode 98, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Episode 98, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Episode 98, Timestep 9: action=1, reward=-1, terminated=False, penalties=0
Episode 98, Timestep 10: action=1, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 98
Episode 98, Timestep 11: action=1, reward=-1, terminated=False, penalties=0
Episode 98, Timestep 12: action=1, reward=-1, terminated=False, penalties=0
Episode 98, Timestep 13: action=1, reward=-1, terminated=F

Training Episodes:  98%|█████████▊| 98/100 [57:36<01:12, 36.07s/it]

Episode 99, Timestep 1: action=0, reward=-1, terminated=False, penalties=0
Episode 99, Timestep 2: action=0, reward=-1, terminated=False, penalties=0
Episode 99, Timestep 3: action=0, reward=-1, terminated=False, penalties=0
Episode 99, Timestep 4: action=0, reward=-1, terminated=False, penalties=0
Episode 99, Timestep 5: action=0, reward=-1, terminated=False, penalties=0
Episode 99, Timestep 6: action=0, reward=-1, terminated=False, penalties=0
Episode 99, Timestep 7: action=0, reward=-1, terminated=False, penalties=0
Episode 99, Timestep 8: action=0, reward=-1, terminated=False, penalties=0
Episode 99, Timestep 9: action=2, reward=-1, terminated=False, penalties=0
Episode 99, Timestep 10: action=1, reward=-1, terminated=False, penalties=0
Retraining on mini-batch at episode 99
Episode 99, Timestep 11: action=1, reward=-1, terminated=False, penalties=0
Episode 99, Timestep 12: action=1, reward=-1, terminated=False, penalties=0
Episode 99, Timestep 13: action=1, reward=-1, terminated=F

Training Episodes:  99%|█████████▉| 99/100 [58:10<00:35, 35.46s/it]

Episode 100, Timestep 1: action=0, reward=-1, terminated=False, penalties=0
Episode 100, Timestep 2: action=0, reward=-1, terminated=False, penalties=0
Episode 100, Timestep 3: action=0, reward=-1, terminated=False, penalties=0
Episode 100, Timestep 4: action=0, reward=-1, terminated=False, penalties=0
Episode 100, Timestep 5: action=0, reward=-1, terminated=False, penalties=0
Episode 100, Timestep 6: action=4, reward=-10, terminated=False, penalties=1
Episode 100, Timestep 7: action=0, reward=-1, terminated=False, penalties=1
Episode 100, Timestep 8: action=0, reward=-1, terminated=False, penalties=1
Episode 100, Timestep 9: action=0, reward=-1, terminated=False, penalties=1
Episode 100, Timestep 10: action=0, reward=-1, terminated=False, penalties=1
Retraining on mini-batch at episode 100
Episode 100, Timestep 11: action=3, reward=-1, terminated=False, penalties=1
Episode 100, Timestep 12: action=2, reward=-1, terminated=False, penalties=1
Episode 100, Timestep 13: action=3, reward=-

Training Episodes: 100%|██████████| 100/100 [58:48<00:00, 35.29s/it]
Evaluation Episodes:   0%|          | 0/100 [00:00<?, ?it/s]

Evaluation Episode 1, Timestep 1: action=4, reward=-10, terminated=False, penalties=1
Evaluation Episode 1, Timestep 2: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 1, Timestep 3: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 1, Timestep 4: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 1, Timestep 5: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 1, Timestep 6: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 1, Timestep 7: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 1, Timestep 8: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 1, Timestep 9: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 1, Timestep 10: action=3, reward=-1, terminated=False, penalties=1
Evaluation Episode 1, Timestep 11: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 1, Timestep 12: action=5, reward=-10, termi

Evaluation Episodes:   1%|          | 1/100 [00:02<04:06,  2.49s/it]

Evaluation Episode 1, Timestep 38: action=1, reward=-1, terminated=False, penalties=5
Evaluation Episode 1, Timestep 39: action=1, reward=-1, terminated=False, penalties=5
Evaluation Episode 1, Timestep 40: action=1, reward=-1, terminated=False, penalties=5
Evaluation Episode 2, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 2, Timestep 2: action=3, reward=-1, terminated=False, penalties=0
Evaluation Episode 2, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 2, Timestep 4: action=3, reward=-1, terminated=False, penalties=0
Evaluation Episode 2, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 2, Timestep 6: action=5, reward=-10, terminated=False, penalties=1
Evaluation Episode 2, Timestep 7: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 2, Timestep 8: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 2, Timestep 9: action=1, reward=-1, termin

Evaluation Episodes:   2%|▏         | 2/100 [00:05<04:10,  2.56s/it]

Evaluation Episode 2, Timestep 40: action=1, reward=-1, terminated=False, penalties=2
Evaluation Episode 3, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 3, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 3, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 3, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 3, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 3, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 3, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 3, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 3, Timestep 9: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 3, Timestep 10: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 3, Timestep 11: action=4, reward=-10, termin

Evaluation Episodes:   3%|▎         | 3/100 [00:07<04:13,  2.61s/it]

Evaluation Episode 3, Timestep 38: action=1, reward=-1, terminated=False, penalties=4
Evaluation Episode 3, Timestep 39: action=1, reward=-1, terminated=False, penalties=4
Evaluation Episode 3, Timestep 40: action=1, reward=-1, terminated=False, penalties=4
Evaluation Episode 4, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 4, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 4, Timestep 3: action=2, reward=-1, terminated=False, penalties=0
Evaluation Episode 4, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 4, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 4, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 4, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 4, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 4, Timestep 9: action=1, reward=-1, termina

Evaluation Episodes:   4%|▍         | 4/100 [00:10<04:25,  2.77s/it]

Evaluation Episode 4, Timestep 40: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 5, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 5, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 5, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 5, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 5, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 5, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 5, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 5, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 5, Timestep 9: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 5, Timestep 10: action=5, reward=-10, terminated=False, penalties=1
Evaluation Episode 5, Timestep 11: action=1, reward=-1, termin

Evaluation Episodes:   5%|▌         | 5/100 [00:14<04:47,  3.03s/it]

Evaluation Episode 5, Timestep 39: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 5, Timestep 40: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 6, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 6, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 6, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 6, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 6, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 6, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 6, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 6, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 6, Timestep 9: action=2, reward=-1, terminated=False, penalties=0
Evaluation Episode 6, Timestep 10: action=1, reward=-1, termina

Evaluation Episodes:   6%|▌         | 6/100 [00:17<04:37,  2.95s/it]

Evaluation Episode 6, Timestep 38: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 6, Timestep 39: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 6, Timestep 40: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 7, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 7, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 7, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 7, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 7, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 7, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 7, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 7, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 7, Timestep 9: action=1, reward=-1, termina

Evaluation Episodes:   7%|▋         | 7/100 [00:19<04:31,  2.92s/it]

Evaluation Episode 7, Timestep 40: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 8, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 8, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 8, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 8, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 8, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 8, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 8, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 8, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 8, Timestep 9: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 8, Timestep 10: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 8, Timestep 11: action=1, reward=-1, termina

Evaluation Episodes:   8%|▊         | 8/100 [00:22<04:31,  2.96s/it]

Evaluation Episode 8, Timestep 40: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 9, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 9, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 9, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 9, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 9, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 9, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 9, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 9, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 9, Timestep 9: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 9, Timestep 10: action=4, reward=-10, terminated=False, penalties=1
Evaluation Episode 9, Timestep 11: action=1, reward=-1, termin

Evaluation Episodes:   9%|▉         | 9/100 [00:26<04:47,  3.16s/it]

Evaluation Episode 9, Timestep 39: action=1, reward=-1, terminated=False, penalties=2
Evaluation Episode 9, Timestep 40: action=1, reward=-1, terminated=False, penalties=2
Evaluation Episode 10, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 10, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 10, Timestep 3: action=4, reward=-10, terminated=False, penalties=1
Evaluation Episode 10, Timestep 4: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 10, Timestep 5: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 10, Timestep 6: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 10, Timestep 7: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 10, Timestep 8: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 10, Timestep 9: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 10, Timestep 10: action=1, reward=

Evaluation Episodes:  10%|█         | 10/100 [00:29<04:29,  2.99s/it]

Evaluation Episode 10, Timestep 37: action=1, reward=-1, terminated=False, penalties=2
Evaluation Episode 10, Timestep 38: action=4, reward=-10, terminated=False, penalties=3
Evaluation Episode 10, Timestep 39: action=2, reward=-1, terminated=False, penalties=3
Evaluation Episode 10, Timestep 40: action=1, reward=-1, terminated=False, penalties=3
Evaluation Episode 11, Timestep 1: action=5, reward=-10, terminated=False, penalties=1
Evaluation Episode 11, Timestep 2: action=5, reward=-10, terminated=False, penalties=2
Evaluation Episode 11, Timestep 3: action=1, reward=-1, terminated=False, penalties=2
Evaluation Episode 11, Timestep 4: action=1, reward=-1, terminated=False, penalties=2
Evaluation Episode 11, Timestep 5: action=1, reward=-1, terminated=False, penalties=2
Evaluation Episode 11, Timestep 6: action=1, reward=-1, terminated=False, penalties=2
Evaluation Episode 11, Timestep 7: action=1, reward=-1, terminated=False, penalties=2
Evaluation Episode 11, Timestep 8: action=1, re

Evaluation Episodes:  11%|█         | 11/100 [00:31<04:16,  2.88s/it]

Evaluation Episode 11, Timestep 39: action=1, reward=-1, terminated=False, penalties=2
Evaluation Episode 11, Timestep 40: action=1, reward=-1, terminated=False, penalties=2
Evaluation Episode 12, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 12, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 12, Timestep 3: action=2, reward=-1, terminated=False, penalties=0
Evaluation Episode 12, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 12, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 12, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 12, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 12, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 12, Timestep 9: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 12, Timestep 10: action=1, reward

Evaluation Episodes:  12%|█▏        | 12/100 [00:34<04:08,  2.82s/it]

Evaluation Episode 12, Timestep 39: action=1, reward=-1, terminated=False, penalties=2
Evaluation Episode 12, Timestep 40: action=1, reward=-1, terminated=False, penalties=2
Evaluation Episode 13, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 13, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 13, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 13, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 13, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 13, Timestep 6: action=2, reward=-1, terminated=False, penalties=0
Evaluation Episode 13, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 13, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 13, Timestep 9: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 13, Timestep 10: action=1, reward

Evaluation Episodes:  13%|█▎        | 13/100 [00:37<04:21,  3.01s/it]

Evaluation Episode 13, Timestep 40: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 14, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 14, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 14, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 14, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 14, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 14, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 14, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 14, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 14, Timestep 9: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 14, Timestep 10: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 14, Timestep 11: action=1, reward

Evaluation Episodes:  14%|█▍        | 14/100 [00:40<04:18,  3.00s/it]

Evaluation Episode 14, Timestep 40: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 15, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 15, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 15, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 15, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 15, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 15, Timestep 6: action=5, reward=-10, terminated=False, penalties=1
Evaluation Episode 15, Timestep 7: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 15, Timestep 8: action=4, reward=-10, terminated=False, penalties=2
Evaluation Episode 15, Timestep 9: action=1, reward=-1, terminated=False, penalties=2
Evaluation Episode 15, Timestep 10: action=1, reward=-1, terminated=False, penalties=2
Evaluation Episode 15, Timestep 11: action=2, rewa

Evaluation Episodes:  15%|█▌        | 15/100 [00:43<03:57,  2.80s/it]

Evaluation Episode 15, Timestep 39: action=1, reward=-1, terminated=False, penalties=3
Evaluation Episode 15, Timestep 40: action=2, reward=-1, terminated=False, penalties=3
Evaluation Episode 16, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 16, Timestep 2: action=2, reward=-1, terminated=False, penalties=0
Evaluation Episode 16, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 16, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 16, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 16, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 16, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 16, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 16, Timestep 9: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 16, Timestep 10: action=1, reward

Evaluation Episodes:  16%|█▌        | 16/100 [00:45<03:52,  2.76s/it]

Evaluation Episode 16, Timestep 37: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 16, Timestep 38: action=5, reward=-10, terminated=False, penalties=1
Evaluation Episode 16, Timestep 39: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 16, Timestep 40: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 17, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 17, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 17, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 17, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 17, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 17, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 17, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 17, Timestep 8: action=1, rewa

Evaluation Episodes:  17%|█▋        | 17/100 [00:50<04:22,  3.16s/it]

Evaluation Episode 17, Timestep 39: action=1, reward=-1, terminated=False, penalties=2
Evaluation Episode 17, Timestep 40: action=1, reward=-1, terminated=False, penalties=2
Evaluation Episode 18, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 18, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 18, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 18, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 18, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 18, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 18, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 18, Timestep 8: action=3, reward=-1, terminated=False, penalties=0
Evaluation Episode 18, Timestep 9: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 18, Timestep 10: action=1, reward

Evaluation Episodes:  18%|█▊        | 18/100 [00:53<04:26,  3.26s/it]

Evaluation Episode 18, Timestep 37: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 18, Timestep 38: action=5, reward=-10, terminated=False, penalties=2
Evaluation Episode 18, Timestep 39: action=1, reward=-1, terminated=False, penalties=2
Evaluation Episode 18, Timestep 40: action=1, reward=-1, terminated=False, penalties=2
Evaluation Episode 19, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 19, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 19, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 19, Timestep 4: action=0, reward=-1, terminated=False, penalties=0
Evaluation Episode 19, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 19, Timestep 6: action=5, reward=-10, terminated=False, penalties=1
Evaluation Episode 19, Timestep 7: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 19, Timestep 8: action=1, rew

Evaluation Episodes:  19%|█▉        | 19/100 [00:56<04:18,  3.19s/it]

Evaluation Episode 19, Timestep 39: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 19, Timestep 40: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 20, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 20, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 20, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 20, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 20, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 20, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 20, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 20, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 20, Timestep 9: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 20, Timestep 10: action=1, reward

Evaluation Episodes:  20%|██        | 20/100 [00:59<04:16,  3.21s/it]

Evaluation Episode 20, Timestep 38: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 20, Timestep 39: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 20, Timestep 40: action=3, reward=-1, terminated=False, penalties=0
Evaluation Episode 21, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 21, Timestep 2: action=3, reward=-1, terminated=False, penalties=0
Evaluation Episode 21, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 21, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 21, Timestep 5: action=3, reward=-1, terminated=False, penalties=0
Evaluation Episode 21, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 21, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 21, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 21, Timestep 9: action=1, reward

Evaluation Episodes:  21%|██        | 21/100 [01:03<04:31,  3.44s/it]

Evaluation Episode 21, Timestep 39: action=1, reward=-1, terminated=False, penalties=3
Evaluation Episode 21, Timestep 40: action=1, reward=-1, terminated=False, penalties=3
Evaluation Episode 22, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 22, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 22, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 22, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 22, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 22, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 22, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 22, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 22, Timestep 9: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 22, Timestep 10: action=1, reward

Evaluation Episodes:  22%|██▏       | 22/100 [01:08<04:49,  3.71s/it]

Evaluation Episode 22, Timestep 38: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 22, Timestep 39: action=4, reward=-10, terminated=False, penalties=1
Evaluation Episode 22, Timestep 40: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 23, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 23, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 23, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 23, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 23, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 23, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 23, Timestep 7: action=0, reward=-1, terminated=False, penalties=0
Evaluation Episode 23, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 23, Timestep 9: action=1, rewar

Evaluation Episodes:  23%|██▎       | 23/100 [01:11<04:31,  3.52s/it]

Evaluation Episode 23, Timestep 40: action=1, reward=-1, terminated=False, penalties=2
Evaluation Episode 24, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 24, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 24, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 24, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 24, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 24, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 24, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 24, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 24, Timestep 9: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 24, Timestep 10: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 24, Timestep 11: action=1, reward

Evaluation Episodes:  24%|██▍       | 24/100 [01:14<04:20,  3.42s/it]

Evaluation Episode 24, Timestep 39: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 24, Timestep 40: action=3, reward=-1, terminated=False, penalties=0
Evaluation Episode 25, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 25, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 25, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 25, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 25, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 25, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 25, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 25, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 25, Timestep 9: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 25, Timestep 10: action=1, reward

Evaluation Episodes:  25%|██▌       | 25/100 [01:18<04:33,  3.65s/it]

Evaluation Episode 25, Timestep 39: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 25, Timestep 40: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 26, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 26, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 26, Timestep 3: action=3, reward=-1, terminated=False, penalties=0
Evaluation Episode 26, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 26, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 26, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 26, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 26, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 26, Timestep 9: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 26, Timestep 10: action=1, reward

Evaluation Episodes:  26%|██▌       | 26/100 [01:21<04:10,  3.38s/it]

Evaluation Episode 26, Timestep 38: action=1, reward=-1, terminated=False, penalties=3
Evaluation Episode 26, Timestep 39: action=1, reward=-1, terminated=False, penalties=3
Evaluation Episode 26, Timestep 40: action=1, reward=-1, terminated=False, penalties=3
Evaluation Episode 27, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 27, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 27, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 27, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 27, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 27, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 27, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 27, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 27, Timestep 9: action=1, reward

Evaluation Episodes:  27%|██▋       | 27/100 [01:24<04:02,  3.32s/it]

Evaluation Episode 27, Timestep 40: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 28, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 28, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 28, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 28, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 28, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 28, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 28, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 28, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 28, Timestep 9: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 28, Timestep 10: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 28, Timestep 11: action=1, reward

Evaluation Episodes:  28%|██▊       | 28/100 [01:27<03:52,  3.23s/it]

Evaluation Episode 28, Timestep 40: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 29, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 29, Timestep 2: action=5, reward=-10, terminated=False, penalties=1
Evaluation Episode 29, Timestep 3: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 29, Timestep 4: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 29, Timestep 5: action=0, reward=-1, terminated=False, penalties=1
Evaluation Episode 29, Timestep 6: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 29, Timestep 7: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 29, Timestep 8: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 29, Timestep 9: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 29, Timestep 10: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 29, Timestep 11: action=1, rewar

Evaluation Episodes:  29%|██▉       | 29/100 [01:31<03:58,  3.36s/it]

Evaluation Episode 29, Timestep 40: action=1, reward=-1, terminated=False, penalties=2
Evaluation Episode 30, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 30, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 30, Timestep 3: action=5, reward=-10, terminated=False, penalties=1
Evaluation Episode 30, Timestep 4: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 30, Timestep 5: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 30, Timestep 6: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 30, Timestep 7: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 30, Timestep 8: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 30, Timestep 9: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 30, Timestep 10: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 30, Timestep 11: action=1, rewar

Evaluation Episodes:  30%|███       | 30/100 [01:33<03:40,  3.16s/it]

Evaluation Episode 30, Timestep 40: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 31, Timestep 1: action=3, reward=-1, terminated=False, penalties=0
Evaluation Episode 31, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 31, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 31, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 31, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 31, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 31, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 31, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 31, Timestep 9: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 31, Timestep 10: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 31, Timestep 11: action=1, reward

Evaluation Episodes:  31%|███       | 31/100 [01:36<03:32,  3.09s/it]

Evaluation Episode 31, Timestep 37: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 31, Timestep 38: action=5, reward=-10, terminated=False, penalties=2
Evaluation Episode 31, Timestep 39: action=1, reward=-1, terminated=False, penalties=2
Evaluation Episode 31, Timestep 40: action=4, reward=-10, terminated=False, penalties=3
Evaluation Episode 32, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 32, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 32, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 32, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 32, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 32, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 32, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 32, Timestep 8: action=1, rew

Evaluation Episodes:  32%|███▏      | 32/100 [01:39<03:23,  2.99s/it]

Evaluation Episode 32, Timestep 39: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 32, Timestep 40: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 33, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 33, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 33, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 33, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 33, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 33, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 33, Timestep 7: action=4, reward=-10, terminated=False, penalties=1
Evaluation Episode 33, Timestep 8: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 33, Timestep 9: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 33, Timestep 10: action=1, rewar

Evaluation Episodes:  33%|███▎      | 33/100 [01:43<03:35,  3.22s/it]

Evaluation Episode 33, Timestep 40: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 34, Timestep 1: action=3, reward=-1, terminated=False, penalties=0
Evaluation Episode 34, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 34, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 34, Timestep 4: action=3, reward=-1, terminated=False, penalties=0
Evaluation Episode 34, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 34, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 34, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 34, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 34, Timestep 9: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 34, Timestep 10: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 34, Timestep 11: action=1, reward

Evaluation Episodes:  34%|███▍      | 34/100 [01:46<03:23,  3.08s/it]

Evaluation Episode 34, Timestep 39: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 34, Timestep 40: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 35, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 35, Timestep 2: action=3, reward=-1, terminated=False, penalties=0
Evaluation Episode 35, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 35, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 35, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 35, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 35, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 35, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 35, Timestep 9: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 35, Timestep 10: action=1, reward

Evaluation Episodes:  35%|███▌      | 35/100 [01:49<03:17,  3.04s/it]

Evaluation Episode 35, Timestep 39: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 35, Timestep 40: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 36, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 36, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 36, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 36, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 36, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 36, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 36, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 36, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 36, Timestep 9: action=3, reward=-1, terminated=False, penalties=0
Evaluation Episode 36, Timestep 10: action=1, reward

Evaluation Episodes:  36%|███▌      | 36/100 [01:51<03:09,  2.96s/it]

Evaluation Episode 36, Timestep 37: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 36, Timestep 38: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 36, Timestep 39: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 36, Timestep 40: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 37, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 37, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 37, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 37, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 37, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 37, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 37, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 37, Timestep 8: action=1, rewar

Evaluation Episodes:  37%|███▋      | 37/100 [01:54<03:02,  2.90s/it]

Evaluation Episode 37, Timestep 37: action=1, reward=-1, terminated=False, penalties=2
Evaluation Episode 37, Timestep 38: action=1, reward=-1, terminated=False, penalties=2
Evaluation Episode 37, Timestep 39: action=5, reward=-10, terminated=False, penalties=3
Evaluation Episode 37, Timestep 40: action=2, reward=-1, terminated=False, penalties=3
Evaluation Episode 38, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 38, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 38, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 38, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 38, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 38, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 38, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 38, Timestep 8: action=1, rewa

Evaluation Episodes:  38%|███▊      | 38/100 [01:58<03:14,  3.13s/it]

Evaluation Episode 38, Timestep 39: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 38, Timestep 40: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 39, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 39, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 39, Timestep 3: action=5, reward=-10, terminated=False, penalties=1
Evaluation Episode 39, Timestep 4: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 39, Timestep 5: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 39, Timestep 6: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 39, Timestep 7: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 39, Timestep 8: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 39, Timestep 9: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 39, Timestep 10: action=1, rewar

Evaluation Episodes:  39%|███▉      | 39/100 [02:00<03:02,  2.99s/it]

Evaluation Episode 39, Timestep 40: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 40, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 40, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 40, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 40, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 40, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 40, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 40, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 40, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 40, Timestep 9: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 40, Timestep 10: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 40, Timestep 11: action=1, reward

Evaluation Episodes:  40%|████      | 40/100 [02:03<02:59,  3.00s/it]

Evaluation Episode 40, Timestep 40: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 41, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 41, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 41, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 41, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 41, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 41, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 41, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 41, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 41, Timestep 9: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 41, Timestep 10: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 41, Timestep 11: action=1, reward

Evaluation Episodes:  41%|████      | 41/100 [02:06<02:53,  2.94s/it]

Evaluation Episode 41, Timestep 39: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 41, Timestep 40: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 42, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 42, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 42, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 42, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 42, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 42, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 42, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 42, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 42, Timestep 9: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 42, Timestep 10: action=1, reward

Evaluation Episodes:  42%|████▏     | 42/100 [02:10<03:05,  3.19s/it]

Evaluation Episode 42, Timestep 39: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 42, Timestep 40: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 43, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 43, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 43, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 43, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 43, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 43, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 43, Timestep 7: action=0, reward=-1, terminated=False, penalties=0
Evaluation Episode 43, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 43, Timestep 9: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 43, Timestep 10: action=5, reward

Evaluation Episodes:  43%|████▎     | 43/100 [02:13<02:54,  3.06s/it]

Evaluation Episode 43, Timestep 39: action=1, reward=-1, terminated=False, penalties=2
Evaluation Episode 43, Timestep 40: action=1, reward=-1, terminated=False, penalties=2
Evaluation Episode 44, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 44, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 44, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 44, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 44, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 44, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 44, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 44, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 44, Timestep 9: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 44, Timestep 10: action=1, reward

Evaluation Episodes:  44%|████▍     | 44/100 [02:15<02:44,  2.94s/it]

Evaluation Episode 44, Timestep 38: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 44, Timestep 39: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 44, Timestep 40: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 45, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 45, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 45, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 45, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 45, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 45, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 45, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 45, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 45, Timestep 9: action=1, reward

Evaluation Episodes:  45%|████▌     | 45/100 [02:18<02:38,  2.88s/it]

Evaluation Episode 45, Timestep 37: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 45, Timestep 38: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 45, Timestep 39: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 45, Timestep 40: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 46, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 46, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 46, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 46, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 46, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 46, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 46, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 46, Timestep 8: action=1, rewar

Evaluation Episodes:  46%|████▌     | 46/100 [02:21<02:41,  2.99s/it]

Evaluation Episode 46, Timestep 40: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 47, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 47, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 47, Timestep 3: action=5, reward=-10, terminated=False, penalties=1
Evaluation Episode 47, Timestep 4: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 47, Timestep 5: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 47, Timestep 6: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 47, Timestep 7: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 47, Timestep 8: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 47, Timestep 9: action=2, reward=-1, terminated=False, penalties=1
Evaluation Episode 47, Timestep 10: action=3, reward=-1, terminated=False, penalties=1
Evaluation Episode 47, Timestep 11: action=1, rewar

Evaluation Episodes:  47%|████▋     | 47/100 [02:25<02:50,  3.22s/it]

Evaluation Episode 47, Timestep 40: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 48, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 48, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 48, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 48, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 48, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 48, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 48, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 48, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 48, Timestep 9: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 48, Timestep 10: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 48, Timestep 11: action=1, reward

Evaluation Episodes:  48%|████▊     | 48/100 [02:29<02:50,  3.27s/it]

Evaluation Episode 48, Timestep 40: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 49, Timestep 1: action=0, reward=-1, terminated=False, penalties=0
Evaluation Episode 49, Timestep 2: action=0, reward=-1, terminated=False, penalties=0
Evaluation Episode 49, Timestep 3: action=0, reward=-1, terminated=False, penalties=0
Evaluation Episode 49, Timestep 4: action=0, reward=-1, terminated=False, penalties=0
Evaluation Episode 49, Timestep 5: action=0, reward=-1, terminated=False, penalties=0
Evaluation Episode 49, Timestep 6: action=0, reward=-1, terminated=False, penalties=0
Evaluation Episode 49, Timestep 7: action=0, reward=-1, terminated=False, penalties=0
Evaluation Episode 49, Timestep 8: action=0, reward=-1, terminated=False, penalties=0
Evaluation Episode 49, Timestep 9: action=0, reward=-1, terminated=False, penalties=0
Evaluation Episode 49, Timestep 10: action=0, reward=-1, terminated=False, penalties=0
Evaluation Episode 49, Timestep 11: action=0, reward

Evaluation Episodes:  49%|████▉     | 49/100 [02:32<02:47,  3.29s/it]

Evaluation Episode 49, Timestep 37: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 49, Timestep 38: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 49, Timestep 39: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 49, Timestep 40: action=2, reward=-1, terminated=False, penalties=0
Evaluation Episode 50, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 50, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 50, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 50, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 50, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 50, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 50, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 50, Timestep 8: action=1, rewar

Evaluation Episodes:  50%|█████     | 50/100 [02:36<02:54,  3.48s/it]

Evaluation Episode 50, Timestep 36: action=1, reward=-1, terminated=False, penalties=2
Evaluation Episode 50, Timestep 37: action=5, reward=-10, terminated=False, penalties=3
Evaluation Episode 50, Timestep 38: action=3, reward=-1, terminated=False, penalties=3
Evaluation Episode 50, Timestep 39: action=1, reward=-1, terminated=False, penalties=3
Evaluation Episode 50, Timestep 40: action=1, reward=-1, terminated=False, penalties=3
Evaluation Episode 51, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 51, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 51, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 51, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 51, Timestep 5: action=0, reward=-1, terminated=False, penalties=0
Evaluation Episode 51, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 51, Timestep 7: action=4, rew

Evaluation Episodes:  51%|█████     | 51/100 [02:39<02:44,  3.36s/it]

Evaluation Episode 51, Timestep 39: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 51, Timestep 40: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 52, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 52, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 52, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 52, Timestep 4: action=3, reward=-1, terminated=False, penalties=0
Evaluation Episode 52, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 52, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 52, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 52, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 52, Timestep 9: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 52, Timestep 10: action=1, reward

Evaluation Episodes:  52%|█████▏    | 52/100 [02:42<02:37,  3.28s/it]

Evaluation Episode 52, Timestep 38: action=1, reward=-1, terminated=False, penalties=2
Evaluation Episode 52, Timestep 39: action=1, reward=-1, terminated=False, penalties=2
Evaluation Episode 52, Timestep 40: action=1, reward=-1, terminated=False, penalties=2
Evaluation Episode 53, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 53, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 53, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 53, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 53, Timestep 5: action=4, reward=-10, terminated=False, penalties=1
Evaluation Episode 53, Timestep 6: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 53, Timestep 7: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 53, Timestep 8: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 53, Timestep 9: action=1, rewar

Evaluation Episodes:  53%|█████▎    | 53/100 [02:45<02:34,  3.28s/it]

Evaluation Episode 53, Timestep 39: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 53, Timestep 40: action=3, reward=-1, terminated=False, penalties=1
Evaluation Episode 54, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 54, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 54, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 54, Timestep 4: action=3, reward=-1, terminated=False, penalties=0
Evaluation Episode 54, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 54, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 54, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 54, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 54, Timestep 9: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 54, Timestep 10: action=1, reward

Evaluation Episodes:  54%|█████▍    | 54/100 [02:49<02:40,  3.49s/it]

Evaluation Episode 54, Timestep 40: action=1, reward=-1, terminated=False, penalties=2
Evaluation Episode 55, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 55, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 55, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 55, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 55, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 55, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 55, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 55, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 55, Timestep 9: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 55, Timestep 10: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 55, Timestep 11: action=1, reward

Evaluation Episodes:  55%|█████▌    | 55/100 [02:53<02:35,  3.46s/it]

Evaluation Episode 55, Timestep 40: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 56, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 56, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 56, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 56, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 56, Timestep 5: action=3, reward=-1, terminated=False, penalties=0
Evaluation Episode 56, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 56, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 56, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 56, Timestep 9: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 56, Timestep 10: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 56, Timestep 11: action=1, reward

Evaluation Episodes:  56%|█████▌    | 56/100 [02:56<02:29,  3.40s/it]

Evaluation Episode 56, Timestep 39: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 56, Timestep 40: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 57, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 57, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 57, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 57, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 57, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 57, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 57, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 57, Timestep 8: action=5, reward=-10, terminated=False, penalties=1
Evaluation Episode 57, Timestep 9: action=0, reward=-1, terminated=False, penalties=1
Evaluation Episode 57, Timestep 10: action=1, rewar

Evaluation Episodes:  57%|█████▋    | 57/100 [02:59<02:23,  3.34s/it]

Evaluation Episode 57, Timestep 39: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 57, Timestep 40: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 58, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 58, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 58, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 58, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 58, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 58, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 58, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 58, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 58, Timestep 9: action=2, reward=-1, terminated=False, penalties=0
Evaluation Episode 58, Timestep 10: action=1, reward

Evaluation Episodes:  58%|█████▊    | 58/100 [03:03<02:23,  3.42s/it]

Evaluation Episode 58, Timestep 38: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 58, Timestep 39: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 58, Timestep 40: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 59, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 59, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 59, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 59, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 59, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 59, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 59, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 59, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 59, Timestep 9: action=1, reward

Evaluation Episodes:  59%|█████▉    | 59/100 [03:06<02:15,  3.31s/it]

Evaluation Episode 59, Timestep 40: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 60, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 60, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 60, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 60, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 60, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 60, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 60, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 60, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 60, Timestep 9: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 60, Timestep 10: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 60, Timestep 11: action=4, reward

Evaluation Episodes:  60%|██████    | 60/100 [03:09<02:09,  3.23s/it]

Evaluation Episode 60, Timestep 38: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 60, Timestep 39: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 60, Timestep 40: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 61, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 61, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 61, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 61, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 61, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 61, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 61, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 61, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 61, Timestep 9: action=1, reward

Evaluation Episodes:  61%|██████    | 61/100 [03:12<02:07,  3.26s/it]

Evaluation Episode 61, Timestep 38: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 61, Timestep 39: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 61, Timestep 40: action=0, reward=-1, terminated=False, penalties=1
Evaluation Episode 62, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 62, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 62, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 62, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 62, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 62, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 62, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 62, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 62, Timestep 9: action=1, reward

Evaluation Episodes:  62%|██████▏   | 62/100 [03:15<02:05,  3.30s/it]

Evaluation Episode 62, Timestep 39: action=1, reward=-1, terminated=False, penalties=2
Evaluation Episode 62, Timestep 40: action=1, reward=-1, terminated=False, penalties=2
Evaluation Episode 63, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 63, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 63, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 63, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 63, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 63, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 63, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 63, Timestep 8: action=4, reward=-10, terminated=False, penalties=1
Evaluation Episode 63, Timestep 9: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 63, Timestep 10: action=4, rewar

Evaluation Episodes:  63%|██████▎   | 63/100 [03:18<01:56,  3.14s/it]

Evaluation Episode 63, Timestep 40: action=1, reward=-1, terminated=False, penalties=3
Evaluation Episode 64, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 64, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 64, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 64, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 64, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 64, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 64, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 64, Timestep 8: action=5, reward=-10, terminated=False, penalties=1
Evaluation Episode 64, Timestep 9: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 64, Timestep 10: action=5, reward=-10, terminated=False, penalties=2
Evaluation Episode 64, Timestep 11: action=1, rewa

Evaluation Episodes:  64%|██████▍   | 64/100 [03:21<01:47,  2.97s/it]

Evaluation Episode 64, Timestep 40: action=1, reward=-1, terminated=False, penalties=3
Evaluation Episode 65, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 65, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 65, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 65, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 65, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 65, Timestep 6: action=4, reward=-10, terminated=False, penalties=1
Evaluation Episode 65, Timestep 7: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 65, Timestep 8: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 65, Timestep 9: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 65, Timestep 10: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 65, Timestep 11: action=4, rewar

Evaluation Episodes:  65%|██████▌   | 65/100 [03:24<01:41,  2.90s/it]

Evaluation Episode 65, Timestep 40: action=1, reward=-1, terminated=False, penalties=2
Evaluation Episode 66, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 66, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 66, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 66, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 66, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 66, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 66, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 66, Timestep 8: action=5, reward=-10, terminated=False, penalties=1
Evaluation Episode 66, Timestep 9: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 66, Timestep 10: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 66, Timestep 11: action=0, rewar

Evaluation Episodes:  66%|██████▌   | 66/100 [03:27<01:46,  3.14s/it]

Evaluation Episode 66, Timestep 39: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 66, Timestep 40: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 67, Timestep 1: action=2, reward=-1, terminated=False, penalties=0
Evaluation Episode 67, Timestep 2: action=3, reward=-1, terminated=False, penalties=0
Evaluation Episode 67, Timestep 3: action=2, reward=-1, terminated=False, penalties=0
Evaluation Episode 67, Timestep 4: action=2, reward=-1, terminated=False, penalties=0
Evaluation Episode 67, Timestep 5: action=2, reward=-1, terminated=False, penalties=0
Evaluation Episode 67, Timestep 6: action=2, reward=-1, terminated=False, penalties=0
Evaluation Episode 67, Timestep 7: action=2, reward=-1, terminated=False, penalties=0
Evaluation Episode 67, Timestep 8: action=2, reward=-1, terminated=False, penalties=0
Evaluation Episode 67, Timestep 9: action=2, reward=-1, terminated=False, penalties=0
Evaluation Episode 67, Timestep 10: action=2, reward

Evaluation Episodes:  67%|██████▋   | 67/100 [03:30<01:39,  3.02s/it]

Evaluation Episode 67, Timestep 39: action=2, reward=-1, terminated=False, penalties=1
Evaluation Episode 67, Timestep 40: action=2, reward=-1, terminated=False, penalties=1
Evaluation Episode 68, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 68, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 68, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 68, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 68, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 68, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 68, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 68, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 68, Timestep 9: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 68, Timestep 10: action=1, reward

Evaluation Episodes:  68%|██████▊   | 68/100 [03:33<01:33,  2.94s/it]

Evaluation Episode 68, Timestep 39: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 68, Timestep 40: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 69, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 69, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 69, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 69, Timestep 4: action=5, reward=-10, terminated=False, penalties=1
Evaluation Episode 69, Timestep 5: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 69, Timestep 6: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 69, Timestep 7: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 69, Timestep 8: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 69, Timestep 9: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 69, Timestep 10: action=1, rewar

Evaluation Episodes:  69%|██████▉   | 69/100 [03:36<01:30,  2.93s/it]

Evaluation Episode 69, Timestep 37: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 69, Timestep 38: action=4, reward=-10, terminated=False, penalties=2
Evaluation Episode 69, Timestep 39: action=1, reward=-1, terminated=False, penalties=2
Evaluation Episode 69, Timestep 40: action=1, reward=-1, terminated=False, penalties=2
Evaluation Episode 70, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 70, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 70, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 70, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 70, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 70, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 70, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 70, Timestep 8: action=1, rewa

Evaluation Episodes:  70%|███████   | 70/100 [03:39<01:31,  3.04s/it]

Evaluation Episode 70, Timestep 39: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 70, Timestep 40: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 71, Timestep 1: action=3, reward=-1, terminated=False, penalties=0
Evaluation Episode 71, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 71, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 71, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 71, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 71, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 71, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 71, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 71, Timestep 9: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 71, Timestep 10: action=1, reward

Evaluation Episodes:  71%|███████   | 71/100 [03:42<01:27,  3.03s/it]

Evaluation Episode 71, Timestep 40: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 72, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 72, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 72, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 72, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 72, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 72, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 72, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 72, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 72, Timestep 9: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 72, Timestep 10: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 72, Timestep 11: action=1, reward

Evaluation Episodes:  72%|███████▏  | 72/100 [03:45<01:21,  2.92s/it]

Evaluation Episode 72, Timestep 40: action=1, reward=-1, terminated=False, penalties=2
Evaluation Episode 73, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 73, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 73, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 73, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 73, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 73, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 73, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 73, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 73, Timestep 9: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 73, Timestep 10: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 73, Timestep 11: action=1, reward

Evaluation Episodes:  73%|███████▎  | 73/100 [03:47<01:15,  2.80s/it]

Evaluation Episode 73, Timestep 39: action=1, reward=-1, terminated=False, penalties=2
Evaluation Episode 73, Timestep 40: action=1, reward=-1, terminated=False, penalties=2
Evaluation Episode 74, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 74, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 74, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 74, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 74, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 74, Timestep 6: action=0, reward=-1, terminated=False, penalties=0
Evaluation Episode 74, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 74, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 74, Timestep 9: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 74, Timestep 10: action=1, reward

Evaluation Episodes:  74%|███████▍  | 74/100 [03:50<01:09,  2.69s/it]

Evaluation Episode 74, Timestep 38: action=1, reward=-1, terminated=False, penalties=2
Evaluation Episode 74, Timestep 39: action=1, reward=-1, terminated=False, penalties=2
Evaluation Episode 74, Timestep 40: action=3, reward=-1, terminated=False, penalties=2
Evaluation Episode 75, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 75, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 75, Timestep 3: action=3, reward=-1, terminated=False, penalties=0
Evaluation Episode 75, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 75, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 75, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 75, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 75, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 75, Timestep 9: action=1, reward

Evaluation Episodes:  75%|███████▌  | 75/100 [03:53<01:14,  2.98s/it]

Evaluation Episode 75, Timestep 40: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 76, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 76, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 76, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 76, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 76, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 76, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 76, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 76, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 76, Timestep 9: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 76, Timestep 10: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 76, Timestep 11: action=1, reward

Evaluation Episodes:  76%|███████▌  | 76/100 [03:56<01:08,  2.86s/it]

Evaluation Episode 76, Timestep 40: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 77, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 77, Timestep 2: action=4, reward=-10, terminated=False, penalties=1
Evaluation Episode 77, Timestep 3: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 77, Timestep 4: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 77, Timestep 5: action=2, reward=-1, terminated=False, penalties=1
Evaluation Episode 77, Timestep 6: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 77, Timestep 7: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 77, Timestep 8: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 77, Timestep 9: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 77, Timestep 10: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 77, Timestep 11: action=1, rewar

Evaluation Episodes:  77%|███████▋  | 77/100 [03:59<01:08,  2.97s/it]

Evaluation Episode 77, Timestep 39: action=1, reward=-1, terminated=False, penalties=2
Evaluation Episode 77, Timestep 40: action=1, reward=-1, terminated=False, penalties=2
Evaluation Episode 78, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 78, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 78, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 78, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 78, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 78, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 78, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 78, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 78, Timestep 9: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 78, Timestep 10: action=1, reward

Evaluation Episodes:  78%|███████▊  | 78/100 [04:03<01:11,  3.23s/it]

Evaluation Episode 78, Timestep 39: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 78, Timestep 40: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 79, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 79, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 79, Timestep 3: action=4, reward=-10, terminated=False, penalties=1
Evaluation Episode 79, Timestep 4: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 79, Timestep 5: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 79, Timestep 6: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 79, Timestep 7: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 79, Timestep 8: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 79, Timestep 9: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 79, Timestep 10: action=1, rewar

Evaluation Episodes:  79%|███████▉  | 79/100 [04:07<01:15,  3.61s/it]

Evaluation Episode 79, Timestep 40: action=3, reward=-1, terminated=False, penalties=1
Evaluation Episode 80, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 80, Timestep 2: action=4, reward=-10, terminated=False, penalties=1
Evaluation Episode 80, Timestep 3: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 80, Timestep 4: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 80, Timestep 5: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 80, Timestep 6: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 80, Timestep 7: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 80, Timestep 8: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 80, Timestep 9: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 80, Timestep 10: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 80, Timestep 11: action=1, rewar

Evaluation Episodes:  80%|████████  | 80/100 [04:11<01:09,  3.47s/it]

Evaluation Episode 80, Timestep 40: action=1, reward=-1, terminated=False, penalties=3
Evaluation Episode 81, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 81, Timestep 2: action=0, reward=-1, terminated=False, penalties=0
Evaluation Episode 81, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 81, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 81, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 81, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 81, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 81, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 81, Timestep 9: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 81, Timestep 10: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 81, Timestep 11: action=1, reward

Evaluation Episodes:  81%|████████  | 81/100 [04:14<01:03,  3.33s/it]

Evaluation Episode 81, Timestep 38: action=1, reward=-1, terminated=False, penalties=2
Evaluation Episode 81, Timestep 39: action=1, reward=-1, terminated=False, penalties=2
Evaluation Episode 81, Timestep 40: action=5, reward=-10, terminated=False, penalties=3
Evaluation Episode 82, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 82, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 82, Timestep 3: action=3, reward=-1, terminated=False, penalties=0
Evaluation Episode 82, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 82, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 82, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 82, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 82, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 82, Timestep 9: action=3, rewar

Evaluation Episodes:  82%|████████▏ | 82/100 [04:17<00:59,  3.31s/it]

Evaluation Episode 82, Timestep 39: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 82, Timestep 40: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 83, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 83, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 83, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 83, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 83, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 83, Timestep 6: action=5, reward=-10, terminated=False, penalties=1
Evaluation Episode 83, Timestep 7: action=3, reward=-1, terminated=False, penalties=1
Evaluation Episode 83, Timestep 8: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 83, Timestep 9: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 83, Timestep 10: action=1, rewar

Evaluation Episodes:  83%|████████▎ | 83/100 [04:21<00:59,  3.52s/it]

Evaluation Episode 83, Timestep 39: action=1, reward=-1, terminated=False, penalties=2
Evaluation Episode 83, Timestep 40: action=1, reward=-1, terminated=False, penalties=2
Evaluation Episode 84, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 84, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 84, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 84, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 84, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 84, Timestep 6: action=5, reward=-10, terminated=False, penalties=1
Evaluation Episode 84, Timestep 7: action=2, reward=-1, terminated=False, penalties=1
Evaluation Episode 84, Timestep 8: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 84, Timestep 9: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 84, Timestep 10: action=1, rewar

Evaluation Episodes:  84%|████████▍ | 84/100 [04:24<00:54,  3.43s/it]

Evaluation Episode 84, Timestep 38: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 84, Timestep 39: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 84, Timestep 40: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 85, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 85, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 85, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 85, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 85, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 85, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 85, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 85, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 85, Timestep 9: action=1, reward

Evaluation Episodes:  85%|████████▌ | 85/100 [04:27<00:51,  3.42s/it]

Evaluation Episode 85, Timestep 39: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 85, Timestep 40: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 86, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 86, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 86, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 86, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 86, Timestep 5: action=4, reward=-10, terminated=False, penalties=1
Evaluation Episode 86, Timestep 6: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 86, Timestep 7: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 86, Timestep 8: action=3, reward=-1, terminated=False, penalties=1
Evaluation Episode 86, Timestep 9: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 86, Timestep 10: action=1, rewar

Evaluation Episodes:  86%|████████▌ | 86/100 [04:31<00:47,  3.38s/it]

Evaluation Episode 86, Timestep 40: action=1, reward=-1, terminated=False, penalties=4
Evaluation Episode 87, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 87, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 87, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 87, Timestep 4: action=0, reward=-1, terminated=False, penalties=0
Evaluation Episode 87, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 87, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 87, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 87, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 87, Timestep 9: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 87, Timestep 10: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 87, Timestep 11: action=5, reward

Evaluation Episodes:  87%|████████▋ | 87/100 [04:34<00:43,  3.34s/it]

Evaluation Episode 87, Timestep 39: action=1, reward=-1, terminated=False, penalties=2
Evaluation Episode 87, Timestep 40: action=1, reward=-1, terminated=False, penalties=2
Evaluation Episode 88, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 88, Timestep 2: action=0, reward=-1, terminated=False, penalties=0
Evaluation Episode 88, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 88, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 88, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 88, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 88, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 88, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 88, Timestep 9: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 88, Timestep 10: action=1, reward

Evaluation Episodes:  88%|████████▊ | 88/100 [04:37<00:38,  3.22s/it]

Evaluation Episode 88, Timestep 38: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 88, Timestep 39: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 88, Timestep 40: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 89, Timestep 1: action=4, reward=-10, terminated=False, penalties=1
Evaluation Episode 89, Timestep 2: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 89, Timestep 3: action=2, reward=-1, terminated=False, penalties=1
Evaluation Episode 89, Timestep 4: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 89, Timestep 5: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 89, Timestep 6: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 89, Timestep 7: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 89, Timestep 8: action=0, reward=-1, terminated=False, penalties=1
Evaluation Episode 89, Timestep 9: action=1, rewar

Evaluation Episodes:  89%|████████▉ | 89/100 [04:40<00:34,  3.10s/it]

Evaluation Episode 89, Timestep 37: action=1, reward=-1, terminated=False, penalties=3
Evaluation Episode 89, Timestep 38: action=1, reward=-1, terminated=False, penalties=3
Evaluation Episode 89, Timestep 39: action=1, reward=-1, terminated=False, penalties=3
Evaluation Episode 89, Timestep 40: action=5, reward=-10, terminated=False, penalties=4
Evaluation Episode 90, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 90, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 90, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 90, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 90, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 90, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 90, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 90, Timestep 8: action=1, rewa

Evaluation Episodes:  90%|█████████ | 90/100 [04:43<00:32,  3.27s/it]

Evaluation Episode 90, Timestep 39: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 90, Timestep 40: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 91, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 91, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 91, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 91, Timestep 4: action=3, reward=-1, terminated=False, penalties=0
Evaluation Episode 91, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 91, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 91, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 91, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 91, Timestep 9: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 91, Timestep 10: action=3, reward

Evaluation Episodes:  91%|█████████ | 91/100 [04:47<00:30,  3.34s/it]

Evaluation Episode 91, Timestep 40: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 92, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 92, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 92, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 92, Timestep 4: action=3, reward=-1, terminated=False, penalties=0
Evaluation Episode 92, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 92, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 92, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 92, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 92, Timestep 9: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 92, Timestep 10: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 92, Timestep 11: action=3, reward

Evaluation Episodes:  92%|█████████▏| 92/100 [04:50<00:25,  3.20s/it]

Evaluation Episode 92, Timestep 40: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 93, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 93, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 93, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 93, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 93, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 93, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 93, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 93, Timestep 8: action=5, reward=-10, terminated=False, penalties=1
Evaluation Episode 93, Timestep 9: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 93, Timestep 10: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 93, Timestep 11: action=1, rewar

Evaluation Episodes:  93%|█████████▎| 93/100 [04:52<00:21,  3.01s/it]

Evaluation Episode 93, Timestep 37: action=1, reward=-1, terminated=False, penalties=3
Evaluation Episode 93, Timestep 38: action=4, reward=-10, terminated=False, penalties=4
Evaluation Episode 93, Timestep 39: action=1, reward=-1, terminated=False, penalties=4
Evaluation Episode 93, Timestep 40: action=1, reward=-1, terminated=False, penalties=4
Evaluation Episode 94, Timestep 1: action=2, reward=-1, terminated=False, penalties=0
Evaluation Episode 94, Timestep 2: action=5, reward=-10, terminated=False, penalties=1
Evaluation Episode 94, Timestep 3: action=2, reward=-1, terminated=False, penalties=1
Evaluation Episode 94, Timestep 4: action=2, reward=-1, terminated=False, penalties=1
Evaluation Episode 94, Timestep 5: action=3, reward=-1, terminated=False, penalties=1
Evaluation Episode 94, Timestep 6: action=2, reward=-1, terminated=False, penalties=1
Evaluation Episode 94, Timestep 7: action=3, reward=-1, terminated=False, penalties=1
Evaluation Episode 94, Timestep 8: action=2, rew

Evaluation Episodes:  94%|█████████▍| 94/100 [04:55<00:17,  2.94s/it]

Evaluation Episode 94, Timestep 40: action=2, reward=-1, terminated=False, penalties=2
Evaluation Episode 95, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 95, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 95, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 95, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 95, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 95, Timestep 6: action=2, reward=-1, terminated=False, penalties=0
Evaluation Episode 95, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 95, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 95, Timestep 9: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 95, Timestep 10: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 95, Timestep 11: action=1, reward

Evaluation Episodes:  95%|█████████▌| 95/100 [04:59<00:16,  3.22s/it]

Evaluation Episode 95, Timestep 39: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 95, Timestep 40: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 96, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 96, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 96, Timestep 3: action=3, reward=-1, terminated=False, penalties=0
Evaluation Episode 96, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 96, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 96, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 96, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 96, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 96, Timestep 9: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 96, Timestep 10: action=1, reward

Evaluation Episodes:  96%|█████████▌| 96/100 [05:02<00:12,  3.09s/it]

Evaluation Episode 96, Timestep 40: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 97, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 97, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 97, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 97, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 97, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 97, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 97, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 97, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 97, Timestep 9: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 97, Timestep 10: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 97, Timestep 11: action=1, reward

Evaluation Episodes:  97%|█████████▋| 97/100 [05:05<00:09,  3.05s/it]

Evaluation Episode 97, Timestep 40: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 98, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 98, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 98, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 98, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 98, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 98, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 98, Timestep 7: action=4, reward=-10, terminated=False, penalties=1
Evaluation Episode 98, Timestep 8: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 98, Timestep 9: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 98, Timestep 10: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 98, Timestep 11: action=1, rewar

Evaluation Episodes:  98%|█████████▊| 98/100 [05:08<00:05,  3.00s/it]

Evaluation Episode 98, Timestep 37: action=1, reward=-1, terminated=False, penalties=2
Evaluation Episode 98, Timestep 38: action=1, reward=-1, terminated=False, penalties=2
Evaluation Episode 98, Timestep 39: action=1, reward=-1, terminated=False, penalties=2
Evaluation Episode 98, Timestep 40: action=1, reward=-1, terminated=False, penalties=2
Evaluation Episode 99, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 99, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 99, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 99, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 99, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 99, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 99, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 99, Timestep 8: action=1, rewar

Evaluation Episodes:  99%|█████████▉| 99/100 [05:11<00:03,  3.21s/it]

Evaluation Episode 99, Timestep 40: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 100, Timestep 1: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 100, Timestep 2: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 100, Timestep 3: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 100, Timestep 4: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 100, Timestep 5: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 100, Timestep 6: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 100, Timestep 7: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 100, Timestep 8: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 100, Timestep 9: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 100, Timestep 10: action=1, reward=-1, terminated=False, penalties=0
Evaluation Episode 100, Timestep 11: actio

Evaluation Episodes: 100%|██████████| 100/100 [05:14<00:00,  3.15s/it]

Evaluation Episode 100, Timestep 39: action=1, reward=-1, terminated=False, penalties=1
Evaluation Episode 100, Timestep 40: action=1, reward=-1, terminated=False, penalties=1
Deep Q-Learning Method:
Average number of penalties per episode: 1.4
Average number of timesteps per trip: 40.0
Average rewards per move: -0.03290155440414508
Time to train: 3528.5490431785583 seconds





In [36]:
sm_data = {
    "Metric": ["Average Penalties", "Average Timesteps", "Average Rewards per Move", "Training Execution Time (seconds)"],
    "Sample Method": [64.2, 195.88, -772.42, 0.8409316539764404]
}

ql_data = {
    "Metric": ["Average Penalties", "Average Timesteps", "Average Rewards per Move", "Training Execution Time (seconds)"],
    "Q-Learning": [0.48125, 15.89639, 0.046793606804738715, 69.6101667881012]
}

sar_data = {
    "Metric": ["Average Penalties", "Average Timesteps", "Average Rewards per Move", "Training Execution Time (seconds)"],
    "SARSA": [0.90008, 28.31204, -0.5942176334709375, 120.70162725448608]
}

dql_data = {
    "Metric": ["Average Penalties", "Average Timesteps", "Average Rewards per Move", "Training Execution Time (seconds)"],
    "Deep Q-Learning": [1.4, 40.0, -0.03290155440414508, 3528.5490431785583]
}

# Create DataFrames
sm_df = pd.DataFrame(sm_data)
ql_df = pd.DataFrame(ql_data)
sarsa_df = pd.DataFrame(sar_data)
dql_df = pd.DataFrame(dql_data)

# Merge DataFrames
summary_df = pd.merge(sm_df, ql_df, on="Metric")
summary_df = pd.merge(summary_df, sarsa_df, on="Metric")
summary_df = pd.merge(summary_df, dql_df, on="Metric")
summary_df = summary_df.round(2)

print("Summary DataFrame:")
summary_df

Summary DataFrame:


Unnamed: 0,Metric,Sample Method,Q-Learning,SARSA,Deep Q-Learning
0,Average Penalties,64.2,0.48,0.9,1.4
1,Average Timesteps,195.88,15.9,28.31,40.0
2,Average Rewards per Move,-772.42,0.05,-0.59,-0.03
3,Training Execution Time (seconds),0.84,69.61,120.7,3528.55
