In [14]:
!pip3 install torch torchvision torchaudio
!pip3 install ale_py gymnasium
!pip3 install opencv-python
!pip3 install pympler
!pip3 install logging



In [23]:
import os
from collections import defaultdict, deque
import random
import sys
from pympler import asizeof
import gc
import time  # Add for timing

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from matplotlib.patches import Patch
import numpy as np
import cv2

import ale_py
import gymnasium as gym
from gymnasium.vector import SyncVectorEnv  # Correct import for gymnasium

import torch
import torch.nn.functional as F
from torch import nn, optim
from torch.utils.data import DataLoader

import torchvision
import torchvision.datasets as datasets
import torchvision.transforms as transforms

from tqdm import tqdm
import logging

In [24]:
gym.register_envs(ale_py)

# ===== CONFIGURATION =====
# Add parallel environment settings
N_ENVS = 8  # Number of parallel environments
N_STEPS = 100  # Collect 100 steps before training
N_UPDATES = 4  # Number of training updates after collecting steps

# Image cropping parameters
CROP_TOP = 34
CROP_BOTTOM = 16
CROP_LEFT = 0
CROP_RIGHT = 0
RESIZE_WIDTH = 84
RESIZE_HEIGHT = 84

# Training hyperparameters
LEARNING_RATE = 0.1
N_EPISODES = 100
START_EPSILON = 0.7
FINAL_EPSILON = 0.1
EPSILON_DECAY = (START_EPSILON - FINAL_EPSILON) / (N_EPISODES * 1000)  # Decay per step
DISCOUNT_FACTOR = 0.99
MEMORY_CAPACITY = 1000000
MEMORY_FILL_SIZE = 50000
MINIBATCH_SIZE = 8192
TARGET_UPDATE_FREQ = 1000
MAX_STEPS_PER_EPISODE = 5000
REWARD_CLIP = True
MODEL_FILE = "dqn_pong"
LOAD_MODEL = "dqn_pong_best.pth"  # or False
# =========================

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Create vectorized environment
def make_env():
    env = gym.make("ALE/Pong-v5")
    return env

env_fns = [make_env for _ in range(N_ENVS)]
vec_env = SyncVectorEnv(env_fns)
print(f"Created vectorized environment with {N_ENVS} parallel instances")

Using device: cpu
Created vectorized environment with 8 parallel instances


In [25]:
# preprocessing image


# preprocess image demonstration
"""
def preprocess_and_show_steps(obs):  # input: 210x160x3 RGB
    fig, axs = plt.subplots(1, 4, figsize=(18, 5))

    # Step 1: Show original RGB image
    axs[0].imshow(obs)
    axs[0].set_title("Original RGB (210x160)")
    axs[0].axis('off')

    # Step 2: Convert to grayscale
    greyscaled = np.dot(obs[..., :3], [0.299, 0.587, 0.114]).astype(np.uint8)
    axs[1].imshow(greyscaled, cmap='gray')
    axs[1].set_title("Grayscale")
    axs[1].axis('off')

    # Step 3: Crop image vertically (rows 95 to 195)
    cropped = greyscaled[95:195, :]
    axs[2].imshow(cropped, cmap='gray')
    axs[2].set_title("Cropped (95:195)")
    axs[2].axis('off')

    # Step 4: Resize to 100x50 using nearest neighbor
    resized = cv2.resize(cropped, dsize=(50, 25), interpolation=cv2.INTER_NEAREST)
    axs[3].imshow(resized, cmap='gray')
    axs[3].set_title("Resized to 100x50")
    axs[3].axis('off')

    plt.tight_layout()
    plt.show()

    return resized

# Run and visualize preprocessing steps
obs, info = env.reset()
processed = preprocess_and_show_steps(obs)
"""

def preprocess(obs,
               crop_top=CROP_TOP,
               crop_bottom=CROP_BOTTOM,
               crop_left=CROP_LEFT,
               crop_right=CROP_RIGHT,
               resize_width=RESIZE_WIDTH,
               resize_height=RESIZE_HEIGHT):
    # Convert to grayscale
    if len(obs.shape) == 3 and obs.shape[2] == 3:
        greyscaled = np.dot(obs[..., :3], [0.299, 0.587, 0.114])
    else:
        greyscaled = obs

    # Get image dimensions
    height, width = greyscaled.shape

    # Calculate crop boundaries
    top_bound = crop_top
    bottom_bound = height - crop_bottom
    left_bound = crop_left
    right_bound = width - crop_right

    # Perform cropping
    cropped = greyscaled[top_bound:bottom_bound, left_bound:right_bound]

    # Resize
    resized = cv2.resize(cropped, (resize_width, resize_height),
                         interpolation=cv2.INTER_NEAREST)

    return resized

In [26]:
# defining neural network
# based on this architecture: https://arxiv.org/pdf/1312.5602
# code written by Claude Sonnet 4

class DQN(nn.Module):
    def __init__(self, num_actions, in_channels=4):
        super(DQN, self).__init__()

        # Convolutional layers (standard DQN architecture)
        self.conv1 = nn.Conv2d(in_channels, 32, kernel_size=8, stride=4)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)

        # Dynamically calculate linear layer size
        with torch.no_grad():
            dummy = torch.zeros(1, in_channels, RESIZE_HEIGHT, RESIZE_WIDTH)
            dummy = F.relu(self.conv1(dummy))
            dummy = F.relu(self.conv2(dummy))
            dummy = F.relu(self.conv3(dummy))
            self.linear_size = dummy.view(1, -1).size(1)

        self.fc1 = nn.Linear(self.linear_size, 512)
        self.fc2 = nn.Linear(512, num_actions)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = x.view(x.size(0), -1)  # Flatten
        x = F.relu(self.fc1(x))
        return self.fc2(x)

# example input: batch of 32 preprocessed frames
# batch_size = 32
# input_tensor = torch.randn(batch_size, 4, 25, 50) # Corrected input shape

# forward pass
# model = DQN(env.action_space.n)
# q_values = model(input_tensor)
# print(f"Input shape: {input_tensor.shape}")
# print(f"Output Q-values shape: {q_values.shape}")

In [27]:
class ReplayMemory:
    def __init__(self, capacity, state_shape):
        self.capacity = capacity
        self.state_shape = state_shape
        # Pre-allocate contiguous arrays
        self.states = np.zeros((capacity, *state_shape), dtype=np.uint8)
        self.next_states = np.zeros((capacity, *state_shape), dtype=np.uint8)
        self.actions = np.zeros(capacity, dtype=np.int32)
        self.rewards = np.zeros(capacity, dtype=np.float16)  # Reduce precision
        self.dones = np.zeros(capacity, dtype=np.bool_)
        self.position = 0
        self.size = 0

    def push(self, state, action, reward, next_state, done):
        self.states[self.position] = state
        self.next_states[self.position] = next_state
        self.actions[self.position] = action
        self.rewards[self.position] = reward
        self.dones[self.position] = done

        self.position = (self.position + 1) % self.capacity
        self.size = min(self.size + 1, self.capacity)

    def sample(self, batch_size):
        indices = np.random.choice(self.size, batch_size, replace=False)
        return (
            self.states[indices],
            self.actions[indices],
            self.rewards[indices],
            self.next_states[indices],
            self.dones[indices]
        )

    def __len__(self):
        return self.size

    def memory_usage_gb(self):
        total_bytes = (self.states.nbytes + self.next_states.nbytes +
                       self.actions.nbytes + self.rewards.nbytes +
                       self.dones.nbytes)
        return total_bytes / (1024 ** 3)

In [28]:
# Initialize replay memory
state_shape = (4, RESIZE_HEIGHT, RESIZE_WIDTH)
replay_memory = ReplayMemory(MEMORY_CAPACITY, state_shape)

# Initialize networks
online_net = DQN(gym.make("ALE/Pong-v5").action_space.n).to(device)
target_net = DQN(gym.make("ALE/Pong-v5").action_space.n).to(device)
target_net.load_state_dict(online_net.state_dict())
target_net.eval()

# === Load saved weights to continue training ===
if LOAD_MODEL:
    online_net.load_state_dict(torch.load(LOAD_MODEL, map_location=device))
    target_net.load_state_dict(online_net.state_dict())

# Optimizer
optimizer = optim.RMSprop(
    online_net.parameters(),
    lr=LEARNING_RATE,
    alpha=0.95,
    momentum=0.95,
    eps=0.01
)

In [29]:
class VectorAgent:
    def __init__(self, num_envs, initial_epsilon, epsilon_decay, final_epsilon, discount_factor):
        self.num_envs = num_envs
        self.epsilon = initial_epsilon
        self.epsilon_decay = epsilon_decay
        self.final_epsilon = final_epsilon
        self.discount_factor = discount_factor

    def get_actions(self, states, model):
        """Get actions for all environments in a batch"""
        # Convert states to tensor
        states_tensor = torch.tensor(states, dtype=torch.float32, device=device) / 255.0

        with torch.no_grad():
            q_values = model(states_tensor)

        # Epsilon-greedy for each environment
        actions = q_values.argmax(1).cpu().numpy()
        rand_mask = np.random.rand(self.num_envs) < self.epsilon
        actions[rand_mask] = np.array([vec_env.single_action_space.sample() for _ in range(self.num_envs)])[rand_mask]

        return actions

    def decay_epsilon(self):
        self.epsilon = max(self.final_epsilon, self.epsilon - self.epsilon_decay)

In [30]:
# Initialize agent
agent = VectorAgent(
    num_envs=N_ENVS,
    initial_epsilon=START_EPSILON,
    epsilon_decay=EPSILON_DECAY,
    final_epsilon=FINAL_EPSILON,
    discount_factor=DISCOUNT_FACTOR,
)

# Training setup
total_steps = 0
episode_rewards = []
episode_losses = []
episode_steps = []
best_reward = -float('inf')

# Initialize frame buffers for each environment
frame_buffers = [deque(maxlen=4) for _ in range(N_ENVS)]
current_states = np.zeros((N_ENVS, 4, RESIZE_HEIGHT, RESIZE_WIDTH))

# Create individual environments instead of vectorized env
print("Creating individual environments...")
envs = [make_env() for _ in range(N_ENVS)]

def reset_env(i):
    """Reset a single environment"""
    obs, _ = envs[i].reset()
    frame = preprocess(obs)
    frame_buffers[i].clear()
    for _ in range(4):
        frame_buffers[i].append(frame)
    return np.stack(frame_buffers[i], axis=0)

# Initialize all environments
print("Initializing parallel environments...")
for i in range(N_ENVS):
    current_states[i] = reset_env(i)

# Pre-fill replay memory 
print(f"Pre-filling replay memory to {MEMORY_FILL_SIZE} transitions...")
pbar = tqdm(total=MEMORY_FILL_SIZE, desc="Pre-filling")
episodes_completed = 0

# Track which environments are done
done_flags = [False] * N_ENVS

while len(replay_memory) < MEMORY_FILL_SIZE:
    # Prepare actions
    actions = []
    for i in range(N_ENVS):
        if done_flags[i]:
            # Environment is done, we'll reset it below
            actions.append(0)  # Dummy action, will be replaced
        else:
            # Get random action
            actions.append(envs[i].action_space.sample())
    
    # Step all environments
    next_obs_list = []
    rewards_list = []
    dones_list = []
    truncateds_list = []
    
    for i in range(N_ENVS):
        if done_flags[i]:
            # Reset done environments
            obs, _ = envs[i].reset()
            frame = preprocess(obs)
            frame_buffers[i].clear()
            for _ in range(4):
                frame_buffers[i].append(frame)
            next_state = np.stack(frame_buffers[i], axis=0)
            next_obs_list.append(obs)
            rewards_list.append(0)
            dones_list.append(False)
            truncateds_list.append(False)
            done_flags[i] = False
        else:
            # Step active environments
            next_obs, reward, done, truncated, info = envs[i].step(actions[i])
            next_obs_list.append(next_obs)
            rewards_list.append(reward)
            dones_list.append(done)
            truncateds_list.append(truncated)
    
    # Process each environment
    next_states = np.zeros_like(current_states)
    for i in range(N_ENVS):
        next_frame = preprocess(next_obs_list[i])
        frame_buffers[i].append(next_frame)
        next_states[i] = np.stack(frame_buffers[i], axis=0)
        
        if REWARD_CLIP:
            rewards_list[i] = np.clip(rewards_list[i], -1, 1)
        
        # Only store transitions from active environments (not those that were just reset)
        if not done_flags[i]:
            replay_memory.push(
                current_states[i].copy(),
                actions[i],
                rewards_list[i],
                next_states[i].copy(),
                dones_list[i] or truncateds_list[i]
            )
        
        # Check if environment is done
        if dones_list[i] or truncateds_list[i]:
            done_flags[i] = True
            episodes_completed += 1
    
    current_states = next_states
    pbar.update(N_ENVS)
    pbar.set_postfix({"Memory": len(replay_memory), "Episodes": episodes_completed})

pbar.close()
print(f"Replay memory filled with {len(replay_memory)} transitions")

Creating individual environments...
Initializing parallel environments...
Pre-filling replay memory to 50000 transitions...


Pre-filling: 100%|█| 50000/50000 [01:20<00:00, 619.48it/s, Memory=5e+4, Episodes

Replay memory filled with 50000 transitions





In [None]:
# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("training.log"),
        logging.StreamHandler(sys.stdout)
    ]
)
logger = logging.getLogger(__name__)

# Training loop with parallel environments - LOGGING VERSION
logger.info("Starting training with parallel environments...")
total_updates = 0
start_time = time.time()
episode_rewards = []      # Track all completed episode rewards
episode_steps_list = []   # Track all completed episode steps
episode_losses = []       # Track losses per training update
best_reward = -float('inf')

# Keep the episode progress bar but remove nested bars
episode_pbar = tqdm(range(N_EPISODES), desc="Training Episodes")

for episode in episode_pbar:
    # Log episode start
    logger.info(f"Starting episode {episode}/{N_EPISODES}")
    
    # Reset all environments at the start of each episode
    obs, _ = vec_env.reset()
    # Reinitialize frame buffers and current states
    frame_buffers = [deque(maxlen=4) for _ in range(N_ENVS)]
    current_states = np.zeros((N_ENVS, 4, RESIZE_HEIGHT, RESIZE_WIDTH))
    for i in range(N_ENVS):
        frame = preprocess(obs[i])
        for _ in range(4):
            frame_buffers[i].append(frame)
        current_states[i] = np.stack(frame_buffers[i], axis=0)

    # Per-environment tracking
    episode_rewards_current = np.zeros(N_ENVS)   # Rewards for current episode in each env
    episode_steps_current = np.zeros(N_ENVS)     # Steps for current episode in each env

    # Collect N_STEPS from all environments - no nested progress bar
    episode_start_time = time.time()
    step_losses = []
    
    for step in range(N_STEPS):
        # Get actions for all environments
        actions = agent.get_actions(current_states, online_net)

        # Step all environments
        next_obs, rewards, dones, truncateds, infos = vec_env.step(actions)

        # Process each environment
        next_states = np.zeros_like(current_states)
        for i in range(N_ENVS):
            # Preprocess frame
            next_frame = preprocess(next_obs[i])

            # Update frame buffer
            frame_buffers[i].append(next_frame)
            next_states[i] = np.stack(frame_buffers[i], axis=0)

            # Clip reward
            if REWARD_CLIP:
                rewards[i] = np.clip(rewards[i], -10, 10)

            # Update tracking
            episode_rewards_current[i] += rewards[i]
            episode_steps_current[i] += 1

            # Store transition
            replay_memory.push(
                current_states[i].copy(),
                actions[i],
                rewards[i],
                next_states[i].copy(),
                dones[i] or truncateds[i]
            )

            # Reset if done
            if dones[i] or truncateds[i]:
                # Record completed episode
                episode_rewards.append(episode_rewards_current[i])
                episode_steps_list.append(episode_steps_current[i])

                # Log completed episode
                logger.info(f"Env {i} completed episode with reward: {episode_rewards_current[i]:.2f}")

                # Reset tracking
                episode_rewards_current[i] = 0
                episode_steps_current[i] = 0

                # Reset environment
                vec_env.reset_async(indices=[i])
                reset_obs, _ = vec_env.reset_wait(indices=[i])
                frame = preprocess(reset_obs[0])
                frame_buffers[i].clear()
                for _ in range(4):
                    frame_buffers[i].append(frame)
                next_states[i] = np.stack(frame_buffers[i], axis=0)

        current_states = next_states
        total_steps += N_ENVS

        # Log every 100 steps
        if step % 100 == 0:
            avg_reward = np.mean(episode_rewards[-10:]) if episode_rewards else 0
            logger.info(f"Step {step}/{N_STEPS} - Total Steps: {total_steps} - Avg Reward: {avg_reward:.2f}")

    # Training updates - no nested progress bar
    if len(replay_memory) > MINIBATCH_SIZE:
        step_losses = []
        for update in range(N_UPDATES):
            batch_data = replay_memory.sample(MINIBATCH_SIZE)
            states, actions, rewards, next_states, dones = batch_data

            # Convert to tensors
            states_tensor = torch.as_tensor(states, device=device, dtype=torch.float32) / 255.0
            next_states_tensor = torch.as_tensor(next_states, device=device, dtype=torch.float32) / 255.0
            actions_tensor = torch.as_tensor(actions, device=device, dtype=torch.long)
            rewards_tensor = torch.as_tensor(rewards, device=device, dtype=torch.float32)
            dones_tensor = torch.as_tensor(dones, device=device, dtype=torch.float32)

            # Compute Q-values
            current_q = online_net(states_tensor).gather(1, actions_tensor.unsqueeze(1)).squeeze(1)

            # Compute target Q-values
            with torch.no_grad():
                next_q = target_net(next_states_tensor).max(1)[0]
                target_q = rewards_tensor + DISCOUNT_FACTOR * next_q * (1 - dones_tensor)

            # Compute loss
            loss = F.mse_loss(current_q, target_q)
            step_losses.append(loss.item())

            # Optimize
            optimizer.zero_grad()
            loss.backward()

            # Gradient clipping
            for param in online_net.parameters():
                if param.grad is not None:
                    param.grad.data.clamp_(-1, 1)

            optimizer.step()

            total_updates += 1

            # Log every 5 updates
            if update % 5 == 0:
                logger.info(f"Update {update}/{N_UPDATES} - Loss: {loss.item():.4f}")

        avg_loss = sum(step_losses) / len(step_losses) if step_losses else 0
        episode_losses.append(avg_loss)
        logger.info(f"Training updates completed - Avg Loss: {avg_loss:.4f}")
    else:
        episode_losses.append(0)
        logger.info("Skipping training updates (insufficient samples)")

    # Update target network
    if total_updates % (TARGET_UPDATE_FREQ // N_UPDATES) == 0:
        target_net.load_state_dict(online_net.state_dict())
        logger.info("Target network updated")

    # Decay epsilon
    for _ in range(N_STEPS * N_ENVS):
        agent.decay_epsilon()

    # Calculate metrics
    avg_reward = np.mean(episode_rewards[-50:]) if episode_rewards else 0
    avg_steps = np.mean(episode_steps_list[-50:]) if episode_steps_list else 0
    elapsed = time.time() - start_time
    steps_per_sec = total_steps / elapsed if elapsed > 0 else 0
    episode_time = time.time() - episode_start_time

    # Update episode progress bar
    episode_pbar.set_postfix({
        "Avg Reward": f"{avg_reward:.2f}",
        "Epsilon": f"{agent.epsilon:.4f}",
        "Avg Loss": f"{episode_losses[-1]:.4f}",
        "Speed": f"{steps_per_sec:.1f} steps/s"
    })

    # Log episode completion
    logger.info(f"Episode {episode} completed in {episode_time:.2f} seconds")
    logger.info(f"  Total Steps: {total_steps}")
    logger.info(f"  Avg Reward (50): {avg_reward:.2f}")
    logger.info(f"  Epsilon: {agent.epsilon:.4f}")
    logger.info(f"  Avg Loss: {episode_losses[-1]:.4f}")
    logger.info(f"  Speed: {steps_per_sec:.1f} steps/s")

    # Save best model
    if avg_reward > best_reward:
        best_reward = avg_reward
        torch.save(online_net.state_dict(), f"{MODEL_FILE}_best.pth")
        logger.info(f"New best model saved with reward: {avg_reward:.2f}")
        episode_pbar.write(f"New best model saved with reward: {avg_reward:.2f}")

# Final save and cleanup
torch.save(online_net.state_dict(), f"{MODEL_FILE}_final.pth")
episode_pbar.close()
logger.info("Training complete. Models saved.")

2025-07-16 00:09:11,876 - INFO - Starting training with parallel environments...


Training Episodes:   0%|                                | 0/100 [00:00<?, ?it/s]

2025-07-16 00:09:11,881 - INFO - Starting episode 0/100
2025-07-16 00:09:11,955 - INFO - Step 0/100 - Total Steps: 8 - Avg Reward: 0.00
2025-07-16 00:09:23,761 - INFO - Update 0/4 - Loss: 287141.1875
2025-07-16 00:09:56,845 - INFO - Training updates completed - Avg Loss: 706816619124841.2500


Training Episodes:   0%| | 0/100 [00:44<?, ?it/s, Avg Reward=0.00, Epsilon=0.695

2025-07-16 00:09:56,848 - INFO - Episode 0 completed in 44.92 seconds
2025-07-16 00:09:56,848 - INFO -   Total Steps: 800
2025-07-16 00:09:56,849 - INFO -   Avg Reward (50): 0.00
2025-07-16 00:09:56,849 - INFO -   Epsilon: 0.6952
2025-07-16 00:09:56,849 - INFO -   Avg Loss: 706816619124841.2500
2025-07-16 00:09:56,850 - INFO -   Speed: 17.8 steps/s
2025-07-16 00:09:56,883 - INFO - New best model saved with reward: 0.00


Training Episodes:   1%| | 1/100 [00:45<1:14:15, 45.01s/it, Avg Reward=0.00, Eps

New best model saved with reward: 0.00
2025-07-16 00:09:56,885 - INFO - Starting episode 1/100
2025-07-16 00:09:56,928 - INFO - Step 0/100 - Total Steps: 808 - Avg Reward: 0.00
2025-07-16 00:10:09,062 - INFO - Update 0/4 - Loss: 2898689534197760.0000
2025-07-16 00:11:07,606 - INFO - Training updates completed - Avg Loss: 262190622188765184.0000


Training Episodes:   1%| | 1/100 [01:55<1:14:15, 45.01s/it, Avg Reward=0.00, Eps

2025-07-16 00:11:07,610 - INFO - Episode 1 completed in 70.69 seconds
2025-07-16 00:11:07,611 - INFO -   Total Steps: 1600
2025-07-16 00:11:07,611 - INFO -   Avg Reward (50): 0.00
2025-07-16 00:11:07,611 - INFO -   Epsilon: 0.6904
2025-07-16 00:11:07,612 - INFO -   Avg Loss: 262190622188765184.0000
2025-07-16 00:11:07,612 - INFO -   Speed: 13.8 steps/s


Training Episodes:   2%| | 2/100 [01:55<1:38:13, 60.14s/it, Avg Reward=0.00, Eps

2025-07-16 00:11:07,613 - INFO - Starting episode 2/100
2025-07-16 00:11:07,667 - INFO - Step 0/100 - Total Steps: 1608 - Avg Reward: 0.00
2025-07-16 00:11:24,937 - INFO - Update 0/4 - Loss: 6097268717387776.0000
2025-07-16 00:12:29,465 - INFO - Training updates completed - Avg Loss: 1524317415534400.0000


Training Episodes:   2%| | 2/100 [03:17<1:38:13, 60.14s/it, Avg Reward=0.00, Eps

2025-07-16 00:12:29,468 - INFO - Episode 2 completed in 81.81 seconds
2025-07-16 00:12:29,468 - INFO -   Total Steps: 2400
2025-07-16 00:12:29,469 - INFO -   Avg Reward (50): 0.00
2025-07-16 00:12:29,469 - INFO -   Epsilon: 0.6856
2025-07-16 00:12:29,470 - INFO -   Avg Loss: 1524317415534400.0000
2025-07-16 00:12:29,470 - INFO -   Speed: 12.1 steps/s


Training Episodes:   3%| | 3/100 [03:17<1:53:15, 70.05s/it, Avg Reward=0.00, Eps

2025-07-16 00:12:29,470 - INFO - Starting episode 3/100
2025-07-16 00:12:29,516 - INFO - Step 0/100 - Total Steps: 2408 - Avg Reward: 0.00
2025-07-16 00:12:42,593 - INFO - Update 0/4 - Loss: 683301312.0000
2025-07-16 00:13:27,374 - INFO - Training updates completed - Avg Loss: 316860056176.0000


Training Episodes:   3%| | 3/100 [04:15<1:53:15, 70.05s/it, Avg Reward=0.00, Eps

2025-07-16 00:13:27,378 - INFO - Episode 3 completed in 57.87 seconds
2025-07-16 00:13:27,378 - INFO -   Total Steps: 3200
2025-07-16 00:13:27,378 - INFO -   Avg Reward (50): 0.00
2025-07-16 00:13:27,379 - INFO -   Epsilon: 0.6808
2025-07-16 00:13:27,379 - INFO -   Avg Loss: 316860056176.0000
2025-07-16 00:13:27,380 - INFO -   Speed: 12.5 steps/s


Training Episodes:   4%| | 4/100 [04:15<1:44:24, 65.26s/it, Avg Reward=0.00, Eps

2025-07-16 00:13:27,380 - INFO - Starting episode 4/100
2025-07-16 00:13:27,425 - INFO - Step 0/100 - Total Steps: 3208 - Avg Reward: 0.00
2025-07-16 00:13:37,636 - INFO - Update 0/4 - Loss: 34141917184.0000
2025-07-16 00:15:02,719 - INFO - Training updates completed - Avg Loss: 18053498880.0000


Training Episodes:   4%| | 4/100 [05:50<1:44:24, 65.26s/it, Avg Reward=0.00, Eps

2025-07-16 00:15:02,727 - INFO - Episode 4 completed in 95.31 seconds
2025-07-16 00:15:02,729 - INFO -   Total Steps: 4000
2025-07-16 00:15:02,730 - INFO -   Avg Reward (50): 0.00
2025-07-16 00:15:02,730 - INFO -   Epsilon: 0.6760
2025-07-16 00:15:02,732 - INFO -   Avg Loss: 18053498880.0000
2025-07-16 00:15:02,733 - INFO -   Speed: 11.4 steps/s


Training Episodes:   5%| | 5/100 [05:50<2:00:30, 76.11s/it, Avg Reward=0.00, Eps

2025-07-16 00:15:02,735 - INFO - Starting episode 5/100
2025-07-16 00:15:02,920 - INFO - Step 0/100 - Total Steps: 4008 - Avg Reward: 0.00
2025-07-16 00:15:26,556 - INFO - Update 0/4 - Loss: 4063986432.0000
2025-07-16 00:16:56,892 - INFO - Training updates completed - Avg Loss: 3590181720.0000


Training Episodes:   5%| | 5/100 [07:45<2:00:30, 76.11s/it, Avg Reward=0.00, Eps

2025-07-16 00:16:56,898 - INFO - Episode 5 completed in 114.01 seconds
2025-07-16 00:16:56,899 - INFO -   Total Steps: 4800
2025-07-16 00:16:56,900 - INFO -   Avg Reward (50): 0.00
2025-07-16 00:16:56,901 - INFO -   Epsilon: 0.6712
2025-07-16 00:16:56,901 - INFO -   Avg Loss: 3590181720.0000
2025-07-16 00:16:56,902 - INFO -   Speed: 10.3 steps/s


Training Episodes:   6%| | 6/100 [07:45<2:19:30, 89.05s/it, Avg Reward=0.00, Eps

2025-07-16 00:16:56,904 - INFO - Starting episode 6/100
2025-07-16 00:16:57,077 - INFO - Step 0/100 - Total Steps: 4808 - Avg Reward: 0.00
2025-07-16 00:17:33,930 - INFO - Update 0/4 - Loss: 13744301056.0000
2025-07-16 00:18:55,184 - INFO - Training updates completed - Avg Loss: 13719755776.0000


Training Episodes:   6%| | 6/100 [09:43<2:19:30, 89.05s/it, Avg Reward=0.00, Eps

2025-07-16 00:18:55,193 - INFO - Episode 6 completed in 118.15 seconds
2025-07-16 00:18:55,195 - INFO -   Total Steps: 5600
2025-07-16 00:18:55,196 - INFO -   Avg Reward (50): 0.00
2025-07-16 00:18:55,196 - INFO -   Epsilon: 0.6664
2025-07-16 00:18:55,197 - INFO -   Avg Loss: 13719755776.0000
2025-07-16 00:18:55,199 - INFO -   Speed: 9.6 steps/s


Training Episodes:   7%| | 7/100 [09:43<2:32:50, 98.61s/it, Avg Reward=0.00, Eps

2025-07-16 00:18:55,201 - INFO - Starting episode 7/100
2025-07-16 00:18:55,405 - INFO - Step 0/100 - Total Steps: 5608 - Avg Reward: 0.00
2025-07-16 00:19:23,199 - INFO - Update 0/4 - Loss: 2437627392.0000
2025-07-16 00:20:59,019 - INFO - Training updates completed - Avg Loss: 5822458816.0000


Training Episodes:   7%| | 7/100 [11:47<2:32:50, 98.61s/it, Avg Reward=0.00, Eps

2025-07-16 00:20:59,026 - INFO - Episode 7 completed in 123.66 seconds
2025-07-16 00:20:59,027 - INFO -   Total Steps: 6400
2025-07-16 00:20:59,029 - INFO -   Avg Reward (50): 0.00
2025-07-16 00:20:59,030 - INFO -   Epsilon: 0.6616
2025-07-16 00:20:59,031 - INFO -   Avg Loss: 5822458816.0000
2025-07-16 00:20:59,032 - INFO -   Speed: 9.1 steps/s


Training Episodes:   8%| | 8/100 [11:47<2:43:31, 106.64s/it, Avg Reward=0.00, Ep

2025-07-16 00:20:59,035 - INFO - Starting episode 8/100
2025-07-16 00:20:59,235 - INFO - Step 0/100 - Total Steps: 6408 - Avg Reward: 0.00
2025-07-16 00:21:36,529 - INFO - Update 0/4 - Loss: 15623474176.0000
2025-07-16 00:22:39,935 - INFO - Training updates completed - Avg Loss: 7863283251.8750


Training Episodes:   8%| | 8/100 [13:28<2:43:31, 106.64s/it, Avg Reward=0.00, Ep

2025-07-16 00:22:39,939 - INFO - Episode 8 completed in 100.74 seconds
2025-07-16 00:22:39,939 - INFO -   Total Steps: 7200
2025-07-16 00:22:39,939 - INFO -   Avg Reward (50): 0.00
2025-07-16 00:22:39,939 - INFO -   Epsilon: 0.6568
2025-07-16 00:22:39,940 - INFO -   Avg Loss: 7863283251.8750
2025-07-16 00:22:39,940 - INFO -   Speed: 8.9 steps/s


Training Episodes:   9%| | 9/100 [13:28<2:39:01, 104.85s/it, Avg Reward=0.00, Ep

2025-07-16 00:22:39,940 - INFO - Starting episode 9/100
2025-07-16 00:22:39,997 - INFO - Step 0/100 - Total Steps: 7208 - Avg Reward: 0.00
2025-07-16 00:22:56,241 - INFO - Update 0/4 - Loss: 1913644800.0000
2025-07-16 00:24:05,352 - INFO - Training updates completed - Avg Loss: 2477647376.0000


Training Episodes:   9%| | 9/100 [14:53<2:39:01, 104.85s/it, Avg Reward=0.00, Ep

2025-07-16 00:24:05,359 - INFO - Episode 9 completed in 85.37 seconds
2025-07-16 00:24:05,361 - INFO -   Total Steps: 8000
2025-07-16 00:24:05,362 - INFO -   Avg Reward (50): 0.00
2025-07-16 00:24:05,364 - INFO -   Epsilon: 0.6520
2025-07-16 00:24:05,364 - INFO -   Avg Loss: 2477647376.0000
2025-07-16 00:24:05,365 - INFO -   Speed: 9.0 steps/s


Training Episodes:  10%| | 10/100 [14:53<2:28:16, 98.85s/it, Avg Reward=0.00, Ep

2025-07-16 00:24:05,367 - INFO - Starting episode 10/100
2025-07-16 00:24:05,557 - INFO - Step 0/100 - Total Steps: 8008 - Avg Reward: 0.00
2025-07-16 00:24:25,401 - INFO - Update 0/4 - Loss: 2356485632.0000
2025-07-16 00:25:13,537 - INFO - Training updates completed - Avg Loss: 5517529728.0000


Training Episodes:  10%| | 10/100 [16:01<2:28:16, 98.85s/it, Avg Reward=0.00, Ep

2025-07-16 00:25:13,540 - INFO - Episode 10 completed in 68.02 seconds
2025-07-16 00:25:13,541 - INFO -   Total Steps: 8800
2025-07-16 00:25:13,541 - INFO -   Avg Reward (50): 0.00
2025-07-16 00:25:13,541 - INFO -   Epsilon: 0.6472
2025-07-16 00:25:13,541 - INFO -   Avg Loss: 5517529728.0000
2025-07-16 00:25:13,542 - INFO -   Speed: 9.2 steps/s


Training Episodes:  11%| | 11/100 [16:01<2:12:42, 89.46s/it, Avg Reward=0.00, Ep

2025-07-16 00:25:13,542 - INFO - Starting episode 11/100
2025-07-16 00:25:13,593 - INFO - Step 0/100 - Total Steps: 8808 - Avg Reward: 0.00
2025-07-16 00:25:26,050 - INFO - Update 0/4 - Loss: 1114471680.0000
2025-07-16 00:25:58,337 - INFO - Training updates completed - Avg Loss: 4451197632.0000


Training Episodes:  11%| | 11/100 [16:46<2:12:42, 89.46s/it, Avg Reward=0.00, Ep

2025-07-16 00:25:58,340 - INFO - Episode 11 completed in 44.76 seconds
2025-07-16 00:25:58,341 - INFO -   Total Steps: 9600
2025-07-16 00:25:58,341 - INFO -   Avg Reward (50): 0.00
2025-07-16 00:25:58,341 - INFO -   Epsilon: 0.6424
2025-07-16 00:25:58,341 - INFO -   Avg Loss: 4451197632.0000
2025-07-16 00:25:58,342 - INFO -   Speed: 9.5 steps/s


Training Episodes:  12%| | 12/100 [16:46<1:51:17, 75.88s/it, Avg Reward=0.00, Ep

2025-07-16 00:25:58,342 - INFO - Starting episode 12/100
2025-07-16 00:25:58,387 - INFO - Step 0/100 - Total Steps: 9608 - Avg Reward: 0.00
2025-07-16 00:26:09,195 - INFO - Update 0/4 - Loss: 4754222080.0000
2025-07-16 00:26:48,169 - INFO - Training updates completed - Avg Loss: 4671325184.0000


Training Episodes:  12%| | 12/100 [17:36<1:51:17, 75.88s/it, Avg Reward=0.00, Ep

2025-07-16 00:26:48,172 - INFO - Episode 12 completed in 49.79 seconds
2025-07-16 00:26:48,173 - INFO -   Total Steps: 10400
2025-07-16 00:26:48,173 - INFO -   Avg Reward (50): 0.00
2025-07-16 00:26:48,173 - INFO -   Epsilon: 0.6376
2025-07-16 00:26:48,174 - INFO -   Avg Loss: 4671325184.0000
2025-07-16 00:26:48,174 - INFO -   Speed: 9.8 steps/s


Training Episodes:  13%|▏| 13/100 [17:36<1:38:34, 67.99s/it, Avg Reward=0.00, Ep

2025-07-16 00:26:48,176 - INFO - Starting episode 13/100
2025-07-16 00:26:48,220 - INFO - Step 0/100 - Total Steps: 10408 - Avg Reward: 0.00
2025-07-16 00:26:59,723 - INFO - Update 0/4 - Loss: 13728991232.0000
2025-07-16 00:27:36,536 - INFO - Training updates completed - Avg Loss: 7892563392.0000


Training Episodes:  13%|▏| 13/100 [18:24<1:38:34, 67.99s/it, Avg Reward=0.00, Ep

2025-07-16 00:27:36,540 - INFO - Episode 13 completed in 48.33 seconds
2025-07-16 00:27:36,541 - INFO -   Total Steps: 11200
2025-07-16 00:27:36,541 - INFO -   Avg Reward (50): 0.00
2025-07-16 00:27:36,541 - INFO -   Epsilon: 0.6328
2025-07-16 00:27:36,541 - INFO -   Avg Loss: 7892563392.0000
2025-07-16 00:27:36,542 - INFO -   Speed: 10.1 steps/s


Training Episodes:  14%|▏| 14/100 [18:24<1:28:57, 62.06s/it, Avg Reward=0.00, Ep

2025-07-16 00:27:36,542 - INFO - Starting episode 14/100
2025-07-16 00:27:36,592 - INFO - Step 0/100 - Total Steps: 11208 - Avg Reward: 0.00
2025-07-16 00:27:47,960 - INFO - Update 0/4 - Loss: 6425724928.0000
2025-07-16 00:28:21,419 - INFO - Training updates completed - Avg Loss: 5678792096.0000


Training Episodes:  14%|▏| 14/100 [19:09<1:28:57, 62.06s/it, Avg Reward=0.00, Ep

2025-07-16 00:28:21,422 - INFO - Episode 14 completed in 44.84 seconds
2025-07-16 00:28:21,423 - INFO -   Total Steps: 12000
2025-07-16 00:28:21,423 - INFO -   Avg Reward (50): 0.00
2025-07-16 00:28:21,424 - INFO -   Epsilon: 0.6280
2025-07-16 00:28:21,424 - INFO -   Avg Loss: 5678792096.0000
2025-07-16 00:28:21,424 - INFO -   Speed: 10.4 steps/s


Training Episodes:  15%|▏| 15/100 [19:09<1:20:35, 56.88s/it, Avg Reward=0.00, Ep

2025-07-16 00:28:21,425 - INFO - Starting episode 15/100
2025-07-16 00:28:21,465 - INFO - Step 0/100 - Total Steps: 12008 - Avg Reward: 0.00
2025-07-16 00:28:34,452 - INFO - Update 0/4 - Loss: 3198530560.0000
2025-07-16 00:29:07,962 - INFO - Training updates completed - Avg Loss: 5374661504.0000


Training Episodes:  15%|▏| 15/100 [19:56<1:20:35, 56.88s/it, Avg Reward=0.00, Ep

2025-07-16 00:29:07,965 - INFO - Episode 15 completed in 46.51 seconds
2025-07-16 00:29:07,966 - INFO -   Total Steps: 12800
2025-07-16 00:29:07,966 - INFO -   Avg Reward (50): 0.00
2025-07-16 00:29:07,966 - INFO -   Epsilon: 0.6232
2025-07-16 00:29:07,966 - INFO -   Avg Loss: 5374661504.0000
2025-07-16 00:29:07,967 - INFO -   Speed: 10.7 steps/s


Training Episodes:  16%|▏| 16/100 [19:56<1:15:16, 53.77s/it, Avg Reward=0.00, Ep

2025-07-16 00:29:07,967 - INFO - Starting episode 16/100
2025-07-16 00:29:08,012 - INFO - Step 0/100 - Total Steps: 12808 - Avg Reward: 0.00
2025-07-16 00:29:20,928 - INFO - Update 0/4 - Loss: 9771344.0000
2025-07-16 00:29:56,929 - INFO - Training updates completed - Avg Loss: 8694062932.0000


Training Episodes:  16%|▏| 16/100 [20:45<1:15:16, 53.77s/it, Avg Reward=0.00, Ep

2025-07-16 00:29:56,933 - INFO - Episode 16 completed in 48.93 seconds
2025-07-16 00:29:56,933 - INFO -   Total Steps: 13600
2025-07-16 00:29:56,934 - INFO -   Avg Reward (50): 0.00
2025-07-16 00:29:56,934 - INFO -   Epsilon: 0.6184
2025-07-16 00:29:56,934 - INFO -   Avg Loss: 8694062932.0000
2025-07-16 00:29:56,935 - INFO -   Speed: 10.9 steps/s


Training Episodes:  17%|▏| 17/100 [20:45<1:12:23, 52.33s/it, Avg Reward=0.00, Ep

2025-07-16 00:29:56,935 - INFO - Starting episode 17/100
2025-07-16 00:29:56,978 - INFO - Step 0/100 - Total Steps: 13608 - Avg Reward: 0.00
2025-07-16 00:30:09,977 - INFO - Update 0/4 - Loss: 11923236864.0000
2025-07-16 00:30:45,626 - INFO - Training updates completed - Avg Loss: 4777559124.0000


Training Episodes:  17%|▏| 17/100 [21:33<1:12:23, 52.33s/it, Avg Reward=0.00, Ep

2025-07-16 00:30:45,629 - INFO - Episode 17 completed in 48.66 seconds
2025-07-16 00:30:45,630 - INFO -   Total Steps: 14400
2025-07-16 00:30:45,630 - INFO -   Avg Reward (50): 0.00
2025-07-16 00:30:45,630 - INFO -   Epsilon: 0.6136
2025-07-16 00:30:45,631 - INFO -   Avg Loss: 4777559124.0000
2025-07-16 00:30:45,631 - INFO -   Speed: 11.1 steps/s


Training Episodes:  18%|▏| 18/100 [21:33<1:10:01, 51.24s/it, Avg Reward=0.00, Ep

2025-07-16 00:30:45,631 - INFO - Starting episode 18/100
2025-07-16 00:30:45,711 - INFO - Step 0/100 - Total Steps: 14408 - Avg Reward: 0.00
2025-07-16 00:30:57,867 - INFO - Update 0/4 - Loss: 4297549824.0000
2025-07-16 00:31:28,069 - INFO - Training updates completed - Avg Loss: 1716429232.0000


Training Episodes:  18%|▏| 18/100 [22:16<1:10:01, 51.24s/it, Avg Reward=0.00, Ep

2025-07-16 00:31:28,072 - INFO - Episode 18 completed in 42.37 seconds
2025-07-16 00:31:28,073 - INFO -   Total Steps: 15200
2025-07-16 00:31:28,073 - INFO -   Avg Reward (50): 0.00
2025-07-16 00:31:28,073 - INFO -   Epsilon: 0.6088
2025-07-16 00:31:28,073 - INFO -   Avg Loss: 1716429232.0000
2025-07-16 00:31:28,074 - INFO -   Speed: 11.4 steps/s


Training Episodes:  19%|▏| 19/100 [22:16<1:05:36, 48.59s/it, Avg Reward=0.00, Ep

2025-07-16 00:31:28,074 - INFO - Starting episode 19/100
2025-07-16 00:31:28,117 - INFO - Step 0/100 - Total Steps: 15208 - Avg Reward: 0.00
2025-07-16 00:31:38,620 - INFO - Update 0/4 - Loss: 97611816.0000
2025-07-16 00:32:07,655 - INFO - Training updates completed - Avg Loss: 1012784058.0000


Training Episodes:  19%|▏| 19/100 [22:55<1:05:36, 48.59s/it, Avg Reward=0.00, Ep

2025-07-16 00:32:07,658 - INFO - Episode 19 completed in 39.55 seconds
2025-07-16 00:32:07,658 - INFO -   Total Steps: 16000
2025-07-16 00:32:07,658 - INFO -   Avg Reward (50): 0.00
2025-07-16 00:32:07,659 - INFO -   Epsilon: 0.6040
2025-07-16 00:32:07,659 - INFO -   Avg Loss: 1012784058.0000
2025-07-16 00:32:07,660 - INFO -   Speed: 11.6 steps/s


Training Episodes:  20%|▏| 20/100 [22:55<1:01:11, 45.89s/it, Avg Reward=0.00, Ep

2025-07-16 00:32:07,660 - INFO - Starting episode 20/100
2025-07-16 00:32:07,700 - INFO - Step 0/100 - Total Steps: 16008 - Avg Reward: 0.00
2025-07-16 00:32:17,622 - INFO - Update 0/4 - Loss: 585231616.0000
2025-07-16 00:32:46,135 - INFO - Training updates completed - Avg Loss: 818660464.0000


Training Episodes:  20%|▏| 20/100 [23:34<1:01:11, 45.89s/it, Avg Reward=0.00, Ep

2025-07-16 00:32:46,138 - INFO - Episode 20 completed in 38.45 seconds
2025-07-16 00:32:46,139 - INFO -   Total Steps: 16800
2025-07-16 00:32:46,139 - INFO -   Avg Reward (50): 0.00
2025-07-16 00:32:46,139 - INFO -   Epsilon: 0.5992
2025-07-16 00:32:46,139 - INFO -   Avg Loss: 818660464.0000
2025-07-16 00:32:46,140 - INFO -   Speed: 11.9 steps/s


Training Episodes:  21%|▏| 21/100 [23:34<57:29, 43.67s/it, Avg Reward=0.00, Epsi

2025-07-16 00:32:46,140 - INFO - Starting episode 21/100
2025-07-16 00:32:46,183 - INFO - Step 0/100 - Total Steps: 16808 - Avg Reward: 0.00
2025-07-16 00:32:58,530 - INFO - Update 0/4 - Loss: 976996096.0000
2025-07-16 00:33:27,562 - INFO - Training updates completed - Avg Loss: 1341346056.0000


Training Episodes:  21%|▏| 21/100 [24:15<57:29, 43.67s/it, Avg Reward=0.00, Epsi

2025-07-16 00:33:27,566 - INFO - Episode 21 completed in 41.39 seconds
2025-07-16 00:33:27,566 - INFO -   Total Steps: 17600
2025-07-16 00:33:27,566 - INFO -   Avg Reward (50): 0.00
2025-07-16 00:33:27,567 - INFO -   Epsilon: 0.5944
2025-07-16 00:33:27,567 - INFO -   Avg Loss: 1341346056.0000
2025-07-16 00:33:27,568 - INFO -   Speed: 12.1 steps/s


Training Episodes:  22%|▏| 22/100 [24:15<55:53, 42.99s/it, Avg Reward=0.00, Epsi

2025-07-16 00:33:27,568 - INFO - Starting episode 22/100
2025-07-16 00:33:27,613 - INFO - Step 0/100 - Total Steps: 17608 - Avg Reward: 0.00
2025-07-16 00:33:37,941 - INFO - Update 0/4 - Loss: 1447781632.0000
2025-07-16 00:34:06,472 - INFO - Training updates completed - Avg Loss: 480458304.1250


Training Episodes:  22%|▏| 22/100 [24:54<55:53, 42.99s/it, Avg Reward=0.00, Epsi

2025-07-16 00:34:06,476 - INFO - Episode 22 completed in 38.87 seconds
2025-07-16 00:34:06,476 - INFO -   Total Steps: 18400
2025-07-16 00:34:06,476 - INFO -   Avg Reward (50): 0.00
2025-07-16 00:34:06,477 - INFO -   Epsilon: 0.5896
2025-07-16 00:34:06,477 - INFO -   Avg Loss: 480458304.1250
2025-07-16 00:34:06,478 - INFO -   Speed: 12.3 steps/s


Training Episodes:  23%|▏| 23/100 [24:54<53:36, 41.77s/it, Avg Reward=0.00, Epsi

2025-07-16 00:34:06,478 - INFO - Starting episode 23/100
2025-07-16 00:34:06,518 - INFO - Step 0/100 - Total Steps: 18408 - Avg Reward: 0.00
2025-07-16 00:34:16,734 - INFO - Update 0/4 - Loss: 1954122368.0000
2025-07-16 00:34:45,333 - INFO - Training updates completed - Avg Loss: 1932066192.0000


Training Episodes:  23%|▏| 23/100 [25:33<53:36, 41.77s/it, Avg Reward=0.00, Epsi

2025-07-16 00:34:45,337 - INFO - Episode 23 completed in 38.83 seconds
2025-07-16 00:34:45,337 - INFO -   Total Steps: 19200
2025-07-16 00:34:45,338 - INFO -   Avg Reward (50): 0.00
2025-07-16 00:34:45,338 - INFO -   Epsilon: 0.5848
2025-07-16 00:34:45,338 - INFO -   Avg Loss: 1932066192.0000
2025-07-16 00:34:45,338 - INFO -   Speed: 12.5 steps/s


Training Episodes:  24%|▏| 24/100 [25:33<51:48, 40.90s/it, Avg Reward=0.00, Epsi

2025-07-16 00:34:45,339 - INFO - Starting episode 24/100
2025-07-16 00:34:45,379 - INFO - Step 0/100 - Total Steps: 19208 - Avg Reward: 0.00
2025-07-16 00:34:55,403 - INFO - Update 0/4 - Loss: 2172078080.0000
2025-07-16 00:35:22,752 - INFO - Training updates completed - Avg Loss: 1364828672.0000


Training Episodes:  24%|▏| 24/100 [26:10<51:48, 40.90s/it, Avg Reward=0.00, Epsi

2025-07-16 00:35:22,755 - INFO - Episode 24 completed in 37.38 seconds
2025-07-16 00:35:22,755 - INFO -   Total Steps: 20000
2025-07-16 00:35:22,756 - INFO -   Avg Reward (50): 0.00
2025-07-16 00:35:22,756 - INFO -   Epsilon: 0.5800
2025-07-16 00:35:22,756 - INFO -   Avg Loss: 1364828672.0000
2025-07-16 00:35:22,757 - INFO -   Speed: 12.7 steps/s


Training Episodes:  25%|▎| 25/100 [26:10<49:48, 39.85s/it, Avg Reward=0.00, Epsi

2025-07-16 00:35:22,757 - INFO - Starting episode 25/100
2025-07-16 00:35:22,796 - INFO - Step 0/100 - Total Steps: 20008 - Avg Reward: 0.00
2025-07-16 00:35:34,031 - INFO - Update 0/4 - Loss: 348447488.0000
2025-07-16 00:36:03,269 - INFO - Training updates completed - Avg Loss: 750607888.0000


Training Episodes:  25%|▎| 25/100 [26:51<49:48, 39.85s/it, Avg Reward=0.00, Epsi

2025-07-16 00:36:03,272 - INFO - Episode 25 completed in 40.48 seconds
2025-07-16 00:36:03,272 - INFO -   Total Steps: 20800
2025-07-16 00:36:03,273 - INFO -   Avg Reward (50): 0.00
2025-07-16 00:36:03,273 - INFO -   Epsilon: 0.5752
2025-07-16 00:36:03,273 - INFO -   Avg Loss: 750607888.0000
2025-07-16 00:36:03,273 - INFO -   Speed: 12.9 steps/s


Training Episodes:  26%|▎| 26/100 [26:51<49:23, 40.05s/it, Avg Reward=0.00, Epsi

2025-07-16 00:36:03,274 - INFO - Starting episode 26/100
2025-07-16 00:36:03,319 - INFO - Step 0/100 - Total Steps: 20808 - Avg Reward: 0.00
2025-07-16 00:36:14,168 - INFO - Update 0/4 - Loss: 1401958528.0000
2025-07-16 00:36:42,195 - INFO - Training updates completed - Avg Loss: 1241363136.0000


Training Episodes:  26%|▎| 26/100 [27:30<49:23, 40.05s/it, Avg Reward=0.00, Epsi

2025-07-16 00:36:42,198 - INFO - Episode 26 completed in 38.89 seconds
2025-07-16 00:36:42,198 - INFO -   Total Steps: 21600
2025-07-16 00:36:42,198 - INFO -   Avg Reward (50): 0.00
2025-07-16 00:36:42,199 - INFO -   Epsilon: 0.5704
2025-07-16 00:36:42,199 - INFO -   Avg Loss: 1241363136.0000
2025-07-16 00:36:42,199 - INFO -   Speed: 13.1 steps/s


Training Episodes:  27%|▎| 27/100 [27:30<48:19, 39.71s/it, Avg Reward=0.00, Epsi

2025-07-16 00:36:42,200 - INFO - Starting episode 27/100
2025-07-16 00:36:42,241 - INFO - Step 0/100 - Total Steps: 21608 - Avg Reward: 0.00
2025-07-16 00:36:52,217 - INFO - Update 0/4 - Loss: 262411184.0000
2025-07-16 00:37:20,020 - INFO - Training updates completed - Avg Loss: 432450892.0000


Training Episodes:  27%|▎| 27/100 [28:08<48:19, 39.71s/it, Avg Reward=0.00, Epsi

2025-07-16 00:37:20,024 - INFO - Episode 27 completed in 37.79 seconds
2025-07-16 00:37:20,024 - INFO -   Total Steps: 22400
2025-07-16 00:37:20,024 - INFO -   Avg Reward (50): 0.00
2025-07-16 00:37:20,025 - INFO -   Epsilon: 0.5656
2025-07-16 00:37:20,025 - INFO -   Avg Loss: 432450892.0000
2025-07-16 00:37:20,025 - INFO -   Speed: 13.3 steps/s


Training Episodes:  28%|▎| 28/100 [28:08<46:58, 39.15s/it, Avg Reward=0.00, Epsi

2025-07-16 00:37:20,026 - INFO - Starting episode 28/100
2025-07-16 00:37:20,067 - INFO - Step 0/100 - Total Steps: 22408 - Avg Reward: 0.00
2025-07-16 00:37:29,852 - INFO - Update 0/4 - Loss: 785689664.0000
2025-07-16 00:37:58,768 - INFO - Training updates completed - Avg Loss: 509625720.0000


Training Episodes:  28%|▎| 28/100 [28:46<46:58, 39.15s/it, Avg Reward=0.00, Epsi

2025-07-16 00:37:58,772 - INFO - Episode 28 completed in 38.71 seconds
2025-07-16 00:37:58,772 - INFO -   Total Steps: 23200
2025-07-16 00:37:58,773 - INFO -   Avg Reward (50): 0.00
2025-07-16 00:37:58,774 - INFO -   Epsilon: 0.5608
2025-07-16 00:37:58,774 - INFO -   Avg Loss: 509625720.0000
2025-07-16 00:37:58,774 - INFO -   Speed: 13.4 steps/s


Training Episodes:  29%|▎| 29/100 [28:46<46:10, 39.03s/it, Avg Reward=0.00, Epsi

2025-07-16 00:37:58,775 - INFO - Starting episode 29/100
2025-07-16 00:37:58,816 - INFO - Step 0/100 - Total Steps: 23208 - Avg Reward: 0.00
2025-07-16 00:38:09,804 - INFO - Update 0/4 - Loss: 715210304.0000
2025-07-16 00:38:38,816 - INFO - Training updates completed - Avg Loss: 966049008.0000


Training Episodes:  29%|▎| 29/100 [29:26<46:10, 39.03s/it, Avg Reward=0.00, Epsi

2025-07-16 00:38:38,818 - INFO - Episode 29 completed in 40.01 seconds
2025-07-16 00:38:38,819 - INFO -   Total Steps: 24000
2025-07-16 00:38:38,819 - INFO -   Avg Reward (50): 0.00
2025-07-16 00:38:38,819 - INFO -   Epsilon: 0.5560
2025-07-16 00:38:38,820 - INFO -   Avg Loss: 966049008.0000
2025-07-16 00:38:38,820 - INFO -   Speed: 13.6 steps/s


Training Episodes:  30%|▎| 30/100 [29:26<45:53, 39.33s/it, Avg Reward=0.00, Epsi

2025-07-16 00:38:38,821 - INFO - Starting episode 30/100
2025-07-16 00:38:38,860 - INFO - Step 0/100 - Total Steps: 24008 - Avg Reward: 0.00
2025-07-16 00:38:48,675 - INFO - Update 0/4 - Loss: 625072448.0000
2025-07-16 00:39:17,347 - INFO - Training updates completed - Avg Loss: 679233308.0000


Training Episodes:  30%|▎| 30/100 [30:05<45:53, 39.33s/it, Avg Reward=0.00, Epsi

2025-07-16 00:39:17,351 - INFO - Episode 30 completed in 38.50 seconds
2025-07-16 00:39:17,351 - INFO -   Total Steps: 24800
2025-07-16 00:39:17,352 - INFO -   Avg Reward (50): 0.00
2025-07-16 00:39:17,352 - INFO -   Epsilon: 0.5512
2025-07-16 00:39:17,352 - INFO -   Avg Loss: 679233308.0000
2025-07-16 00:39:17,352 - INFO -   Speed: 13.7 steps/s


Training Episodes:  31%|▎| 31/100 [30:05<44:57, 39.09s/it, Avg Reward=0.00, Epsi

2025-07-16 00:39:17,353 - INFO - Starting episode 31/100
2025-07-16 00:39:17,396 - INFO - Step 0/100 - Total Steps: 24808 - Avg Reward: 0.00
2025-07-16 00:39:27,873 - INFO - Update 0/4 - Loss: 1775599744.0000
2025-07-16 00:40:00,031 - INFO - Training updates completed - Avg Loss: 1685687584.0000


Training Episodes:  31%|▎| 31/100 [30:48<44:57, 39.09s/it, Avg Reward=0.00, Epsi

2025-07-16 00:40:00,033 - INFO - Episode 31 completed in 42.65 seconds
2025-07-16 00:40:00,034 - INFO -   Total Steps: 25600
2025-07-16 00:40:00,034 - INFO -   Avg Reward (50): 0.00
2025-07-16 00:40:00,034 - INFO -   Epsilon: 0.5464
2025-07-16 00:40:00,035 - INFO -   Avg Loss: 1685687584.0000
2025-07-16 00:40:00,035 - INFO -   Speed: 13.9 steps/s


Training Episodes:  32%|▎| 32/100 [30:48<45:31, 40.17s/it, Avg Reward=0.00, Epsi

2025-07-16 00:40:00,036 - INFO - Starting episode 32/100
2025-07-16 00:40:00,078 - INFO - Step 0/100 - Total Steps: 25608 - Avg Reward: 0.00
2025-07-16 00:40:10,461 - INFO - Update 0/4 - Loss: 803220480.0000
2025-07-16 00:40:40,014 - INFO - Training updates completed - Avg Loss: 495850000.0000


Training Episodes:  32%|▎| 32/100 [31:28<45:31, 40.17s/it, Avg Reward=0.00, Epsi

2025-07-16 00:40:40,017 - INFO - Episode 32 completed in 39.95 seconds
2025-07-16 00:40:40,018 - INFO -   Total Steps: 26400
2025-07-16 00:40:40,019 - INFO -   Avg Reward (50): 0.00
2025-07-16 00:40:40,019 - INFO -   Epsilon: 0.5416
2025-07-16 00:40:40,019 - INFO -   Avg Loss: 495850000.0000
2025-07-16 00:40:40,020 - INFO -   Speed: 14.0 steps/s


Training Episodes:  33%|▎| 33/100 [31:28<44:47, 40.11s/it, Avg Reward=0.00, Epsi

2025-07-16 00:40:40,020 - INFO - Starting episode 33/100
2025-07-16 00:40:40,060 - INFO - Step 0/100 - Total Steps: 26408 - Avg Reward: 0.00
2025-07-16 00:40:52,856 - INFO - Update 0/4 - Loss: 788525312.0000
2025-07-16 00:41:20,480 - INFO - Training updates completed - Avg Loss: 640280736.0000


Training Episodes:  33%|▎| 33/100 [32:08<44:47, 40.11s/it, Avg Reward=0.00, Epsi

2025-07-16 00:41:20,484 - INFO - Episode 33 completed in 40.43 seconds
2025-07-16 00:41:20,484 - INFO -   Total Steps: 27200
2025-07-16 00:41:20,485 - INFO -   Avg Reward (50): 0.00
2025-07-16 00:41:20,485 - INFO -   Epsilon: 0.5368
2025-07-16 00:41:20,485 - INFO -   Avg Loss: 640280736.0000
2025-07-16 00:41:20,485 - INFO -   Speed: 14.1 steps/s


Training Episodes:  34%|▎| 34/100 [32:08<44:14, 40.22s/it, Avg Reward=0.00, Epsi

2025-07-16 00:41:20,486 - INFO - Starting episode 34/100
2025-07-16 00:41:20,527 - INFO - Step 0/100 - Total Steps: 27208 - Avg Reward: 0.00
2025-07-16 00:41:30,680 - INFO - Update 0/4 - Loss: 356389952.0000
2025-07-16 00:41:58,578 - INFO - Training updates completed - Avg Loss: 290114476.0000


Training Episodes:  34%|▎| 34/100 [32:46<44:14, 40.22s/it, Avg Reward=0.00, Epsi

2025-07-16 00:41:58,581 - INFO - Episode 34 completed in 38.06 seconds
2025-07-16 00:41:58,581 - INFO -   Total Steps: 28000
2025-07-16 00:41:58,581 - INFO -   Avg Reward (50): 0.00
2025-07-16 00:41:58,582 - INFO -   Epsilon: 0.5320
2025-07-16 00:41:58,582 - INFO -   Avg Loss: 290114476.0000
2025-07-16 00:41:58,582 - INFO -   Speed: 14.2 steps/s


Training Episodes:  35%|▎| 35/100 [32:46<42:52, 39.58s/it, Avg Reward=0.00, Epsi

2025-07-16 00:41:58,583 - INFO - Starting episode 35/100
2025-07-16 00:41:58,627 - INFO - Step 0/100 - Total Steps: 28008 - Avg Reward: 0.00
2025-07-16 00:42:09,912 - INFO - Update 0/4 - Loss: 492300448.0000
2025-07-16 00:42:40,988 - INFO - Training updates completed - Avg Loss: 560649288.0000


Training Episodes:  35%|▎| 35/100 [33:29<42:52, 39.58s/it, Avg Reward=0.00, Epsi

2025-07-16 00:42:40,991 - INFO - Episode 35 completed in 42.37 seconds
2025-07-16 00:42:40,991 - INFO -   Total Steps: 28800
2025-07-16 00:42:40,991 - INFO -   Avg Reward (50): 0.00
2025-07-16 00:42:40,992 - INFO -   Epsilon: 0.5272
2025-07-16 00:42:40,992 - INFO -   Avg Loss: 560649288.0000
2025-07-16 00:42:40,992 - INFO -   Speed: 14.3 steps/s


Training Episodes:  36%|▎| 36/100 [33:29<43:07, 40.43s/it, Avg Reward=0.00, Epsi

2025-07-16 00:42:40,992 - INFO - Starting episode 36/100
2025-07-16 00:42:41,038 - INFO - Step 0/100 - Total Steps: 28808 - Avg Reward: 0.00
2025-07-16 00:42:52,854 - INFO - Update 0/4 - Loss: 174789856.0000
2025-07-16 00:43:24,709 - INFO - Training updates completed - Avg Loss: 468662492.0000


Training Episodes:  36%|▎| 36/100 [34:12<43:07, 40.43s/it, Avg Reward=0.00, Epsi

2025-07-16 00:43:24,713 - INFO - Episode 36 completed in 43.69 seconds
2025-07-16 00:43:24,713 - INFO -   Total Steps: 29600
2025-07-16 00:43:24,714 - INFO -   Avg Reward (50): 0.00
2025-07-16 00:43:24,714 - INFO -   Epsilon: 0.5224
2025-07-16 00:43:24,714 - INFO -   Avg Loss: 468662492.0000
2025-07-16 00:43:24,715 - INFO -   Speed: 14.4 steps/s


Training Episodes:  37%|▎| 37/100 [34:12<43:29, 41.42s/it, Avg Reward=0.00, Epsi

2025-07-16 00:43:24,717 - INFO - Starting episode 37/100
2025-07-16 00:43:24,761 - INFO - Step 0/100 - Total Steps: 29608 - Avg Reward: 0.00
2025-07-16 00:43:35,541 - INFO - Update 0/4 - Loss: 415032640.0000
2025-07-16 00:44:10,840 - INFO - Training updates completed - Avg Loss: 509302016.0000


Training Episodes:  37%|▎| 37/100 [34:58<43:29, 41.42s/it, Avg Reward=0.00, Epsi

2025-07-16 00:44:10,844 - INFO - Episode 37 completed in 46.09 seconds
2025-07-16 00:44:10,844 - INFO -   Total Steps: 30400
2025-07-16 00:44:10,845 - INFO -   Avg Reward (50): 0.00
2025-07-16 00:44:10,845 - INFO -   Epsilon: 0.5176
2025-07-16 00:44:10,845 - INFO -   Avg Loss: 509302016.0000
2025-07-16 00:44:10,845 - INFO -   Speed: 14.5 steps/s


Training Episodes:  38%|▍| 38/100 [34:58<44:15, 42.83s/it, Avg Reward=0.00, Epsi

2025-07-16 00:44:10,846 - INFO - Starting episode 38/100
2025-07-16 00:44:10,887 - INFO - Step 0/100 - Total Steps: 30408 - Avg Reward: 0.00
2025-07-16 00:44:22,702 - INFO - Update 0/4 - Loss: 1078049408.0000
2025-07-16 00:44:55,264 - INFO - Training updates completed - Avg Loss: 1134542544.0000


Training Episodes:  38%|▍| 38/100 [35:43<44:15, 42.83s/it, Avg Reward=0.00, Epsi

2025-07-16 00:44:55,270 - INFO - Episode 38 completed in 44.39 seconds
2025-07-16 00:44:55,270 - INFO -   Total Steps: 31200
2025-07-16 00:44:55,271 - INFO -   Avg Reward (50): 0.00
2025-07-16 00:44:55,271 - INFO -   Epsilon: 0.5128
2025-07-16 00:44:55,271 - INFO -   Avg Loss: 1134542544.0000
2025-07-16 00:44:55,272 - INFO -   Speed: 14.6 steps/s


Training Episodes:  39%|▍| 39/100 [35:43<44:01, 43.31s/it, Avg Reward=0.00, Epsi

2025-07-16 00:44:55,273 - INFO - Starting episode 39/100
2025-07-16 00:44:55,320 - INFO - Step 0/100 - Total Steps: 31208 - Avg Reward: 0.00
2025-07-16 00:45:09,250 - INFO - Update 0/4 - Loss: 1527110656.0000
2025-07-16 00:45:40,293 - INFO - Training updates completed - Avg Loss: 1336693808.0000


Training Episodes:  39%|▍| 39/100 [36:28<44:01, 43.31s/it, Avg Reward=0.00, Epsi

2025-07-16 00:45:40,296 - INFO - Episode 39 completed in 44.99 seconds
2025-07-16 00:45:40,297 - INFO -   Total Steps: 32000
2025-07-16 00:45:40,297 - INFO -   Avg Reward (50): 0.00
2025-07-16 00:45:40,298 - INFO -   Epsilon: 0.5080
2025-07-16 00:45:40,298 - INFO -   Avg Loss: 1336693808.0000
2025-07-16 00:45:40,298 - INFO -   Speed: 14.6 steps/s


Training Episodes:  40%|▍| 40/100 [36:28<43:49, 43.83s/it, Avg Reward=0.00, Epsi

2025-07-16 00:45:40,299 - INFO - Starting episode 40/100
2025-07-16 00:45:40,343 - INFO - Step 0/100 - Total Steps: 32008 - Avg Reward: 0.00
2025-07-16 00:45:51,335 - INFO - Update 0/4 - Loss: 647788352.0000
2025-07-16 00:46:19,254 - INFO - Training updates completed - Avg Loss: 553929352.0000


Training Episodes:  40%|▍| 40/100 [37:07<43:49, 43.83s/it, Avg Reward=0.00, Epsi

2025-07-16 00:46:19,257 - INFO - Episode 40 completed in 38.92 seconds
2025-07-16 00:46:19,257 - INFO -   Total Steps: 32800
2025-07-16 00:46:19,257 - INFO -   Avg Reward (50): 0.00
2025-07-16 00:46:19,258 - INFO -   Epsilon: 0.5032
2025-07-16 00:46:19,258 - INFO -   Avg Loss: 553929352.0000
2025-07-16 00:46:19,258 - INFO -   Speed: 14.7 steps/s


Training Episodes:  41%|▍| 41/100 [37:07<41:39, 42.37s/it, Avg Reward=0.00, Epsi

2025-07-16 00:46:19,259 - INFO - Starting episode 41/100
2025-07-16 00:46:19,299 - INFO - Step 0/100 - Total Steps: 32808 - Avg Reward: 0.00
2025-07-16 00:46:29,119 - INFO - Update 0/4 - Loss: 308877728.0000
2025-07-16 00:46:56,189 - INFO - Training updates completed - Avg Loss: 525079272.0000


Training Episodes:  41%|▍| 41/100 [37:44<41:39, 42.37s/it, Avg Reward=0.00, Epsi

2025-07-16 00:46:56,192 - INFO - Episode 41 completed in 36.90 seconds
2025-07-16 00:46:56,192 - INFO -   Total Steps: 33600
2025-07-16 00:46:56,193 - INFO -   Avg Reward (50): 0.00
2025-07-16 00:46:56,193 - INFO -   Epsilon: 0.4984
2025-07-16 00:46:56,193 - INFO -   Avg Loss: 525079272.0000
2025-07-16 00:46:56,193 - INFO -   Speed: 14.8 steps/s


Training Episodes:  42%|▍| 42/100 [37:44<39:22, 40.74s/it, Avg Reward=0.00, Epsi

2025-07-16 00:46:56,194 - INFO - Starting episode 42/100
2025-07-16 00:46:56,237 - INFO - Step 0/100 - Total Steps: 33608 - Avg Reward: 0.00
2025-07-16 00:47:06,121 - INFO - Update 0/4 - Loss: 97194248.0000
2025-07-16 00:47:34,668 - INFO - Training updates completed - Avg Loss: 424271538.0000


Training Episodes:  42%|▍| 42/100 [38:22<39:22, 40.74s/it, Avg Reward=0.00, Epsi

2025-07-16 00:47:34,672 - INFO - Episode 42 completed in 38.44 seconds
2025-07-16 00:47:34,672 - INFO -   Total Steps: 34400
2025-07-16 00:47:34,672 - INFO -   Avg Reward (50): 0.00
2025-07-16 00:47:34,673 - INFO -   Epsilon: 0.4936
2025-07-16 00:47:34,673 - INFO -   Avg Loss: 424271538.0000
2025-07-16 00:47:34,673 - INFO -   Speed: 14.9 steps/s


Training Episodes:  43%|▍| 43/100 [38:22<38:03, 40.06s/it, Avg Reward=0.00, Epsi

2025-07-16 00:47:34,674 - INFO - Starting episode 43/100
2025-07-16 00:47:34,716 - INFO - Step 0/100 - Total Steps: 34408 - Avg Reward: 0.00
2025-07-16 00:47:44,986 - INFO - Update 0/4 - Loss: 654800960.0000
2025-07-16 00:48:13,948 - INFO - Training updates completed - Avg Loss: 589836080.0000


Training Episodes:  43%|▍| 43/100 [39:02<38:03, 40.06s/it, Avg Reward=0.00, Epsi

2025-07-16 00:48:13,951 - INFO - Episode 43 completed in 39.25 seconds
2025-07-16 00:48:13,952 - INFO -   Total Steps: 35200
2025-07-16 00:48:13,952 - INFO -   Avg Reward (50): 0.00
2025-07-16 00:48:13,953 - INFO -   Epsilon: 0.4888
2025-07-16 00:48:13,953 - INFO -   Avg Loss: 589836080.0000
2025-07-16 00:48:13,953 - INFO -   Speed: 15.0 steps/s


Training Episodes:  44%|▍| 44/100 [39:02<37:10, 39.83s/it, Avg Reward=0.00, Epsi

2025-07-16 00:48:13,954 - INFO - Starting episode 44/100
2025-07-16 00:48:13,994 - INFO - Step 0/100 - Total Steps: 35208 - Avg Reward: 0.00
2025-07-16 00:48:24,383 - INFO - Update 0/4 - Loss: 616673984.0000
2025-07-16 00:48:52,920 - INFO - Training updates completed - Avg Loss: 318643920.0000


Training Episodes:  44%|▍| 44/100 [39:41<37:10, 39.83s/it, Avg Reward=0.00, Epsi

2025-07-16 00:48:52,923 - INFO - Episode 44 completed in 38.94 seconds
2025-07-16 00:48:52,923 - INFO -   Total Steps: 36000
2025-07-16 00:48:52,924 - INFO -   Avg Reward (50): 0.00
2025-07-16 00:48:52,924 - INFO -   Epsilon: 0.4840
2025-07-16 00:48:52,925 - INFO -   Avg Loss: 318643920.0000
2025-07-16 00:48:52,925 - INFO -   Speed: 15.1 steps/s


Training Episodes:  45%|▍| 45/100 [39:41<36:16, 39.57s/it, Avg Reward=0.00, Epsi

2025-07-16 00:48:52,925 - INFO - Starting episode 45/100
2025-07-16 00:48:52,965 - INFO - Step 0/100 - Total Steps: 36008 - Avg Reward: 0.00
2025-07-16 00:49:03,276 - INFO - Update 0/4 - Loss: 243920704.0000
2025-07-16 00:49:32,386 - INFO - Training updates completed - Avg Loss: 354248944.0000


Training Episodes:  45%|▍| 45/100 [40:20<36:16, 39.57s/it, Avg Reward=0.00, Epsi

2025-07-16 00:49:32,389 - INFO - Episode 45 completed in 39.43 seconds
2025-07-16 00:49:32,389 - INFO -   Total Steps: 36800
2025-07-16 00:49:32,390 - INFO -   Avg Reward (50): 0.00
2025-07-16 00:49:32,390 - INFO -   Epsilon: 0.4792
2025-07-16 00:49:32,390 - INFO -   Avg Loss: 354248944.0000
2025-07-16 00:49:32,391 - INFO -   Speed: 15.2 steps/s


Training Episodes:  46%|▍| 46/100 [40:20<35:35, 39.54s/it, Avg Reward=0.00, Epsi

2025-07-16 00:49:32,392 - INFO - Starting episode 46/100
2025-07-16 00:49:32,433 - INFO - Step 0/100 - Total Steps: 36808 - Avg Reward: 0.00
2025-07-16 00:49:43,010 - INFO - Update 0/4 - Loss: 185063520.0000
2025-07-16 00:50:14,519 - INFO - Training updates completed - Avg Loss: 399165760.0000


Training Episodes:  46%|▍| 46/100 [41:02<35:35, 39.54s/it, Avg Reward=0.00, Epsi

2025-07-16 00:50:14,530 - INFO - Episode 46 completed in 42.11 seconds
2025-07-16 00:50:14,530 - INFO -   Total Steps: 37600
2025-07-16 00:50:14,531 - INFO -   Avg Reward (50): 0.00
2025-07-16 00:50:14,531 - INFO -   Epsilon: 0.4744
2025-07-16 00:50:14,531 - INFO -   Avg Loss: 399165760.0000
2025-07-16 00:50:14,531 - INFO -   Speed: 15.3 steps/s


Training Episodes:  47%|▍| 47/100 [41:02<35:36, 40.32s/it, Avg Reward=0.00, Epsi

2025-07-16 00:50:14,532 - INFO - Starting episode 47/100
2025-07-16 00:50:14,591 - INFO - Step 0/100 - Total Steps: 37608 - Avg Reward: 0.00
2025-07-16 00:50:26,036 - INFO - Update 0/4 - Loss: 319334400.0000
2025-07-16 00:50:55,344 - INFO - Training updates completed - Avg Loss: 305014840.0000


Training Episodes:  47%|▍| 47/100 [41:43<35:36, 40.32s/it, Avg Reward=0.00, Epsi

2025-07-16 00:50:55,349 - INFO - Episode 47 completed in 40.77 seconds
2025-07-16 00:50:55,349 - INFO -   Total Steps: 38400
2025-07-16 00:50:55,349 - INFO -   Avg Reward (50): 0.00
2025-07-16 00:50:55,350 - INFO -   Epsilon: 0.4696
2025-07-16 00:50:55,350 - INFO -   Avg Loss: 305014840.0000
2025-07-16 00:50:55,350 - INFO -   Speed: 15.3 steps/s


Training Episodes:  48%|▍| 48/100 [41:43<35:04, 40.47s/it, Avg Reward=0.00, Epsi

2025-07-16 00:50:55,351 - INFO - Starting episode 48/100
2025-07-16 00:50:55,408 - INFO - Step 0/100 - Total Steps: 38408 - Avg Reward: 0.00
2025-07-16 00:51:06,390 - INFO - Update 0/4 - Loss: 304284928.0000
2025-07-16 00:51:35,619 - INFO - Training updates completed - Avg Loss: 285331872.0000


Training Episodes:  48%|▍| 48/100 [42:23<35:04, 40.47s/it, Avg Reward=0.00, Epsi

2025-07-16 00:51:35,625 - INFO - Episode 48 completed in 40.23 seconds
2025-07-16 00:51:35,626 - INFO -   Total Steps: 39200
2025-07-16 00:51:35,626 - INFO -   Avg Reward (50): 0.00
2025-07-16 00:51:35,626 - INFO -   Epsilon: 0.4648
2025-07-16 00:51:35,627 - INFO -   Avg Loss: 285331872.0000
2025-07-16 00:51:35,627 - INFO -   Speed: 15.4 steps/s


Training Episodes:  49%|▍| 49/100 [42:23<34:20, 40.41s/it, Avg Reward=0.00, Epsi

2025-07-16 00:51:35,627 - INFO - Starting episode 49/100
2025-07-16 00:51:35,688 - INFO - Step 0/100 - Total Steps: 39208 - Avg Reward: 0.00
2025-07-16 00:51:45,889 - INFO - Update 0/4 - Loss: 291942240.0000
2025-07-16 00:52:15,297 - INFO - Training updates completed - Avg Loss: 324915976.0000


Training Episodes:  49%|▍| 49/100 [43:03<34:20, 40.41s/it, Avg Reward=0.00, Epsi

2025-07-16 00:52:15,301 - INFO - Episode 49 completed in 39.63 seconds
2025-07-16 00:52:15,301 - INFO -   Total Steps: 40000
2025-07-16 00:52:15,301 - INFO -   Avg Reward (50): 0.00
2025-07-16 00:52:15,302 - INFO -   Epsilon: 0.4600
2025-07-16 00:52:15,302 - INFO -   Avg Loss: 324915976.0000
2025-07-16 00:52:15,303 - INFO -   Speed: 15.5 steps/s


Training Episodes:  50%|▌| 50/100 [43:03<33:29, 40.19s/it, Avg Reward=0.00, Epsi

2025-07-16 00:52:15,304 - INFO - Starting episode 50/100
2025-07-16 00:52:15,361 - INFO - Step 0/100 - Total Steps: 40008 - Avg Reward: 0.00
2025-07-16 00:52:25,639 - INFO - Update 0/4 - Loss: 336293568.0000
2025-07-16 00:52:54,108 - INFO - Training updates completed - Avg Loss: 292424212.0000


Training Episodes:  50%|▌| 50/100 [43:42<33:29, 40.19s/it, Avg Reward=0.00, Epsi

2025-07-16 00:52:54,112 - INFO - Episode 50 completed in 38.77 seconds
2025-07-16 00:52:54,113 - INFO -   Total Steps: 40800
2025-07-16 00:52:54,113 - INFO -   Avg Reward (50): 0.00
2025-07-16 00:52:54,113 - INFO -   Epsilon: 0.4552
2025-07-16 00:52:54,114 - INFO -   Avg Loss: 292424212.0000
2025-07-16 00:52:54,114 - INFO -   Speed: 15.6 steps/s


Training Episodes:  51%|▌| 51/100 [43:42<32:29, 39.78s/it, Avg Reward=0.00, Epsi

2025-07-16 00:52:54,114 - INFO - Starting episode 51/100
2025-07-16 00:52:54,156 - INFO - Step 0/100 - Total Steps: 40808 - Avg Reward: 0.00
2025-07-16 00:53:04,634 - INFO - Update 0/4 - Loss: 407096448.0000
2025-07-16 00:53:31,949 - INFO - Training updates completed - Avg Loss: 289419474.0000


Training Episodes:  51%|▌| 51/100 [44:20<32:29, 39.78s/it, Avg Reward=0.00, Epsi

2025-07-16 00:53:31,952 - INFO - Episode 51 completed in 37.81 seconds
2025-07-16 00:53:31,952 - INFO -   Total Steps: 41600
2025-07-16 00:53:31,953 - INFO -   Avg Reward (50): 0.00
2025-07-16 00:53:31,953 - INFO -   Epsilon: 0.4504
2025-07-16 00:53:31,953 - INFO -   Avg Loss: 289419474.0000
2025-07-16 00:53:31,954 - INFO -   Speed: 15.6 steps/s


Training Episodes:  52%|▌| 52/100 [44:20<31:21, 39.20s/it, Avg Reward=0.00, Epsi

2025-07-16 00:53:31,965 - INFO - Starting episode 52/100
2025-07-16 00:53:32,005 - INFO - Step 0/100 - Total Steps: 41608 - Avg Reward: 0.00
2025-07-16 00:53:41,749 - INFO - Update 0/4 - Loss: 476065504.0000
2025-07-16 00:54:09,048 - INFO - Training updates completed - Avg Loss: 299322798.0000


Training Episodes:  52%|▌| 52/100 [44:57<31:21, 39.20s/it, Avg Reward=0.00, Epsi

2025-07-16 00:54:09,052 - INFO - Episode 52 completed in 37.05 seconds
2025-07-16 00:54:09,052 - INFO -   Total Steps: 42400
2025-07-16 00:54:09,052 - INFO -   Avg Reward (50): 0.00
2025-07-16 00:54:09,053 - INFO -   Epsilon: 0.4456
2025-07-16 00:54:09,053 - INFO -   Avg Loss: 299322798.0000
2025-07-16 00:54:09,053 - INFO -   Speed: 15.7 steps/s


Training Episodes:  53%|▌| 53/100 [44:57<30:12, 38.57s/it, Avg Reward=0.00, Epsi

2025-07-16 00:54:09,053 - INFO - Starting episode 53/100
2025-07-16 00:54:09,095 - INFO - Step 0/100 - Total Steps: 42408 - Avg Reward: 0.00
2025-07-16 00:54:19,056 - INFO - Update 0/4 - Loss: 532209152.0000
2025-07-16 00:54:46,482 - INFO - Training updates completed - Avg Loss: 311491065.0000


Training Episodes:  53%|▌| 53/100 [45:34<30:12, 38.57s/it, Avg Reward=0.00, Epsi

2025-07-16 00:54:46,485 - INFO - Episode 53 completed in 37.40 seconds
2025-07-16 00:54:46,486 - INFO -   Total Steps: 43200
2025-07-16 00:54:46,486 - INFO -   Avg Reward (50): 0.00
2025-07-16 00:54:46,487 - INFO -   Epsilon: 0.4408
2025-07-16 00:54:46,488 - INFO -   Avg Loss: 311491065.0000
2025-07-16 00:54:46,488 - INFO -   Speed: 15.8 steps/s


Training Episodes:  54%|▌| 54/100 [45:34<29:18, 38.23s/it, Avg Reward=0.00, Epsi

2025-07-16 00:54:46,488 - INFO - Starting episode 54/100
2025-07-16 00:54:46,528 - INFO - Step 0/100 - Total Steps: 43208 - Avg Reward: 0.00
2025-07-16 00:54:57,143 - INFO - Update 0/4 - Loss: 588351744.0000
2025-07-16 00:55:24,820 - INFO - Training updates completed - Avg Loss: 327713817.5000


Training Episodes:  54%|▌| 54/100 [46:12<29:18, 38.23s/it, Avg Reward=0.00, Epsi

2025-07-16 00:55:24,823 - INFO - Episode 54 completed in 38.30 seconds
2025-07-16 00:55:24,823 - INFO -   Total Steps: 44000
2025-07-16 00:55:24,824 - INFO -   Avg Reward (50): 0.00
2025-07-16 00:55:24,824 - INFO -   Epsilon: 0.4360
2025-07-16 00:55:24,824 - INFO -   Avg Loss: 327713817.5000
2025-07-16 00:55:24,825 - INFO -   Speed: 15.9 steps/s


Training Episodes:  55%|▌| 55/100 [46:12<28:41, 38.26s/it, Avg Reward=0.00, Epsi

2025-07-16 00:55:24,825 - INFO - Starting episode 55/100
2025-07-16 00:55:24,870 - INFO - Step 0/100 - Total Steps: 44008 - Avg Reward: 0.00
2025-07-16 00:55:35,870 - INFO - Update 0/4 - Loss: 630734272.0000
2025-07-16 00:56:12,099 - INFO - Training updates completed - Avg Loss: 343357208.5000


Training Episodes:  55%|▌| 55/100 [47:00<28:41, 38.26s/it, Avg Reward=0.00, Epsi

2025-07-16 00:56:12,104 - INFO - Episode 55 completed in 47.24 seconds
2025-07-16 00:56:12,105 - INFO -   Total Steps: 44800
2025-07-16 00:56:12,105 - INFO -   Avg Reward (50): 0.00
2025-07-16 00:56:12,105 - INFO -   Epsilon: 0.4312
2025-07-16 00:56:12,106 - INFO -   Avg Loss: 343357208.5000
2025-07-16 00:56:12,106 - INFO -   Speed: 15.9 steps/s


Training Episodes:  56%|▌| 56/100 [47:00<30:02, 40.97s/it, Avg Reward=0.00, Epsi

2025-07-16 00:56:12,107 - INFO - Starting episode 56/100
2025-07-16 00:56:12,156 - INFO - Step 0/100 - Total Steps: 44808 - Avg Reward: 0.00
2025-07-16 00:56:27,333 - INFO - Update 0/4 - Loss: 670806016.0000
2025-07-16 00:57:06,503 - INFO - Training updates completed - Avg Loss: 358179820.2500


Training Episodes:  56%|▌| 56/100 [47:54<30:02, 40.97s/it, Avg Reward=0.00, Epsi

2025-07-16 00:57:06,507 - INFO - Episode 56 completed in 54.36 seconds
2025-07-16 00:57:06,507 - INFO -   Total Steps: 45600
2025-07-16 00:57:06,507 - INFO -   Avg Reward (50): 0.00
2025-07-16 00:57:06,507 - INFO -   Epsilon: 0.4264
2025-07-16 00:57:06,508 - INFO -   Avg Loss: 358179820.2500
2025-07-16 00:57:06,508 - INFO -   Speed: 15.9 steps/s


Training Episodes:  57%|▌| 57/100 [47:54<32:14, 45.00s/it, Avg Reward=0.00, Epsi

2025-07-16 00:57:06,508 - INFO - Starting episode 57/100
2025-07-16 00:57:06,552 - INFO - Step 0/100 - Total Steps: 45608 - Avg Reward: 0.00
2025-07-16 00:57:20,671 - INFO - Update 0/4 - Loss: 705486912.0000
2025-07-16 00:57:55,169 - INFO - Training updates completed - Avg Loss: 372420194.2500


Training Episodes:  57%|▌| 57/100 [48:43<32:14, 45.00s/it, Avg Reward=0.00, Epsi

2025-07-16 00:57:55,172 - INFO - Episode 57 completed in 48.63 seconds
2025-07-16 00:57:55,173 - INFO -   Total Steps: 46400
2025-07-16 00:57:55,173 - INFO -   Avg Reward (50): 0.00
2025-07-16 00:57:55,173 - INFO -   Epsilon: 0.4216
2025-07-16 00:57:55,173 - INFO -   Avg Loss: 372420194.2500
2025-07-16 00:57:55,173 - INFO -   Speed: 15.9 steps/s


Training Episodes:  58%|▌| 58/100 [48:43<32:16, 46.10s/it, Avg Reward=0.00, Epsi

2025-07-16 00:57:55,174 - INFO - Starting episode 58/100
2025-07-16 00:57:55,216 - INFO - Step 0/100 - Total Steps: 46408 - Avg Reward: 0.00
2025-07-16 00:58:07,417 - INFO - Update 0/4 - Loss: 732552832.0000
2025-07-16 00:58:41,236 - INFO - Training updates completed - Avg Loss: 383514467.9375


Training Episodes:  58%|▌| 58/100 [49:29<32:16, 46.10s/it, Avg Reward=0.00, Epsi

2025-07-16 00:58:41,240 - INFO - Episode 58 completed in 46.03 seconds
2025-07-16 00:58:41,240 - INFO -   Total Steps: 47200
2025-07-16 00:58:41,241 - INFO -   Avg Reward (50): 0.00
2025-07-16 00:58:41,241 - INFO -   Epsilon: 0.4168
2025-07-16 00:58:41,241 - INFO -   Avg Loss: 383514467.9375
2025-07-16 00:58:41,242 - INFO -   Speed: 15.9 steps/s


Training Episodes:  59%|▌| 59/100 [49:29<31:29, 46.09s/it, Avg Reward=0.00, Epsi

2025-07-16 00:58:41,242 - INFO - Starting episode 59/100
2025-07-16 00:58:41,289 - INFO - Step 0/100 - Total Steps: 47208 - Avg Reward: 0.00
2025-07-16 00:58:52,575 - INFO - Update 0/4 - Loss: 756326912.0000
2025-07-16 00:59:25,975 - INFO - Training updates completed - Avg Loss: 377888915.4062


Training Episodes:  59%|▌| 59/100 [50:14<31:29, 46.09s/it, Avg Reward=0.00, Epsi

2025-07-16 00:59:25,979 - INFO - Episode 59 completed in 44.70 seconds
2025-07-16 00:59:25,979 - INFO -   Total Steps: 48000
2025-07-16 00:59:25,980 - INFO -   Avg Reward (50): 0.00
2025-07-16 00:59:25,980 - INFO -   Epsilon: 0.4120
2025-07-16 00:59:25,980 - INFO -   Avg Loss: 377888915.4062
2025-07-16 00:59:25,980 - INFO -   Speed: 15.9 steps/s


Training Episodes:  60%|▌| 60/100 [50:14<30:27, 45.68s/it, Avg Reward=0.00, Epsi

2025-07-16 00:59:25,981 - INFO - Starting episode 60/100
2025-07-16 00:59:26,023 - INFO - Step 0/100 - Total Steps: 48008 - Avg Reward: 0.00
2025-07-16 00:59:38,396 - INFO - Update 0/4 - Loss: 669853376.0000
2025-07-16 01:00:12,844 - INFO - Training updates completed - Avg Loss: 361424176.0000


Training Episodes:  60%|▌| 60/100 [51:00<30:27, 45.68s/it, Avg Reward=0.00, Epsi

2025-07-16 01:00:12,860 - INFO - Episode 60 completed in 46.83 seconds
2025-07-16 01:00:12,860 - INFO -   Total Steps: 48800
2025-07-16 01:00:12,860 - INFO -   Avg Reward (50): 0.00
2025-07-16 01:00:12,861 - INFO -   Epsilon: 0.4072
2025-07-16 01:00:12,861 - INFO -   Avg Loss: 361424176.0000
2025-07-16 01:00:12,861 - INFO -   Speed: 15.9 steps/s


Training Episodes:  61%|▌| 61/100 [51:00<29:55, 46.04s/it, Avg Reward=0.00, Epsi

2025-07-16 01:00:12,862 - INFO - Starting episode 61/100
2025-07-16 01:00:12,910 - INFO - Step 0/100 - Total Steps: 48808 - Avg Reward: 0.00
2025-07-16 01:00:24,481 - INFO - Update 0/4 - Loss: 551332608.0000
2025-07-16 01:00:59,223 - INFO - Training updates completed - Avg Loss: 383868980.0000


Training Episodes:  61%|▌| 61/100 [51:47<29:55, 46.04s/it, Avg Reward=0.00, Epsi

2025-07-16 01:00:59,233 - INFO - Episode 61 completed in 46.33 seconds
2025-07-16 01:00:59,233 - INFO -   Total Steps: 49600
2025-07-16 01:00:59,233 - INFO -   Avg Reward (50): 0.00
2025-07-16 01:00:59,234 - INFO -   Epsilon: 0.4024
2025-07-16 01:00:59,234 - INFO -   Avg Loss: 383868980.0000
2025-07-16 01:00:59,234 - INFO -   Speed: 16.0 steps/s


Training Episodes:  62%|▌| 62/100 [51:47<29:13, 46.14s/it, Avg Reward=0.00, Epsi

2025-07-16 01:00:59,234 - INFO - Starting episode 62/100
2025-07-16 01:00:59,292 - INFO - Step 0/100 - Total Steps: 49608 - Avg Reward: 0.00
2025-07-16 01:01:09,937 - INFO - Update 0/4 - Loss: 525027968.0000
2025-07-16 01:01:42,297 - INFO - Training updates completed - Avg Loss: 384468808.0000


Training Episodes:  62%|▌| 62/100 [52:30<29:13, 46.14s/it, Avg Reward=0.00, Epsi

2025-07-16 01:01:42,301 - INFO - Episode 62 completed in 43.02 seconds
2025-07-16 01:01:42,301 - INFO -   Total Steps: 50400
2025-07-16 01:01:42,301 - INFO -   Avg Reward (50): 0.00
2025-07-16 01:01:42,302 - INFO -   Epsilon: 0.3976
2025-07-16 01:01:42,302 - INFO -   Avg Loss: 384468808.0000
2025-07-16 01:01:42,302 - INFO -   Speed: 16.0 steps/s


Training Episodes:  63%|▋| 63/100 [52:30<27:53, 45.22s/it, Avg Reward=0.00, Epsi

2025-07-16 01:01:42,303 - INFO - Starting episode 63/100
2025-07-16 01:01:42,343 - INFO - Step 0/100 - Total Steps: 50408 - Avg Reward: 0.00
2025-07-16 01:01:54,788 - INFO - Update 0/4 - Loss: 551399232.0000
2025-07-16 01:02:27,235 - INFO - Training updates completed - Avg Loss: 350560508.0000


Training Episodes:  63%|▋| 63/100 [53:15<27:53, 45.22s/it, Avg Reward=0.00, Epsi

2025-07-16 01:02:27,239 - INFO - Episode 63 completed in 44.90 seconds
2025-07-16 01:02:27,239 - INFO -   Total Steps: 51200
2025-07-16 01:02:27,240 - INFO -   Avg Reward (50): 0.00
2025-07-16 01:02:27,240 - INFO -   Epsilon: 0.3928
2025-07-16 01:02:27,240 - INFO -   Avg Loss: 350560508.0000
2025-07-16 01:02:27,241 - INFO -   Speed: 16.0 steps/s


Training Episodes:  64%|▋| 64/100 [53:15<27:04, 45.14s/it, Avg Reward=0.00, Epsi

2025-07-16 01:02:27,241 - INFO - Starting episode 64/100
2025-07-16 01:02:27,282 - INFO - Step 0/100 - Total Steps: 51208 - Avg Reward: 0.00
2025-07-16 01:02:38,023 - INFO - Update 0/4 - Loss: 509509376.0000
2025-07-16 01:03:09,637 - INFO - Training updates completed - Avg Loss: 317527048.0000


Training Episodes:  64%|▋| 64/100 [53:57<27:04, 45.14s/it, Avg Reward=0.00, Epsi

2025-07-16 01:03:09,641 - INFO - Episode 64 completed in 42.37 seconds
2025-07-16 01:03:09,642 - INFO -   Total Steps: 52000
2025-07-16 01:03:09,643 - INFO -   Avg Reward (50): 0.00
2025-07-16 01:03:09,644 - INFO -   Epsilon: 0.3880
2025-07-16 01:03:09,644 - INFO -   Avg Loss: 317527048.0000
2025-07-16 01:03:09,644 - INFO -   Speed: 16.1 steps/s


Training Episodes:  65%|▋| 65/100 [53:57<25:51, 44.32s/it, Avg Reward=0.00, Epsi

2025-07-16 01:03:09,645 - INFO - Starting episode 65/100
2025-07-16 01:03:09,686 - INFO - Step 0/100 - Total Steps: 52008 - Avg Reward: 0.00
2025-07-16 01:03:23,608 - INFO - Update 0/4 - Loss: 378545504.0000
2025-07-16 01:03:52,962 - INFO - Training updates completed - Avg Loss: 323417272.0000


Training Episodes:  65%|▋| 65/100 [54:41<25:51, 44.32s/it, Avg Reward=0.00, Epsi

2025-07-16 01:03:52,965 - INFO - Episode 65 completed in 43.29 seconds
2025-07-16 01:03:52,966 - INFO -   Total Steps: 52800
2025-07-16 01:03:52,966 - INFO -   Avg Reward (50): 0.00
2025-07-16 01:03:52,966 - INFO -   Epsilon: 0.3832
2025-07-16 01:03:52,966 - INFO -   Avg Loss: 323417272.0000
2025-07-16 01:03:52,967 - INFO -   Speed: 16.1 steps/s


Training Episodes:  66%|▋| 66/100 [54:41<24:56, 44.02s/it, Avg Reward=0.00, Epsi

2025-07-16 01:03:52,967 - INFO - Starting episode 66/100
2025-07-16 01:03:53,008 - INFO - Step 0/100 - Total Steps: 52808 - Avg Reward: 0.00
2025-07-16 01:04:03,585 - INFO - Update 0/4 - Loss: 354654464.0000
2025-07-16 01:04:35,178 - INFO - Training updates completed - Avg Loss: 377607880.0000


Training Episodes:  66%|▋| 66/100 [55:23<24:56, 44.02s/it, Avg Reward=0.00, Epsi

2025-07-16 01:04:35,180 - INFO - Episode 66 completed in 42.18 seconds
2025-07-16 01:04:35,181 - INFO -   Total Steps: 53600
2025-07-16 01:04:35,181 - INFO -   Avg Reward (50): 0.00
2025-07-16 01:04:35,181 - INFO -   Epsilon: 0.3784
2025-07-16 01:04:35,182 - INFO -   Avg Loss: 377607880.0000
2025-07-16 01:04:35,182 - INFO -   Speed: 16.1 steps/s


Training Episodes:  67%|▋| 67/100 [55:23<23:54, 43.48s/it, Avg Reward=0.00, Epsi

2025-07-16 01:04:35,182 - INFO - Starting episode 67/100
2025-07-16 01:04:35,223 - INFO - Step 0/100 - Total Steps: 53608 - Avg Reward: 0.00
2025-07-16 01:04:45,668 - INFO - Update 0/4 - Loss: 265423424.0000
2025-07-16 01:05:20,266 - INFO - Training updates completed - Avg Loss: 306808768.0000


Training Episodes:  67%|▋| 67/100 [56:08<23:54, 43.48s/it, Avg Reward=0.00, Epsi

2025-07-16 01:05:20,270 - INFO - Episode 67 completed in 45.06 seconds
2025-07-16 01:05:20,271 - INFO -   Total Steps: 54400
2025-07-16 01:05:20,272 - INFO -   Avg Reward (50): 0.00
2025-07-16 01:05:20,272 - INFO -   Epsilon: 0.3736
2025-07-16 01:05:20,273 - INFO -   Avg Loss: 306808768.0000
2025-07-16 01:05:20,273 - INFO -   Speed: 16.2 steps/s


Training Episodes:  68%|▋| 68/100 [56:08<23:26, 43.96s/it, Avg Reward=0.00, Epsi

2025-07-16 01:05:20,274 - INFO - Starting episode 68/100
2025-07-16 01:05:20,315 - INFO - Step 0/100 - Total Steps: 54408 - Avg Reward: 0.00
2025-07-16 01:05:30,858 - INFO - Update 0/4 - Loss: 260614432.0000
2025-07-16 01:06:02,469 - INFO - Training updates completed - Avg Loss: 358636264.0000


Training Episodes:  68%|▋| 68/100 [56:50<23:26, 43.96s/it, Avg Reward=0.00, Epsi

2025-07-16 01:06:02,473 - INFO - Episode 68 completed in 42.17 seconds
2025-07-16 01:06:02,474 - INFO -   Total Steps: 55200
2025-07-16 01:06:02,474 - INFO -   Avg Reward (50): 0.00
2025-07-16 01:06:02,474 - INFO -   Epsilon: 0.3688
2025-07-16 01:06:02,475 - INFO -   Avg Loss: 358636264.0000
2025-07-16 01:06:02,475 - INFO -   Speed: 16.2 steps/s


Training Episodes:  69%|▋| 69/100 [56:50<22:26, 43.43s/it, Avg Reward=0.00, Epsi

2025-07-16 01:06:02,475 - INFO - Starting episode 69/100
2025-07-16 01:06:02,517 - INFO - Step 0/100 - Total Steps: 55208 - Avg Reward: 0.00
2025-07-16 01:06:13,374 - INFO - Update 0/4 - Loss: 434240320.0000
2025-07-16 01:06:44,474 - INFO - Training updates completed - Avg Loss: 320918224.0000


Training Episodes:  69%|▋| 69/100 [57:32<22:26, 43.43s/it, Avg Reward=0.00, Epsi

2025-07-16 01:06:44,477 - INFO - Episode 69 completed in 41.97 seconds
2025-07-16 01:06:44,477 - INFO -   Total Steps: 56000
2025-07-16 01:06:44,478 - INFO -   Avg Reward (50): 0.00
2025-07-16 01:06:44,478 - INFO -   Epsilon: 0.3640
2025-07-16 01:06:44,478 - INFO -   Avg Loss: 320918224.0000
2025-07-16 01:06:44,478 - INFO -   Speed: 16.2 steps/s


Training Episodes:  70%|▋| 70/100 [57:32<21:30, 43.00s/it, Avg Reward=0.00, Epsi

2025-07-16 01:06:44,479 - INFO - Starting episode 70/100
2025-07-16 01:06:44,521 - INFO - Step 0/100 - Total Steps: 56008 - Avg Reward: 0.00
2025-07-16 01:06:54,965 - INFO - Update 0/4 - Loss: 363145152.0000
2025-07-16 01:07:39,203 - INFO - Training updates completed - Avg Loss: 296277116.0000


Training Episodes:  70%|▋| 70/100 [58:27<21:30, 43.00s/it, Avg Reward=0.00, Epsi

2025-07-16 01:07:39,215 - INFO - Episode 70 completed in 54.70 seconds
2025-07-16 01:07:39,215 - INFO -   Total Steps: 56800
2025-07-16 01:07:39,215 - INFO -   Avg Reward (50): 0.00
2025-07-16 01:07:39,216 - INFO -   Epsilon: 0.3592
2025-07-16 01:07:39,216 - INFO -   Avg Loss: 296277116.0000
2025-07-16 01:07:39,216 - INFO -   Speed: 16.2 steps/s


Training Episodes:  71%|▋| 71/100 [58:27<22:29, 46.52s/it, Avg Reward=0.00, Epsi

2025-07-16 01:07:39,217 - INFO - Starting episode 71/100
2025-07-16 01:07:39,281 - INFO - Step 0/100 - Total Steps: 56808 - Avg Reward: 0.00
2025-07-16 01:07:53,467 - INFO - Update 0/4 - Loss: 436509952.0000
2025-07-16 01:08:34,013 - INFO - Training updates completed - Avg Loss: 340612444.0000


Training Episodes:  71%|▋| 71/100 [59:22<22:29, 46.52s/it, Avg Reward=0.00, Epsi

2025-07-16 01:08:34,018 - INFO - Episode 71 completed in 54.76 seconds
2025-07-16 01:08:34,018 - INFO -   Total Steps: 57600
2025-07-16 01:08:34,019 - INFO -   Avg Reward (50): 0.00
2025-07-16 01:08:34,019 - INFO -   Epsilon: 0.3544
2025-07-16 01:08:34,019 - INFO -   Avg Loss: 340612444.0000
2025-07-16 01:08:34,020 - INFO -   Speed: 16.2 steps/s


Training Episodes:  72%|▋| 72/100 [59:22<22:52, 49.01s/it, Avg Reward=0.00, Epsi

2025-07-16 01:08:34,020 - INFO - Starting episode 72/100
2025-07-16 01:08:34,064 - INFO - Step 0/100 - Total Steps: 57608 - Avg Reward: 0.00
2025-07-16 01:08:46,648 - INFO - Update 0/4 - Loss: 374189376.0000


In [None]:
# Plot results
plt.figure(figsize=(18, 5))

# Plot rewards
plt.subplot(1, 3, 1)
plt.plot(episode_rewards)
plt.title("Episode Rewards")
plt.xlabel("Episode")
plt.ylabel("Total Reward")

# Plot loss
plt.subplot(1, 3, 2)
plt.plot(episode_losses)
plt.title("Training Loss")
plt.xlabel("Episode")
plt.ylabel("Average Loss")

# Plot steps
plt.subplot(1, 3, 3)
plt.plot(episode_steps)
plt.title("Episode Steps")
plt.xlabel("Episode")
plt.ylabel("Steps Taken")

plt.tight_layout()
plt.savefig("training_results.png")
plt.show()
