In [37]:
%pip install tianshou
%pip install pygame

  pid, fd = os.forkpty()



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [38]:
import math
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import gymnasium as gym
from torch.utils.tensorboard import SummaryWriter

import tianshou as ts
from tianshou.policy import DQNPolicy

In [39]:
class CBPLayer(nn.Module):
    def __init__(self, in_features, out_features, activation=nn.ReLU(), cbp_params=None):
        """
        in_features: number of inputs.
        out_features: number of neurons (features) in this layer.
        activation: activation function.
        cbp_params: dict with keys:
            - eta: decay rate (e.g. 0.99)
            - rho: replacement rate (e.g. 1e-4)
            - m: maturity threshold (e.g. 100)
        """
        super().__init__()
        # Initialized the weight for each layer (Initialize: Initialize the weights w0, ..., wL. Let, wl be sampled from a distribution dl)
        self.linear = nn.Linear(in_features, out_features)
        self.activation = activation
        
        # Set default CBP parameters if none provided.
        if cbp_params is None:
            cbp_params = {'eta': 0.99, 'rho': 1e-4, 'm': 100}
        self.eta = cbp_params.get('eta', 0.99)
        self.rho = cbp_params.get('rho', 1e-4)
        self.m = cbp_params.get('m', 100)
        
        # Initialize: Utilities u1, ..., uL, average feature activation f1, ..., fl
        # Then, register_buffer is the PyTorch func to ensure these tensor are moved to the proper CPU or GPU we plan to use
        self.register_buffer('age', torch.zeros(out_features))
        self.register_buffer('u', torch.zeros(out_features))      # overall utility
        self.register_buffer('z', torch.zeros(out_features))      # mean-corrected contribution utility
        self.register_buffer('f_hat', torch.zeros(out_features))  # running average activation
        
        self.last_activation = None  # will be set in forward()

    # this is part of the the forward pass (put here and will be called in CBPNetwork forward pass to avoid redudnacy and readability)
    def forward(self, x):
        z = self.linear(x)
        a = self.activation(z)
        # Function 4 (in the curent paper it's function 1 but in the YT video it's func 4) in the paper discuss about 
        # tracking feature activations to compute utilities. However, it does not specify how to store activations in an implementation.
        # last_activation will be used below to track feature activations to compute utilities
        # detach() tell PyTorch to not track gradients for last_activation (since we only need the activation values, not their gradients) 
        # to avoid using unnecessary memory
        self.last_activation = a.detach()
        return a

    def update_cbp(self, next_layer_weight=None):
        """
        This function updates the CBP utilities for this layer and reinitialize a fraction of neurons if needed.
        Paper: for each input x_t do
        """
        eta = self.eta
        rho = self.rho
        m = self.m
        
        # If no forward pass has been done, skip the update.
        if self.last_activation is None:
            return None

        # Increase the age (Paper: Update age: al += 1).
        self.age += 1

        # Compute the batch-average activation per neuron.
        h = self.last_activation.mean(dim=0)
        
        # Paper: Update feature utility: Using Function 4, 5, and 6
        # Update the running average (f_hat) using an exponential moving average.
        beta = 0.01
        # Paper: Function 2
        self.f_hat = (1 - beta) * self.f_hat + beta * h

        # Compute the absolute difference |h - f_hat| part in function 4 below
        diff = (h - self.f_hat).abs()

        # Sum over the absolute weights connecting this neuron to the next layer.
        if next_layer_weight is not None:
            # next_layer_weight: shape (n_next, out_features) — sum over next-layer neurons.
            sum_abs_out = next_layer_weight.abs().sum(dim=0)
        else:
            sum_abs_out = torch.zeros_like(self.u)
        
        # Sum over the absolute input weights for each neuron in this layer.
        # self.linear.weight is shape (out_features, in_features)
        sum_abs_in = self.linear.weight.abs().sum(dim=1)  # one value per neuron

        # --- Function 4: Mean-corrected Contribution Utility ---
        self.z = (1 - eta) * diff * sum_abs_out + eta * self.z

        # --- Function 5: Adaptation Utility ---
        y = (diff * sum_abs_out) / (sum_abs_in + 1e-8)  # Here, we add epsilon (1e-8) to avoid division by zero

        # --- Function 6: Overall Feature Utility ---
        self.u = (1 - eta) * y + eta * self.u

        # Find eligible features: Features with age more than m (age > m)
        # This createes a boolean tensor (True for neurons older than m and False otherwise).
        eligible = self.age > m

        # If no neuron are eligible for replacement, then skip this computation to avoid error
        if eligible.sum() > 0:
            eligible_indices = torch.nonzero(eligible).squeeze()
            # Determine the number of neurons to replace.
            n_eligible = eligible.sum().item()
            n_replace = max(1, int(n_eligible * rho))
            # Paper: Features to replace: nl∗ρ of eligible features with smallest utility, let their indices be r
            # Among the eligible neurons, select those with the smallest utility.
            # The argsort function call will return indices that would sort the tensor in ascending order.
            utilities = self.u[eligible]
            sorted_order = utilities.argsort() # Sort by utility
            # Map sorted indices back to global indices.
            if eligible_indices.dim() == 0:
                eligible_indices = eligible_indices.unsqueeze(0)
            replace_indices = eligible_indices[sorted_order[:n_replace]] # Here, we select lowest utility
            # For each neuron to replace, we reinitialize its input weights and reset its buffers.
            for idx in replace_indices:
                fan_in = self.linear.in_features
                bound = 1 / math.sqrt(fan_in)

                # Paper: Initialize input weights: Reset the input weights wl−1[r] using samples from dl
                # Reinitialize the weights for this neuron (i.e. one row of self.linear.weight)
                with torch.no_grad():
                    self.linear.weight[idx].uniform_(-bound, bound)
                    if self.linear.bias is not None:
                        self.linear.bias[idx].uniform_(-bound, bound)
                # Paper: Initialize utility, feature activation, and age
                self.u[idx] = 0
                self.z[idx] = 0
                self.f_hat[idx] = 0
                self.age[idx] = 0
            # Return the indices of the replaced neurons so that the next layer can zero out its input weights.
            return replace_indices
        return None

In [40]:
class CBPNetwork(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_sizes, cbp_params=None):
        """
        input_dim: dimensionality of the input.
        output_dim: number of actions (output neurons).
        hidden_sizes: list of hidden layer sizes.
        cbp_params: dictionary of CBP parameters (eta, rho, m).
        """
        # Iniitialize the layers
        super().__init__()
        layers = []
        in_dim = input_dim
        for hidden_dim in hidden_sizes:
            layers.append(CBPLayer(in_dim, hidden_dim, activation=nn.ReLU(), cbp_params=cbp_params))
            in_dim = hidden_dim
        self.hidden_layers = nn.ModuleList(layers)
        self.out_layer = nn.Linear(in_dim, output_dim)

    # Forward pass
    def forward(self, obs, state=None, info=None):
        # This will ensure we're working with a tensor
        if isinstance(obs, np.ndarray):
            obs = torch.from_numpy(obs).float()
        
        # Handle batch vs single observation
        if obs.dim() == 1:
            obs = obs.unsqueeze(0)  # Add batch dimension
            
        x = obs
        for layer in self.hidden_layers:
            x = layer(x)
        logits = self.out_layer(x)
        
        # Finally, return both logits and None for the state
        # This will match what Tianshou expects for a stateless network
        return logits, state

    def update_cbp(self):
        """
        Call update_cbp() on each CBP layer. For each layer, if some neurons were replaced,
        zero out the corresponding incoming weights in the next layer.
        """
        # Iterate through each layer
        for i, layer in enumerate(self.hidden_layers):
            if i < len(self.hidden_layers) - 1:
                next_weight = self.hidden_layers[i + 1].linear.weight
            else:
                next_weight = self.out_layer.weight  # for the last hidden layer, use output layer weights.
            # Call update_cbp func of each layer to update the paper's tasks (comment in the update_cbp func above already)
            replace_indices = layer.update_cbp(next_layer_weight=next_weight)
            if replace_indices is not None:
                with torch.no_grad():
                    # For next layer, set weights corresponding to replaced features to zero.
                    # Paper: "Initialize output weights: Set wl[r] to zero"
                    next_weight[:, replace_indices] = 0

In [41]:
class CBPDQNPolicy(DQNPolicy):
    def learn(self, batch, **kwargs):
        # This learn function will call DQNPolicy.learn() from Tianshou, this will:
        # Run the Runs the forward pass using CBPNetwork.forward().
        # Then, it will computes loss.
        # Finally, it updates model weights (SGD).
        result = super().learn(batch, **kwargs)
        # Then update the CBP utilities then (if needed) replace features.
        if hasattr(self.model, 'update_cbp'):
            self.model.update_cbp()
        return result

In [42]:
import numpy as np

# --- Set up logging ---
logger = ts.utils.TensorboardLogger(SummaryWriter("logs/cbp_dqn"))

# --- Create the environment ---
env_name = "CartPole-v1"
train_env = gym.make(env_name)
test_env = gym.make(env_name)

# --- Get state and action info ---
obs_shape = train_env.observation_space.shape  
n_actions = train_env.action_space.n           
input_dim = np.prod(obs_shape)

# --- Define CBP parameters ---
cbp_params = {
    'eta': 0.99,   # decay rate 
    'rho': 1e-4,   # replacement rate 
    'm': 100       # maturity threshold 
}

# --- Build the CBP network ---
hidden_sizes = [128, 128, 128]
model = CBPNetwork(input_dim=input_dim, output_dim=n_actions, hidden_sizes=hidden_sizes, cbp_params=cbp_params)

# --- Set up optimizer ---
optim = torch.optim.Adam(model.parameters(), lr=0.001)

# --- Create the CBP-DQN policy ---
policy = CBPDQNPolicy(
    model=model,
    optim=optim,
    discount_factor=0.9,
    estimation_step=3,
    target_update_freq=320,
    action_space=train_env.action_space
)

# --- Set up data collectors ---
train_collector = ts.data.Collector(
    policy, 
    train_env, 
    ts.data.VectorReplayBuffer(20000, 1),
    exploration_noise=True
)
test_collector = ts.data.Collector(policy, test_env, exploration_noise=True)

# --- Train the agent ---
result = ts.trainer.OffpolicyTrainer(
    policy=policy,
    train_collector=train_collector,
    test_collector=test_collector,
    max_epoch=5,
    step_per_epoch=10000,
    step_per_collect=10,
    episode_per_test=100,
    batch_size=64,
    update_per_step=1 / 10,
    train_fn=lambda epoch, env_step: policy.set_eps(0.1),
    test_fn=lambda epoch, env_step: policy.set_eps(0.05),
    stop_fn=lambda mean_rewards: mean_rewards >= train_env.spec.reward_threshold,
    logger=logger,
).run()

print(f"Finished training in {result.timing.total_time} seconds")
torch.save(policy.state_dict(), 'models/cbp_dqn.pth')

# --- Evaluate the trained agent ---
policy.load_state_dict(torch.load('models/cbp_dqn.pth'))
policy.eval()
policy.set_eps(0.05)

# Create an environment with rendering enabled.
eval_env = gym.make(env_name, render_mode="human")
eval_env.reset()
collector = ts.data.Collector(policy, eval_env, exploration_noise=True)
collector.collect(n_episode=1, render=1/35, reset_before_collect=True)
eval_env.close()

Epoch #1: 10001it [00:08, 1127.31it/s, env_step=10000, gradient_step=1000, len=10, n/ep=1, n/st=10, rew=10.00]                           


Epoch #1: test_reward: 9.880000 ± 1.107068, best_reward: 9.880000 ± 1.107068 in #1


Epoch #2: 10001it [00:10, 911.97it/s, env_step=20000, gradient_step=2000, len=22, n/ep=0, n/st=10, rew=22.00]                             


Epoch #2: test_reward: 34.170000 ± 5.770711, best_reward: 34.170000 ± 5.770711 in #2


Epoch #3: 10001it [00:09, 1015.40it/s, env_step=30000, gradient_step=3000, len=102, n/ep=0, n/st=10, rew=102.00]                           


Epoch #3: test_reward: 27.580000 ± 4.164565, best_reward: 34.170000 ± 5.770711 in #2


Epoch #4: 10001it [00:16, 600.20it/s, env_step=40000, gradient_step=4000, len=144, n/ep=0, n/st=10, rew=144.00]                            


Epoch #4: test_reward: 407.370000 ± 42.719938, best_reward: 407.370000 ± 42.719938 in #4


Epoch #5: 10001it [00:10, 924.49it/s, env_step=50000, gradient_step=5000, len=154, n/ep=0, n/st=10, rew=154.00]                           


Epoch #5: test_reward: 29.930000 ± 8.510294, best_reward: 407.370000 ± 42.719938 in #4
Finished training in 71.10224294662476 seconds


In [43]:
import numpy as np

# --- Set up logging ---
logger = ts.utils.TensorboardLogger(SummaryWriter("logs/cbp_dqn"))

# --- Create the environment ---
env_name = "CartPole-v1"
train_env = gym.make(env_name)
test_env = gym.make(env_name)

# --- Get state and action info ---
obs_shape = train_env.observation_space.shape  
n_actions = train_env.action_space.n           
input_dim = np.prod(obs_shape)

# --- Define CBP parameters ---
cbp_params = {
    'eta': 0.99,   # decay rate 
    'rho': 1e-4,   # replacement rate 
    'm': 100       # maturity threshold 
}

# --- Build the CBP network ---
hidden_sizes = [128, 128, 128]
model = CBPNetwork(input_dim=input_dim, output_dim=n_actions, hidden_sizes=hidden_sizes, cbp_params=cbp_params)

# --- Set up optimizer ---
optim = torch.optim.Adam(model.parameters(), lr=0.001)

# --- Create the CBP-DQN policy ---
policy = CBPDQNPolicy(
    model=model,
    optim=optim,
    discount_factor=0.9,
    estimation_step=3,
    target_update_freq=320,
    action_space=train_env.action_space
)

# --- Set up data collectors ---
train_collector = ts.data.Collector(
    policy, 
    train_env, 
    ts.data.VectorReplayBuffer(20000, 1),
    exploration_noise=True
)
test_collector = ts.data.Collector(policy, test_env, exploration_noise=True)

# --- Train the agent ---
result = ts.trainer.OffpolicyTrainer(
    policy=policy,
    train_collector=train_collector,
    test_collector=test_collector,
    max_epoch=5,
    step_per_epoch=10000,
    step_per_collect=10,
    episode_per_test=100,
    batch_size=64,
    update_per_step=1 / 10,
    train_fn=lambda epoch, env_step: policy.set_eps(0.1),
    test_fn=lambda epoch, env_step: policy.set_eps(0.05),
    stop_fn=lambda mean_rewards: mean_rewards >= train_env.spec.reward_threshold,
    logger=logger,
).run()

print(f"Finished training in {result.timing.total_time} seconds")
torch.save(policy.state_dict(), 'models/cbp_dqn.pth')

# --- Evaluate the trained agent ---
policy.load_state_dict(torch.load('models/cbp_dqn.pth'))
policy.eval()
policy.set_eps(0.05)

# Create an environment with rendering enabled.
eval_env = gym.make(env_name, render_mode="human")
eval_env.reset()
collector = ts.data.Collector(policy, eval_env, exploration_noise=True)
collector.collect(n_episode=1, render=1/35, reset_before_collect=True)
eval_env.close()

Epoch #1: 10001it [00:08, 1147.30it/s, env_step=10000, gradient_step=1000, len=10, n/ep=1, n/st=10, rew=10.00]                           


Epoch #1: test_reward: 9.570000 ± 0.886059, best_reward: 9.690000 ± 1.074197 in #0


Epoch #2: 10001it [00:09, 1023.77it/s, env_step=20000, gradient_step=2000, len=10, n/ep=0, n/st=10, rew=10.50]                           


Epoch #2: test_reward: 9.920000 ± 1.110675, best_reward: 9.920000 ± 1.110675 in #2


Epoch #3: 10001it [00:09, 1055.48it/s, env_step=30000, gradient_step=3000, len=12, n/ep=0, n/st=10, rew=12.00]                           


Epoch #3: test_reward: 36.390000 ± 8.984314, best_reward: 36.390000 ± 8.984314 in #3


Epoch #4: 10001it [00:09, 1053.41it/s, env_step=40000, gradient_step=4000, len=13, n/ep=1, n/st=10, rew=13.00]                            


Epoch #4: test_reward: 14.260000 ± 1.752826, best_reward: 36.390000 ± 8.984314 in #3


Epoch #5: 10001it [00:09, 1019.15it/s, env_step=50000, gradient_step=5000, len=16, n/ep=1, n/st=10, rew=16.00]                            


Epoch #5: test_reward: 18.720000 ± 2.474187, best_reward: 36.390000 ± 8.984314 in #3
Finished training in 50.10233783721924 seconds


In [44]:
import numpy as np

# --- Set up logging ---
logger = ts.utils.TensorboardLogger(SummaryWriter("logs/cbp_dqn"))

# --- Create the environment ---
env_name = "CartPole-v1"
train_env = gym.make(env_name)
test_env = gym.make(env_name)

# --- Get state and action info ---
obs_shape = train_env.observation_space.shape  
n_actions = train_env.action_space.n           
input_dim = np.prod(obs_shape)

# --- Define CBP parameters ---
cbp_params = {
    'eta': 0.99,   # decay rate 
    'rho': 1e-4,   # replacement rate 
    'm': 100       # maturity threshold 
}

# --- Build the CBP network ---
hidden_sizes = [128, 128, 128]
model = CBPNetwork(input_dim=input_dim, output_dim=n_actions, hidden_sizes=hidden_sizes, cbp_params=cbp_params)

# --- Set up optimizer ---
optim = torch.optim.Adam(model.parameters(), lr=0.001)

# --- Create the CBP-DQN policy ---
policy = CBPDQNPolicy(
    model=model,
    optim=optim,
    discount_factor=0.975,
    estimation_step=3,
    target_update_freq=1000,
    action_space=train_env.action_space
)

# --- Set up data collectors ---
train_collector = ts.data.Collector(
    policy, 
    train_env, 
    ts.data.VectorReplayBuffer(20000, 1),
    exploration_noise=True
)
test_collector = ts.data.Collector(policy, test_env, exploration_noise=True)

# --- Train the agent ---
best_mean_reward = -np.inf 

def save_best_model(mean_rewards):
    global best_mean_reward
    if mean_rewards > best_mean_reward:
        best_mean_reward = mean_rewards
        torch.save(policy.state_dict(), 'models/cbp_dqn.pth')
        print(f"New best model saved with mean reward: {best_mean_reward}")

result = ts.trainer.OffpolicyTrainer(
    policy=policy,
    train_collector=train_collector,
    test_collector=test_collector,
    max_epoch=15,
    step_per_epoch=10000,
    step_per_collect=10,
    episode_per_test=100,
    batch_size=64,
    update_per_step=1 / 10,
    train_fn=lambda epoch, env_step: policy.set_eps(0.1),
    test_fn=lambda epoch, env_step: policy.set_eps(0.05),
    stop_fn=lambda mean_rewards: mean_rewards >= train_env.spec.reward_threshold,
    logger=logger,
).run()

save_best_model(result.best_reward)

print(f"Finished training in {result.timing.total_time} seconds")

# --- Evaluate the trained agent ---
policy.load_state_dict(torch.load('models/cbp_dqn.pth'))
policy.eval()
policy.set_eps(0.05)

# Create an environment with rendering enabled.
eval_env = gym.make(env_name, render_mode="human")
eval_env.reset()
collector = ts.data.Collector(policy, eval_env, exploration_noise=True)
collector.collect(n_episode=1, render=1/35, reset_before_collect=True)
eval_env.close()

Epoch #1: 10001it [00:09, 1020.82it/s, env_step=10000, gradient_step=1000, len=9, n/ep=1, n/st=10, rew=9.00]                           


Epoch #1: test_reward: 9.610000 ± 0.893252, best_reward: 9.610000 ± 0.893252 in #1


Epoch #2: 10001it [00:09, 1003.13it/s, env_step=20000, gradient_step=2000, len=10, n/ep=1, n/st=10, rew=10.00]                           


Epoch #2: test_reward: 9.590000 ± 0.917551, best_reward: 9.610000 ± 0.893252 in #1


Epoch #3: 10001it [00:11, 892.39it/s, env_step=30000, gradient_step=3000, len=11, n/ep=1, n/st=10, rew=11.00]                            


Epoch #3: test_reward: 9.700000 ± 1.109054, best_reward: 9.700000 ± 1.109054 in #3


Epoch #4: 10001it [00:09, 1025.90it/s, env_step=40000, gradient_step=4000, len=41, n/ep=1, n/st=10, rew=41.00]                            


Epoch #4: test_reward: 53.450000 ± 25.396210, best_reward: 53.450000 ± 25.396210 in #4


Epoch #5: 10001it [00:09, 1050.52it/s, env_step=50000, gradient_step=5000, len=17, n/ep=1, n/st=10, rew=17.00]                            


Epoch #5: test_reward: 40.850000 ± 11.163669, best_reward: 53.450000 ± 25.396210 in #4


Epoch #6: 10001it [00:09, 1000.18it/s, env_step=60000, gradient_step=6000, len=150, n/ep=0, n/st=10, rew=150.00]                          


Epoch #6: test_reward: 167.440000 ± 36.331342, best_reward: 167.440000 ± 36.331342 in #6


Epoch #7: 10001it [00:09, 1093.03it/s, env_step=70000, gradient_step=7000, len=70, n/ep=0, n/st=10, rew=70.00]                            


Epoch #7: test_reward: 145.830000 ± 12.124401, best_reward: 167.440000 ± 36.331342 in #6


Epoch #8: 10001it [00:10, 958.29it/s, env_step=80000, gradient_step=8000, len=72, n/ep=0, n/st=10, rew=72.00]                             


Epoch #8: test_reward: 117.440000 ± 6.986158, best_reward: 167.440000 ± 36.331342 in #6


Epoch #9: 10001it [00:09, 1040.71it/s, env_step=90000, gradient_step=9000, len=112, n/ep=0, n/st=10, rew=112.00]                          


Epoch #9: test_reward: 108.320000 ± 5.594426, best_reward: 167.440000 ± 36.331342 in #6


Epoch #10: 10001it [00:13, 749.47it/s, env_step=100000, gradient_step=10000, len=121, n/ep=0, n/st=10, rew=121.00]                            


Epoch #10: test_reward: 253.650000 ± 75.944766, best_reward: 253.650000 ± 75.944766 in #10


Epoch #11: 10001it [00:15, 659.02it/s, env_step=110000, gradient_step=11000, len=322, n/ep=0, n/st=10, rew=322.00]                           


Epoch #11: test_reward: 107.010000 ± 22.207429, best_reward: 253.650000 ± 75.944766 in #10


Epoch #12: 10001it [00:09, 1009.72it/s, env_step=120000, gradient_step=12000, len=169, n/ep=0, n/st=10, rew=169.00]                          


Epoch #12: test_reward: 182.990000 ± 24.044748, best_reward: 253.650000 ± 75.944766 in #10


Epoch #13: 10001it [00:13, 752.56it/s, env_step=130000, gradient_step=13000, len=189, n/ep=0, n/st=10, rew=189.00]                           


Epoch #13: test_reward: 496.460000 ± 18.066223, best_reward: 496.460000 ± 18.066223 in #13
New best model saved with mean reward: 496.46
Finished training in 192.78231406211853 seconds


# Mod 1: Change to cartpole-v0

In [45]:
from tianshou.env import DummyVectorEnv
# Prepare the new environment
new_env_name = "CartPole-v0"
# train_env_new = gym.make(new_env_name)
# test_env_new = gym.make(new_env_name)
train_env_new = DummyVectorEnv([lambda: gym.make(new_env_name)])
test_env_new = DummyVectorEnv([lambda: gym.make(new_env_name)])

# Update the collectors by setting their env attribute directly
# train_collector.env = train_env_new
train_collector = ts.data.Collector(
    policy, train_env_new, ts.data.VectorReplayBuffer(20000, 1), exploration_noise=True
)
test_collector.env = test_env_new

# Reset the new environments
train_collector.reset_env()
test_collector.reset_env()

# Ensure the policy is set to training mode
policy.load_state_dict(torch.load('models/cbp_dqn.pth'))
policy.train()
policy.set_eps(0.1)  # Exploration rate

# Update logger
logger = ts.utils.TensorboardLogger(SummaryWriter(f"logs/cbp_dqn_{new_env_name}"))

# Define a new train function if desired
def train_fn_new(epoch, env_step):
    eps = max(0.1, 0.5 - epoch * 0.05)
    policy.set_eps(eps)
    policy.model.update_cbp()

env_spec = gym.make(new_env_name).spec.reward_threshold

# Run the training loop
result = ts.trainer.OffpolicyTrainer(
    policy=policy,
    train_collector=train_collector,
    test_collector=test_collector,
    max_epoch=10,
    step_per_epoch=5000,
    step_per_collect=10,
    episode_per_test=10,
    batch_size=64,
    update_per_step=1 / 10,
    train_fn=train_fn_new,
    test_fn=lambda epoch, env_step: policy.set_eps(0.05),
    stop_fn=lambda mean_rewards: mean_rewards >= env_spec,
    logger=logger,
).run()

print(f"Finished training in {result.timing.total_time} seconds")
torch.save(policy.state_dict(), 'models/cbp_dqn_retest_v0.pth')

# --- Evaluate the trained agent ---
policy.load_state_dict(torch.load('models/cbp_dqn_retest_v0.pth'))
policy.eval()
policy.set_eps(0.05)

# Create an environment with rendering enabled.
eval_env = gym.make(env_name, render_mode="human")
eval_env.reset()
collector = ts.data.Collector(policy, eval_env, exploration_noise=True)
collector.collect(n_episode=1, render=1/35, reset_before_collect=True)
eval_env.close()

  logger.deprecation(
Epoch #1: 5001it [00:08, 599.72it/s, env_step=5000, gradient_step=500, len=152, n/ep=0, n/st=10, rew=152.00]                          


Epoch #1: test_reward: 102.600000 ± 39.593434, best_reward: 200.000000 ± 0.000000 in #0
Finished training in 9.72939920425415 seconds


# Mod 2: Change cartpole-v1's metrics

1. Incremental Variations of CartPole-v1 
Modify CartPole-v1's parameters gradually over time to observe how well CBP adapts:  

Gravity shift: Change gravity from 9.8 m/s² (Earth) to a lower value (e.g., Moon: 1.6 m/s²) or a higher value (e.g., Jupiter: 24.8 m/s²).  
Pole length variations: Increase or decrease length of the pole.  
Mass changes: Modify masspole or masscart dynamically every few episodes.  
Friction changes: Adjust track friction dynamically to simulate different surfaces.  
👉 Goal: See if the model continues learning and does not degrade in performance as physics conditions change.  

In [46]:
import numpy as np

class DynamicCartPoleWrapper(gym.Wrapper):
    def __init__(self, env, gravity_range=(5.0, 25.0), length_range=(0.2, 1.0), mass_range=(0.05, 1.0), friction_range=(0.0, 0.5)):
        super().__init__(env)
        self.gravity_range = gravity_range
        self.length_range = length_range
        self.mass_range = mass_range
        self.friction_range = friction_range
        self.episode_count = 0  # Track episodes

    def reset(self, **kwargs):
        self.episode_count += 1  # Increment episode count

        # Gradually shift physics conditions over time
        new_gravity = np.interp(self.episode_count, [0, 500], self.gravity_range)  # Gradually change gravity
        new_length = np.interp(self.episode_count, [0, 500], self.length_range)  # Change pole length
        new_mass = np.interp(self.episode_count, [0, 500], self.mass_range)  # Change pole mass
        new_friction = np.interp(self.episode_count, [0, 500], self.friction_range)  # Change friction

        # Apply new parameters
        self.env.env.gravity = new_gravity
        self.env.env.length = new_length
        self.env.env.masspole = new_mass
        self.env.env.force_mag = 10.0 * (new_mass / 0.1)  # Scale force with mass to balance difficulty
        self.env.env.friction = new_friction

        print(f"Episode {self.episode_count}: Gravity={new_gravity}, Length={new_length}, Mass={new_mass}, Friction={new_friction}")

        return self.env.reset(**kwargs)

env_name = "CartPole-v1"
base_env = gym.make(env_name)
train_env = DynamicCartPoleWrapper(base_env)

test_env = DynamicCartPoleWrapper(gym.make(env_name))  # Use same variations for testing

train_collector = ts.data.Collector(
    policy, train_env, ts.data.VectorReplayBuffer(20000, 1), exploration_noise=True
)
test_collector = ts.data.Collector(policy, test_env, exploration_noise=True)

# Load the pre-trained model from Code Block 1 
policy.load_state_dict(torch.load('models/cbp_dqn.pth')) 
policy.eval()

best_mean_reward = -np.inf 

def save_best_model(mean_rewards):
    global best_mean_reward
    if mean_rewards > best_mean_reward:
        best_mean_reward = mean_rewards
        torch.save(policy.state_dict(), 'models/cbp_dqn_v1_alternate_metrics.pth')
        print(f"New best model saved with mean reward: {best_mean_reward}")

# Train the model while CBP adapts to the changing environment
result = ts.trainer.OffpolicyTrainer(
    policy=policy,
    train_collector=train_collector,
    test_collector=test_collector,
    max_epoch=30,
    step_per_epoch=10000,
    step_per_collect=10,
    episode_per_test=100,
    batch_size=64,
    update_per_step=1 / 10,
    train_fn=lambda epoch, env_step: policy.set_eps(0.1),
    test_fn=lambda epoch, env_step: policy.set_eps(0.05),
    stop_fn=lambda mean_rewards: mean_rewards >= train_env.spec.reward_threshold,
    logger=ts.utils.TensorboardLogger(SummaryWriter("logs/cbp_dynamic_cartpole")),
).run()

save_best_model(result.best_reward)

Episode 1: Gravity=5.04, Length=0.2016, Mass=0.0519, Friction=0.001
Episode 1: Gravity=5.04, Length=0.2016, Mass=0.0519, Friction=0.001
Episode 2: Gravity=5.08, Length=0.20320000000000002, Mass=0.0538, Friction=0.002
Episode 3: Gravity=5.12, Length=0.2048, Mass=0.0557, Friction=0.003
Episode 4: Gravity=5.16, Length=0.2064, Mass=0.057600000000000005, Friction=0.004
Episode 5: Gravity=5.2, Length=0.20800000000000002, Mass=0.059500000000000004, Friction=0.005
Episode 6: Gravity=5.24, Length=0.2096, Mass=0.0614, Friction=0.006
Episode 7: Gravity=5.28, Length=0.2112, Mass=0.0633, Friction=0.007
Episode 8: Gravity=5.32, Length=0.21280000000000002, Mass=0.06520000000000001, Friction=0.008
Episode 9: Gravity=5.36, Length=0.2144, Mass=0.0671, Friction=0.009000000000000001
Episode 10: Gravity=5.4, Length=0.21600000000000003, Mass=0.069, Friction=0.01
Episode 11: Gravity=5.44, Length=0.21760000000000002, Mass=0.0709, Friction=0.011
Episode 12: Gravity=5.48, Length=0.2192, Mass=0.0728, Friction=0.

Epoch #1:   1%|1         | 140/10000 [00:00<00:12, 775.21it/s, env_step=140, gradient_step=14, len=12, n/ep=0, n/st=10, rew=12.00]

Episode 3: Gravity=5.12, Length=0.2048, Mass=0.0557, Friction=0.003
Episode 4: Gravity=5.16, Length=0.2064, Mass=0.057600000000000005, Friction=0.004
Episode 5: Gravity=5.2, Length=0.20800000000000002, Mass=0.059500000000000004, Friction=0.005
Episode 6: Gravity=5.24, Length=0.2096, Mass=0.0614, Friction=0.006


Epoch #1:   2%|2         | 220/10000 [00:00<00:13, 720.37it/s, env_step=220, gradient_step=22, len=10, n/ep=1, n/st=10, rew=10.00]

Episode 7: Gravity=5.28, Length=0.2112, Mass=0.0633, Friction=0.007
Episode 8: Gravity=5.32, Length=0.21280000000000002, Mass=0.06520000000000001, Friction=0.008
Episode 9: Gravity=5.36, Length=0.2144, Mass=0.0671, Friction=0.009000000000000001
Episode 10: Gravity=5.4, Length=0.21600000000000003, Mass=0.069, Friction=0.01
Episode 11: Gravity=5.44, Length=0.21760000000000002, Mass=0.0709, Friction=0.011
Episode 12: Gravity=5.48, Length=0.2192, Mass=0.0728, Friction=0.012
Episode 13: Gravity=5.52, Length=0.22080000000000002, Mass=0.0747, Friction=0.013000000000000001


Epoch #1:   3%|2         | 280/10000 [00:00<00:13, 704.47it/s, env_step=280, gradient_step=28, len=10, n/ep=1, n/st=10, rew=10.00]

Episode 14: Gravity=5.5600000000000005, Length=0.22240000000000001, Mass=0.0766, Friction=0.014
Episode 15: Gravity=5.6, Length=0.224, Mass=0.0785, Friction=0.015
Episode 16: Gravity=5.64, Length=0.22560000000000002, Mass=0.0804, Friction=0.016
Episode 17: Gravity=5.68, Length=0.2272, Mass=0.08230000000000001, Friction=0.017
Episode 18: Gravity=5.72, Length=0.2288, Mass=0.0842, Friction=0.018000000000000002
Episode 19: Gravity=5.76, Length=0.23040000000000002, Mass=0.08610000000000001, Friction=0.019
Episode 20: Gravity=5.8, Length=0.232, Mass=0.088, Friction=0.02


Epoch #1:   4%|3         | 360/10000 [00:00<00:13, 693.09it/s, env_step=360, gradient_step=36, len=9, n/ep=1, n/st=10, rew=9.00]  

Episode 21: Gravity=5.84, Length=0.23360000000000003, Mass=0.08990000000000001, Friction=0.021
Episode 22: Gravity=5.88, Length=0.23520000000000002, Mass=0.09179999999999999, Friction=0.022
Episode 23: Gravity=5.92, Length=0.2368, Mass=0.0937, Friction=0.023
Episode 24: Gravity=5.96, Length=0.2384, Mass=0.0956, Friction=0.024
Episode 25: Gravity=6.0, Length=0.24000000000000002, Mass=0.0975, Friction=0.025
Episode 26: Gravity=6.04, Length=0.2416, Mass=0.0994, Friction=0.026000000000000002
Episode 27: Gravity=6.08, Length=0.24320000000000003, Mass=0.1013, Friction=0.027
Episode 28: Gravity=6.12, Length=0.24480000000000002, Mass=0.1032, Friction=0.028


Epoch #1:   4%|4         | 430/10000 [00:00<00:13, 687.86it/s, env_step=430, gradient_step=43, len=9, n/ep=1, n/st=10, rew=9.00]  

Episode 29: Gravity=6.16, Length=0.2464, Mass=0.1051, Friction=0.029
Episode 30: Gravity=6.2, Length=0.248, Mass=0.10700000000000001, Friction=0.03
Episode 31: Gravity=6.24, Length=0.24960000000000002, Mass=0.1089, Friction=0.031
Episode 32: Gravity=6.28, Length=0.25120000000000003, Mass=0.11080000000000001, Friction=0.032
Episode 33: Gravity=6.32, Length=0.2528, Mass=0.11270000000000001, Friction=0.033
Episode 34: Gravity=6.36, Length=0.2544, Mass=0.11460000000000001, Friction=0.034
Episode 35: Gravity=6.4, Length=0.256, Mass=0.1165, Friction=0.035


Epoch #1:   5%|5         | 510/10000 [00:00<00:13, 690.28it/s, env_step=510, gradient_step=51, len=9, n/ep=1, n/st=10, rew=9.00]  

Episode 36: Gravity=6.4399999999999995, Length=0.2576, Mass=0.1184, Friction=0.036000000000000004
Episode 37: Gravity=6.48, Length=0.2592, Mass=0.1203, Friction=0.037
Episode 38: Gravity=6.52, Length=0.26080000000000003, Mass=0.1222, Friction=0.038
Episode 39: Gravity=6.5600000000000005, Length=0.2624, Mass=0.1241, Friction=0.039
Episode 40: Gravity=6.6, Length=0.264, Mass=0.126, Friction=0.04
Episode 41: Gravity=6.640000000000001, Length=0.2656, Mass=0.1279, Friction=0.041
Episode 42: Gravity=6.68, Length=0.2672, Mass=0.1298, Friction=0.042


Epoch #1:   6%|5         | 580/10000 [00:00<00:13, 691.53it/s, env_step=580, gradient_step=58, len=11, n/ep=1, n/st=10, rew=11.00]

Episode 43: Gravity=6.72, Length=0.26880000000000004, Mass=0.13169999999999998, Friction=0.043000000000000003
Episode 44: Gravity=6.76, Length=0.27040000000000003, Mass=0.1336, Friction=0.044
Episode 45: Gravity=6.8, Length=0.272, Mass=0.1355, Friction=0.045
Episode 46: Gravity=6.84, Length=0.2736, Mass=0.13740000000000002, Friction=0.046
Episode 47: Gravity=6.88, Length=0.2752, Mass=0.1393, Friction=0.047
Episode 48: Gravity=6.92, Length=0.27680000000000005, Mass=0.1412, Friction=0.048
Episode 49: Gravity=6.96, Length=0.2784, Mass=0.1431, Friction=0.049


Epoch #1:   6%|6         | 650/10000 [00:00<00:13, 687.06it/s, env_step=650, gradient_step=65, len=13, n/ep=1, n/st=10, rew=13.00]

Episode 50: Gravity=7.0, Length=0.28, Mass=0.14500000000000002, Friction=0.05
Episode 51: Gravity=7.04, Length=0.2816, Mass=0.1469, Friction=0.051000000000000004
Episode 52: Gravity=7.08, Length=0.2832, Mass=0.1488, Friction=0.052000000000000005
Episode 53: Gravity=7.12, Length=0.2848, Mass=0.1507, Friction=0.053
Episode 54: Gravity=7.16, Length=0.2864, Mass=0.1526, Friction=0.054
Episode 55: Gravity=7.2, Length=0.28800000000000003, Mass=0.1545, Friction=0.055


Epoch #1:   7%|7         | 710/10000 [00:01<00:13, 681.08it/s, env_step=710, gradient_step=71, len=19, n/ep=1, n/st=10, rew=19.00]

Episode 56: Gravity=7.24, Length=0.2896, Mass=0.15639999999999998, Friction=0.056
Episode 57: Gravity=7.28, Length=0.2912, Mass=0.1583, Friction=0.057
Episode 58: Gravity=7.32, Length=0.2928, Mass=0.1602, Friction=0.058
Episode 59: Gravity=7.359999999999999, Length=0.2944, Mass=0.16210000000000002, Friction=0.059000000000000004


Epoch #1:   8%|7         | 760/10000 [00:01<00:15, 614.87it/s, env_step=760, gradient_step=76, len=23, n/ep=0, n/st=10, rew=23.00]

Episode 60: Gravity=7.4, Length=0.29600000000000004, Mass=0.164, Friction=0.06


Epoch #1:   8%|8         | 820/10000 [00:01<00:14, 632.89it/s, env_step=820, gradient_step=82, len=59, n/ep=0, n/st=10, rew=59.00]

Episode 61: Gravity=7.4399999999999995, Length=0.29760000000000003, Mass=0.1659, Friction=0.061
Episode 62: Gravity=7.48, Length=0.2992, Mass=0.1678, Friction=0.062


Epoch #1:   9%|8         | 870/10000 [00:01<00:14, 632.89it/s, env_step=870, gradient_step=87, len=30, n/ep=0, n/st=10, rew=30.00]

Episode 63: Gravity=7.52, Length=0.3008, Mass=0.16970000000000002, Friction=0.063


Epoch #1:  10%|9         | 980/10000 [00:01<00:15, 571.82it/s, env_step=980, gradient_step=98, len=10, n/ep=1, n/st=10, rew=10.00]

Episode 64: Gravity=7.5600000000000005, Length=0.3024, Mass=0.1716, Friction=0.064
Episode 65: Gravity=7.6, Length=0.30400000000000005, Mass=0.1735, Friction=0.065
Episode 66: Gravity=7.640000000000001, Length=0.3056, Mass=0.1754, Friction=0.066
Episode 67: Gravity=7.68, Length=0.30720000000000003, Mass=0.1773, Friction=0.067
Episode 68: Gravity=7.720000000000001, Length=0.3088, Mass=0.17920000000000003, Friction=0.068
Episode 69: Gravity=7.76, Length=0.3104, Mass=0.18109999999999998, Friction=0.069


Epoch #1:  10%|#         | 1030/10000 [00:01<00:15, 595.21it/s, env_step=1030, gradient_step=103, len=10, n/ep=1, n/st=10, rew=10.00]

Episode 70: Gravity=7.800000000000001, Length=0.312, Mass=0.183, Friction=0.07
Episode 71: Gravity=7.84, Length=0.3136, Mass=0.1849, Friction=0.07100000000000001
Episode 72: Gravity=7.88, Length=0.31520000000000004, Mass=0.18680000000000002, Friction=0.07200000000000001
Episode 73: Gravity=7.92, Length=0.3168, Mass=0.18869999999999998, Friction=0.073
Episode 74: Gravity=7.96, Length=0.3184, Mass=0.1906, Friction=0.074


Epoch #1:  11%|#1        | 1110/10000 [00:01<00:14, 614.61it/s, env_step=1110, gradient_step=111, len=9, n/ep=0, n/st=10, rew=9.50]  

Episode 75: Gravity=8.0, Length=0.32, Mass=0.1925, Friction=0.075
Episode 76: Gravity=8.04, Length=0.3216, Mass=0.19440000000000002, Friction=0.076
Episode 77: Gravity=8.08, Length=0.32320000000000004, Mass=0.19630000000000003, Friction=0.077
Episode 78: Gravity=8.120000000000001, Length=0.32480000000000003, Mass=0.1982, Friction=0.078


Epoch #1:  13%|#3        | 1340/10000 [00:02<00:12, 682.33it/s, env_step=1340, gradient_step=134, len=133, n/ep=0, n/st=10, rew=133.00]

Episode 79: Gravity=8.16, Length=0.3264, Mass=0.2001, Friction=0.079


Epoch #1:  15%|#5        | 1520/10000 [00:02<00:11, 738.47it/s, env_step=1520, gradient_step=152, len=159, n/ep=0, n/st=10, rew=159.00]

Episode 80: Gravity=8.2, Length=0.328, Mass=0.202, Friction=0.08


Epoch #1:  17%|#7        | 1720/10000 [00:02<00:10, 826.49it/s, env_step=1720, gradient_step=172, len=44, n/ep=0, n/st=10, rew=44.00]  

Episode 81: Gravity=8.24, Length=0.3296, Mass=0.20390000000000003, Friction=0.081
Episode 82: Gravity=8.280000000000001, Length=0.33120000000000005, Mass=0.20579999999999998, Friction=0.082
Episode 83: Gravity=8.32, Length=0.3328, Mass=0.2077, Friction=0.083
Episode 84: Gravity=8.36, Length=0.33440000000000003, Mass=0.2096, Friction=0.084


Epoch #1:  19%|#9        | 1910/10000 [00:02<00:09, 879.20it/s, env_step=1910, gradient_step=191, len=11, n/ep=0, n/st=10, rew=11.00]

Episode 85: Gravity=8.4, Length=0.336, Mass=0.21150000000000002, Friction=0.085
Episode 86: Gravity=8.44, Length=0.3376, Mass=0.21339999999999998, Friction=0.08600000000000001
Episode 87: Gravity=8.48, Length=0.33920000000000006, Mass=0.2153, Friction=0.08700000000000001
Episode 88: Gravity=8.52, Length=0.3408, Mass=0.2172, Friction=0.088
Episode 89: Gravity=8.56, Length=0.34240000000000004, Mass=0.21910000000000002, Friction=0.089
Episode 90: Gravity=8.6, Length=0.34400000000000003, Mass=0.22100000000000003, Friction=0.09
Episode 91: Gravity=8.64, Length=0.3456, Mass=0.2229, Friction=0.091
Episode 92: Gravity=8.68, Length=0.3472, Mass=0.2248, Friction=0.092
Episode 93: Gravity=8.72, Length=0.3488, Mass=0.2267, Friction=0.093
Episode 94: Gravity=8.76, Length=0.35040000000000004, Mass=0.22860000000000003, Friction=0.094
Episode 95: Gravity=8.8, Length=0.352, Mass=0.23049999999999998, Friction=0.095
Episode 96: Gravity=8.84, Length=0.3536, Mass=0.2324, Friction=0.096
Episode 97: Gravity=

Epoch #1:  21%|##1       | 2120/10000 [00:02<00:08, 907.57it/s, env_step=2120, gradient_step=212, len=87, n/ep=0, n/st=10, rew=87.00]

Episode 99: Gravity=8.96, Length=0.35840000000000005, Mass=0.23809999999999998, Friction=0.099


Epoch #1:  23%|##2       | 2290/10000 [00:03<00:08, 874.71it/s, env_step=2290, gradient_step=229, len=11, n/ep=1, n/st=10, rew=11.00]  

Episode 100: Gravity=9.0, Length=0.36, Mass=0.24, Friction=0.1
Episode 101: Gravity=9.04, Length=0.36160000000000003, Mass=0.2419, Friction=0.101
Episode 102: Gravity=9.08, Length=0.3632, Mass=0.24380000000000002, Friction=0.10200000000000001
Episode 103: Gravity=9.120000000000001, Length=0.3648, Mass=0.24570000000000003, Friction=0.10300000000000001
Episode 104: Gravity=9.16, Length=0.36640000000000006, Mass=0.2476, Friction=0.10400000000000001
Episode 105: Gravity=9.2, Length=0.368, Mass=0.2495, Friction=0.105


Epoch #1:  26%|##5       | 2570/10000 [00:03<00:09, 747.45it/s, env_step=2570, gradient_step=257, len=49, n/ep=0, n/st=10, rew=49.00]  

Episode 106: Gravity=9.24, Length=0.36960000000000004, Mass=0.2514, Friction=0.106
Episode 107: Gravity=9.280000000000001, Length=0.37120000000000003, Mass=0.2533, Friction=0.107


Epoch #1:  27%|##7       | 2730/10000 [00:03<00:10, 712.29it/s, env_step=2730, gradient_step=273, len=14, n/ep=0, n/st=10, rew=14.00]  

Episode 108: Gravity=9.32, Length=0.3728, Mass=0.2552, Friction=0.108
Episode 109: Gravity=9.36, Length=0.3744, Mass=0.2571, Friction=0.109
Episode 110: Gravity=9.4, Length=0.376, Mass=0.259, Friction=0.11
Episode 111: Gravity=9.440000000000001, Length=0.37760000000000005, Mass=0.2609, Friction=0.111
Episode 112: Gravity=9.48, Length=0.3792, Mass=0.2628, Friction=0.112
Episode 113: Gravity=9.52, Length=0.3808, Mass=0.2647, Friction=0.113
Episode 114: Gravity=9.56, Length=0.3824, Mass=0.2666, Friction=0.114
Episode 115: Gravity=9.600000000000001, Length=0.384, Mass=0.2685, Friction=0.115


Epoch #1:  31%|###1      | 3120/10000 [00:04<00:09, 718.38it/s, env_step=3120, gradient_step=312, len=310, n/ep=0, n/st=10, rew=310.00]

Episode 116: Gravity=9.64, Length=0.38560000000000005, Mass=0.27040000000000003, Friction=0.116


Epoch #1:  33%|###2      | 3280/10000 [00:04<00:09, 683.85it/s, env_step=3280, gradient_step=328, len=15, n/ep=0, n/st=10, rew=15.00]  

Episode 117: Gravity=9.68, Length=0.3872, Mass=0.2723, Friction=0.117
Episode 118: Gravity=9.719999999999999, Length=0.38880000000000003, Mass=0.2742, Friction=0.11800000000000001
Episode 119: Gravity=9.76, Length=0.3904, Mass=0.2761, Friction=0.11900000000000001
Episode 120: Gravity=9.8, Length=0.392, Mass=0.278, Friction=0.12
Episode 121: Gravity=9.84, Length=0.39360000000000006, Mass=0.2799, Friction=0.121
Episode 122: Gravity=9.879999999999999, Length=0.3952, Mass=0.2818, Friction=0.122
Episode 123: Gravity=9.92, Length=0.39680000000000004, Mass=0.2837, Friction=0.123
Episode 124: Gravity=9.96, Length=0.39840000000000003, Mass=0.2856, Friction=0.124
Episode 125: Gravity=10.0, Length=0.4, Mass=0.2875, Friction=0.125
Episode 126: Gravity=10.04, Length=0.4016, Mass=0.2894, Friction=0.126
Episode 127: Gravity=10.08, Length=0.4032, Mass=0.2913, Friction=0.127


Epoch #1:  36%|###5      | 3570/10000 [00:04<00:09, 695.97it/s, env_step=3570, gradient_step=357, len=126, n/ep=0, n/st=10, rew=126.00]

Episode 128: Gravity=10.120000000000001, Length=0.40480000000000005, Mass=0.2932, Friction=0.128
Episode 129: Gravity=10.16, Length=0.4064, Mass=0.29510000000000003, Friction=0.129


Epoch #1:  39%|###9      | 3910/10000 [00:05<00:07, 821.76it/s, env_step=3910, gradient_step=391, len=116, n/ep=0, n/st=10, rew=116.00]

Episode 130: Gravity=10.2, Length=0.40800000000000003, Mass=0.297, Friction=0.13
Episode 131: Gravity=10.24, Length=0.4096, Mass=0.2989, Friction=0.131


Epoch #1:  42%|####1     | 4170/10000 [00:05<00:06, 884.70it/s, env_step=4170, gradient_step=417, len=110, n/ep=0, n/st=10, rew=110.00]

Episode 132: Gravity=10.280000000000001, Length=0.4112, Mass=0.3008, Friction=0.132
Episode 133: Gravity=10.32, Length=0.41280000000000006, Mass=0.30269999999999997, Friction=0.133


Epoch #1:  44%|####3     | 4370/10000 [00:05<00:06, 891.96it/s, env_step=4370, gradient_step=437, len=13, n/ep=1, n/st=10, rew=13.00]  

Episode 134: Gravity=10.36, Length=0.4144, Mass=0.3046, Friction=0.134
Episode 135: Gravity=10.4, Length=0.41600000000000004, Mass=0.3065, Friction=0.135
Episode 136: Gravity=10.440000000000001, Length=0.4176, Mass=0.3084, Friction=0.136
Episode 137: Gravity=10.48, Length=0.4192, Mass=0.31029999999999996, Friction=0.137
Episode 138: Gravity=10.52, Length=0.42080000000000006, Mass=0.3122, Friction=0.138
Episode 139: Gravity=10.56, Length=0.4224, Mass=0.3141, Friction=0.139
Episode 140: Gravity=10.600000000000001, Length=0.42400000000000004, Mass=0.316, Friction=0.14
Episode 141: Gravity=10.64, Length=0.42560000000000003, Mass=0.3179, Friction=0.14100000000000001
Episode 142: Gravity=10.68, Length=0.4272, Mass=0.3198, Friction=0.14200000000000002
Episode 143: Gravity=10.719999999999999, Length=0.4288, Mass=0.3217, Friction=0.14300000000000002
Episode 144: Gravity=10.76, Length=0.4304, Mass=0.3236, Friction=0.14400000000000002
Episode 145: Gravity=10.8, Length=0.43200000000000005, Mass=0.

Epoch #1:  46%|####5     | 4560/10000 [00:06<00:06, 899.94it/s, env_step=4560, gradient_step=456, len=49, n/ep=0, n/st=10, rew=49.00]

Episode 152: Gravity=11.08, Length=0.44320000000000004, Mass=0.3388, Friction=0.152
Episode 153: Gravity=11.120000000000001, Length=0.44480000000000003, Mass=0.3407, Friction=0.153
Episode 154: Gravity=11.16, Length=0.4464, Mass=0.3426, Friction=0.154
Episode 155: Gravity=11.2, Length=0.448, Mass=0.3445, Friction=0.155
Episode 156: Gravity=11.24, Length=0.4496, Mass=0.3464, Friction=0.156


Epoch #1:  49%|####8     | 4870/10000 [00:06<00:05, 912.99it/s, env_step=4870, gradient_step=487, len=95, n/ep=0, n/st=10, rew=95.00]  

Episode 157: Gravity=11.280000000000001, Length=0.45120000000000005, Mass=0.3483, Friction=0.157
Episode 158: Gravity=11.32, Length=0.45280000000000004, Mass=0.3502, Friction=0.158


Epoch #1:  52%|#####2    | 5210/10000 [00:06<00:05, 931.19it/s, env_step=5210, gradient_step=521, len=16, n/ep=1, n/st=10, rew=16.00]  

Episode 159: Gravity=11.36, Length=0.4544, Mass=0.35209999999999997, Friction=0.159
Episode 160: Gravity=11.4, Length=0.456, Mass=0.354, Friction=0.16
Episode 161: Gravity=11.440000000000001, Length=0.4576, Mass=0.3559, Friction=0.161
Episode 162: Gravity=11.48, Length=0.4592, Mass=0.3578, Friction=0.162
Episode 163: Gravity=11.52, Length=0.46080000000000004, Mass=0.35969999999999996, Friction=0.163
Episode 164: Gravity=11.56, Length=0.46240000000000003, Mass=0.3616, Friction=0.164
Episode 165: Gravity=11.600000000000001, Length=0.464, Mass=0.3635, Friction=0.165


Epoch #1:  54%|#####4    | 5400/10000 [00:06<00:05, 913.27it/s, env_step=5400, gradient_step=540, len=10, n/ep=1, n/st=10, rew=10.00]

Episode 166: Gravity=11.64, Length=0.4656, Mass=0.3654, Friction=0.166
Episode 167: Gravity=11.68, Length=0.4672, Mass=0.3673, Friction=0.167
Episode 168: Gravity=11.719999999999999, Length=0.46880000000000005, Mass=0.3692, Friction=0.168
Episode 169: Gravity=11.76, Length=0.47040000000000004, Mass=0.3711, Friction=0.169
Episode 170: Gravity=11.8, Length=0.47200000000000003, Mass=0.373, Friction=0.17
Episode 171: Gravity=11.84, Length=0.4736, Mass=0.3749, Friction=0.171
Episode 172: Gravity=11.879999999999999, Length=0.4752, Mass=0.37679999999999997, Friction=0.17200000000000001
Episode 173: Gravity=11.92, Length=0.4768, Mass=0.3787, Friction=0.17300000000000001
Episode 174: Gravity=11.96, Length=0.47840000000000005, Mass=0.3806, Friction=0.17400000000000002
Episode 175: Gravity=12.0, Length=0.48000000000000004, Mass=0.3825, Friction=0.17500000000000002
Episode 176: Gravity=12.04, Length=0.48160000000000003, Mass=0.38439999999999996, Friction=0.176
Episode 177: Gravity=12.08, Length=0.

Epoch #1:  56%|#####5    | 5590/10000 [00:07<00:04, 910.29it/s, env_step=5590, gradient_step=559, len=16, n/ep=1, n/st=10, rew=16.00]

Episode 183: Gravity=12.32, Length=0.4928, Mass=0.3977, Friction=0.183
Episode 184: Gravity=12.36, Length=0.4944, Mass=0.3996, Friction=0.184
Episode 185: Gravity=12.4, Length=0.49600000000000005, Mass=0.40149999999999997, Friction=0.185
Episode 186: Gravity=12.440000000000001, Length=0.49760000000000004, Mass=0.4034, Friction=0.186
Episode 187: Gravity=12.48, Length=0.49920000000000003, Mass=0.4053, Friction=0.187
Episode 188: Gravity=12.52, Length=0.5008, Mass=0.4072, Friction=0.188
Episode 189: Gravity=12.56, Length=0.5024, Mass=0.40909999999999996, Friction=0.189
Episode 190: Gravity=12.600000000000001, Length=0.504, Mass=0.411, Friction=0.19
Episode 191: Gravity=12.64, Length=0.5056, Mass=0.4129, Friction=0.191
Episode 192: Gravity=12.68, Length=0.5072000000000001, Mass=0.4148, Friction=0.192
Episode 193: Gravity=12.719999999999999, Length=0.5088, Mass=0.4167, Friction=0.193
Episode 194: Gravity=12.76, Length=0.5104, Mass=0.41859999999999997, Friction=0.194
Episode 195: Gravity=12

Epoch #1:  58%|#####7    | 5790/10000 [00:07<00:04, 908.55it/s, env_step=5790, gradient_step=579, len=16, n/ep=1, n/st=10, rew=16.00]

Episode 198: Gravity=12.92, Length=0.5168, Mass=0.42619999999999997, Friction=0.198
Episode 199: Gravity=12.96, Length=0.5184, Mass=0.4281, Friction=0.199
Episode 200: Gravity=13.0, Length=0.52, Mass=0.43, Friction=0.2
Episode 201: Gravity=13.040000000000001, Length=0.5216000000000001, Mass=0.4319, Friction=0.201
Episode 202: Gravity=13.08, Length=0.5232000000000001, Mass=0.43379999999999996, Friction=0.202
Episode 203: Gravity=13.120000000000001, Length=0.5248, Mass=0.4357, Friction=0.203
Episode 204: Gravity=13.16, Length=0.5264, Mass=0.4376, Friction=0.20400000000000001
Episode 205: Gravity=13.2, Length=0.528, Mass=0.4395, Friction=0.20500000000000002
Episode 206: Gravity=13.24, Length=0.5296000000000001, Mass=0.4414, Friction=0.20600000000000002
Episode 207: Gravity=13.28, Length=0.5312, Mass=0.44329999999999997, Friction=0.20700000000000002


Epoch #1:  60%|#####9    | 5980/10000 [00:07<00:04, 908.18it/s, env_step=5980, gradient_step=598, len=18, n/ep=1, n/st=10, rew=18.00]

Episode 208: Gravity=13.32, Length=0.5328, Mass=0.4452, Friction=0.20800000000000002
Episode 209: Gravity=13.36, Length=0.5344, Mass=0.4471, Friction=0.209
Episode 210: Gravity=13.4, Length=0.536, Mass=0.449, Friction=0.21
Episode 211: Gravity=13.44, Length=0.5376000000000001, Mass=0.45089999999999997, Friction=0.211
Episode 212: Gravity=13.48, Length=0.5392, Mass=0.4528, Friction=0.212
Episode 213: Gravity=13.52, Length=0.5408, Mass=0.4547, Friction=0.213
Episode 214: Gravity=13.56, Length=0.5424, Mass=0.4566, Friction=0.214
Episode 215: Gravity=13.6, Length=0.544, Mass=0.45849999999999996, Friction=0.215
Episode 216: Gravity=13.64, Length=0.5456000000000001, Mass=0.4604, Friction=0.216
Episode 217: Gravity=13.68, Length=0.5472, Mass=0.4623, Friction=0.217
Episode 218: Gravity=13.72, Length=0.5488, Mass=0.4642, Friction=0.218


Epoch #1:  62%|######1   | 6170/10000 [00:07<00:04, 907.17it/s, env_step=6170, gradient_step=617, len=17, n/ep=1, n/st=10, rew=17.00]

Episode 219: Gravity=13.76, Length=0.5504, Mass=0.4661, Friction=0.219
Episode 220: Gravity=13.8, Length=0.552, Mass=0.46799999999999997, Friction=0.22
Episode 221: Gravity=13.84, Length=0.5536000000000001, Mass=0.4699, Friction=0.221
Episode 222: Gravity=13.88, Length=0.5552, Mass=0.4718, Friction=0.222
Episode 223: Gravity=13.92, Length=0.5568, Mass=0.4737, Friction=0.223
Episode 224: Gravity=13.96, Length=0.5584, Mass=0.47559999999999997, Friction=0.224
Episode 225: Gravity=14.0, Length=0.56, Mass=0.4775, Friction=0.225
Episode 226: Gravity=14.040000000000001, Length=0.5616000000000001, Mass=0.4794, Friction=0.226
Episode 227: Gravity=14.08, Length=0.5632, Mass=0.4813, Friction=0.227


Epoch #1:  64%|######3   | 6360/10000 [00:08<00:04, 902.52it/s, env_step=6360, gradient_step=636, len=17, n/ep=1, n/st=10, rew=17.00]

Episode 228: Gravity=14.120000000000001, Length=0.5648, Mass=0.48319999999999996, Friction=0.228
Episode 229: Gravity=14.16, Length=0.5664, Mass=0.4851, Friction=0.229
Episode 230: Gravity=14.200000000000001, Length=0.5680000000000001, Mass=0.487, Friction=0.23
Episode 231: Gravity=14.24, Length=0.5696000000000001, Mass=0.4889, Friction=0.231
Episode 232: Gravity=14.28, Length=0.5712, Mass=0.4908, Friction=0.232
Episode 233: Gravity=14.32, Length=0.5728, Mass=0.49269999999999997, Friction=0.233
Episode 234: Gravity=14.36, Length=0.5744, Mass=0.4946, Friction=0.234
Episode 235: Gravity=14.4, Length=0.5760000000000001, Mass=0.4965, Friction=0.23500000000000001
Episode 236: Gravity=14.44, Length=0.5776, Mass=0.4984, Friction=0.23600000000000002
Episode 237: Gravity=14.48, Length=0.5792, Mass=0.5003, Friction=0.23700000000000002


Epoch #1:  66%|######5   | 6550/10000 [00:08<00:03, 905.80it/s, env_step=6550, gradient_step=655, len=32, n/ep=0, n/st=10, rew=32.00]

Episode 238: Gravity=14.52, Length=0.5808, Mass=0.5022, Friction=0.23800000000000002
Episode 239: Gravity=14.56, Length=0.5824, Mass=0.5041, Friction=0.23900000000000002
Episode 240: Gravity=14.6, Length=0.5840000000000001, Mass=0.506, Friction=0.24
Episode 241: Gravity=14.64, Length=0.5856, Mass=0.5079, Friction=0.241
Episode 242: Gravity=14.68, Length=0.5872, Mass=0.5098, Friction=0.242
Episode 243: Gravity=14.72, Length=0.5888, Mass=0.5117, Friction=0.243
Episode 244: Gravity=14.76, Length=0.5904, Mass=0.5136000000000001, Friction=0.244


Epoch #1:  68%|######7   | 6750/10000 [00:08<00:03, 908.79it/s, env_step=6750, gradient_step=675, len=35, n/ep=0, n/st=10, rew=35.00]

Episode 245: Gravity=14.8, Length=0.5920000000000001, Mass=0.5155000000000001, Friction=0.245
Episode 246: Gravity=14.84, Length=0.5936, Mass=0.5174, Friction=0.246
Episode 247: Gravity=14.88, Length=0.5952, Mass=0.5193, Friction=0.247
Episode 248: Gravity=14.92, Length=0.5968, Mass=0.5212, Friction=0.248
Episode 249: Gravity=14.96, Length=0.5984, Mass=0.5231, Friction=0.249
Episode 250: Gravity=15.0, Length=0.6000000000000001, Mass=0.525, Friction=0.25


Epoch #1:  70%|#######   | 7000/10000 [00:08<00:03, 917.59it/s, env_step=7000, gradient_step=700, len=50, n/ep=1, n/st=10, rew=50.00]  

Episode 251: Gravity=15.040000000000001, Length=0.6016, Mass=0.5269, Friction=0.251
Episode 252: Gravity=15.08, Length=0.6032, Mass=0.5288, Friction=0.252
Episode 253: Gravity=15.120000000000001, Length=0.6048, Mass=0.5307000000000001, Friction=0.253


Epoch #1:  72%|#######1  | 7190/10000 [00:08<00:03, 918.40it/s, env_step=7190, gradient_step=719, len=18, n/ep=0, n/st=10, rew=18.00]

Episode 254: Gravity=15.16, Length=0.6064, Mass=0.5326, Friction=0.254
Episode 255: Gravity=15.200000000000001, Length=0.6080000000000001, Mass=0.5345, Friction=0.255
Episode 256: Gravity=15.24, Length=0.6096, Mass=0.5364, Friction=0.256
Episode 257: Gravity=15.28, Length=0.6112, Mass=0.5383, Friction=0.257
Episode 258: Gravity=15.32, Length=0.6128, Mass=0.5402, Friction=0.258
Episode 259: Gravity=15.36, Length=0.6144000000000001, Mass=0.5421, Friction=0.259
Episode 260: Gravity=15.4, Length=0.6160000000000001, Mass=0.544, Friction=0.26
Episode 261: Gravity=15.44, Length=0.6176, Mass=0.5459, Friction=0.261
Episode 262: Gravity=15.48, Length=0.6192, Mass=0.5478000000000001, Friction=0.262


Epoch #1:  74%|#######3  | 7390/10000 [00:09<00:02, 914.01it/s, env_step=7390, gradient_step=739, len=27, n/ep=0, n/st=10, rew=27.00]

Episode 263: Gravity=15.52, Length=0.6208, Mass=0.5497, Friction=0.263
Episode 264: Gravity=15.56, Length=0.6224000000000001, Mass=0.5516000000000001, Friction=0.264
Episode 265: Gravity=15.6, Length=0.6240000000000001, Mass=0.5535, Friction=0.265
Episode 266: Gravity=15.64, Length=0.6256, Mass=0.5554, Friction=0.266
Episode 267: Gravity=15.68, Length=0.6272, Mass=0.5573, Friction=0.267
Episode 268: Gravity=15.72, Length=0.6288, Mass=0.5592, Friction=0.268


Epoch #1:  76%|#######5  | 7580/10000 [00:09<00:02, 908.47it/s, env_step=7580, gradient_step=758, len=36, n/ep=0, n/st=10, rew=36.00]

Episode 269: Gravity=15.76, Length=0.6304000000000001, Mass=0.5611, Friction=0.269
Episode 270: Gravity=15.8, Length=0.632, Mass=0.5630000000000001, Friction=0.27
Episode 271: Gravity=15.84, Length=0.6336, Mass=0.5649000000000001, Friction=0.271
Episode 272: Gravity=15.88, Length=0.6352, Mass=0.5668000000000001, Friction=0.272
Episode 273: Gravity=15.92, Length=0.6368, Mass=0.5687000000000001, Friction=0.273


Epoch #1:  78%|#######7  | 7760/10000 [00:09<00:02, 906.30it/s, env_step=7760, gradient_step=776, len=80, n/ep=1, n/st=10, rew=80.00]

Episode 274: Gravity=15.96, Length=0.6384000000000001, Mass=0.5706, Friction=0.274
Episode 275: Gravity=16.0, Length=0.64, Mass=0.5725, Friction=0.275
Episode 276: Gravity=16.04, Length=0.6416000000000001, Mass=0.5744, Friction=0.276
Episode 277: Gravity=16.08, Length=0.6432, Mass=0.5763, Friction=0.277
Episode 278: Gravity=16.12, Length=0.6448, Mass=0.5782, Friction=0.278


Epoch #1:  79%|#######9  | 7940/10000 [00:09<00:02, 901.99it/s, env_step=7940, gradient_step=794, len=39, n/ep=0, n/st=10, rew=39.00]

Episode 279: Gravity=16.16, Length=0.6464000000000001, Mass=0.5801000000000001, Friction=0.279
Episode 280: Gravity=16.200000000000003, Length=0.648, Mass=0.5820000000000001, Friction=0.28
Episode 281: Gravity=16.240000000000002, Length=0.6496, Mass=0.5839000000000001, Friction=0.281
Episode 282: Gravity=16.28, Length=0.6512, Mass=0.5858000000000001, Friction=0.28200000000000003
Episode 283: Gravity=16.32, Length=0.6528, Mass=0.5877, Friction=0.28300000000000003
Episode 284: Gravity=16.36, Length=0.6544000000000001, Mass=0.5896, Friction=0.28400000000000003


Epoch #1:  81%|########1 | 8100/10000 [00:10<00:02, 804.45it/s, env_step=8100, gradient_step=810, len=22, n/ep=0, n/st=10, rew=22.00]

Episode 285: Gravity=16.4, Length=0.656, Mass=0.5915, Friction=0.28500000000000003
Episode 286: Gravity=16.439999999999998, Length=0.6576, Mass=0.5934, Friction=0.28600000000000003
Episode 287: Gravity=16.48, Length=0.6592, Mass=0.5953, Friction=0.28700000000000003
Episode 288: Gravity=16.52, Length=0.6608, Mass=0.5972000000000001, Friction=0.28800000000000003


Epoch #1:  82%|########2 | 8250/10000 [00:10<00:02, 728.67it/s, env_step=8250, gradient_step=825, len=19, n/ep=1, n/st=10, rew=19.00]

Episode 289: Gravity=16.560000000000002, Length=0.6624000000000001, Mass=0.5991000000000001, Friction=0.289
Episode 290: Gravity=16.6, Length=0.664, Mass=0.6010000000000001, Friction=0.29
Episode 291: Gravity=16.64, Length=0.6656, Mass=0.6029, Friction=0.291
Episode 292: Gravity=16.68, Length=0.6672, Mass=0.6048, Friction=0.292
Episode 293: Gravity=16.72, Length=0.6688000000000001, Mass=0.6067, Friction=0.293
Episode 294: Gravity=16.759999999999998, Length=0.6704000000000001, Mass=0.6086, Friction=0.294
Episode 295: Gravity=16.8, Length=0.672, Mass=0.6105, Friction=0.295
Episode 296: Gravity=16.84, Length=0.6736, Mass=0.6124, Friction=0.296


Epoch #1:  84%|########3 | 8390/10000 [00:10<00:02, 707.22it/s, env_step=8390, gradient_step=839, len=20, n/ep=0, n/st=10, rew=20.00]

Episode 297: Gravity=16.880000000000003, Length=0.6752, Mass=0.6143000000000001, Friction=0.297
Episode 298: Gravity=16.92, Length=0.6768000000000001, Mass=0.6162000000000001, Friction=0.298
Episode 299: Gravity=16.96, Length=0.6784000000000001, Mass=0.6181000000000001, Friction=0.299
Episode 300: Gravity=17.0, Length=0.68, Mass=0.62, Friction=0.3
Episode 301: Gravity=17.04, Length=0.6816, Mass=0.6219, Friction=0.301


Epoch #1:  86%|########5 | 8570/10000 [00:10<00:02, 710.42it/s, env_step=8570, gradient_step=857, len=108, n/ep=1, n/st=10, rew=108.00]

Episode 302: Gravity=17.08, Length=0.6832, Mass=0.6238, Friction=0.302
Episode 303: Gravity=17.12, Length=0.6848000000000001, Mass=0.6257, Friction=0.303
Episode 304: Gravity=17.16, Length=0.6864, Mass=0.6276, Friction=0.304


Epoch #1:  87%|########7 | 8730/10000 [00:10<00:01, 707.70it/s, env_step=8730, gradient_step=873, len=107, n/ep=0, n/st=10, rew=107.00]

Episode 305: Gravity=17.200000000000003, Length=0.6880000000000001, Mass=0.6295000000000001, Friction=0.305
Episode 306: Gravity=17.240000000000002, Length=0.6896, Mass=0.6314000000000001, Friction=0.306


Epoch #1:  89%|########9 | 8930/10000 [00:11<00:01, 702.87it/s, env_step=8930, gradient_step=893, len=99, n/ep=0, n/st=10, rew=99.00]  

Episode 307: Gravity=17.28, Length=0.6912, Mass=0.6333000000000001, Friction=0.307
Episode 308: Gravity=17.32, Length=0.6928000000000001, Mass=0.6352000000000001, Friction=0.308
Episode 309: Gravity=17.36, Length=0.6944, Mass=0.6371, Friction=0.309


Epoch #1:  92%|#########2| 9200/10000 [00:11<00:01, 714.95it/s, env_step=9200, gradient_step=920, len=67, n/ep=0, n/st=10, rew=67.00]  

Episode 310: Gravity=17.4, Length=0.696, Mass=0.639, Friction=0.31
Episode 311: Gravity=17.439999999999998, Length=0.6976, Mass=0.6409, Friction=0.311


Epoch #1:  94%|#########4| 9420/10000 [00:11<00:00, 816.06it/s, env_step=9420, gradient_step=942, len=142, n/ep=0, n/st=10, rew=142.00]

Episode 312: Gravity=17.48, Length=0.6992, Mass=0.6428, Friction=0.312
Episode 313: Gravity=17.52, Length=0.7008000000000001, Mass=0.6447, Friction=0.313


Epoch #1:  96%|#########6| 9620/10000 [00:12<00:00, 868.58it/s, env_step=9620, gradient_step=962, len=75, n/ep=0, n/st=10, rew=75.00]  

Episode 314: Gravity=17.560000000000002, Length=0.7024000000000001, Mass=0.6466000000000001, Friction=0.314
Episode 315: Gravity=17.6, Length=0.704, Mass=0.6485000000000001, Friction=0.315
Episode 316: Gravity=17.64, Length=0.7056, Mass=0.6504000000000001, Friction=0.316
Episode 317: Gravity=17.68, Length=0.7072, Mass=0.6523, Friction=0.317


Epoch #1:  99%|#########8| 9890/10000 [00:12<00:00, 896.76it/s, env_step=9890, gradient_step=989, len=132, n/ep=0, n/st=10, rew=132.00]

Episode 318: Gravity=17.72, Length=0.7088000000000001, Mass=0.6542, Friction=0.318


Epoch #1: 10001it [00:12, 802.29it/s, env_step=10000, gradient_step=1000, len=207, n/ep=0, n/st=10, rew=207.00]                           


Episode 319: Gravity=17.759999999999998, Length=0.7104000000000001, Mass=0.6561, Friction=0.319
Episode 207: Gravity=13.28, Length=0.5312, Mass=0.44329999999999997, Friction=0.20700000000000002
Episode 208: Gravity=13.32, Length=0.5328, Mass=0.4452, Friction=0.20800000000000002
Episode 209: Gravity=13.36, Length=0.5344, Mass=0.4471, Friction=0.209
Episode 210: Gravity=13.4, Length=0.536, Mass=0.449, Friction=0.21
Episode 211: Gravity=13.44, Length=0.5376000000000001, Mass=0.45089999999999997, Friction=0.211
Episode 212: Gravity=13.48, Length=0.5392, Mass=0.4528, Friction=0.212
Episode 213: Gravity=13.52, Length=0.5408, Mass=0.4547, Friction=0.213
Episode 214: Gravity=13.56, Length=0.5424, Mass=0.4566, Friction=0.214
Episode 215: Gravity=13.6, Length=0.544, Mass=0.45849999999999996, Friction=0.215
Episode 216: Gravity=13.64, Length=0.5456000000000001, Mass=0.4604, Friction=0.216
Episode 217: Gravity=13.68, Length=0.5472, Mass=0.4623, Friction=0.217
Episode 218: Gravity=13.72, Length=0.5

2. More Complex or Noisy CartPole Variants  
To evaluate adaptation beyond gradual parameter shifts:  

Noisy CartPole: Introduce small random perturbations to forces applied by the agent.  
Windy CartPole: Apply lateral forces that vary over time.  
Delayed-Action CartPole: Introduce a small delay between the agent’s action and its effect.  

👉 Goal: Test CBP’s ability to handle unpredictability in dynamic environments.

In [47]:
import numpy as np

class NoisyCartPoleWrapper(gym.Wrapper):
    def __init__(self, env, noise_std=0.1):
        super().__init__(env)
        self.noise_std = noise_std  # Standard deviation of noise

    def step(self, action):
        noisy_action = action + np.random.normal(0, self.noise_std)  # Add Gaussian noise
        noisy_action = np.clip(noisy_action, 0, 1)  # Ensure action remains valid
        return self.env.step(int(noisy_action))  # Convert back to integer action


class WindyCartPoleWrapper(gym.Wrapper):
    def __init__(self, env, wind_max=5.0, wind_change_prob=0.1):
        super().__init__(env)
        self.wind_max = wind_max  # Max lateral force
        self.wind_force = 0
        self.wind_change_prob = wind_change_prob  # Probability of wind change per step

    def step(self, action):
        # Occasionally change wind force
        if np.random.rand() < self.wind_change_prob:
            self.wind_force = np.random.uniform(-self.wind_max, self.wind_max)

        # Get the step results
        obs, reward, terminated, truncated, info = self.env.step(action)
        
        # Apply wind effect by modifying the observation directly
        # CartPole observation: [position, velocity, pole_angle, pole_angular_velocity]
        obs[1] += self.wind_force * 0.01  # Modify cart velocity in the observation
        
        # Update the environment's state array
        self.env.unwrapped.state = obs.copy()

        return obs, reward, terminated, truncated, info

from collections import deque

class DelayedActionCartPoleWrapper(gym.Wrapper):
    def __init__(self, env, delay_steps=3):
        super().__init__(env)
        self.delay_steps = delay_steps  # How many steps before action takes effect
        self.action_queue = deque([0] * delay_steps, maxlen=delay_steps)  # Store past actions

    def step(self, action):
        self.action_queue.append(action)  # Queue the new action
        delayed_action = self.action_queue.popleft()  # Use delayed action
        return self.env.step(delayed_action)  # Will automatically pass through all 5 values

train_env = WindyCartPoleWrapper(NoisyCartPoleWrapper(DelayedActionCartPoleWrapper(gym.make("CartPole-v1"))))
test_env = WindyCartPoleWrapper(NoisyCartPoleWrapper(DelayedActionCartPoleWrapper(gym.make("CartPole-v1"))))

# policy.load_state_dict(torch.load('models/cbp_dqn_adapted_v1.pth')) # Should change name to cbp_dqn_adapted_v1.pth, but haven't
policy.load_state_dict(torch.load('models/cbp_dqn_v1_alternate_metrics.pth'))
policy.train()
policy.set_eps(0.1)  # Start with exploration

best_mean_reward = -np.inf

def save_best_model(mean_rewards):
    global best_mean_reward
    if mean_rewards > best_mean_reward:
        best_mean_reward = mean_rewards
        torch.save(policy.state_dict(), 'models/cbp_dqn_final_adapted_v2.pth')
        print(f"New best model saved with mean reward: {best_mean_reward}")

result = ts.trainer.OffpolicyTrainer(
    policy=policy,
    train_collector=ts.data.Collector(policy, train_env, ts.data.VectorReplayBuffer(20000, 1), exploration_noise=True),
    test_collector=ts.data.Collector(policy, test_env, exploration_noise=True),
    max_epoch=20,
    step_per_epoch=5000,
    step_per_collect=10,
    episode_per_test=20,
    batch_size=64,
    update_per_step=1 / 10,
    train_fn=lambda epoch, env_step: policy.set_eps(max(0.1, 0.5 - epoch * 0.05)),
    test_fn=lambda epoch, env_step: policy.set_eps(0.05),
    stop_fn=lambda mean_rewards: mean_rewards >= train_env.spec.reward_threshold,
    logger=ts.utils.TensorboardLogger(SummaryWriter("logs/cbp_windy_noisy_delayed")),
).run()

print(f"Finished training in {result.timing.total_time} seconds")
save_best_model(result.best_reward)

Epoch #1: 5001it [00:07, 644.27it/s, env_step=5000, gradient_step=500, len=14, n/ep=0, n/st=10, rew=14.00]                          


Epoch #1: test_reward: 22.300000 ± 8.933644, best_reward: 22.300000 ± 8.933644 in #1


Epoch #2: 5001it [00:06, 754.27it/s, env_step=10000, gradient_step=1000, len=28, n/ep=1, n/st=10, rew=28.00]                          


Epoch #2: test_reward: 24.600000 ± 14.416657, best_reward: 24.600000 ± 14.416657 in #2


Epoch #3: 5001it [00:07, 659.75it/s, env_step=15000, gradient_step=1500, len=21, n/ep=0, n/st=10, rew=21.00]                          


Epoch #3: test_reward: 23.500000 ± 9.902020, best_reward: 24.600000 ± 14.416657 in #2


Epoch #4: 5001it [00:06, 731.69it/s, env_step=20000, gradient_step=2000, len=45, n/ep=0, n/st=10, rew=45.00]                          


Epoch #4: test_reward: 19.150000 ± 6.799081, best_reward: 24.600000 ± 14.416657 in #2


Epoch #5: 5001it [00:06, 813.84it/s, env_step=25000, gradient_step=2500, len=48, n/ep=1, n/st=10, rew=48.00]                            


Epoch #5: test_reward: 28.650000 ± 18.224366, best_reward: 28.650000 ± 18.224366 in #5


Epoch #6: 5001it [00:06, 720.19it/s, env_step=30000, gradient_step=3000, len=12, n/ep=1, n/st=10, rew=12.00]                          


Epoch #6: test_reward: 23.200000 ± 18.128982, best_reward: 28.650000 ± 18.224366 in #5


Epoch #7: 5001it [00:06, 762.73it/s, env_step=35000, gradient_step=3500, len=17, n/ep=1, n/st=10, rew=17.00]                          


Epoch #7: test_reward: 25.400000 ± 11.204463, best_reward: 28.650000 ± 18.224366 in #5


Epoch #8: 5001it [00:06, 730.77it/s, env_step=40000, gradient_step=4000, len=14, n/ep=0, n/st=10, rew=14.00]                            


Epoch #8: test_reward: 27.050000 ± 15.847634, best_reward: 28.650000 ± 18.224366 in #5


Epoch #9: 5001it [00:07, 667.65it/s, env_step=45000, gradient_step=4500, len=13, n/ep=0, n/st=10, rew=13.00]                          


Epoch #9: test_reward: 35.450000 ± 24.303241, best_reward: 35.450000 ± 24.303241 in #9


Epoch #10: 5001it [00:07, 685.56it/s, env_step=50000, gradient_step=5000, len=27, n/ep=0, n/st=10, rew=27.00]                          


Epoch #10: test_reward: 23.900000 ± 12.356780, best_reward: 35.450000 ± 24.303241 in #9


Epoch #11: 5001it [00:05, 889.43it/s, env_step=55000, gradient_step=5500, len=10, n/ep=0, n/st=10, rew=10.00]                            


Epoch #11: test_reward: 26.950000 ± 11.482487, best_reward: 35.450000 ± 24.303241 in #9


Epoch #12: 5001it [00:05, 949.98it/s, env_step=60000, gradient_step=6000, len=15, n/ep=0, n/st=10, rew=15.00]                           


Epoch #12: test_reward: 32.700000 ± 10.854032, best_reward: 35.450000 ± 24.303241 in #9


Epoch #13: 5001it [00:05, 971.22it/s, env_step=65000, gradient_step=6500, len=16, n/ep=0, n/st=10, rew=16.00]                           


Epoch #13: test_reward: 22.900000 ± 10.024470, best_reward: 35.450000 ± 24.303241 in #9


Epoch #14: 5001it [00:04, 1014.42it/s, env_step=70000, gradient_step=7000, len=32, n/ep=1, n/st=10, rew=32.00]                          


Epoch #14: test_reward: 24.050000 ± 8.980395, best_reward: 35.450000 ± 24.303241 in #9


Epoch #15: 5001it [00:05, 972.98it/s, env_step=75000, gradient_step=7500, len=22, n/ep=1, n/st=10, rew=22.00]                           


Epoch #15: test_reward: 22.950000 ± 11.791840, best_reward: 35.450000 ± 24.303241 in #9


Epoch #16: 5001it [00:05, 895.56it/s, env_step=80000, gradient_step=8000, len=32, n/ep=1, n/st=10, rew=32.00]                           


Epoch #16: test_reward: 31.300000 ± 20.576929, best_reward: 35.450000 ± 24.303241 in #9


Epoch #17: 5001it [00:05, 889.61it/s, env_step=85000, gradient_step=8500, len=26, n/ep=0, n/st=10, rew=26.00]                           


Epoch #17: test_reward: 25.550000 ± 9.129485, best_reward: 35.450000 ± 24.303241 in #9


Epoch #18: 5001it [00:05, 942.85it/s, env_step=90000, gradient_step=9000, len=40, n/ep=1, n/st=10, rew=40.00]                           


Epoch #18: test_reward: 25.200000 ± 11.227644, best_reward: 35.450000 ± 24.303241 in #9


Epoch #19: 5001it [00:05, 933.59it/s, env_step=95000, gradient_step=9500, len=27, n/ep=1, n/st=10, rew=27.00]                           


Epoch #19: test_reward: 27.150000 ± 14.054448, best_reward: 35.450000 ± 24.303241 in #9


Epoch #20: 5001it [00:05, 984.16it/s, env_step=100000, gradient_step=10000, len=17, n/ep=0, n/st=10, rew=17.00]                          


Epoch #20: test_reward: 30.200000 ± 13.969968, best_reward: 35.450000 ± 24.303241 in #9
Finished training in 128.2709560394287 seconds
New best model saved with mean reward: 35.45


# Changes to improve  
1️⃣ NoisyCartPoleWrapper

✅ Correct Approach: Adds Gaussian noise to the action.  
⚠ Issue:  

The action in CartPole-v1 is discrete (0 or 1), but noisy_action becomes continuous due to np.random.normal().  
Clipping (np.clip(noisy_action, 0, 1)) does not correctly map back to {0,1}.  
Fix: Instead of directly adding noise to a discrete action, use probability flipping:  

def step(self, action):  
    if np.random.rand() < self.noise_std:  # With probability `noise_std`, flip action  
        action = 1 - action  
    return self.env.step(action)  
👉 This ensures actions remain 0 or 1 while still introducing randomness.  

2️⃣ WindyCartPoleWrapper

✅ Correct Approach: Applies wind as a force that changes over time.  
⚠ Issue:  

Modifying obs[1] (cart velocity) directly does not actually apply force to the environment; it only changes the observation.  
The true physics of CartPole are managed internally, so directly changing obs[1] does not actually push the cart.  
Fix:  

Instead of modifying obs[1], add force inside the environment's physics using env.unwrapped.state.  
def step(self, action):  
    # Occasionally change wind force  
    if np.random.rand() < self.wind_change_prob:  
        self.wind_force = np.random.uniform(-self.wind_max, self.wind_max)

    # Apply action  
    obs, reward, terminated, truncated, info = self.env.step(action)

    # Apply wind effect by modifying the environment’s internal state (not just observation)  
    self.env.unwrapped.state[1] += self.wind_force  # Modify velocity directly in environment  

    return obs, reward, terminated, truncated, info  
👉 This properly simulates wind affecting the cart’s velocity.  

3️⃣ DelayedActionCartPoleWrapper

✅ Correct Approach: Introduces an action delay using a queue.  
⚠ Issue:  

The queue (deque) is initialized with 0, meaning for the first few steps the agent always takes action 0 regardless of policy.  
This biases the early learning by making the agent effectively ignore initial training steps.  
Fix:  

Instead of defaulting to 0, initialize with random past actions to avoid bias.  
self.action_queue = deque(np.random.choice([0, 1], delay_steps), maxlen=delay_steps)  
👉 This prevents early-action bias.

In [48]:
import numpy as np

class NoisyCartPoleWrapper(gym.Wrapper):
    def __init__(self, env, noise_std=0.1):
        super().__init__(env)
        self.noise_std = noise_std  # Standard deviation of noise

    def step(self, action):
        if np.random.rand() < self.noise_std:  # With probability `noise_std`, flip action
            action = 1 - action
        return self.env.step(int(action))


class WindyCartPoleWrapper(gym.Wrapper):
    def __init__(self, env, wind_max=5.0, wind_change_prob=0.1):
        super().__init__(env)
        self.wind_max = wind_max  # Max lateral force
        self.wind_force = 0
        self.wind_change_prob = wind_change_prob  # Probability of wind change per step

    def step(self, action):
        # Occasionally change wind force
        if np.random.rand() < self.wind_change_prob:
            self.wind_force = np.random.uniform(-self.wind_max, self.wind_max)

        # Get the step results
        obs, reward, terminated, truncated, info = self.env.step(action)
        
        # Apply wind effect by modifying the observation directly
        # CartPole observation: [position, velocity, pole_angle, pole_angular_velocity]
        obs[1] += self.wind_force * 0.01  # Modify cart velocity in the observation
        
        # Update the environment's state array
        self.env.unwrapped.state = obs.copy()

        return obs, reward, terminated, truncated, info

from collections import deque

class DelayedActionCartPoleWrapper(gym.Wrapper):
    def __init__(self, env, delay_steps=3):
        super().__init__(env)
        self.delay_steps = delay_steps  # How many steps before action takes effect
        # self.action_queue = deque([0] * delay_steps, maxlen=delay_steps)  # Store past actions
        self.action_queue = deque(np.random.choice([0, 1], delay_steps), maxlen=delay_steps)

    def step(self, action):
        self.action_queue.append(action)  # Queue the new action
        delayed_action = self.action_queue.popleft()  # Use delayed action
        return self.env.step(delayed_action)  # Will automatically pass through all 5 values

train_env = WindyCartPoleWrapper(NoisyCartPoleWrapper(DelayedActionCartPoleWrapper(gym.make("CartPole-v1"))))
test_env = WindyCartPoleWrapper(NoisyCartPoleWrapper(DelayedActionCartPoleWrapper(gym.make("CartPole-v1"))))

# policy.load_state_dict(torch.load('models/cbp_dqn_adapted_v1.pth')) # Should change name to cbp_dqn_adapted_v1.pth, but haven't
policy.load_state_dict(torch.load('models/cbp_dqn_v1_alternate_metrics.pth'))
policy.train()
policy.set_eps(0.1)  # Start with exploration

best_mean_reward = -np.inf

def save_best_model(mean_rewards):
    global best_mean_reward
    if mean_rewards > best_mean_reward:
        best_mean_reward = mean_rewards
        torch.save(policy.state_dict(), 'models/cbp_dqn_final_adapted_v2.pth')
        print(f"New best model saved with mean reward: {best_mean_reward}")

result = ts.trainer.OffpolicyTrainer(
    policy=policy,
    train_collector=ts.data.Collector(policy, train_env, ts.data.VectorReplayBuffer(20000, 1), exploration_noise=True),
    test_collector=ts.data.Collector(policy, test_env, exploration_noise=True),
    max_epoch=30,
    step_per_epoch=5000,
    step_per_collect=10,
    episode_per_test=20,
    batch_size=64,
    update_per_step=1 / 10,
    train_fn=lambda epoch, env_step: policy.set_eps(max(0.1, 0.5 - epoch * 0.05)),
    test_fn=lambda epoch, env_step: policy.set_eps(0.05),
    stop_fn=lambda mean_rewards: mean_rewards >= train_env.spec.reward_threshold,
    logger=ts.utils.TensorboardLogger(SummaryWriter("logs/cbp_windy_noisy_delayed")),
).run()

print(f"Finished training in {result.timing.total_time} seconds")
save_best_model(result.best_reward)

Epoch #1: 5001it [00:04, 1047.77it/s, env_step=5000, gradient_step=500, len=45, n/ep=0, n/st=10, rew=45.00]                            


Epoch #1: test_reward: 41.650000 ± 9.477737, best_reward: 107.550000 ± 69.976049 in #0


Epoch #2: 5001it [00:04, 1086.11it/s, env_step=10000, gradient_step=1000, len=27, n/ep=0, n/st=10, rew=27.00]                          


Epoch #2: test_reward: 20.200000 ± 5.921149, best_reward: 107.550000 ± 69.976049 in #0


Epoch #3: 5001it [00:05, 958.49it/s, env_step=15000, gradient_step=1500, len=29, n/ep=0, n/st=10, rew=29.00]                            


Epoch #3: test_reward: 45.300000 ± 16.796131, best_reward: 107.550000 ± 69.976049 in #0


Epoch #4: 5001it [00:05, 900.99it/s, env_step=20000, gradient_step=2000, len=25, n/ep=0, n/st=10, rew=25.00]                             


Epoch #4: test_reward: 59.050000 ± 20.190282, best_reward: 107.550000 ± 69.976049 in #0


Epoch #5: 5001it [00:04, 1013.13it/s, env_step=25000, gradient_step=2500, len=32, n/ep=0, n/st=10, rew=32.00]                            


Epoch #5: test_reward: 29.450000 ± 6.184456, best_reward: 107.550000 ± 69.976049 in #0


Epoch #6: 5001it [00:05, 950.18it/s, env_step=30000, gradient_step=3000, len=16, n/ep=0, n/st=10, rew=16.00]                           


Epoch #6: test_reward: 20.650000 ± 4.040730, best_reward: 107.550000 ± 69.976049 in #0


Epoch #7: 5001it [00:04, 1000.93it/s, env_step=35000, gradient_step=3500, len=109, n/ep=0, n/st=10, rew=109.00]                          


Epoch #7: test_reward: 97.600000 ± 47.765469, best_reward: 107.550000 ± 69.976049 in #0


Epoch #8: 5001it [00:04, 1164.31it/s, env_step=40000, gradient_step=4000, len=83, n/ep=0, n/st=10, rew=83.00]                            


Epoch #8: test_reward: 113.200000 ± 83.819807, best_reward: 113.200000 ± 83.819807 in #8


Epoch #9: 5001it [00:05, 925.32it/s, env_step=45000, gradient_step=4500, len=41, n/ep=0, n/st=10, rew=41.00]                             


Epoch #9: test_reward: 70.450000 ± 55.121207, best_reward: 113.200000 ± 83.819807 in #8


Epoch #10: 5001it [00:04, 1146.74it/s, env_step=50000, gradient_step=5000, len=15, n/ep=0, n/st=10, rew=15.00]                            


Epoch #10: test_reward: 16.400000 ± 2.973214, best_reward: 113.200000 ± 83.819807 in #8


Epoch #11: 5001it [00:04, 1147.20it/s, env_step=55000, gradient_step=5500, len=33, n/ep=0, n/st=10, rew=33.00]                            


Epoch #11: test_reward: 56.450000 ± 39.388418, best_reward: 113.200000 ± 83.819807 in #8


Epoch #12: 5001it [00:05, 995.52it/s, env_step=60000, gradient_step=6000, len=110, n/ep=0, n/st=10, rew=110.00]                           


Epoch #12: test_reward: 67.300000 ± 26.376315, best_reward: 113.200000 ± 83.819807 in #8


Epoch #13: 5001it [00:05, 941.71it/s, env_step=65000, gradient_step=6500, len=24, n/ep=0, n/st=10, rew=24.00]                             


Epoch #13: test_reward: 34.000000 ± 10.271319, best_reward: 113.200000 ± 83.819807 in #8


Epoch #14: 5001it [00:05, 942.72it/s, env_step=70000, gradient_step=7000, len=45, n/ep=0, n/st=10, rew=45.00]                             


Epoch #14: test_reward: 66.450000 ± 32.703937, best_reward: 113.200000 ± 83.819807 in #8


Epoch #15: 5001it [00:05, 895.86it/s, env_step=75000, gradient_step=7500, len=71, n/ep=1, n/st=10, rew=71.00]                             


Epoch #15: test_reward: 22.400000 ± 5.580323, best_reward: 113.200000 ± 83.819807 in #8


Epoch #16: 5001it [00:05, 866.36it/s, env_step=80000, gradient_step=8000, len=24, n/ep=0, n/st=10, rew=24.00]                            


Epoch #16: test_reward: 49.650000 ± 22.152370, best_reward: 113.200000 ± 83.819807 in #8


Epoch #17: 5001it [00:05, 954.46it/s, env_step=85000, gradient_step=8500, len=32, n/ep=0, n/st=10, rew=32.00]                             


Epoch #17: test_reward: 58.500000 ± 22.017039, best_reward: 113.200000 ± 83.819807 in #8


Epoch #18: 5001it [00:05, 892.81it/s, env_step=90000, gradient_step=9000, len=15, n/ep=0, n/st=10, rew=15.00]                            


Epoch #18: test_reward: 27.750000 ± 7.292976, best_reward: 113.200000 ± 83.819807 in #8


Epoch #19: 5001it [00:04, 1060.61it/s, env_step=95000, gradient_step=9500, len=19, n/ep=0, n/st=10, rew=19.00]                            


Epoch #19: test_reward: 27.300000 ± 6.091798, best_reward: 113.200000 ± 83.819807 in #8


Epoch #20: 5001it [00:04, 1071.09it/s, env_step=100000, gradient_step=10000, len=39, n/ep=0, n/st=10, rew=39.00]                          


Epoch #20: test_reward: 33.450000 ± 11.732327, best_reward: 113.200000 ± 83.819807 in #8


Epoch #21: 5001it [00:05, 997.99it/s, env_step=105000, gradient_step=10500, len=23, n/ep=1, n/st=10, rew=23.00]                             


Epoch #21: test_reward: 22.050000 ± 5.739991, best_reward: 113.200000 ± 83.819807 in #8


Epoch #22: 5001it [00:05, 916.78it/s, env_step=110000, gradient_step=11000, len=88, n/ep=0, n/st=10, rew=88.00]                             


Epoch #22: test_reward: 37.050000 ± 15.790741, best_reward: 113.200000 ± 83.819807 in #8


Epoch #23: 5001it [00:04, 1131.33it/s, env_step=115000, gradient_step=11500, len=25, n/ep=0, n/st=10, rew=25.00]                            


Epoch #23: test_reward: 31.000000 ± 9.418068, best_reward: 113.200000 ± 83.819807 in #8


Epoch #24: 5001it [00:05, 843.60it/s, env_step=120000, gradient_step=12000, len=29, n/ep=1, n/st=10, rew=29.00]                            


Epoch #24: test_reward: 54.650000 ± 39.110453, best_reward: 113.200000 ± 83.819807 in #8


Epoch #25: 5001it [00:05, 844.80it/s, env_step=125000, gradient_step=12500, len=39, n/ep=0, n/st=10, rew=39.00]                            


Epoch #25: test_reward: 79.800000 ± 25.549168, best_reward: 113.200000 ± 83.819807 in #8


Epoch #26: 5001it [00:05, 992.39it/s, env_step=130000, gradient_step=13000, len=25, n/ep=0, n/st=10, rew=25.00]                             


Epoch #26: test_reward: 15.500000 ± 6.726812, best_reward: 113.200000 ± 83.819807 in #8


Epoch #27: 5001it [00:04, 1039.20it/s, env_step=135000, gradient_step=13500, len=29, n/ep=0, n/st=10, rew=29.00]                            


Epoch #27: test_reward: 26.750000 ± 11.562331, best_reward: 113.200000 ± 83.819807 in #8


Epoch #28: 5001it [00:04, 1132.20it/s, env_step=140000, gradient_step=14000, len=29, n/ep=1, n/st=10, rew=29.00]                            


Epoch #28: test_reward: 25.550000 ± 8.120807, best_reward: 113.200000 ± 83.819807 in #8


Epoch #29: 5001it [00:04, 1050.56it/s, env_step=145000, gradient_step=14500, len=71, n/ep=0, n/st=10, rew=71.00]                            


Epoch #29: test_reward: 34.800000 ± 9.130170, best_reward: 113.200000 ± 83.819807 in #8


Epoch #30: 5001it [00:04, 1064.66it/s, env_step=150000, gradient_step=15000, len=25, n/ep=0, n/st=10, rew=25.00]                            


Epoch #30: test_reward: 43.250000 ± 27.356672, best_reward: 113.200000 ± 83.819807 in #8
Finished training in 161.48106002807617 seconds
New best model saved with mean reward: 113.2


In [49]:
import numpy as np

class NoisyCartPoleWrapper(gym.Wrapper):
    def __init__(self, env, noise_std=0.1):
        super().__init__(env)
        self.noise_std = noise_std  # Standard deviation of noise

    def step(self, action):
        if np.random.rand() < self.noise_std:  # With probability `noise_std`, flip action
            action = 1 - action
        return self.env.step(int(action))


class WindyCartPoleWrapper(gym.Wrapper):
    def __init__(self, env, wind_max=5.0, wind_change_prob=0.1):
        super().__init__(env)
        self.wind_max = wind_max  # Max lateral force
        self.wind_force = 0
        self.wind_change_prob = wind_change_prob  # Probability of wind change per step

    def step(self, action):
        # Occasionally change wind force
        if np.random.rand() < self.wind_change_prob:
            self.wind_force = np.random.uniform(-self.wind_max, self.wind_max)

        # Get the step results
        obs, reward, terminated, truncated, info = self.env.step(action)
        
        # Apply wind effect by modifying the observation directly
        # CartPole observation: [position, velocity, pole_angle, pole_angular_velocity]
        obs[1] += self.wind_force * 0.01  # Modify cart velocity in the observation
        
        # Update the environment's state array
        self.env.unwrapped.state = obs.copy()

        return obs, reward, terminated, truncated, info

from collections import deque

class DelayedActionCartPoleWrapper(gym.Wrapper):
    def __init__(self, env, delay_steps=3):
        super().__init__(env)
        self.delay_steps = delay_steps  # How many steps before action takes effect
        # self.action_queue = deque([0] * delay_steps, maxlen=delay_steps)  # Store past actions
        self.action_queue = deque(np.random.choice([0, 1], delay_steps), maxlen=delay_steps)

    def step(self, action):
        self.action_queue.append(action)  # Queue the new action
        delayed_action = self.action_queue.popleft()  # Use delayed action
        return self.env.step(delayed_action)  # Will automatically pass through all 5 values

train_env = WindyCartPoleWrapper(NoisyCartPoleWrapper(DelayedActionCartPoleWrapper(gym.make("CartPole-v1"))))
test_env = WindyCartPoleWrapper(NoisyCartPoleWrapper(DelayedActionCartPoleWrapper(gym.make("CartPole-v1"))))

# policy.load_state_dict(torch.load('models/cbp_dqn_adapted_v1.pth')) # Should change name to cbp_dqn_adapted_v1.pth, but haven't
policy.load_state_dict(torch.load('models/cbp_dqn_v1_alternate_metrics.pth'))
policy.train()
policy.set_eps(0.1)  # Start with exploration

best_mean_reward = -np.inf

def save_best_model(mean_rewards):
    global best_mean_reward
    if mean_rewards > best_mean_reward:
        best_mean_reward = mean_rewards
        torch.save(policy.state_dict(), 'models/cbp_dqn_final_adapted_v2.pth')
        print(f"New best model saved with mean reward: {best_mean_reward}")

result = ts.trainer.OffpolicyTrainer(
    policy=policy,
    train_collector=ts.data.Collector(policy, train_env, ts.data.VectorReplayBuffer(20000, 1), exploration_noise=True),
    test_collector=ts.data.Collector(policy, test_env, exploration_noise=True),
    max_epoch=30,
    step_per_epoch=5000,
    step_per_collect=10,
    episode_per_test=20,
    batch_size=64,
    update_per_step=1 / 10,
    train_fn=lambda epoch, env_step: policy.set_eps(max(0.1, 0.5 - epoch * 0.05)),
    test_fn=lambda epoch, env_step: policy.set_eps(0.05),
    stop_fn=lambda mean_rewards: mean_rewards >= train_env.spec.reward_threshold,
    logger=ts.utils.TensorboardLogger(SummaryWriter("logs/cbp_windy_noisy_delayed")),
).run()

print(f"Finished training in {result.timing.total_time} seconds")
save_best_model(result.best_reward)

Epoch #1: 5001it [00:04, 1014.47it/s, env_step=5000, gradient_step=500, len=27, n/ep=0, n/st=10, rew=27.00]                            


Epoch #1: test_reward: 45.850000 ± 16.429470, best_reward: 120.050000 ± 69.162472 in #0


Epoch #2: 5001it [00:05, 875.09it/s, env_step=10000, gradient_step=1000, len=26, n/ep=0, n/st=10, rew=26.00]                           


Epoch #2: test_reward: 32.250000 ± 8.269674, best_reward: 120.050000 ± 69.162472 in #0


Epoch #3: 5001it [00:07, 674.21it/s, env_step=15000, gradient_step=1500, len=119, n/ep=0, n/st=10, rew=119.00]                          


Epoch #3: test_reward: 58.500000 ± 31.795440, best_reward: 120.050000 ± 69.162472 in #0


Epoch #4: 5001it [00:08, 623.50it/s, env_step=20000, gradient_step=2000, len=12, n/ep=0, n/st=10, rew=12.00]                            


Epoch #4: test_reward: 20.450000 ± 7.453020, best_reward: 120.050000 ± 69.162472 in #0


Epoch #5: 5001it [00:07, 636.73it/s, env_step=25000, gradient_step=2500, len=30, n/ep=0, n/st=10, rew=30.00]                            


Epoch #5: test_reward: 64.400000 ± 31.449006, best_reward: 120.050000 ± 69.162472 in #0


Epoch #6: 5001it [00:07, 654.08it/s, env_step=30000, gradient_step=3000, len=19, n/ep=0, n/st=10, rew=19.00]                          


Epoch #6: test_reward: 35.500000 ± 8.120961, best_reward: 120.050000 ± 69.162472 in #0


Epoch #7: 5001it [00:07, 684.04it/s, env_step=35000, gradient_step=3500, len=48, n/ep=0, n/st=10, rew=48.00]                            


Epoch #7: test_reward: 106.250000 ± 52.747393, best_reward: 120.050000 ± 69.162472 in #0


Epoch #8: 5001it [00:07, 671.76it/s, env_step=40000, gradient_step=4000, len=47, n/ep=0, n/st=10, rew=47.00]                            


Epoch #8: test_reward: 34.550000 ± 12.212596, best_reward: 120.050000 ± 69.162472 in #0


Epoch #9: 5001it [00:07, 690.50it/s, env_step=45000, gradient_step=4500, len=58, n/ep=0, n/st=10, rew=58.00]                            


Epoch #9: test_reward: 63.700000 ± 34.793821, best_reward: 120.050000 ± 69.162472 in #0


Epoch #10: 5001it [00:07, 679.92it/s, env_step=50000, gradient_step=5000, len=40, n/ep=0, n/st=10, rew=40.00]                            


Epoch #10: test_reward: 73.600000 ± 50.191035, best_reward: 120.050000 ± 69.162472 in #0


Epoch #11: 5001it [00:07, 680.19it/s, env_step=55000, gradient_step=5500, len=17, n/ep=0, n/st=10, rew=17.00]                            


Epoch #11: test_reward: 40.700000 ± 17.236879, best_reward: 120.050000 ± 69.162472 in #0


Epoch #12: 5001it [00:07, 643.87it/s, env_step=60000, gradient_step=6000, len=13, n/ep=0, n/st=10, rew=13.00]                          


Epoch #12: test_reward: 24.600000 ± 7.038466, best_reward: 120.050000 ± 69.162472 in #0


Epoch #13: 5001it [00:07, 652.65it/s, env_step=65000, gradient_step=6500, len=23, n/ep=0, n/st=10, rew=23.00]                          


Epoch #13: test_reward: 35.700000 ± 12.219247, best_reward: 120.050000 ± 69.162472 in #0


Epoch #14: 5001it [00:08, 592.04it/s, env_step=70000, gradient_step=7000, len=173, n/ep=1, n/st=10, rew=173.00]                          


Epoch #14: test_reward: 52.650000 ± 30.775437, best_reward: 120.050000 ± 69.162472 in #0


Epoch #15: 5001it [00:07, 686.27it/s, env_step=75000, gradient_step=7500, len=23, n/ep=0, n/st=10, rew=23.00]                            


Epoch #15: test_reward: 40.150000 ± 19.565978, best_reward: 120.050000 ± 69.162472 in #0


Epoch #16: 5001it [00:06, 729.93it/s, env_step=80000, gradient_step=8000, len=51, n/ep=1, n/st=10, rew=51.00]                            


Epoch #16: test_reward: 54.150000 ± 27.664553, best_reward: 120.050000 ± 69.162472 in #0


Epoch #17: 5001it [00:07, 685.46it/s, env_step=85000, gradient_step=8500, len=60, n/ep=0, n/st=10, rew=60.00]                            


Epoch #17: test_reward: 58.350000 ± 22.068700, best_reward: 120.050000 ± 69.162472 in #0


Epoch #18: 5001it [00:07, 707.57it/s, env_step=90000, gradient_step=9000, len=45, n/ep=1, n/st=10, rew=45.00]                            


Epoch #18: test_reward: 68.000000 ± 33.092295, best_reward: 120.050000 ± 69.162472 in #0


Epoch #19: 5001it [00:06, 734.99it/s, env_step=95000, gradient_step=9500, len=24, n/ep=0, n/st=10, rew=24.00]                            


Epoch #19: test_reward: 40.800000 ± 21.148522, best_reward: 120.050000 ± 69.162472 in #0


Epoch #20: 5001it [00:06, 732.78it/s, env_step=100000, gradient_step=10000, len=15, n/ep=1, n/st=10, rew=15.00]                          


Epoch #20: test_reward: 21.150000 ± 4.767337, best_reward: 120.050000 ± 69.162472 in #0


Epoch #21: 5001it [00:06, 728.51it/s, env_step=105000, gradient_step=10500, len=30, n/ep=0, n/st=10, rew=30.00]                          


Epoch #21: test_reward: 37.300000 ± 18.360556, best_reward: 120.050000 ± 69.162472 in #0


Epoch #22: 5001it [00:07, 669.37it/s, env_step=110000, gradient_step=11000, len=43, n/ep=0, n/st=10, rew=43.00]                            


Epoch #22: test_reward: 60.950000 ± 41.392602, best_reward: 120.050000 ± 69.162472 in #0


Epoch #23: 5001it [00:07, 652.22it/s, env_step=115000, gradient_step=11500, len=23, n/ep=0, n/st=10, rew=23.00]                            


Epoch #23: test_reward: 47.050000 ± 26.732892, best_reward: 120.050000 ± 69.162472 in #0


Epoch #24: 5001it [00:06, 813.47it/s, env_step=120000, gradient_step=12000, len=33, n/ep=1, n/st=10, rew=33.00]                            


Epoch #24: test_reward: 56.800000 ± 29.859002, best_reward: 120.050000 ± 69.162472 in #0


Epoch #25: 5001it [00:05, 924.34it/s, env_step=125000, gradient_step=12500, len=49, n/ep=0, n/st=10, rew=49.00]                            


Epoch #25: test_reward: 46.550000 ± 38.991634, best_reward: 120.050000 ± 69.162472 in #0


Epoch #26: 5001it [00:05, 898.03it/s, env_step=130000, gradient_step=13000, len=45, n/ep=0, n/st=10, rew=45.00]                            


Epoch #26: test_reward: 60.950000 ± 47.694313, best_reward: 120.050000 ± 69.162472 in #0


Epoch #27: 5001it [00:05, 923.80it/s, env_step=135000, gradient_step=13500, len=50, n/ep=0, n/st=10, rew=50.00]                            


Epoch #27: test_reward: 34.800000 ± 9.801020, best_reward: 120.050000 ± 69.162472 in #0


Epoch #28: 5001it [00:05, 941.48it/s, env_step=140000, gradient_step=14000, len=29, n/ep=0, n/st=10, rew=29.00]                             


Epoch #28: test_reward: 96.700000 ± 57.258275, best_reward: 120.050000 ± 69.162472 in #0


Epoch #29: 5001it [00:04, 1007.36it/s, env_step=145000, gradient_step=14500, len=136, n/ep=0, n/st=10, rew=136.00]                          


Epoch #29: test_reward: 113.400000 ± 55.955697, best_reward: 120.050000 ± 69.162472 in #0


Epoch #30: 5001it [00:05, 908.73it/s, env_step=150000, gradient_step=15000, len=93, n/ep=0, n/st=10, rew=93.00]                             


Epoch #30: test_reward: 49.200000 ± 41.820569, best_reward: 120.050000 ± 69.162472 in #0
Finished training in 219.03963708877563 seconds
New best model saved with mean reward: 120.05


3. Transfer to a Different Control Problem 
For a drastic test of adaptation, switch to an environment requiring similar skills but with new challenges:  

MountainCarContinuous-v0: Requires continuous control and memory of past actions, unlike CartPole’s immediate balance task.  
Pendulum-v1: Similar physics, but requires torque control instead of discrete left/right actions.  
Acrobot-v1: A more complex version of inverted pendulum, with two linked segments.  
👉 Goal: Test if CBP allows knowledge retention while adapting to a new but related task.  

Notes:  
Question 1:
Theory: Agent w/o backprop 
And compare to Agent with backprop  
Graph of: train in first, 2nd, 3rd, etc  

Question 2: to address, which env work well and which doesn't .   
-> Maze: change end point (up mean down, down mean up)


Think about complexity: how behavior change  
Jump cartpole to other env  

Question 3: Does continual backprop work with larger inputs  

In [50]:
# First create a new policy with correct dimensions for Acrobot
env_name = "Acrobot-v1"
train_env = gym.make(env_name)
test_env = gym.make(env_name)

input_dim = train_env.observation_space.shape[0]  # Should be 6
output_dim = train_env.action_space.n  # Should be 3

# Create a new model with correct dimensions
new_model = CBPNetwork(
    input_dim=input_dim, 
    output_dim=output_dim, 
    hidden_sizes=[128, 128, 128], 
    cbp_params={'eta': 0.99, 'rho': 1e-4, 'm': 100}
)

# Create a new policy
new_policy = CBPDQNPolicy(
    model=new_model,
    optim=torch.optim.Adam(new_model.parameters(), lr=0.001),
    discount_factor=0.975,
    estimation_step=3,
    target_update_freq=1000,
    action_space=train_env.action_space
)

# Load the pretrained weights for matching layers
pretrained_dict = torch.load("models/cbp_dqn_final_adapted_v2.pth")
model_dict = new_policy.state_dict()

# Filter out incompatible layers
filtered_dict = {k: v for k, v in pretrained_dict.items() 
                if k in model_dict and 
                'hidden_layers.0' not in k and  # Skip first layer
                'out_layer' not in k and        # Skip output layer
                model_dict[k].shape == v.shape}

# Update matching layers
model_dict.update(filtered_dict)
new_policy.load_state_dict(model_dict)

# Now you can train with the new policy
new_policy.train()
new_policy.set_eps(0.1)

best_mean_reward = -np.inf

def save_best_model(mean_rewards):
    global best_mean_reward
    if mean_rewards > best_mean_reward:
        best_mean_reward = mean_rewards
        torch.save(new_policy.state_dict(), 'models/cbp_dqn_Acrobot-v1.pth')
        print(f"New best model saved with mean reward: {best_mean_reward}")

result = ts.trainer.OffpolicyTrainer(
    policy=new_policy,
    train_collector=ts.data.Collector(new_policy, train_env, ts.data.VectorReplayBuffer(20000, 1), exploration_noise=True),
    test_collector=ts.data.Collector(new_policy, test_env, exploration_noise=True),
    max_epoch=20,
    step_per_epoch=5000,
    step_per_collect=10,
    episode_per_test=20,
    batch_size=64,
    update_per_step=1 / 10,
    train_fn=lambda epoch, env_step: new_policy.set_eps(max(0.1, 0.5 - epoch * 0.05)),
    test_fn=lambda epoch, env_step: new_policy.set_eps(0.05),
    stop_fn=lambda mean_rewards: mean_rewards >= train_env.spec.reward_threshold,
    logger=ts.utils.TensorboardLogger(SummaryWriter("logs/cbp_Acrobot-v1")),
).run()

print(f"Finished training in {result.timing.total_time} seconds")
save_best_model(result.best_reward)

Epoch #1: 5001it [00:05, 894.52it/s, env_step=5000, gradient_step=500, len=392, n/ep=0, n/st=10, rew=-391.00]                           


Epoch #1: test_reward: -500.000000 ± 0.000000, best_reward: -500.000000 ± 0.000000 in #0


Epoch #2: 5001it [00:06, 821.09it/s, env_step=10000, gradient_step=1000, len=500, n/ep=0, n/st=10, rew=-500.00]                          


Epoch #2: test_reward: -301.000000 ± 78.367085, best_reward: -301.000000 ± 78.367085 in #2


Epoch #3: 5001it [00:04, 1007.88it/s, env_step=15000, gradient_step=1500, len=500, n/ep=0, n/st=10, rew=-500.00]                          


Epoch #3: test_reward: -500.000000 ± 0.000000, best_reward: -301.000000 ± 78.367085 in #2


Epoch #4: 5001it [00:05, 881.38it/s, env_step=20000, gradient_step=2000, len=500, n/ep=0, n/st=10, rew=-500.00]                           


Epoch #4: test_reward: -500.000000 ± 0.000000, best_reward: -301.000000 ± 78.367085 in #2


Epoch #5: 5001it [00:04, 1102.44it/s, env_step=25000, gradient_step=2500, len=377, n/ep=0, n/st=10, rew=-376.00]                          


Epoch #5: test_reward: -500.000000 ± 0.000000, best_reward: -301.000000 ± 78.367085 in #2


Epoch #6: 5001it [00:04, 1074.53it/s, env_step=30000, gradient_step=3000, len=481, n/ep=0, n/st=10, rew=-480.00]                          


Epoch #6: test_reward: -500.000000 ± 0.000000, best_reward: -301.000000 ± 78.367085 in #2


Epoch #7: 5001it [00:04, 1064.31it/s, env_step=35000, gradient_step=3500, len=500, n/ep=0, n/st=10, rew=-500.00]                          


Epoch #7: test_reward: -500.000000 ± 0.000000, best_reward: -301.000000 ± 78.367085 in #2


Epoch #8: 5001it [00:04, 1069.41it/s, env_step=40000, gradient_step=4000, len=500, n/ep=0, n/st=10, rew=-500.00]                          


Epoch #8: test_reward: -500.000000 ± 0.000000, best_reward: -301.000000 ± 78.367085 in #2


Epoch #9: 5001it [00:04, 1070.70it/s, env_step=45000, gradient_step=4500, len=500, n/ep=0, n/st=10, rew=-500.00]                          


Epoch #9: test_reward: -500.000000 ± 0.000000, best_reward: -301.000000 ± 78.367085 in #2


Epoch #10: 5001it [00:04, 1026.40it/s, env_step=50000, gradient_step=5000, len=500, n/ep=0, n/st=10, rew=-500.00]                          


Epoch #10: test_reward: -500.000000 ± 0.000000, best_reward: -301.000000 ± 78.367085 in #2


Epoch #11: 5001it [00:04, 1066.49it/s, env_step=55000, gradient_step=5500, len=500, n/ep=0, n/st=10, rew=-500.00]                          


Epoch #11: test_reward: -500.000000 ± 0.000000, best_reward: -301.000000 ± 78.367085 in #2


Epoch #12: 5001it [00:04, 1017.53it/s, env_step=60000, gradient_step=6000, len=500, n/ep=0, n/st=10, rew=-500.00]                          


Epoch #12: test_reward: -500.000000 ± 0.000000, best_reward: -301.000000 ± 78.367085 in #2


Epoch #13: 5001it [00:04, 1078.58it/s, env_step=65000, gradient_step=6500, len=500, n/ep=0, n/st=10, rew=-500.00]                          


Epoch #13: test_reward: -500.000000 ± 0.000000, best_reward: -301.000000 ± 78.367085 in #2


Epoch #14: 5001it [00:04, 1086.04it/s, env_step=70000, gradient_step=7000, len=500, n/ep=0, n/st=10, rew=-500.00]                          


Epoch #14: test_reward: -500.000000 ± 0.000000, best_reward: -301.000000 ± 78.367085 in #2


Epoch #15: 5001it [00:04, 1059.74it/s, env_step=75000, gradient_step=7500, len=500, n/ep=0, n/st=10, rew=-500.00]                          


Epoch #15: test_reward: -500.000000 ± 0.000000, best_reward: -301.000000 ± 78.367085 in #2


Epoch #16: 5001it [00:04, 1074.18it/s, env_step=80000, gradient_step=8000, len=500, n/ep=1, n/st=10, rew=-500.00]                          


Epoch #16: test_reward: -500.000000 ± 0.000000, best_reward: -301.000000 ± 78.367085 in #2


Epoch #17: 5001it [00:04, 1078.89it/s, env_step=85000, gradient_step=8500, len=500, n/ep=1, n/st=10, rew=-500.00]                          


Epoch #17: test_reward: -500.000000 ± 0.000000, best_reward: -301.000000 ± 78.367085 in #2


Epoch #18: 5001it [00:04, 1082.42it/s, env_step=90000, gradient_step=9000, len=500, n/ep=1, n/st=10, rew=-500.00]                          


Epoch #18: test_reward: -500.000000 ± 0.000000, best_reward: -301.000000 ± 78.367085 in #2


Epoch #19: 5001it [00:04, 1072.03it/s, env_step=95000, gradient_step=9500, len=500, n/ep=0, n/st=10, rew=-500.00]                          


Epoch #19: test_reward: -500.000000 ± 0.000000, best_reward: -301.000000 ± 78.367085 in #2


Epoch #20: 5001it [00:04, 1054.89it/s, env_step=100000, gradient_step=10000, len=500, n/ep=0, n/st=10, rew=-500.00]                          


Epoch #20: test_reward: -500.000000 ± 0.000000, best_reward: -301.000000 ± 78.367085 in #2
Finished training in 166.45523834228516 seconds
New best model saved with mean reward: -301.0


In [51]:
print(new_policy)

CBPDQNPolicy(
  (model): CBPNetwork(
    (hidden_layers): ModuleList(
      (0): CBPLayer(
        (linear): Linear(in_features=6, out_features=128, bias=True)
        (activation): ReLU()
      )
      (1-2): 2 x CBPLayer(
        (linear): Linear(in_features=128, out_features=128, bias=True)
        (activation): ReLU()
      )
    )
    (out_layer): Linear(in_features=128, out_features=3, bias=True)
  )
  (model_old): CBPNetwork(
    (hidden_layers): ModuleList(
      (0): CBPLayer(
        (linear): Linear(in_features=6, out_features=128, bias=True)
        (activation): ReLU()
      )
      (1-2): 2 x CBPLayer(
        (linear): Linear(in_features=128, out_features=128, bias=True)
        (activation): ReLU()
      )
    )
    (out_layer): Linear(in_features=128, out_features=3, bias=True)
  )
)


In [53]:
# First create a new policy with correct dimensions for Acrobot
env_name = "MountainCar-v0"
train_env = gym.make(env_name)
test_env = gym.make(env_name)

input_dim = train_env.observation_space.shape[0]  # Should be 6
output_dim = train_env.action_space.n  # Should be 3

# Create a new model with correct dimensions
new_model = CBPNetwork(
    input_dim=input_dim, 
    output_dim=output_dim, 
    hidden_sizes=[128, 128, 128], 
    cbp_params={'eta': 0.99, 'rho': 1e-4, 'm': 100}
)

# Create a new policy
new_policy = CBPDQNPolicy(
    model=new_model,
    optim=torch.optim.Adam(new_model.parameters(), lr=0.001),
    discount_factor=0.975,
    estimation_step=3,
    target_update_freq=1000,
    action_space=train_env.action_space
)

# Load the pretrained weights for matching layers
pretrained_dict = torch.load("models/cbp_dqn_final_adapted_v2.pth")
model_dict = new_policy.state_dict()

# Filter out incompatible layers
filtered_dict = {k: v for k, v in pretrained_dict.items() 
                if k in model_dict and 
                'hidden_layers.0' not in k and  # Skip first layer
                'out_layer' not in k and        # Skip output layer
                model_dict[k].shape == v.shape}

# Update matching layers
model_dict.update(filtered_dict)
new_policy.load_state_dict(model_dict)

# Now you can train with the new policy
new_policy.train()
new_policy.set_eps(0.1)

best_mean_reward = -np.inf

def save_best_model(mean_rewards):
    global best_mean_reward
    if mean_rewards > best_mean_reward:
        best_mean_reward = mean_rewards
        torch.save(new_policy.state_dict(), 'models/cbp_dqn_MountainCar-v0.pth')
        print(f"New best model saved with mean reward: {best_mean_reward}")

result = ts.trainer.OffpolicyTrainer(
    policy=new_policy,
    train_collector=ts.data.Collector(new_policy, train_env, ts.data.VectorReplayBuffer(20000, 1), exploration_noise=True),
    test_collector=ts.data.Collector(new_policy, test_env, exploration_noise=True),
    max_epoch=20,
    step_per_epoch=5000,
    step_per_collect=10,
    episode_per_test=20,
    batch_size=64,
    update_per_step=1 / 10,
    train_fn=lambda epoch, env_step: new_policy.set_eps(max(0.1, 0.5 - epoch * 0.05)),
    test_fn=lambda epoch, env_step: new_policy.set_eps(0.05),
    stop_fn=lambda mean_rewards: mean_rewards >= train_env.spec.reward_threshold,
    logger=ts.utils.TensorboardLogger(SummaryWriter("logs/cbp_MountainCar-v0")),
).run()

print(f"Finished training in {result.timing.total_time} seconds")
save_best_model(result.best_reward)

Epoch #1: 5001it [00:05, 988.06it/s, env_step=5000, gradient_step=500, len=200, n/ep=1, n/st=10, rew=-200.00]                          


Epoch #1: test_reward: -200.000000 ± 0.000000, best_reward: -200.000000 ± 0.000000 in #0


Epoch #2: 5001it [00:04, 1078.50it/s, env_step=10000, gradient_step=1000, len=200, n/ep=1, n/st=10, rew=-200.00]                          


Epoch #2: test_reward: -200.000000 ± 0.000000, best_reward: -200.000000 ± 0.000000 in #0


Epoch #3: 5001it [00:04, 1037.31it/s, env_step=15000, gradient_step=1500, len=200, n/ep=1, n/st=10, rew=-200.00]                          


Epoch #3: test_reward: -200.000000 ± 0.000000, best_reward: -200.000000 ± 0.000000 in #0


Epoch #4: 5001it [00:05, 907.26it/s, env_step=20000, gradient_step=2000, len=200, n/ep=1, n/st=10, rew=-200.00]                           


Epoch #4: test_reward: -200.000000 ± 0.000000, best_reward: -200.000000 ± 0.000000 in #0


Epoch #5: 5001it [00:05, 932.69it/s, env_step=25000, gradient_step=2500, len=176, n/ep=0, n/st=10, rew=-176.00]                           


Epoch #5: test_reward: -200.000000 ± 0.000000, best_reward: -200.000000 ± 0.000000 in #0


Epoch #6: 5001it [00:05, 941.92it/s, env_step=30000, gradient_step=3000, len=200, n/ep=0, n/st=10, rew=-200.00]                           


Epoch #6: test_reward: -200.000000 ± 0.000000, best_reward: -200.000000 ± 0.000000 in #0


Epoch #7: 5001it [00:04, 1092.12it/s, env_step=35000, gradient_step=3500, len=200, n/ep=0, n/st=10, rew=-200.00]                          


Epoch #7: test_reward: -200.000000 ± 0.000000, best_reward: -200.000000 ± 0.000000 in #0


Epoch #8: 5001it [00:04, 1082.32it/s, env_step=40000, gradient_step=4000, len=200, n/ep=0, n/st=10, rew=-200.00]                          


Epoch #8: test_reward: -200.000000 ± 0.000000, best_reward: -200.000000 ± 0.000000 in #0


Epoch #9: 5001it [00:04, 1076.67it/s, env_step=45000, gradient_step=4500, len=200, n/ep=0, n/st=10, rew=-200.00]                          


Epoch #9: test_reward: -200.000000 ± 0.000000, best_reward: -200.000000 ± 0.000000 in #0


Epoch #10: 5001it [00:04, 1002.30it/s, env_step=50000, gradient_step=5000, len=200, n/ep=0, n/st=10, rew=-200.00]                          


Epoch #10: test_reward: -200.000000 ± 0.000000, best_reward: -200.000000 ± 0.000000 in #0


Epoch #11: 5001it [00:04, 1006.31it/s, env_step=55000, gradient_step=5500, len=200, n/ep=0, n/st=10, rew=-200.00]                          


Epoch #11: test_reward: -200.000000 ± 0.000000, best_reward: -200.000000 ± 0.000000 in #0


Epoch #12: 5001it [00:04, 1089.88it/s, env_step=60000, gradient_step=6000, len=200, n/ep=0, n/st=10, rew=-200.00]                          


Epoch #12: test_reward: -200.000000 ± 0.000000, best_reward: -200.000000 ± 0.000000 in #0


Epoch #13: 5001it [00:04, 1030.27it/s, env_step=65000, gradient_step=6500, len=200, n/ep=0, n/st=10, rew=-200.00]                          


Epoch #13: test_reward: -200.000000 ± 0.000000, best_reward: -200.000000 ± 0.000000 in #0


Epoch #14: 5001it [00:04, 1060.71it/s, env_step=70000, gradient_step=7000, len=200, n/ep=0, n/st=10, rew=-200.00]                          


Epoch #14: test_reward: -200.000000 ± 0.000000, best_reward: -200.000000 ± 0.000000 in #0


Epoch #15: 5001it [00:04, 1023.35it/s, env_step=75000, gradient_step=7500, len=200, n/ep=0, n/st=10, rew=-200.00]                          


Epoch #15: test_reward: -200.000000 ± 0.000000, best_reward: -200.000000 ± 0.000000 in #0


Epoch #16: 5001it [00:05, 998.98it/s, env_step=80000, gradient_step=8000, len=200, n/ep=0, n/st=10, rew=-200.00]                           


Epoch #16: test_reward: -200.000000 ± 0.000000, best_reward: -200.000000 ± 0.000000 in #0


Epoch #17: 5001it [00:04, 1017.79it/s, env_step=85000, gradient_step=8500, len=200, n/ep=0, n/st=10, rew=-200.00]                          


Epoch #17: test_reward: -200.000000 ± 0.000000, best_reward: -200.000000 ± 0.000000 in #0


Epoch #18: 5001it [00:04, 1100.95it/s, env_step=90000, gradient_step=9000, len=200, n/ep=0, n/st=10, rew=-200.00]                          


Epoch #18: test_reward: -200.000000 ± 0.000000, best_reward: -200.000000 ± 0.000000 in #0


Epoch #19: 5001it [00:04, 1077.44it/s, env_step=95000, gradient_step=9500, len=200, n/ep=0, n/st=10, rew=-200.00]                          


Epoch #19: test_reward: -200.000000 ± 0.000000, best_reward: -200.000000 ± 0.000000 in #0


Epoch #20: 5001it [00:05, 919.22it/s, env_step=100000, gradient_step=10000, len=200, n/ep=0, n/st=10, rew=-200.00]                          


Epoch #20: test_reward: -200.000000 ± 0.000000, best_reward: -200.000000 ± 0.000000 in #0
Finished training in 122.64696025848389 seconds
New best model saved with mean reward: -200.0
