In [2]:
import gymnasium as gym
import torch, numpy as np, torch.nn as nn
from torch.utils.tensorboard import SummaryWriter
import tianshou as ts

In [4]:
task = 'CartPole-v1'
lr, epoch, batch_size = 1e-3, 10, 64
train_num, test_num = 10, 100
gamma, n_step, target_freq = 0.9, 3, 320
buffer_size = 20000
eps_train, eps_test = 0.1, 0.05
step_per_epoch, step_per_collect = 10000, 10
# logger = ts.utils.TensorboardLogger(SummaryWriter('log/dqn'))  # TensorBoard is supported!
# For other loggers: https://tianshou.readthedocs.io/en/master/tutorials/logger.html

In [5]:
# you can also try with SubprocVectorEnv
train_envs = ts.env.DummyVectorEnv([lambda: gym.make(task) for _ in range(train_num)])
test_envs = ts.env.DummyVectorEnv([lambda: gym.make(task) for _ in range(test_num)])

In [8]:
from tianshou.utils.net.common import Net
# you can define other net by following the API:
# https://tianshou.readthedocs.io/en/master/tutorials/dqn.html#build-the-network
env = gym.make(task)
state_shape = env.observation_space.shape or env.observation_space.n
action_shape = env.action_space.shape or env.action_space.n
net = Net(state_shape=state_shape, action_shape=action_shape, hidden_sizes=[256, 256, 256])
optim = torch.optim.Adam(net.parameters(), lr=lr)

In [9]:
env.reset()

(array([-0.04257247,  0.04756985,  0.00133843,  0.02949654], dtype=float32),
 {})

In [24]:
policy = ts.policy.DQNPolicy(net, optim, gamma, n_step, target_update_freq=target_freq)
train_collector = ts.data.Collector(policy, train_envs, ts.data.VectorReplayBuffer(buffer_size, train_num), exploration_noise=True)
test_collector = ts.data.Collector(policy, test_envs, exploration_noise=True)  # because DQN uses epsilon-greedy method

In [25]:
%%time
result = ts.trainer.offpolicy_trainer(
    policy, train_collector, test_collector, epoch, step_per_epoch, step_per_collect,
    test_num, batch_size, update_per_step=1 / step_per_collect,
    train_fn=lambda epoch, env_step: policy.set_eps(eps_train),
    test_fn=lambda epoch, env_step: policy.set_eps(eps_test),
    stop_fn=lambda mean_rewards: mean_rewards >= env.spec.reward_threshold,
    )
print(f'Finished training! Use {result["duration"]}')

Epoch #1: 10001it [00:05, 1926.78it/s, env_step=10000, len=430, loss=0.258, n/ep=0, n/st=10, rew=430.00]                           


Epoch #1: test_reward: 359.200000 ± 75.609391, best_reward: 359.200000 ± 75.609391 in #1


Epoch #2: 10001it [00:06, 1530.80it/s, env_step=20000, len=240, loss=0.316, n/ep=0, n/st=10, rew=240.00]                           


Epoch #2: test_reward: 278.090000 ± 60.695650, best_reward: 359.200000 ± 75.609391 in #1


Epoch #3: 10001it [00:06, 1651.35it/s, env_step=30000, len=253, loss=0.081, n/ep=0, n/st=10, rew=253.00]                           


Epoch #3: test_reward: 301.220000 ± 70.034360, best_reward: 359.200000 ± 75.609391 in #1


Epoch #4: 10001it [00:06, 1472.64it/s, env_step=40000, len=328, loss=0.027, n/ep=0, n/st=10, rew=328.00]                           


Epoch #4: test_reward: 304.230000 ± 55.159560, best_reward: 359.200000 ± 75.609391 in #1


Epoch #5: 10001it [00:06, 1626.44it/s, env_step=50000, len=238, loss=0.016, n/ep=0, n/st=10, rew=238.00]                           


Epoch #5: test_reward: 204.920000 ± 25.458075, best_reward: 359.200000 ± 75.609391 in #1


Epoch #6: 10001it [00:06, 1640.95it/s, env_step=60000, len=98, loss=0.032, n/ep=0, n/st=10, rew=98.00]                            


Epoch #6: test_reward: 233.930000 ± 44.857609, best_reward: 359.200000 ± 75.609391 in #1


Epoch #7: 10001it [00:06, 1577.04it/s, env_step=70000, len=315, loss=0.035, n/ep=0, n/st=10, rew=315.00]                           


Epoch #7: test_reward: 330.960000 ± 77.793306, best_reward: 359.200000 ± 75.609391 in #1


Epoch #8: 10001it [00:05, 1881.86it/s, env_step=80000, len=290, loss=0.033, n/ep=0, n/st=10, rew=290.00]                           


Epoch #8: test_reward: 183.420000 ± 10.929026, best_reward: 359.200000 ± 75.609391 in #1


Epoch #9: 10001it [00:05, 1891.53it/s, env_step=90000, len=278, loss=0.069, n/ep=0, n/st=10, rew=278.00]                           


Epoch #9: test_reward: 337.010000 ± 49.587397, best_reward: 359.200000 ± 75.609391 in #1


Epoch #10: 10001it [00:05, 1707.24it/s, env_step=100000, len=214, loss=0.029, n/ep=0, n/st=10, rew=214.00]                           


Epoch #10: test_reward: 184.390000 ± 10.327531, best_reward: 359.200000 ± 75.609391 in #1
Finished training! Use 66.77s
CPU times: user 6min 37s, sys: 4.37 s, total: 6min 41s
Wall time: 1min 6s


In [10]:
t = torch.Tensor(10,15,20,25)

In [11]:
x = [1,2,3]
y = [2,3,4]

In [20]:
import timm

  from .autonotebook import tqdm as notebook_tqdm


In [22]:
image_model = timm.create_model('efficientnet_b0', pretrained=False)

In [31]:
policy.eval()
policy.set_eps(eps_test)
collector = ts.data.Collector(policy, env, exploration_noise=True)
collector.collect(n_episode=2, render=1 / 35)

error: display Surface quit

In [4]:
np.concatenate([[1,2,3], [4,5,6]]).shape

(6,)