In [3]:
import tianshou
print(tianshou.__version__)

0.4.6





In [4]:
import gym
import tianshou as ts
env = gym.make('CartPole-v0')

  logger.warn(


## Previous way of using 'gym.env'

In [5]:
train_envs = gym.make('CartPole-v0')
test_envs = gym.make('CartPole-v0')

## Tianshou support parallel sampling for every algorithm.
there is four type for **Vectorized Environment Wrapper**\
- DummyVectorEnv
- SubprocVectorEnv
- ShemVectorEnv
- RayVectorEnv

### DummyVectorEnv
make 10 numbers of train_envs and 100 of test_envs

In [6]:
train_envs = ts.env.DummyVectorEnv([lambda: gym.make('CartPole-v0') for _ in range(10)])
test_envs = ts.env.DummyVectorEnv([lambda: gym.make('CartPole-v0') for _ in range(100)])

In [7]:
# if you use custom env, you need to set seed:
"""
def seed(self, seed):
  np.random.seed(seed)
"""
# if you don't set the seed value, every env could have same result

'\ndef seed(self, seed):\n  np.random.seed(seed)\n'

# Build the Network

Tianshou every Network and Optimizer of Pytorch (input, output should follow Tianshou API)*italicized text*

In [8]:
import torch
import numpy as np

from torch import nn

class Net(nn.Module):
  def __init__(self, state_shape, action_shape):
    super().__init__()
    self.model = nn.Sequential(
        nn.Linear(np.prod(state_shape), 128), nn.ReLU(inplace = True),
        nn.Linear(128, 128), nn.ReLU(inplace = True),
        nn.Linear(128, 128), nn.ReLU(inplace = True),
        nn.Linear(128, np.prod(action_shape))
    )

  def forward(self, obs, state = None, info={}):
    if not isinstance(obs, torch.Tensor):
      obs = torch.tensor(obs, dtype=torch.float)

    batch = obs.shape[0]
    logits = self.model(obs.view(batch, -1))
    return logits, state

In [9]:
state_shape = env.observation_space.shape or env.observation_space.n
action_shape = env.action_space.shape or env.action_space.n
net = Net(state_shape, action_shape)
optim = torch.optim.Adam(net.parameters(), lr=1e-3)

- input: Obervation obs

- logits means raw output of every NN

# Setup Policy
In order to define **policy**, we will use initial variables, net, optims.

In [1]:
policy = ts.policy.DQNPolicy(net, optim, discount_factor = 0.9, estimation_step = 3, target_update_freq = 320)

NameError: name 'ts' is not defined

# Setup Collector
Collector is main concept of Tianshou. Collector helps environments that have diferrent policy to interact each other. From each step, Colletor make policy act with limited step or episode, and Colletor save it at the replay buffer

In [11]:
train_collector = ts.data.Collector(policy, train_envs, ts.data.VectorReplayBuffer(20000, 10), exploration_noise = True)
test_collector = ts.data.Collector(policy, test_envs, exploration_noise = True)

# Train Policy with a Trainer

Tianshou has tree trainer: **onpolicy_trainer(), offpolicy_trainer(), offline_trainer()**\
Trainer ends the train when the ploicy of test collector reach to the stop condition (stop_fn). (DQN is off-policy algorithm)

In [12]:
result = ts.trainer.offpolicy_trainer(
    policy, train_collector, test_collector,
    max_epoch=10, step_per_epoch=10000, step_per_collect=10,
    update_per_step=0.1, episode_per_test=100, batch_size=64,
    train_fn=lambda epoch, env_step: policy.set_eps(0.1),
    test_fn=lambda epoch, env_step: policy.set_eps(0.05),
    stop_fn=lambda mean_rewards: mean_rewards >= env.spec.reward_threshold)
print(f'Finished training! Use {result["duration"]}')

Epoch #1:  71%|#######   | 7060/10000 [00:06<00:02, 1121.28it/s, env_step=7060, len=200, n/ep=1, n/st=10, rew=200.00]            

Finished training! Use 6.35s





- max_epoch: maximum number of training epoch. train could be ended before the max_epoch
- step_per_peoch: number of transition per epoch
- step_per_collect: the number of transition when network is updated
- episode_per_test: number of episode to vlaue the policy
- batch_size: the size of sampling bath to train
- train_fn: set the train env based on the current epoch and step index
- test_fn: set the test env
- stop_fn: return the bool type. reveices the value of non-discounted data
- logger: save the trainning data

In [13]:
from torch.utils.tensorboard import SummaryWriter
from tianshou.utils import BasicLogger

writer = SummaryWriter('log/dqn')
logger = BasicLogger(writer)



# Save/Load Policy

Saving and Loading is sas same as torch module as policy inherits torch.nn.Module

In [14]:
torch.save(policy.state_dict(), 'dqn.pth')
policy.load_state_dict(torch.load('dqn.pth'))

<All keys matched successfully>

# Watch the Agent's Performance

Collector support rendering. (35 FPS code below)

In [15]:
policy.eval()
policy.set_eps(0.07)
collector = ts.data.Collector(policy, env, exploration_noise=True)
collector.collect(n_episode=1, render=1 / 35)



{'n/ep': 1,
 'n/st': 200,
 'rews': array([200.]),
 'lens': array([200]),
 'idxs': array([0]),
 'rew': 200.0,
 'len': 200.0,
 'rew_std': 0.0,
 'len_std': 0.0}

## Train a Policy with Customized Codes

In [16]:
train_collector.collect(n_step=5000, random=True)

policy.set_eps(0.1)
for i in range(int(1e6)):  # total step
    collect_result = train_collector.collect(n_step=10)

    # once if the collected episodes' mean returns reach the threshold,
    # or every 1000 steps, we test it on test_collector
    if collect_result['rews'].mean() >= env.spec.reward_threshold or i % 1000 == 0:
        policy.set_eps(0.05)
        result = test_collector.collect(n_episode=100)
        if result['rews'].mean() >= env.spec.reward_threshold:
            print(f'Finished training! Test mean returns: {result["rews"].mean()}')
            break
        else:
            # back to training eps
            policy.set_eps(0.1)

    # train policy with a sampled batch data from buffer
    losses = policy.update(64, train_collector.buffer)

Finished training! Test mean returns: 198.92
