-
Notifications
You must be signed in to change notification settings - Fork 9
/
ppo.py
288 lines (230 loc) · 10.9 KB
/
ppo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
import random
import copy
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as Fnn
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from torch.autograd import Variable
from utils import gae, cuda_if, mean_std_groups, set_lr
class PPO:
def __init__(self, policy, venv, test_env, optimizer,
lr_func=None, clip_func=None, gamma=.99, lambd=.95,
worker_steps=128, sequence_steps=32, minibatch_steps=256,
opt_epochs=3, value_coef=1., entropy_coef=.01, max_grad_norm=.5,
cuda=False, plot_reward=False, plot_points=20, plot_path='ep_reward.png',
test_repeat_max=100):
""" Proximal Policy Optimization algorithm class
Evaluates a policy over a vectorized environment and
optimizes over policy, value, entropy objectives.
Assumes discrete action space.
Args:
policy (nn.Module): the policy to optimize
venv (vec_env): the vectorized environment to use
test_env (Env): the environment to use for policy testing
optimizer (optim.Optimizer): the optimizer to use
clip (float): probability ratio clipping range
gamma (float): discount factor
lambd (float): GAE lambda parameter
worker_steps (int): steps per worker between optimization rounds
sequence_steps (int): steps per sequence (for backprop through time)
batch_steps (int): steps per sequence (for backprop through time)
"""
self.policy = policy
self.policy_old = copy.deepcopy(policy)
self.venv = venv
self.test_env = test_env
self.optimizer = optimizer
self.lr_func = lr_func
self.clip_func = clip_func
self.num_workers = venv.num_envs
self.worker_steps = worker_steps
self.sequence_steps = sequence_steps
self.minibatch_steps = minibatch_steps
self.opt_epochs = opt_epochs
self.gamma = gamma
self.lambd = lambd
self.value_coef = value_coef
self.entropy_coef = entropy_coef
self.max_grad_norm = max_grad_norm
self.cuda = cuda
self.plot_reward = plot_reward
self.plot_points = plot_points
self.plot_path = plot_path
self.ep_reward = np.zeros(self.num_workers)
self.reward_histr = []
self.steps_histr = []
self.objective = PPOObjective()
self.last_ob = self.venv.reset()
self.taken_steps = 0
self.test_repeat_max = test_repeat_max
def run(self, total_steps):
""" Runs PPO
Args:
total_steps (int): total number of environment steps to run for
"""
N = self.num_workers
T = self.worker_steps
E = self.opt_epochs
A = self.venv.action_space.n
while self.taken_steps < total_steps:
progress = self.taken_steps / total_steps
obs, rewards, masks, actions, steps = self.interact()
ob_shape = obs.size()[2:]
ep_reward = self.test()
self.reward_histr.append(ep_reward)
self.steps_histr.append(self.taken_steps)
# statistic logic
group_size = len(self.steps_histr) // self.plot_points
if self.plot_reward and len(self.steps_histr) % (self.plot_points * 10) == 0 and group_size >= 10:
x_means, _, y_means, y_stds = \
mean_std_groups(np.array(self.steps_histr), np.array(self.reward_histr), group_size)
fig = plt.figure()
fig.set_size_inches(8, 6)
plt.ticklabel_format(axis='x', style='sci', scilimits=(-2, 6))
plt.errorbar(x_means, y_means, yerr=y_stds, ecolor='xkcd:blue', fmt='xkcd:black', capsize=5, elinewidth=1.5, mew=1.5, linewidth=1.5)
plt.title('Training progress')
plt.xlabel('Total steps')
plt.ylabel('Episode reward')
plt.savefig(self.plot_path, dpi=200)
plt.clf()
plt.close()
plot_timer = 0
# TEMP upgrade to support recurrence
# compute advantages, returns with GAE
obs_ = obs.view(((T + 1) * N,) + ob_shape)
obs_ = Variable(obs_)
_, values = self.policy(obs_)
values = values.view(T + 1, N, 1)
advantages, returns = gae(rewards, masks, values, self.gamma, self.lambd)
self.policy_old.load_state_dict(self.policy.state_dict())
for e in range(E):
self.policy.zero_grad()
MB = steps // self.minibatch_steps
b_obs = Variable(obs[:T].view((steps,) + ob_shape))
b_rewards = Variable(rewards.view(steps, 1))
b_masks = Variable(masks.view(steps, 1))
b_actions = Variable(actions.view(steps, 1))
b_advantages = Variable(advantages.view(steps, 1))
b_returns = Variable(returns.view(steps, 1))
b_inds = np.arange(steps)
np.random.shuffle(b_inds)
for start in range(0, steps, self.minibatch_steps):
mb_inds = b_inds[start:start + self.minibatch_steps]
mb_inds = cuda_if(torch.from_numpy(mb_inds).long(), self.cuda)
mb_obs, mb_rewards, mb_masks, mb_actions, mb_advantages, mb_returns = \
[arr[mb_inds] for arr in [b_obs, b_rewards, b_masks, b_actions, b_advantages, b_returns]]
mb_pis, mb_vs = self.policy(mb_obs)
mb_pi_olds, mb_v_olds = self.policy_old(mb_obs)
mb_pi_olds, mb_v_olds = mb_pi_olds.detach(), mb_v_olds.detach()
losses = self.objective(self.clip_func(progress),
mb_pis, mb_vs, mb_pi_olds, mb_v_olds,
mb_actions, mb_advantages, mb_returns)
policy_loss, value_loss, entropy_loss = losses
loss = policy_loss + value_loss * self.value_coef + entropy_loss * self.entropy_coef
set_lr(self.optimizer, self.lr_func(progress))
self.optimizer.zero_grad()
loss.backward()
torch.nn.utils.clip_grad_norm(self.policy.parameters(), self.max_grad_norm)
self.optimizer.step()
self.taken_steps += steps
print(self.taken_steps)
def interact(self):
""" Interacts with the environment
Returns:
obs (ArgumentDefaultsHelpFormatternsor): observations shaped [T + 1 x N x ...]
rewards (FloatTensor): rewards shaped [T x N x 1]
masks (FloatTensor): continuation masks shaped [T x N x 1]
zero at done timesteps, one otherwise
actions (LongTensor): discrete actions shaped [T x N x 1]
steps (int): total number of steps taken
"""
N = self.num_workers
T = self.worker_steps
# TEMP needs to be generalized, does conv-specific transpose for PyTorch
obs = torch.zeros(T + 1, N, 4, 84, 84)
obs = cuda_if(obs, self.cuda)
rewards = torch.zeros(T, N, 1)
rewards = cuda_if(rewards, self.cuda)
masks = torch.zeros(T, N, 1)
masks = cuda_if(masks, self.cuda)
actions = torch.zeros(T, N, 1).long()
actions = cuda_if(actions, self.cuda)
for t in range(T):
# interaction logic
ob = torch.from_numpy(self.last_ob.transpose((0, 3, 1, 2))).float()
ob = Variable(ob / 255.)
ob = cuda_if(ob, self.cuda)
obs[t] = ob.data
pi, v = self.policy(ob)
u = cuda_if(torch.rand(pi.size()), self.cuda)
_, action = torch.max(pi.data - (-u.log()).log(), 1)
action = action.unsqueeze(1)
actions[t] = action
self.last_ob, reward, done, _ = self.venv.step(action.cpu().numpy())
reward = torch.from_numpy(reward).unsqueeze(1)
rewards[t] = torch.clamp(reward, min=-1., max=1.)
masks[t] = mask = torch.from_numpy((1. - done)).unsqueeze(1)
ob = torch.from_numpy(self.last_ob.transpose((0, 3, 1, 2))).float()
ob = Variable(ob / 255.)
ob = cuda_if(ob, self.cuda)
obs[T] = ob.data
steps = N * T
return obs, rewards, masks, actions, steps
def test(self):
ob = self.test_env.reset()
done = False
ep_reward = 0
last_action = np.array([-1])
action_repeat = 0
while not done:
ob = np.array(ob)
ob = torch.from_numpy(ob.transpose((2, 0, 1))).float().unsqueeze(0)
ob = Variable(ob / 255., volatile=True)
ob = cuda_if(ob, self.cuda)
pi, v = self.policy(ob)
_, action = torch.max(pi, dim=1)
# abort after {self.test_repeat_max} discrete action repeats
if action.data[0] == last_action.data[0]:
action_repeat += 1
if action_repeat == self.test_repeat_max:
return ep_reward
else:
action_repeat = 0
last_action = action
ob, reward, done, _ = self.test_env.step(action.data.cpu().numpy())
ep_reward += reward
return ep_reward
class PPOObjective(nn.Module):
def forward(self, clip, pi, v, pi_old, v_old, action, advantage, returns):
""" Computes PPO objectives
Assumes discrete action space.
Args:
clip (float): probability ratio clipping range
pi (Variable): discrete action logits, shaped [N x num_actions]
v (Variable): value predictions, shaped [N x 1]
pi_old (Variable): old discrete action logits, shaped [N x num_actions]
v_old (Variable): old value predictions, shaped [N x 1]
action (Variable): discrete actions, shaped [N x 1]
advantage (Variable): action advantages, shaped [N x 1]
returns (Variable): discounted returns, shaped [N x 1]
Returns:
policy_loss (Variable): policy surrogate loss, shaped [1]
value_loss (Variable): value loss, shaped [1]
entropy_loss (Variable): entropy loss, shaped [1]
"""
prob = Fnn.softmax(pi)
log_prob = Fnn.log_softmax(pi)
action_prob = prob.gather(1, action)
prob_old = Fnn.softmax(pi_old)
action_prob_old = prob_old.gather(1, action)
ratio = action_prob / (action_prob_old + 1e-10)
advantage = (advantage - advantage.mean()) / (advantage.std() + 1e-5)
surr1 = ratio * advantage
surr2 = torch.clamp(ratio, min=1. - clip, max=1. + clip) * advantage
policy_loss = -torch.min(surr1, surr2).mean()
value_loss = (.5 * (v - returns) ** 2.).mean()
entropy_loss = (prob * log_prob).sum(1).mean()
return policy_loss, value_loss, entropy_loss