-
Notifications
You must be signed in to change notification settings - Fork 63
/
agent.py
219 lines (172 loc) · 7.32 KB
/
agent.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
# -*- coding: utf-8 -*-
"""Reinforce agent for episodic tasks in OpenAI Gym.
- Author: Curt Park
- Contact: curt.park@medipixel.io
"""
import argparse
import os
from collections import deque
from typing import Deque, Tuple
import gym
import numpy as np
import torch
import torch.nn.functional as F
import wandb
from algorithms.common.abstract.agent import AbstractAgent
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
class Agent(AbstractAgent):
"""ReinforceAgent interacting with environment.
Attributes:
actor (nn.Module): policy model to select actions
critic (nn.Module): critic model to evaluate states
hyper_params (dict): hyper-parameters
actor_optimizer (Optimizer): optimizer for actor
critic_optimizer (Optimizer): optimizer for critic
log_prob_sequence (list): log probabailities of an episode
predicted_value_sequence (list): predicted values of an episode
reward_sequence (list): rewards of an episode to calculate returns
"""
def __init__(
self,
env: gym.Env,
args: argparse.Namespace,
hyper_params: dict,
models: tuple,
optims: tuple,
):
"""Initialization.
Args:
env (gym.Env): openAI Gym environment
args (argparse.Namespace): arguments including hyperparameters and training settings
hyper_params (dict): hyper-parameters
models (tuple): models including actor and baseline
optims (tuple): optimizers for actor and baseline
"""
AbstractAgent.__init__(self, env, args)
self.actor, self.baseline = models
self.actor_optimizer, self.baseline_optimizer = optims
self.hyper_params = hyper_params
self.log_prob_sequence: list = []
self.predicted_value_sequence: list = []
self.reward_sequence: list = []
# load stored parameters
if args.load_from is not None and os.path.exists(args.load_from):
self.load_params(args.load_from)
def select_action(self, state: np.ndarray) -> torch.Tensor:
"""Select an action from the input space."""
state = torch.FloatTensor(state).to(device)
selected_action, dist = self.actor(state)
predicted_value = self.baseline(state)
self.log_prob_sequence.append(dist.log_prob(selected_action).sum(dim=-1))
self.predicted_value_sequence.append(predicted_value)
return selected_action
def step(self, action: torch.Tensor) -> Tuple[np.ndarray, np.float64, bool]:
"""Take an action and return the response of the env."""
action = action.detach().cpu().numpy()
next_state, reward, done, _ = self.env.step(action)
# store rewards to calculate return values
self.reward_sequence.append(reward)
return next_state, reward, done
def update_model(self) -> Tuple[torch.Tensor, torch.Tensor]:
"""Train the model after each episode."""
return_value = 0 # initial return value
return_sequence: Deque = deque()
# calculate return value at each step
for i in range(len(self.reward_sequence) - 1, -1, -1):
return_value = (
self.reward_sequence[i] + self.hyper_params["GAMMA"] * return_value
)
return_sequence.appendleft(return_value)
# standardize returns for better stability
return_sequence_tensor = torch.Tensor(return_sequence).to(device)
return_sequence_tensor = (
return_sequence_tensor - return_sequence_tensor.mean()
) / (return_sequence_tensor.std() + 1e-7)
# calculate loss at each step
policy_loss_sequence = []
value_loss_sequence = []
for log_prob, return_value, predicted_value in zip(
self.log_prob_sequence,
return_sequence_tensor,
self.predicted_value_sequence,
):
delta = return_value - predicted_value.detach()
policy_loss = -delta * log_prob
value_loss = F.mse_loss(predicted_value, return_value)
policy_loss_sequence.append(policy_loss)
value_loss_sequence.append(value_loss)
# train
self.actor_optimizer.zero_grad()
policy_loss = torch.stack(policy_loss_sequence).mean()
policy_loss.backward()
self.actor_optimizer.step()
self.baseline_optimizer.zero_grad()
value_loss = torch.stack(value_loss_sequence).mean()
value_loss.backward()
self.baseline_optimizer.step()
# clear
self.log_prob_sequence.clear()
self.predicted_value_sequence.clear()
self.reward_sequence.clear()
return policy_loss.data, value_loss.data
def load_params(self, path: str):
"""Load model and optimizer parameters."""
if not os.path.exists(path):
print("[ERROR] the input path does not exist. ->", path)
return
params = torch.load(path)
self.actor.load_state_dict(params["actor_state_dict"])
self.baseline.load_state_dict(params["baseline_state_dict"])
self.actor_optimizer.load_state_dict(params["actor_optim_state_dict"])
self.baseline_optimizer.load_state_dict(params["baseline_optim_state_dict"])
print("[INFO] loaded the model and optimizer from", path)
def save_params(self, n_episode: int):
"""Save model and optimizer parameters."""
params = {
"actor_state_dict": self.actor.state_dict(),
"baseline_state_dict": self.baseline.state_dict(),
"actor_optim_state_dict": self.actor_optimizer.state_dict(),
"baseline_optim_state_dict": self.baseline_optimizer.state_dict(),
}
AbstractAgent.save_params(self, params, n_episode)
def write_log(self, i: int, score: int, policy_loss: float, value_loss: float):
total_loss = policy_loss + value_loss
print(
"[INFO] episode %d\ttotal score: %d\ttotal loss: %f\n"
"policy loss: %f\tvalue loss: %f\n"
% (i, score, total_loss, policy_loss, value_loss)
)
if self.args.log:
wandb.log(
{
"total loss": total_loss,
"policy loss": policy_loss,
"value loss": value_loss,
"score": score,
}
)
def train(self):
"""Run the agent."""
# logger
if self.args.log:
wandb.init()
wandb.config.update(self.hyper_params)
wandb.watch([self.actor, self.baseline], log="parameters")
for i_episode in range(1, self.args.episode_num + 1):
state = self.env.reset()
done = False
score = 0
while not done:
if self.args.render and i_episode >= self.args.render_after:
self.env.render()
action = self.select_action(state)
next_state, reward, done = self.step(action)
state = next_state
score += reward
policy_loss, value_loss = self.update_model()
# logging
self.write_log(i_episode, score, policy_loss, value_loss)
if i_episode % self.args.save_period == 0:
self.save_params(i_episode)
# termination
self.env.close()