In [7]:
import gym
from actor import Actor
from critic import Critic

In [8]:
MAX_EPISODE = 1000
MAX_EP_STEPS = 200
DISPLAY_REWARD_THRESHOLD = -100
RENDER = False
LR_A = 0.001
LR_C = 0.01

In [9]:
env = gym.make('Pendulum-v0')
env = env.unwrapped

In [10]:
N_S = env.observation_space.shape[0]
A_BOUND = env.action_space.high

In [11]:
actor = Actor(n_features=N_S, lr=LR_A, action_bound=[float(-A_BOUND), float(A_BOUND)])
critic = Critic(n_features=N_S, lr=LR_C)

In [13]:
for i_episode in range(MAX_EPISODE):
	s = env.reset()
	t = 0
	ep_rs = []
	while True:
		if RENDER: env.render()
		a = actor.choose_action(s)

		s_, r, done, info = env.step([a])
		r /= 10

		td_error = critic.learn(s, r, s_)   # gradient = grad[r + gamma * V(s_) - V(s)]
		actor.learn(s, a, td_error)   # gradient = grad[logPi(s, a) * td_error]

		s = s_
		t += 1
		ep_rs.append(r)
		if t > MAX_EP_STEPS:
			ep_rs_sum = sum(ep_rs)
			if 'running_reward' not in globals():
				running_reward = ep_rs_sum
			else:
				running_reward = running_reward * 0.9 + ep_rs_sum * 0.1
			if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True
			print('episode: ', i_episode, '  reward:', int(running_reward))
			break
env.close()

episode:  0   reward: -115
episode:  1   reward: -120
episode:  2   reward: -119
episode:  3   reward: -121
episode:  4   reward: -119
episode:  5   reward: -120
episode:  6   reward: -118
episode:  7   reward: -121
episode:  8   reward: -119
episode:  9   reward: -122
episode:  10   reward: -120
episode:  11   reward: -118
episode:  12   reward: -122
episode:  13   reward: -125
episode:  14   reward: -126
episode:  15   reward: -129
episode:  16   reward: -129
episode:  17   reward: -134
episode:  18   reward: -130
episode:  19   reward: -129
episode:  20   reward: -125
episode:  21   reward: -127
episode:  22   reward: -123
episode:  23   reward: -126
episode:  24   reward: -130
episode:  25   reward: -126
episode:  26   reward: -128
episode:  27   reward: -128
episode:  28   reward: -129
episode:  29   reward: -129
episode:  30   reward: -133
episode:  31   reward: -136
episode:  32   reward: -132
episode:  33   reward: -133
episode:  34   reward: -131
episode:  35   reward: -128
ep