Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Policy gradients for Classic Control problems #26

Merged
merged 4 commits into from
Dec 28, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions ddqn/atari/play-atari.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@
import skvideo.io
from skvideo.io import FFmpegWriter as vid_writer

from dqn.atari.environment import AtariEnvironment
from dqn.atari.agent import AgentOfAtari
from ddqn.atari.environment import AtariEnvironment
from ddqn.atari.agent import AgentOfAtari
from utils.helpers import read_yaml, get_logger


Expand Down
Empty file.
115 changes: 115 additions & 0 deletions policy_gradients/classic_control/agent-of-control.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import os
import shutil
import argparse

import tqdm
import torch
import numpy as np

from policy_gradients.classic_control.environment import ClassicControlEnvironment
from policy_gradients.classic_control.agent import AgentOfControl
from utils.helpers import read_yaml, get_logger


logger = get_logger(__file__)


def train_agent_of_control(config_file, device='gpu'):

cuda_available = torch.cuda.is_available()
cuda_and_device = cuda_available and device == 'gpu'

if cuda_and_device:
logger.info('Running CUDA benchmarks, GPU(s) device available')
else:
logger.info('Running on CPU(s)')

device = torch.device('cuda' if cuda_and_device else 'cpu')

cfgs = read_yaml(config_file)

train_cfgs = cfgs['train']
save_model = train_cfgs['save_model']
model_dest = train_cfgs['model_dest']
train_eps = train_cfgs['n_train_episodes']
max_steps = train_cfgs['max_steps']
env_solved = train_cfgs['env_solution']

env = ClassicControlEnvironment(cfgs['env'])
agent = AgentOfControl(cfgs['agent'], action_size=env.action_size,
device=device)

os.makedirs(model_dest, exist_ok=True)
shutil.copy(config_file, model_dest)

assert env.action_size == agent.action_size, \
"Environment and state action size should match"

train_ep = tqdm.tqdm(range(train_eps), ascii=True, unit='ep', leave=True)

ep_reward = 0
running_reward = 10

for ep in train_ep:

agent.reset()
state = env.reset()

agent.append_state(state)

train_step = tqdm.tqdm(range(max_steps), ascii=True,
unit='stp', leave=False)

for step in train_step:

state = agent.get_state()
action = agent.get_action(state)
next_state, reward, done, info = env.step(action)
agent.append_reward(reward)
agent.append_state(next_state)

if done:

running_reward = 0.05 * agent.get_episode_rewards() + \
(1 - 0.05) * running_reward

agent.append_episode_reward(running_reward)

agent.discount_episode()
agent.flash_episode()
state = env.reset()
agent.append_state(state)

loss = agent.optimize()

mean_rewards = np.mean(agent.batch_rewards)
train_ep.set_description('Average reward: {:.3f}'.format(mean_rewards))

if ep % save_model == 0:
agent.save_model('{0:09d}'.format(ep * max_steps), model_dest)

best_reward = np.max(agent.batch_rewards)
if best_reward >= env_solved:
logger.info('Solved! At epside {}'
' reward {:.3f} > {:.3f}'.format(ep, best_reward,
env_solved))
break

agent.save_model('final', model_dest)


if __name__ == '__main__':

parser = argparse.ArgumentParser('Train an RL Agent to solve Classic '
'Control problems (with PG)')
parser.add_argument('-x', dest='config_file', type=str,
help='Config for the Atari env/agent', required=True)
parser.add_argument('-d', dest='device', choices=['gpu', 'cpu'],
help='Device to run the train/test', default='gpu')

args = parser.parse_args()

train_agent_of_control(args.config_file, device=args.device)
184 changes: 184 additions & 0 deletions policy_gradients/classic_control/agent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
# https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html#dqn-algorithm
import os
import sys
import math
from collections import deque, namedtuple

import torch
import random
import numpy as np
from torch import nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical
from torchvision.transforms import Compose, CenterCrop, \
Grayscale, Resize, ToPILImage, ToTensor


from utils.helpers import get_logger

logger = get_logger(__file__)


class ControlNet(torch.nn.Module):

def __init__(self, input_shape, state_size, action_size, lr, device):

super(ControlNet, self).__init__()

self.lr = lr
self.device = device
self.input_shape = input_shape
self.state_size = state_size
self.action_size = action_size

self.fc1 = nn.Linear(input_shape[0] * state_size, 128)
self.action = nn.Linear(128, action_size)
self.value = nn.Linear(128, 1)

self.softmax = nn.Softmax(dim=1)

def forward(self, x):

x = x.to(self.device).float()

x = F.relu(self.fc1(x))
q = self.action(x)
v = self.value(x)
aprob = self.softmax(q)

# action, log_prob for action, value estimate for action, softmax probs
return aprob, v


class AgentOfControl():

def __init__(self, cfgs, action_size=None, device=None, model_file=None):

self.history = None
self.rewards = None
self.values = None
self.aprobs = None
self.log_probs = None
self.batch_loss = []
self.batch_rewards = []
self.input_shape = cfgs['input_shape']
self.lr = cfgs['lr']
self.gamma = cfgs['gamma']
self.state_size = cfgs['state_size']
self.action_size = action_size
self.device = device

assert self.input_shape is not None, 'Input shape has to be not None'
assert self.action_size is not None, 'Action size has to non None'
assert self.device is not None, 'Device has to be CPU/GPU'

self.zero_state = torch.zeros([1] + self.input_shape, dtype=torch.float)

self.policy = ControlNet(self.input_shape, self.state_size,
self.action_size, self.lr,
self.device).to(self.device)

self.optimizer = optim.Adam(self.policy.parameters(), lr=self.lr)

if model_file:
self.load_model(model_file)

def reset(self):

self.batch_loss = []
self.batch_rewards = []

self.flash_episode()

def flash_episode(self):

self.rewards = []
self.log_probs = []
self.values = []
self.aprobs = []

no_history = [self.zero_state for _ in range(self.state_size)]
self.history = deque(no_history, maxlen=self.state_size)

def load_model(self, model_file):

logger.info('Loading agent weights from {}'.format(model_file))

self.policy.load_state_dict(torch.load(model_file))
self.policy.eval()

def get_action(self, state):

aprob, value = self.policy(state)

c = Categorical(aprob)
a = c.sample()

log_prob = c.log_prob(a)

self.log_probs.append(log_prob)
self.values.append(value[0])
self.aprobs.append(aprob[0])

return a.detach().cpu().numpy()[0]

def append_state(self, state):

state = torch.from_numpy(state)

self.history.append(state)

def get_state(self):

return torch.cat(list(self.history)).unsqueeze(0)

def append_reward(self, r):

self.rewards.append(r)

def get_episode_rewards(self):

return np.sum(self.rewards)

def discount_episode(self):

ep_length = len(self.rewards)
gamma = torch.tensor(self.gamma)
ep_rewards = torch.tensor(self.rewards, dtype=torch.float32)
ep_discounts = torch.tensor([gamma ** t for t in range(ep_length)],
dtype=torch.float32)

rewards = [ep_rewards[idx:] * ep_discounts[:ep_length - idx]
for idx in range(ep_length)]

rewards = list(map(torch.sum, rewards))
rewards = torch.stack(rewards).to(self.device)

mean, std = rewards.mean(), rewards.std()
rewards = (rewards - mean)/(std + np.finfo(np.float32).eps.item())

neg_log_probs = torch.cat(self.log_probs).mul(-1)
policy_loss = neg_log_probs * rewards

self.batch_loss.append(policy_loss)

def append_episode_reward(self, ep_reward):

self.batch_rewards.append(ep_reward)

def optimize(self):

self.optimizer.zero_grad()
batch_policy_loss = torch.cat(self.batch_loss).mean()
batch_policy_loss.backward()
self.optimizer.step()

return batch_policy_loss.detach().cpu().numpy()

def save_model(self, step, dest):

model_savefile = '{0}/atari-agent-{1}.pth'.format(dest, step)
logger.debug("Saving Atari Agent to {}".format(model_savefile))

torch.save(self.policy.state_dict(), model_savefile)
37 changes: 37 additions & 0 deletions policy_gradients/classic_control/configs/control.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Environment config
env:
# Classic control env name
env_name : 'CartPole-v0'
# seed
seed: 543

# Agent config
agent:
# Learning rate for the agent
lr : 0.01
# Bellman equation reward discount
gamma : 0.99
# input shape
input_shape : [4]
# state size
state_size: 1

train:
# Number of training episodes
n_train_episodes : 1000
# Max steps in each episode
max_steps : 1000
# model location
model_dest: /data/experiments/agent-of-control/25-12-2019-CartPole-v0-pg
# save model every save_model steps
save_model: 20
# solution rewards
env_solution: 195.0

test:
# Number of testing episodes
n_test_episodes : 1
# Max steps in each episode
max_steps : 10000
# path where to save played video
state_dest: /data/experiments/agent-of-control/25-12-2019-CartPole-v0-pg/states
Loading