In [None]:
import torch
import ray
import gym
from IPython import display
import ray.rllib.agents.ppo as ppo
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
print(torch.cuda.is_available())

In [None]:
from ray.rllib.agents import ppo
from ray import tune

config = ppo.DEFAULT_CONFIG.copy()
#Edit default config to do hyperparameter search
config['framework'] = 'torch'
config['lr'] = tune.grid_search([0.01, 0.001, 0.0001])
trainer = ppo.PPOTrainer(env='Breakout-v0', config=config)
policy = trainer.get_policy()
model = policy.q_model
print(model)

In [None]:
import logging

from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
from ray.rllib.utils.annotations import override
from ray.rllib.utils.framework import try_import_torch

torch, nn = try_import_torch()

logger = logging.getLogger(__name__)


class ConvNet(TorchModelV2, nn.Module):
    """Generic fully connected network."""

    def __init__(self, obs_space, action_space, num_outputs, model_config,
                 name):
        TorchModelV2.__init__(self, obs_space, action_space, num_outputs,
                              model_config, name)
        nn.Module.__init__(self)

        in_channels = obs_space.shape[-1]
        self._conv_layers = nn.Sequential(
            torch.nn.Conv2d(in_channels, 8, kernel_size=[7,7], padding=3),
            torch.nn.ReLU(),
            torch.nn.MaxPool2d(2, stride=2, padding=1),
            torch.nn.Conv2d(8, 16, kernel_size=[5,5], padding=2),
            torch.nn.ReLU(),
            torch.nn.MaxPool2d(2, stride=2, padding=1),
            torch.nn.Conv2d(16, 32, kernel_size=[3,3], padding=1),
            torch.nn.ReLU(),
            torch.nn.MaxPool2d(2, stride=2, padding=1),
            torch.nn.Conv2d(32, num_outputs, kernel_size=[12,12])
        )
        self._features = None
        self._num_outputs = num_outputs

        

    @override(TorchModelV2)
    def forward(self, input_dict, state, seq_lens):
        obs = input_dict["obs"].float().permute(0,3,1,2) #reshape input
        self._features = self._conv_layers(obs).view(-1, self._num_outputs)
        return self._features, state


In [None]:
import torch.nn as nn

import ray
from ray.rllib.models import ModelCatalog
from ray.rllib.models.torch.torch_modelv2 import TorchModelV2

ray.shutdown()

ModelCatalog.register_custom_model("ConvNet", ConvNet)

config["hiddens"] = [256,256]
config["num_gpus"] = 2
config["model"] = {
        "custom_model": "ConvNet",
        # Extra kwargs to be passed to your model's c'tor.
        "custom_model_config": {},
    }
ray.init()

def train_ppo(config):
    agent = PPOTrainer(config)
    phase = 0
    agent.workers.foreach_worker(
            lambda ev: ev.foreach_env(
                lambda env: env.set_phase(phase)))
    i = 0
    while True:
        result = agent.train()
        if i % 10 == 0:
            print(trainer.train()['episode_reward_mean'])
        if i % 100 == 0:
            checkpoint = trainer.save()
            print('checkpoint saved at', checkpoint)
        i+=1
        
# tune.run() allows setting a custom log directory (other than ``~/ray-results``)
# and automatically saving the trained agent
trainingSteps = 1000000
analysis = ray.tune.run(
    train_ppo,
    config=config,
    resources_per_trial={
            "cpu": 7,
            "gpu": 1,
            "extra_cpu": 0,
        },
    local_dir=log_dir,
    stop={
        "training_iteration": trainingSteps,
    },
    checkpoint_at_end=True)

In [None]:
from IPython import display
checkpoint_path = ""
trainer = PPOTrainer(ppo_config)
trainer.restore(checkpoint_path)
env = trainer.workers.local_worker().env
episode_reward = 0
done = False
obs = env.reset()
for i in range(500):
    action = trainer.compute_action(obs)
    obs, reward, done, info = env.step(action)
    
    plt.imshow(env.render(mode='rgb_array'))
    display.display(plt.gcf())
    display.clear_output(wait=True)