In [5]:
!apt-get install python-opengl -y > /dev/null
!apt install xvfb -y > /dev/null
!pip install pyvirtualdisplay > /dev/null
!pip install piglet > /dev/null



Reading package lists... Done
Building dependency tree       
Reading state information... Done
E: Unable to locate package xdpyinfo1


In [8]:
!apt-get install xdotool wmctrl

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following package was automatically installed and is no longer required:
  libnvidia-common-410
Use 'apt autoremove' to remove it.
The following additional packages will be installed:
  libxdo3
The following NEW packages will be installed:
  libxdo3 wmctrl xdotool
0 upgraded, 3 newly installed, 0 to remove and 16 not upgraded.
Need to get 81.5 kB of archives.
After this operation, 258 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 wmctrl amd64 1.07-7build1 [20.0 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libxdo3 amd64 1:3.20160805.1-3 [20.5 kB]
Get:3 http://archive.ubuntu.com/ubuntu bionic/universe amd64 xdotool amd64 1:3.20160805.1-3 [41.0 kB]
Fetched 81.5 kB in 0s (581 kB/s)
Selecting previously unselected package wmctrl.
(Reading database ... 133304 files and directories currently installed.)
Preparing to un

In [9]:
import argparse
import gym
import numpy as np
from itertools import count
from pyvirtualdisplay import Display
Display().start()
from IPython import display
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
%matplotlib inline

gamma = 0.99
seed = 543
log_interval = 10

env = gym.make('CartPole-v0')
env.reset()
env.seed(seed)
torch.manual_seed(seed)


class Policy(nn.Module):
    def __init__(self):
        super(Policy, self).__init__()
        self.affine1 = nn.Linear(4, 128)
        self.dropout = nn.Dropout(p=0.6)
        self.affine2 = nn.Linear(128, 2)

        self.saved_log_probs = []
        self.rewards = []

    def forward(self, x):
        x = self.affine1(x)
        x = self.dropout(x)
        x = F.relu(x)
        action_scores = self.affine2(x)
        return F.softmax(action_scores, dim=1)


policy = Policy()
optimizer = optim.Adam(policy.parameters(), lr=1e-2)
eps = np.finfo(np.float32).eps.item()


def select_action(state):
    state = torch.from_numpy(state).float().unsqueeze(0)
    probs = policy(state)
    m = Categorical(probs)
    action = m.sample()
    policy.saved_log_probs.append(m.log_prob(action))
    return action.item()


def finish_episode():
    R = 0
    policy_loss = []
    returns = []
    for r in policy.rewards[::-1]:
        R = r + gamma * R
        returns.insert(0, R)
    returns = torch.tensor(returns)
    returns = (returns - returns.mean()) / (returns.std() + eps)
    for log_prob, R in zip(policy.saved_log_probs, returns):
        policy_loss.append(-log_prob * R)
    optimizer.zero_grad()
    policy_loss = torch.cat(policy_loss).sum()
    policy_loss.backward()
    optimizer.step()
    del policy.rewards[:]
    del policy.saved_log_probs[:]


def main():
    ng_reward = 10
    img = plt.show(env.render('rgb-array'))
    for i_episode in count(1):
        state, ep_reward = env.reset(), 0
        for t in range(1, 10000):  # Don't infinite loop while learning
            action = select_action(state)
            state, reward, done, _ = env.step(action)
            
            img.set_data(env.render(mode='rgb_array'))
            display.display(plt.gcf())
            display.clear_output(wait=True)
            policy.rewards.append(reward)
            ep_reward += reward
            if done:
                break

        running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward
        finish_episode()
        if i_episode % log_interval == 0:
            print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'.format(
                  i_episode, ep_reward, running_reward))
        if running_reward > env.spec.reward_threshold:
            print("Solved! Running reward is now {} and "
                  "the last episode runs to {} time steps!".format(running_reward, t))
            break
            
main()

xdpyinfo was not found, X start can not be checked! Please install xdpyinfo!


AttributeError: ignored