In [1]:
!apt-get install -y python-numpy python-dev cmake zlib1g-dev libjpeg- dev xvfb libav-tools xorg-dev python-opengl libboost-all-dev libsdl2-dev swig
!pip install pyvirtualdisplay
!pip install piglet
!apt-get install xvfb

Reading package lists... Done
Building dependency tree       
Reading state information... Done
Package libav-tools is not available, but is referred to by another package.
This may mean that the package is missing, has been obsoleted, or
is only available from another source
However the following packages replace it:
  ffmpeg

E: Unable to locate package libjpeg
E: Unable to locate package dev
E: Package 'libav-tools' has no installation candidate
Reading package lists... Done
Building dependency tree       
Reading state information... Done
xvfb is already the newest version (2:1.19.6-1ubuntu4.2).
The following package was automatically installed and is no longer required:
  libnvidia-common-410
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 11 not upgraded.


In [2]:
!pip install gym
!pip install gym[atari]
!pip install gym-retro
!pip install tqdm retrowrapper gym-retro
!pip install -U git+git://github.com/frenchie4111/dumbrain.git

Collecting git+git://github.com/frenchie4111/dumbrain.git
  Cloning git://github.com/frenchie4111/dumbrain.git to /tmp/pip-req-build-t2eaa14g
  Running command git clone -q git://github.com/frenchie4111/dumbrain.git /tmp/pip-req-build-t2eaa14g
Building wheels for collected packages: dumbrain
  Building wheel for dumbrain (setup.py) ... [?25l[?25hdone
  Stored in directory: /tmp/pip-ephem-wheel-cache-epz6msm7/wheels/50/8e/6f/47c68c95113aa8c02ac02bde75673ace7c3d3636842c75fcb6
Successfully built dumbrain
Installing collected packages: dumbrain
  Found existing installation: dumbrain 0.1
    Uninstalling dumbrain-0.1:
      Successfully uninstalled dumbrain-0.1
Successfully installed dumbrain-0.1


In [3]:
!pip install baselines



In [4]:
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1400, 900))
display.start()

<Display cmd_param=['Xvfb', '-br', '-nolisten', 'tcp', '-screen', '0', '1400x900x24', ':1013'] cmd=['Xvfb', '-br', '-nolisten', 'tcp', '-screen', '0', '1400x900x24', ':1013'] oserror=None return_code=None stdout="None" stderr="None" timeout_happened=False>

Environment Setup

In [0]:
#@title
import numpy as np
import gym

from baselines import logger
from baselines.common.atari_wrappers import FrameStack

import cv2

# Custom observation wrapper to preprocess frames
class PreprocessFrame(gym.ObservationWrapper):
    def __init__(self, env):
        super(PreprocessFrame, self).__init__(env)
        self.width = 96
        self.height = 96
        self.observation_space = gym.spaces.Box(low=0, high=255, shape=(self.height, self.width, 1), dtype=np.uint8)
    
    def observation(self, frame):
        frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
        frame = cv2.resize(frame, (self.width, self.height), interpolation=cv2.INTER_AREA)
        frame = frame[:,:,None]
        
        return frame

# Scale rewards to more reasonable amount
class RewardScaler(gym.RewardWrapper):
    def reward(self, reward):
        return reward * 0.01
        
        
# Create environment
def make_env():
    env = gym.make("PongDeterministic-v0")
    env = PreprocessFrame(env)
    env = RewardScaler(env)
    env = FrameStack(env, 4)
    return env
    


Network Architecture

In [0]:
#@title
import numpy as np
import tensorflow as tf

from baselines.common.distributions import make_pdtype

def conv_layer(inputs, filters, kernel, strides, gain=1.0):
    return tf.layers.conv2d(inputs=inputs, filters=filters, kernel_size=kernel, strides=(strides,strides), activation=tf.nn.relu, kernel_initializer=tf.orthogonal_initializer(gain=gain))
    
def fc_layer(inputs, nodes, activation_fn=tf.nn.relu, gain=1.0):
    return tf.layers.dense(inputs=inputs, units=nodes, activation=activation_fn, kernel_initializer=tf.orthogonal_initializer(gain))
    
### Neural Network Class
# init: constructs a convolutional network with actor and critic streams
# step: given a state, return recommended action and value of that state
# value: given a state, return value of that state
# select_action: given a state, return action of that state

class A2CNetwork(object):
    def __init__(self, sess, observation_space, action_space, batches, steps, reuse = False):
        gain = np.sqrt(2)
        
        self.pdtype = make_pdtype(action_space)
        
        height, width, channel = observation_space.shape
        observation_shape = (height, width, channel)

        inputs = tf.placeholder(tf.float32, [None, *observation_shape], name="input")
        
        scaled_images = tf.cast(inputs, tf.float32) / 255.
        
        with tf.variable_scope("model", reuse = reuse):
            conv1 = conv_layer(scaled_images, 32, 8, 4, gain)
            conv2 = conv_layer(conv1, 64, 4, 2, gain)
            conv3 = conv_layer(conv2, 64, 3, 1, gain)
            conv_flatten = tf.layers.flatten(conv3)
            common_fc = fc_layer(conv_flatten, 512, gain=gain)
            
            # self.pi - logits, self.pd - prob distribution
            self.pd, self.pi = self.pdtype.pdfromlatent(common_fc, init_scale=0.01)
            
            # value function
            vf = fc_layer(common_fc, 1, activation_fn=None)[:, 0]
            
        self.initial_state = None
        
        a0 = self.pd.sample()
        
        
        def step(state_in):
            action, value = sess.run([a0, vf], feed_dict={inputs: state_in})
            return action,value
        
        def value(state_in):
            value = sess.run(vf, {inputs: state_in})
            return value
        
        def select_action(state_in):
            action = sess.run(a0, {inputs: state_in})
            return action
        
        self.inputs = inputs
        self.vf = vf
        self.step = step
        self.value = value
        self.select_action = select_action
        

Model

In [0]:
import numpy as np
import tensorflow as tf
import os
import time
from baselines import logger
import cv2

import matplotlib.pyplot as plt

from baselines.a2c.utils import cat_entropy


from baselines.common import explained_variance
from baselines.common.runners import AbstractEnvRunner

def mse(pred, target):
    return tf.square(pred-target)/2

### A2C Model Class
# init: create two neural nets, the step and train net, and constructs loss graph
# train: takes as input a set of states, actions, returns, and values, performs training, and outputs losses
# save: saves model
# load: loads model

class Model():
    
    def __init__(self, policy, observation_space,
    action_space, number_envs, number_steps, ent_coef, vf_coef, max_grad_norm):
        sess = tf.get_default_session()
        
        actions = tf.placeholder(tf.int32, [None], name="actions")
        advantages = tf.placeholder(tf.float32, [None], name="advantages")
        rewards = tf.placeholder(tf.float32, [None], name="rewards")
        lr = tf.placeholder(tf.float32, name="learning_rate")
        
        step_model = policy(sess, observation_space, action_space, number_envs, 1, reuse=False)
        
        train_model = policy(sess, observation_space, action_space, number_envs*number_steps, number_steps, reuse=True)
        
        # Total Loss: policy gradient loss - (entropy * entropy_coefficient) + (value*value_coefficient)
        
        # -log(softmax(neural net))

        neglogp = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=actions)
        
        # policy gradient loss: 1/n * sum A(s_i,a_i) * -log(softmax(neural net(a_i | s_i)))
        pg_loss = tf.reduce_mean(advantages * neglogp)
        
        #value loss: 1/2 * sum (r - v(s))^2
        
        vf_loss = tf.reduce_mean (mse(tf.squeeze(train_model.vf), rewards))
        
        # improve exploration by limiting convergence
        entropy = tf.reduce_mean(train_model.pd.entropy())
        
        loss = pg_loss - (entropy * ent_coef) + (vf_loss * vf_coef)
        
        # get trainable parameters
        params = tf.trainable_variables("model")
        
        # get gradient
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads,params))
        
        trainer = tf.train.RMSPropOptimizer(lr, decay=0.99, epsilon=1e-5)
        
        train_obj = trainer.apply_gradients(grads)
        
        # Calculate advantage from returns and values, use to compute
        def train(states_in, actions_, returns_, values_, lr_):
            
            # returns = (bootstrap) q-value estimation (reward + gamma*V(s'))
            advantages_ = returns_ - values_
            
            feed_dict = {train_model.inputs: states_in,
            actions: actions_,
            advantages: advantages_,
            rewards: returns_,
            lr: lr_}
            
            policy_loss, value_loss, entropy_, _ = sess.run([pg_loss, vf_loss, entropy, train_obj], feed_dict=feed_dict)
            
            return policy_loss, value_loss, entropy_
        
        def save(save_path):
            saver = tf.train.Saver()
            saver.save(sess, save_path)
            
        def load(save_path):
            saver = tf.train.Saver()
            saver.restore(sess, save_path)
            print("Model loaded.")
            
        
        self.train = train
        self.train_model = train_model
        self.save = save
        self.load = load
        self.step = step_model.step
        self.step_model = step_model
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        
        tf.global_variables_initializer().run(session=sess)

# Runs training
class Runner(AbstractEnvRunner):
    def __init__(self, env, model, number_steps, total_timesteps, gamma, lam):
        super().__init__(env=env, model=model, nsteps=number_steps)
        
        self.gamma = gamma
        
        self.lam = lam
        
        self.total_timesteps = total_timesteps
        
    # Collect a set of experiences
    def run(self):
        observations_list, actions_list, rewards_list, values_list, dones_list = [], [], [], [], []
        
        for n in range(self.nsteps):
            actions,values = self.model.step(self.obs)
            
            observations_list.append(np.copy(self.obs))
            actions_list.append(actions)
            values_list.append(values)
            dones_list.append(np.copy(self.dones))
            
            self.obs[:], rewards, self.dones, _ = self.env.step(actions)
            
            rewards_list.append(rewards)
        
        observations_list = np.asarray(observations_list, dtype=np.uint8)
        actions_list = np.asarray(actions_list, dtype=np.int32)
        rewards_list = np.asarray(rewards_list, dtype=np.float32)
        values_list = np.asarray(values_list, dtype=np.float32)
        dones_list = np.asarray(dones_list, dtype=np.bool)
        
        last_values = self.model.value(self.obs)
        
        returns_list = np.zeros_like(rewards_list)
        advantages_list = np.zeros_like(rewards_list)
        
        last_gae_lam = 0
        
        for t in reversed(range(self.nsteps)):
            
            # if we are in a final state, there is no value of the next state, so set modifier (nextnonterminal) to 0
            if t == self.nsteps - 1:
                next_non_terminal = 1.0 - self.dones
                next_values = last_values
            else:
                next_non_terminal = 1.0 - dones_list[t+1]
                next_values = values_list[t+1]
            
            # return function. r_t + gamma*V(s_t+1) - V(s_t)
            delta = rewards_list[t] + self.gamma * next_values * next_non_terminal - values_list[t]
            
            # advantage. delta + gamma * lambda * next_non_terminal * last_gae_lam
            advantages_list[t] = delta + self.gamma * self.lam * next_non_terminal * last_gae_lam
            last_gae_lam = advantages_list[t]
        
        returns_list = advantages_list + values_list
        
        return map(sf01, (observations_list, actions_list, returns_list, values_list, rewards_list))
        
def sf01(arr):
    s = arr.shape
    swapped = arr.swapaxes(0,1)
    reshaped = swapped.reshape(s[0] * s[1], *s[2:])
    return reshaped
    
## Learn Function
# Takes as input a policy, environment, number of steps, total number of time steps, gamma, lambda, value coefficient, entropy coefficient, learning rate, max gradient norm, log_interval and executes training
def learn(policy, env, nsteps, total_timesteps, gamma, lam, vf_coef, ent_coef, lr, max_grad_norm, log_interval, restart):
    number_epochs = 4
    number_mini_batches = 8
    
    nenvs = env.num_envs
    
    observation_space = env.observation_space
    action_space = env.action_space
    
    batch_size = nenvs * nsteps
    batch_train_size = batch_size // number_mini_batches
    
    assert batch_size % number_mini_batches == 0
    
    # Construct model
    model = Model(policy=policy,
                observation_space=observation_space,
                action_space=action_space,
                number_envs=nenvs,
                number_steps=nsteps,
                ent_coef=ent_coef,
                vf_coef=vf_coef,
                max_grad_norm=max_grad_norm
                )

    if not restart:
        model.load("./models/model.ckpt")

    runner = Runner(env, model, nsteps, total_timesteps, gamma, lam)
    
    time_first_start = time.time()
    
    # For every batch of the game
    for update in range(1, total_timesteps // batch_size+1):
        time_start = time.time()
        
        # Generate observations
        observations, actions, returns, values, rewards = runner.run()
        
        reward_sum = np.sum(rewards)
        
        losses_list = []
        total_batches_train = 0
        
        indices = np.arange(batch_size)
        for epoch in range(number_epochs):
            np.random.shuffle(indices)
            
            # Feed minibatches to model for training
            for mini_start in range(0, batch_size, batch_train_size):
                mini_end = mini_start + batch_train_size
                mini_indices = indices[mini_start:mini_end]

                mini_states = observations[mini_indices]
                mini_actions = actions[mini_indices]
                mini_returns = returns[mini_indices]
                mini_values = values[mini_indices]

                losses_list.append(model.train(mini_states,mini_actions,mini_returns,mini_values,lr))
        
        loss_values = np.mean(losses_list, axis=0)
        time_now = time.time()
        fps = int(batch_size / (time_now - time_start))
        
        # Print out updates
        if update % log_interval == 0 or update == 1:
            ev = explained_variance(values, returns)
            logger.record_tabular("nupdates", update)
            logger.record_tabular("total_timesteps", update*batch_size)
            logger.record_tabular("fps", fps)
            logger.record_tabular("policy_loss", float(loss_values[0]))
            logger.record_tabular("policy_entropy", float(loss_values[2]))
            logger.record_tabular("value_loss", float(loss_values[1]))
            logger.record_tabular("explained_variance", float(ev))
            logger.record_tabular("time_elapsed", float(time_now - time_first_start))
            logger.record_tabular("total_reward", float(reward_sum))
            logger.dump_tabular()

            model.save("./models/model.ckpt")
            print("Model saved.")
    env.close()

def play(policy, env):
    observation_space = env.observation_space
    action_space = env.action_space
    
    model = Model(policy=policy,
                  observation_space=observation_space,
                  action_space=action_space,
                  number_envs=1,
                  number_steps=1,
                  ent_coef=0,
                  vf_coef=0,
                  max_grad_norm=0)
                  
    model.load("./models/model.ckpt")
    
    obs = env.reset()
    
    score=0
    done = False
    while done == False:
        actions,values = model.step(obs)
        obs, rewards,done,_ = env.step(actions)
        score += rewards
        
        env.render()
    
    print("Score: ", score)
    env.close()

Agent Trainer

In [0]:
import math
import os
import tensorflow as tf



from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv

def main():
    config = tf.ConfigProto()
    
    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
    
    config.gpu_options.allow_growth = True
    environment_list = []
    for i in range(10):
        environment_list.append(make_env)
    
    env_vector = SubprocVecEnv(environment_list)
    
    with tf.Session(config=config):
        learn(policy=A2CNetwork,
                    env=env_vector,
                    nsteps=2048,
                    total_timesteps=10000000,
                    gamma=0.99,
                    lam=0.95,
                    vf_coef=0.5,
                    ent_coef=0.01,
                    lr=2e-4,
                    max_grad_norm=0.5,
                    log_interval=10,
                    restart=True)

if __name__ == "__main__":
    main()

Instructions for updating:
Use keras.layers.conv2d instead.
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use keras.layers.flatten instead.
Instructions for updating:
Use keras.layers.dense instead.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Logging to /tmp/openai-2019-05-09-02-53-44-418423
---------------------------------
| explained_variance | -0.422   |
| fps                | 519      |
| nupdates           | 1        |
| policy_entropy     | 1.79     |
| policy_loss        | 0.0958   |
| time_elapsed       | 39.4     |
| total_reward       | -4.35    |
| total_timesteps    | 2.05e+04 |
| value_loss         | 0.000716 |
---------------------------------
Model saved.
---------------------------------
| explained_variance | -2.81    |
| fps                | 594      |
| nupdates           | 10       |
| policy_entropy     | 1.79     |
| policy_loss  

In [0]:
!ls
!zip -r models.zip models/
from google.colab import files
files.download("./models.zip")