In [1]:
%env THEANO_FLAGS=device=gpu0
%env THEANO_FLAGS='floatX=float32'
import os
if type(os.environ.get("DISPLAY")) is not str or len(os.environ.get("DISPLAY"))==0:
    !bash ../xvfb start
    %env DISPLAY=:1
        
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline


env: THEANO_FLAGS=device=gpu0
env: THEANO_FLAGS='floatX=float32'
bash: ../xvfb: No such file or directory
env: DISPLAY=:1


In [2]:
import gym

env = gym.make("BipedalWalker-v2").env
obs = env.step(env.action_space.sample())[0]
state_size = len(obs)
n_actions = env.action_space.shape[0]
print(obs)

[ 0.00245766 -0.00692379  0.00638609  0.01958699 -0.29070678 -0.71052462
  1.47233391  0.99359345  1.          0.30071586 -0.01757671  0.16331172
  0.33354759  1.          0.45289594  0.45803925  0.47406957  0.50296789
  0.54874158  0.61897343  0.72858536  0.91021365  1.          1.        ]


### Архитектура сети

In [3]:
import lasagne
from lasagne.layers import InputLayer, DenseLayer, NonlinearityLayer, batch_norm,dropout,GaussianNoiseLayer
import theano.tensor as T

observation_layer = InputLayer((None,state_size))
dense0 = DenseLayer(observation_layer,256,name='dense1')
dense1 = DenseLayer(dense0,256,name='dense2',nonlinearity=T.tanh,)
nn = dense1

Using cuDNN version 5110 on context None
Mapped name None to device cuda0: Tesla K80 (0000:00:04.0)


In [4]:
from lasagne.nonlinearities import elu

policy_layer = DenseLayer(nn, n_actions, nonlinearity=elu)
V_layer = DenseLayer(nn, 1, nonlinearity=None)

In [5]:
from agentnet.resolver import ProbabilisticResolver
from lasagne.layers import InputLayer,DenseLayer,batch_norm,dropout,NonlinearityLayer,,ElemwiseSumLayer
from agentnet.learning.qlearning_naf import LowerTriangularLayer,NAFLayer
import theano

epsilon = theano.shared(np.float32(0.0))
low = env.action_space.low
high = env.action_space.high

# Пространство action непрерывное, поэтому будем использовать low и high
action = NonlinearityLayer(policy_layer,lambda a: a.clip(low,high))
action = GaussianNoiseLayer(action,sigma=epsilon)
action_layer = NonlinearityLayer(action,lambda a: a.clip(low,high))

### Агент

In [6]:
from agentnet.agent import Agent

agent = Agent(observation_layers=observation_layer,
              policy_estimators=(policy_layer,V_layer),
              action_layers=action_layer)

In [7]:
weights = lasagne.layers.get_all_params((action_layer,V_layer),trainable=True)
weights

[dense1.W, dense1.b, dense2.W, dense2.b, W, b, W, b]

### Pool

In [8]:
from gym.wrappers.time_limit import TimeLimit

class env_wrapper(object):
    def __init__(self, name, t_max):
        self.name = name
        self.t_max = t_max
    def __call__(self):
        env = gym.make(self.name).env
        env = TimeLimit(env, max_episode_steps=self.t_max)
        return env

In [9]:
from agentnet.experiments.openai_gym.pool import EnvPool

pool = EnvPool(agent,make_env=env_wrapper("BipedalWalker-v2", 10000), n_games=70,max_size=1000) 

In [11]:
SEQ_LENGTH = 10
pool.update(SEQ_LENGTH)

### Actor-critic

In [12]:
replay = pool.experience_replay.sample_session_batch(100)

_,_,_,_,(policy_seq,V_seq) = agent.get_sessions(
    replay,
    session_length=SEQ_LENGTH,
    experience_replay=True,
)

In [13]:
from agentnet.learning import a2c                                                   

elwise_mse_loss = a2c.get_elementwise_objective(policy_seq[:,:,0],
                                                V_seq[:,:,0],
                                                replay.rewards,
                                                replay.is_alive,
                                                gamma_or_gammas=0.95,
                                                n_steps=1)

loss = elwise_mse_loss.sum() / replay.is_alive.sum()
# Регуляризация
loss += lasagne.regularization.regularize_network_params(nn,lasagne.regularization.l2)*1e-5

In [15]:
updates = lasagne.updates.rmsprop(loss, weights, learning_rate=10e-5)

In [16]:
import theano
train_step = theano.function([],loss,updates=updates)

### Train loop

In [18]:
epoch_counter = 1
rewards = {}

In [19]:
from IPython.display import clear_output

In [25]:
loss = 0
for i in range(100000):    
    
    pool.update(SEQ_LENGTH,append=True)
    
    loss = loss*0.99 + train_step()*0.01
        
    if epoch_counter%100==0:
        pool_mean_reward = np.average(pool.experience_replay.rewards.get_value()[:,:-1],
                                      weights=1+pool.experience_replay.is_alive.get_value()[:,:-1])
        print("iter=%i\treward/step=%.5f\tloss ma=%.5f"%(epoch_counter,
                                                        pool_mean_reward,
                                                        loss))
        
    if epoch_counter%500 ==0:
        clear_output(True)

        n_games = 10
        rewards[epoch_counter] = pool.evaluate( record_video=False,n_games=n_games,
                                               verbose=False)
        iters,session_rewards=zip(*sorted(rewards.items(),key=lambda k:k))
        mean_rewards = [np.mean(x) for x in session_rewards]
        plt.figure(figsize=(12, 8))
        plt.plot(iters, mean_rewards)
        plt.show()
        if np.mean(rewards[epoch_counter]) > 290:
            break
        print("Current score(mean over %i) = %.3f"%(n_games,np.mean(rewards[epoch_counter])))
        
    if i % 100 == 0:
        print(i)
    
    epoch_counter +=1


