In [7]:
import agentnet
import gym
import numpy as np
import itertools

In [3]:
env = gym.make('MountainCar-v0')

[2016-11-25 14:17:32,888] Making new env: MountainCar-v0


In [4]:
env.observation_space, env.action_space

(Box(2,), Discrete(3))

In [5]:
n_actions = env.action_space.n
input_shape = (1,)+env.observation_space.shape

** Want to do SARSA control with linear approximation of q(s, a) **

* On each tick (being in state S):
    1. Estimate action values with model -> (q(S, 0), q(S, 1), q(S, 2))
    2. Sample action A with epsilon-greedy resolver.
    3. Take action A and observe new state S' and reward R.
    4. Sample action A' from state S'.
    5. Update q(S, A) with SARSA update taking (R+gamma*q(S', A')) as target.
    6. S <- S', A <- A'

In [6]:
from lasagne.layers import *
import lasagne
import theano.tensor as T
import theano
from theano.gradient import disconnected_grad
from agentnet.resolver import EpsilonGreedyResolver
from agentnet.target_network import TargetNetwork


class model:
    epsilon = theano.shared(0.1)
    gamma = theano.shared(0.99)
    
    state = T.matrix('S')
    action = T.iscalar('A')
    reward = T.scalar('R')
    next_state = T.matrix("S'")
    next_action = T.iscalar("A'")
    
    # Define action-value function approximator.
    state_layer = InputLayer(input_shape, name='state_layer')
#     dense_layer = DenseLayer(state_layer, 4, name='dense_layer')
    qvalues_layer = DenseLayer(state_layer, n_actions, nonlinearity=None, name='q_eval')
    resolver_layer = EpsilonGreedyResolver(qvalues_layer, epsilon)
    
    # Define target network for not propagating gradient over Q(next_state, next_action)
    # We will load weights after each gradient descent step.
#     target_network = TargetNetwork([qvalues_layer])
#     target_qvalues_layer, = target_network.output_layers

    picked_action = get_output(resolver_layer, inputs=state)
    next_picked_action = disconnected_grad(get_output(resolver_layer, inputs=next_state))
    
    # We predict qvalue for (state, action) pair.
    predicted_qvalue = get_output(qvalues_layer, inputs=state)[0, action]
    # But for (next_state, next_action) we predict qvalue from target network.
    reference_qvalue = reward + gamma * get_output(target_qvalues_layer, inputs=next_state)[0, next_action]
    
    loss = ((predicted_qvalue - reference_qvalue)**2).sum()
    all_params = get_all_params(qvalues_layer, trainable=True)
    
class training:
#     learning_rate = theano.shared(0.1)

    updates = lasagne.updates.adam(model.loss, model.all_params)
    
    act_fn = theano.function([model.state], model.picked_action[0])
    train_fn = theano.function([model.state, model.action, model.reward,
                                model.next_state, model.next_action], model.loss, updates=updates)

[2016-11-25 14:17:44,032] We did not found a dynamic library into the library_dir of the library we use for blas. If you use ATLAS, make sure to compile it with dynamics library.


### The sketch of training function:
* Get **A** (action) with ```<training.act_fn(S)>```
* Do this action and observe **S'**, ```<R = env.step(A)>```
* Get **A'** with ```<traning.act_fn(S')>```
* Now we can update our q-function doing ```<train_fn(S, A, R, S', A')>```
* And copy weights to target network with ```<model.target_network.load_weights()>```

In [89]:
theano.function([], model.all_params)()

[array([[ 0.43176488, -0.67488989,  0.92007666,  0.25039531],
        [-0.62802407, -0.66000446,  0.40852582, -0.95264395]]),
 array([ 0.,  0.,  0.,  0.]),
 array([[-0.23852516,  0.66602076,  0.67461779],
        [-0.70842526, -0.26032232,  0.83247447],
        [-0.40304877, -0.2702351 , -0.37738067],
        [ 0.89064078, -0.25781148,  0.12057828]]),
 array([ 0.,  0.,  0.])]

In [90]:
qvalues_fn = theano.function([model.state], get_output(model.qvalues_layer, inputs=model.state))

In [64]:
pred_qv = theano.function([model.state, model.action], model.predicted_qvalue)

In [65]:
pred_qv([env.reset()], 0)

array(-0.9216315335377956)

In [95]:
qvalues_fn([env.reset()])

array([[-0.23128884, -0.08499082,  0.2717888 ]])

In [68]:
import itertools
n_episodes = 100
loss_history = []
rewards_history = []

target_net_delay = 10

for i in xrange(n_episodes):
    current_epsilon = 0.05 + 0.45*np.exp(-i/10.)
    model.epsilon.set_value(np.float32(current_epsilon))
    
    state = env.reset()
    total_loss = 0
    total_reward = 0
    for t in itertools.count():
        action = training.act_fn([state])
        next_state, reward, done, _ = env.step(action)
        next_action = training.act_fn([next_state])
        loss = training.train_fn([state], action, reward/100., [next_state], next_action)
        
        if t % 1000 == 0:
            print t
        
        if (t+1) % target_net_delay == 0:
            model.target_network.load_weights()
        
        total_loss += loss
        total_reward += reward
        if done:
            print "Episode #{0}\tReward={1}\tTime={2}\tLoss={3:.2f}\tEpsilon={4:.3f}".format(i, total_reward, t, total_loss, current_epsilon)
            break
    loss_history.append(total_loss)
    rewards_history.append(total_reward)

0
1000
2000
Episode #0	Reward=-2174.0	Time=2173	Loss=83.01	Epsilon=0.500
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000


KeyboardInterrupt: 