In [1]:
import numpy as np
import gym
from numpy.random import choice
import random
from tensorbuilder.api import *
import tensorflow as tf

env = gym.make("FrozenLake-v0")

hdf5 not supported (please install/reinstall h5py)


[2017-01-29 00:33:45,867] Making new env: FrozenLake-v0


In [2]:
def select_columns(tensor, indexes):
    idx = tf.stack((tf.range(tf.shape(indexes)[0]), indexes), 1)
    return tf.gather_nd(tensor, idx)

In [15]:
model_name = "shallow.model"
model_path = "/models/" + model_name
n_actions = env.action_space.n
n_states = env.observation_space.n

class Model(object):
    
    def __init__(self, y, restore=False):
        
        self.graph = tf.Graph()
        self.sess = tf.Session(graph=self.graph)
        
        with self.graph.as_default():
            with tf.device("cpu:0"):
                s = tf.placeholder(tf.int32, [None], name='s')
                a = tf.placeholder(tf.int32, [None], name='a')
                r = tf.placeholder(tf.float32, [None], name='r')
                max_Qs1 = tf.placeholder(tf.float32, [None], name='maxQs1')
                lr = tf.placeholder(tf.float32, [], name='lr')

                ops = dict(trainable=True, weights_initializer=tf.random_uniform_initializer(minval=0.0, maxval=0.01), biases_initializer=None) #tf.random_uniform_initializer(minval=0, maxval=0.01))


                Qs = Pipe(
                    s,
                    T.one_hot(n_states)
                    .linear_layer(n_actions, scope='linear_layer', **ops)
                )

                Qsa = select_columns(Qs, a)

                max_Qs = tf.reduce_max(Qs, 1)

                error = r + y * max_Qs1 - Qsa
                loss = Pipe(error, tf.nn.l2_loss, tf.reduce_sum)
                update = tf.train.GradientDescentOptimizer(lr).minimize(loss)

                self.writer = tf.summary.FileWriter('/logs/' +  model_name)
                self.saver = tf.train.Saver()
                
                self.variables_initializer = tf.global_variables_initializer()
                
        

            if restore:
                self.saver.restore(self.sess, model_path)
            else:
                self.sess.run(self.variables_initializer)

        self.s = s; self.a = a; self.r = r; self.max_Qs1 = max_Qs1
        self.max_Qs = max_Qs; self.Qs = Qs; self.Qsa = Qsa; self.update = update
        self.lr = lr
                
    def next_action(self, state, e=0.05):
        actions = self.sess.run(self.Qs, feed_dict={self.s: [state]})[0]
        n = len(actions)

        if random.random() < e:
            return random.randint(0, n-1)
        else:
            return np.argmax(actions)

    def train(self, s, a, r, s1, lr):
        #calculate next Qs
        [maxQs1, qs1] = self.sess.run([self.max_Qs, self.Qs], feed_dict={self.s: [s1]})

        #train
        self.sess.run(self.update, feed_dict={
            self.s: [s], self.a: [a], self.r: [r], 
            self.lr: lr,
            self.max_Qs1: np.max(qs1, 1)
        })

    def save(self, model_path):
        self.saver.save(self.sess, model_path)

    def restore(self, model_path):
        self.sess.close()
        self.sess = tf.Session(graph=self.graph)
        self.saver.restore(self.sess, model_path)

    @staticmethod
    def learning_rate(t, b, k):
        return b * k / (k + t)

In [18]:
y = 0.95
b = 0.5
k = 2000.0
e = 0.05

model = Model(y, restore=False)

r_total = 0.0

for t in range(200000):
    lr = model.learning_rate(t, b, k)
    s = env.reset()
    
    done = False
    while not done:
        #next action
        a = model.next_action(s, e)

        #take step
        s1, r, done, info = env.step(a)
        r_total += r

        #train
        model.train(s, a, r, s1, lr)
        
        #update state
        s = s1

    if t % 500 == 0:
        print r_total, "of", 500, ", lr:", lr
        r_total = 0
        model.save(model_path)
        


0.0 of 500 , lr: 0.5
26.0 of 500 , lr: 0.4
139.0 of 500 , lr: 0.333333333333
193.0 of 500 , lr: 0.285714285714
186.0 of 500 , lr: 0.25
201.0 of 500 , lr: 0.222222222222
220.0 of 500 , lr: 0.2
203.0 of 500 , lr: 0.181818181818
266.0 of 500 , lr: 0.166666666667
217.0 of 500 , lr: 0.153846153846
226.0 of 500 , lr: 0.142857142857
219.0 of 500 , lr: 0.133333333333


KeyboardInterrupt: 

In [19]:
s = env.reset()
    
for i in range(100):
    a = model.next_action(s, e=0)
    s, r, done, info = env.step(a)
    env.render()
    print("")

    if done:
        print(r)
        break

SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)

SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)

SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)

[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)

SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)

SFFF
FHFH
[41mF[0mFFH
HFFG
  (Left)

SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)

SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)

SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)

SFFF
FHFH
F[41mF[0mFH
HFFG
  (Up)

SFFF
FHFH
FF[41mF[0mH
HFFG
  (Down)

SFFF
FH[41mF[0mH
FFFH
HFFG
  (Left)

SF[41mF[0mF
FHFH
FFFH
HFFG
  (Left)

SFFF
FH[41mF[0mH
FFFH
HFFG
  (Left)

SF[41mF[0mF
FHFH
FFFH
HFFG
  (Left)

SF[41mF[0mF
FHFH
FFFH
HFFG
  (Left)

S[41mF[0mFF
FHFH
FFFH
HFFG
  (Left)

S[41mF[0mFF
FHFH
FFFH
HFFG
  (Up)

S[41mF[0mFF
FHFH
FFFH
HFFG
  (Up)

S[41mF[0mFF
FHFH
FFFH
HFFG
  (Up)

SF[41mF[0mF
FHFH
FFFH
HFFG
  (Up)

S[41mF[0mFF
FHFH
FFFH
HFFG
  (Left)

[41mS[0mFFF
FHFH
FFFH
HFFG
  (Up)

SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)

SFFF
FHFH
[41mF[0mFFH
HFFG
  (Left)

SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)

SFFF
