In [1]:
import numpy as np
from multiprocessing import Pool
import gym
import os

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory

from gym.envs.registration import registry, register, make, spec
import gym_unicycle
#from gym import wrappers # xvfb-run -s "-screen 0 1400x900x24" python <your_script.py>
import os.path
from time import time
from statistics import mean
import pprint as pp
import json
import sys

Using TensorFlow backend.


In [2]:

ENV_NAME = 'MATTENV-v0'
register(
    id=ENV_NAME,
    entry_point='gym_unicycle.envs:UnicycleEnv',
    max_episode_steps=2000,
    reward_threshold=800.0,
)

In [3]:

def attempt(args):
    lr = args['lr']
    nb_steps = args['nb_steps']
    activation = args['activation']
    layerType = args['layerType']
    # Get the environment and extract the number of actions.
    env = gym.make(ENV_NAME)
    #env = wrappers.Monitor(env, './videos/' + str(time()) + '/', force=True)
    np.random.seed(123)
    env.seed(123)
    nb_actions = env.action_space.n

    print("env.observation_space.shape: " + str(env.observation_space.shape))

    # Next, we build a very simple model.
    model = Sequential()
    model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
    if layerType == 0:
        model.add(Dense(16))
        model.add(Activation(activation))
        model.add(Dense(16))
        model.add(Activation(activation))
        model.add(Dense(16))
        model.add(Activation(activation))
    elif layerType == 1:
        model.add(Dense(16))
        model.add(Activation(activation))
        model.add(Dense(13))
        model.add(Activation(activation))
        model.add(Dense(10))
        model.add(Activation(activation))
    else:
        model.add(Dense(16))
        model.add(Activation(activation))
        model.add(Dense(13))
        model.add(Activation(activation))
        model.add(Dense(3))
        model.add(Activation(activation))
    model.add(Dense(nb_actions))
    model.add(Activation('linear'))
    print(model.summary())

    # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
    # even the metrics!
    memory = SequentialMemory(limit=100000, window_length=1)
    policy = BoltzmannQPolicy()
    dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=100,
                   target_model_update=1e-2, policy=policy)
    dqn.compile(Adam(lr=1e-3), metrics=['mae'])
    if not os.path.exists(args['fnamePrefix']):
        os.makedirs(args['fnamePrefix'])
    weights_fname = '%s/weights.h5f' % args['fnamePrefix']
    if os.path.isfile(weights_fname):
        print("Loading weights from before")
        print("Skipping training")
        dqn.load_weights(weights_fname)
    else:
        # Okay, now it's time to learn something! We visualize the training here for show, but this
        # slows down training quite a lot. You can always safely abort the training prematurely using
        # Ctrl + C.
        dqn.fit(env, nb_steps=nb_steps, visualize=False, verbose=1)

        # After training is done, we save the final weights.
        dqn.save_weights(weights_fname, overwrite=True)

    # Finally, evaluate our algorithm for 5 episodes.
    env.reset()
    env.close()
    result = dqn.test(env, nb_episodes=5, visualize=False)
    means = {
        'reward': mean(result.history['episode_reward']),
        'steps': mean(result.history['nb_steps'])
            }
    json_fname = args['fnamePrefix'] + '/result.json'
    with open(json_fname,"w") as f:
            json.dump(result.history,f)
    return(means)

In [4]:

def attemptWrap(args):

    if not os.path.exists(args['fnamePrefix']):
        os.makedirs(args['fnamePrefix'])
    old_stdout = sys.stdout
    new_stdout_fname = args['fnamePrefix'] + '/stdout.txt'
    sys.stdout = open(new_stdout_fname,"w")
    result = attempt(args)
    sys.stdout = old_stdout
    return(result)

In [5]:

def mergeDicts(x,y):
    z = x.copy()   # start with x's keys and values
    z.update(y)    # modifies z with y's keys and values & returns None
    return(z)

def testMergeDict():
    a = {1:2}
    b = {'asd':123}
    z = mergeDicts(a,b)
    assert(z == {1:2,'asd':123})
    print("Test passed")
    
testMergeDict()
print("check")

Test passed
check


In [6]:

def main():
    if not os.path.exists('results'):
        os.makedirs('results')
    args = []
    for lr in [5e-3, 2e-3, 1e-3, 5e-4, 1e-4]:
        for nb_steps in [10**x for x in range(4,7)]:
            for activation in ['tanh','relu']:
                for layerType in range(3):
                    fname = 'results/%s_%f_%d_%d/' % (ENV_NAME,lr,nb_steps,layerType)
                    arg = {
                        'lr':lr,
                        'nb_steps':nb_steps,
                        'activation':activation,
                        'layerType':layerType,
                        'fnamePrefix':fname
                        }
                    args.append(arg)

#     pp.pprint(args)
    with Pool(4) as p:
        results = p.map(attemptWrap, args)
    pp.pprint(results)
    data = [mergeDicts(a,r) for (a,r) in zip(args,results)]
    data.sort(key=lambda x: x['reward'])
    pp.pprint(data)
    return(data)

In [7]:
results = main()

  result = entry_point.load(False)
  result = entry_point.load(False)
  result = entry_point.load(False)
  result = entry_point.load(False)


[{'reward': 78.94686712717453, 'steps': 25.6},
 {'reward': 32.10549781478155, 'steps': 10.4},
 {'reward': 33.871938017218454, 'steps': 10.8},
 {'reward': 65.89540354251014, 'steps': 19},
 {'reward': 120.51817068869562, 'steps': 41},
 {'reward': 33.60560893816412, 'steps': 10.6},
 {'reward': 29.651334683162844, 'steps': 9.4},
 {'reward': 76.28108349693915, 'steps': 24.8},
 {'reward': 91.93629129337222, 'steps': 27.4},
 {'reward': 3111.6421257208526, 'steps': 963.2},
 {'reward': 2791.6998746946783, 'steps': 875.8},
 {'reward': 55.44675674703746, 'steps': 16},
 {'reward': 4512.494581348294, 'steps': 1602.6},
 {'reward': 1000.1892970769126, 'steps': 318.8},
 {'reward': 421.7994214732127, 'steps': 141.8},
 {'reward': 59.093269018830654, 'steps': 18.6},
 {'reward': 27.53499153539131, 'steps': 8.8},
 {'reward': 47.17073816779185, 'steps': 13.8},
 {'reward': 49.03273059059149, 'steps': 15},
 {'reward': 50.045252091481345, 'steps': 15.4},
 {'reward': 25.57798457206883, 'steps': 8.4},
 {'reward'