In [1]:
import numpy as np
import os
import random
import sys
import tensorflow as tf
from tensorflow.contrib.distributions import MultivariateNormalFullCovariance
import vrep
import keras
from keras.models import Sequential, Model
from keras.models import load_model
from keras.layers import BatchNormalization, Dense, Input
from keras.callbacks import TensorBoard
from keras import backend as K
from common import *
from GaussianPolicy import GaussianPolicy
from VREPEnvironments import VREPPushTaskEnvironment

# Auto-reload modules
%load_ext autoreload
%autoreload 2

Using TensorFlow backend.


In [2]:
def getModel():
    """
    Model for transition dynamics, rewards and termination of episodes.
    Inputs: [PrevState(24,), Action(6,)](30,)
    Outputs: [NestState-PrevState(24,), Reward](25,)
    """
    prevState_action_l = Input(shape=(30,), dtype="float32", name="prevState_action_l")
    H_l = Dense(256, kernel_initializer="normal", activation="relu", name="hidden_l1")(prevState_action_l)
    H_l = BatchNormalization()(H_l)
    H_l = Dense(64, kernel_initializer="normal", activation="relu", name="hidden_l2")(H_l)
    H_l = BatchNormalization()(H_l)
    nextState_reward_l = Dense(25, kernel_initializer="normal", name="nextState_l")(H_l)
    #dest_l = Dense(1, kernel_initializer="normal", activation="sigmoid", name="dest_l")(H_l)
    model = Model(inputs=prevState_action_l, outputs=nextState_reward_l)
    model.compile(loss="mse", optimizer="rmsprop")
    return model

def getPolicy():
    return GaussianPolicy()


def getAdvantages(rewards, discount_factor):
    eps = rewards.shape[0]
    advantages = np.zeros_like(rewards)
    running_discounted_advantages = 0
    for i in range(eps - 1, -1, -1):
        running_discounted_advantages = running_discounted_advantages * discount_factor + rewards[i]
        advantages[i] = running_discounted_advantages
    return advantages

In [8]:
GREEDY_FACTOR = 0.9 # Fraction of times to choose greedy actions TODO: Increase over time

EPS_LENGTH = 200
NUM_EPS = 2

NEW_MODEL = False
MODEL_FILE = "model/PolicyGradient/model01.h5"

NEW_POLICY = True
POLICY_FILE = "model/PolicyGradient/policy02.h5"
SEED = 10

MONITOR_LOG_FILE = "" 

DISCOUNT_FACTOR = 0.9

EPSILON = 0.1

BATCH_SIZE = 128
EPOCHS = 20

random.seed(SEED)

model = getModel() if NEW_MODEL else load_model(MODEL_FILE)
policy = getPolicy() if NEW_POLICY else GaussianPolicy(model_file=POLICY_FILE)
with VREPPushTaskEnvironment() as env, policy as policy:
    # obtain first state
    current_state = env.reset(True)
    Xs_mean = None
    Xs_std = None
    ys_mean = None
    ys_std = None
    
    policy_Xs_mean = None
    policy_Xs_std = None
    policy_ys_mean = None
    policy_ys_std = None
    policy_rs_mean = None
    policy_rs_std = None
    for i in range(NUM_EPS):
        print("{}th episode".format(i))
        states = [current_state[np.newaxis, :]]
        actions = []
        rewards = []
        GREEDY_FACTOR = 0.5 if i < 100 else 0.9
        # collect trajectory
        for step in range(EPS_LENGTH):
            # select greedy action according to GREEDY_FACTOR
            if (not NEW_POLICY) or (np.random.rand() < GREEDY_FACTOR and policy_Xs_mean is not None):
                #action = policy.sampleAction(invStandardise(current_state[np.newaxis, :], policy_Xs_mean, policy_Xs_std))
                #action = invStandardise(action, policy_ys_mean, policy_ys_std)
                action = policy.sampleAction(current_state[np.newaxis, :])
            else:
                action = generateRandomVel(env.MAX_JOINT_VELOCITY)[np.newaxis, :]
            next_state, reward = env.step(action)
           
            actions.append(action)
            states.append(next_state[np.newaxis, :])
            rewards.append(reward)
            # proceed to next state
            current_state = next_state[:]
            
        states = np.concatenate(states, axis=0)
        actions = np.concatenate(actions, axis=0)
        rewards = np.array(rewards)
        advantages = getAdvantages(rewards, DISCOUNT_FACTOR)
        if NEW_MODEL:
            # X = [current_states(,24), actions(,6)]
            # y = [next_states - current_states(,24), rewards]
            X = np.concatenate([states[:-1, :], actions], axis=1)
            y = np.concatenate([states[1:, :] - states[:-1, :], rewards], axis=1)
            # standardise training data
            if Xs_mean is None:
                Xs_mean = np.mean(X, axis=0)
                Xs_std = np.std(X, axis=0)
                ys_mean = np.mean(y, axis=0)
                ys_std = np.std(y, axis=0)
            X = standardise(X, Xs_mean, Xs_std)
            y = standardise(y, ys_mean, ys_std)    
            # add zero-mean gaussian noise
            X += np.random.normal(0, 0.05, X.shape)
            y += np.random.normal(0, 0.05, y.shape)

            # train model
            model.fit(X, y, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_split=0.2)
        if NEW_POLICY:
            policy_X = states[:-1, :]
            policy_y = actions[:]
            # standardise training data
            """
            if policy_Xs_mean is None:
                policy_Xs_mean = np.mean(policy_X, 0)
                policy_Xs_std = np.std(policy_X, 0)
                policy_ys_mean = np.mean(policy_y, 0)
                policy_ys_std = np.std(policy_y, 0)
                #policy_rs_mean = np.mean(advantages, 0)
                #policy_rs_std = np.std(advantages, 0)
            policy_X = standardise(policy_X, policy_Xs_mean, policy_Xs_std)
            policy_y = standardise(policy_y, policy_ys_mean, policy_ys_std)
            """
            advantages = standardise(advantages, np.mean(advantages), np.std(advantages))

            # train policy
            policy.train(policy_X, policy_y, advantages)
        # Reset the scene
        env.reset(False)
        
    if NEW_MODEL:
        model.save(MODEL_FILE)
    if NEW_POLICY:
        policy.save(POLICY_FILE)

[<tf.Variable 'hidden_l1_7/kernel:0' shape=(24, 256) dtype=float32_ref>, <tf.Variable 'hidden_l1_7/bias:0' shape=(256,) dtype=float32_ref>, <tf.Variable 'batch_normalization_7/gamma:0' shape=(256,) dtype=float32_ref>, <tf.Variable 'batch_normalization_7/beta:0' shape=(256,) dtype=float32_ref>, <tf.Variable 'hidden_l2_7/kernel:0' shape=(256, 64) dtype=float32_ref>, <tf.Variable 'hidden_l2_7/bias:0' shape=(64,) dtype=float32_ref>, <tf.Variable 'batch_normalization_8/gamma:0' shape=(64,) dtype=float32_ref>, <tf.Variable 'batch_normalization_8/beta:0' shape=(64,) dtype=float32_ref>, <tf.Variable 'action_l_3/kernel:0' shape=(64, 6) dtype=float32_ref>, <tf.Variable 'action_l_3/bias:0' shape=(6,) dtype=float32_ref>]
0th episode


KeyboardInterrupt: 

In [None]:
env = VREPPushTaskEnvironment()
states = env.reset(True)

In [None]:
del model
del policy
del env

In [6]:
del env

NameError: name 'env' is not defined

In [None]:
K.learning_phase()

In [None]:
states

In [None]:
import time
for _ in range(200):
    states = env.step(generateRandomVel(2)[np.newaxis, :])
    print(states)
    #time.sleep(0.05)