In [None]:
import numpy as np
import os
import random
import sys
import tensorflow as tf
from tensorflow.contrib.distributions import MultivariateNormalFullCovariance
import vrep
import keras
from keras.models import Sequential, Model
from keras.models import load_model
from keras.layers import BatchNormalization, Dense, Input
from keras.callbacks import TensorBoard
from keras import backend as K
from common import *
from GaussianPolicy import GaussianPolicy
from VREPEnvironments import VREPPushTaskEnvironment

# Auto-reload modules
%load_ext autoreload
%autoreload 2

In [None]:
def getModel():
    """
    Model for transition dynamics, rewards and termination of episodes.
    Inputs: [PrevState(24,), Action(6,)](30,)
    Outputs: [NestState-PrevState(24,), Reward](25,)
    """
    prevState_action_l = Input(shape=(30,), dtype="float32", name="prevState_action_l")
    H_l = Dense(256, kernel_initializer="normal", activation="relu", name="hidden_l1")(prevState_action_l)
    H_l = BatchNormalization()(H_l)
    H_l = Dense(64, kernel_initializer="normal", activation="relu", name="hidden_l2")(H_l)
    H_l = BatchNormalization()(H_l)
    nextState_reward_l = Dense(25, kernel_initializer="normal", name="nextState_l")(H_l)
    #dest_l = Dense(1, kernel_initializer="normal", activation="sigmoid", name="dest_l")(H_l)
    model = Model(inputs=prevState_action_l, outputs=nextState_reward_l)
    model.compile(loss="mse", optimizer="rmsprop")
    return model

def getPolicy():
    return GaussianPolicy()


def getAdvantages(rewards, discount_factor):
    eps = rewards.shape[0]
    advantages = np.zeros_like(rewards)
    running_discounted_advantages = 0
    for i in range(eps - 1, -1, -1):
        running_discounted_advantages = running_discounted_advantages * discount_factor + rewards[i]
        advantages[i] = running_discounted_advantages
    return advantages


def getGreedyFactor(eps):
    """
    Greedy factor scheduler
    """
    return 0.95 / (1 + np.exp(-0.02 * (eps - 100)))

In [None]:
a = np.array([1])

In [None]:
getGreedyFactor(7)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
x = np.arange(500)
plt.plot(x, getGreedyFactor(x), '-')


In [None]:
EPS_LENGTH = 200
NUM_EPS = 200

NEW_MODEL = True
TRAINING_SESSION_ROOT = "TrainingSessions/01/"
MODEL_FILE = TRAINING_SESSION_ROOT + "model.h5"
SAMPLE_FROM_MODEL = False

NEW_POLICY = True
POLICY_FILE = TRAINING_SESSION_ROOT + "policy.h5"

SEED = 10

MODEL_TRAINING_LOG_FILE = TRAINING_SESSION_ROOT + "modeltraining.log"
POLICY_TRAINING_LOG_FILE = TRAINING_SESSION_ROOT + "policytraining.log"
REWARDS_SUM_LOG_FILE = TRAINING_SESSION_ROOT + "RewardSums.log"

DISCOUNT_FACTOR = 0.99

#EPSILON = 0.1

SWITCH_POINT_EP = 100

BATCH_SIZE = 128
EPOCHS = 100

random.seed(SEED)

with VREPPushTaskEnvironment() as env:
    model = getModel() if NEW_MODEL else load_model(MODEL_FILE)
    policy = GaussianPolicy(model_file=POLICY_FILE) if NEW_POLICY else GaussianPolicy(model_file=POLICY_FILE, load_model=True)

    # obtain first state
    current_state = env.reset(True)[np.newaxis, :]
    
    if NEW_MODEL:
        Xs_mean = None
        Xs_std = None
        ys_mean = None
        ys_std = None
    else:
        Xs_mean = np.load(TRAINING_SESSION_ROOT + "Xs_mean.npy")
        Xs_std = np.load(TRAINING_SESSION_ROOT + "Xs_std.npy")
        ys_mean = np.load(TRAINING_SESSION_ROOT + "ys_mean.npy")
        ys_std = np.load(TRAINING_SESSION_ROOT + "ys_std.npy")
    #if NEW_POLICY:
    #    policy_Xs_mean = None
    #    policy_Xs_std = None
    #    policy_ys_mean = None
    #     policy_ys_std = None
    #    policy_rs_mean = None
    #    policy_rs_std = None

    for i in range(NUM_EPS):
        print("{}th episode".format(i))
        states = [current_state]
        actions = []
        rewards = []
        greedy_factor = getGreedyFactor(i)
        print("Greedy Factor: {}".format(greedy_factor))
        # collect trajectory
        print(current_state.shape)
        for step in range(EPS_LENGTH):
            #if (not NEW_POLICY) or (np.random.rand() < greedy_factor):
            if False:
                #action = policy.sampleAction(invStandardise(current_state[np.newaxis, :], policy_Xs_mean, policy_Xs_std))
                #action = invStandardise(action, policy_ys_mean, policy_ys_std)
                action = policy.sampleAction(current_state)[np.newaxis, :]
            else:
                action = generateRandomVel(env.MAX_JOINT_VELOCITY_DELTA)[np.newaxis, :]
            if SAMPLE_FROM_MODEL:
                #print(current_state.shape)
                #print(action.shape)
                X = np.concatenate([current_state, action], axis=1)
                pred_next_state_reward = model.predict(X)
                next_state = pred_next_state_reward[:, :-1]
                reward = pred_next_state_reward[:, -1]
            else:
                next_state, reward = env.step(action)
            action = action.reshape(1, -1)
            next_state = next_state.reshape(1, -1)
            reward = reward.reshape(1, -1)
            # Terminate if the current velocity is too big (avoid bad data)
            if np.any(np.abs(next_state[:, :6]) >= env.MAX_JOINT_VELOCITY ):
                break
            actions.append(action)
            states.append(next_state)
            rewards.append(reward)
            # proceed to next state
            current_state = next_state
            
        states = np.concatenate(states, axis=0)
        actions = np.concatenate(actions, axis=0)
        rewards = np.concatenate(rewards, axis=0)
        advantages = getAdvantages(rewards, DISCOUNT_FACTOR)
        
        # Record collected reward
        with open(REWARDS_SUM_LOG_FILE, 'a') as f:
            f.write("{}, {}\n".format(i, np.sum(rewards)))
        if NEW_MODEL:
            # X = [current_states(,24), actions(,6)]
            # y = [next_states - current_states(,24), rewards]
            X = np.concatenate([states[:-1, :], actions], axis=1)
            y = np.concatenate([states[1:, :] - states[:-1, :], rewards], axis=1)
            # standardise training data
            if Xs_mean is None:
                Xs_mean = np.mean(X, axis=0)
                Xs_std = np.std(X, axis=0)
                ys_mean = np.mean(y, axis=0)
                ys_std = np.std(y, axis=0)
                np.save(TRAINING_SESSION_ROOT + "Xs_mean.npy", Xs_mean)
                np.save(TRAINING_SESSION_ROOT + "Xs_std.npy", Xs_std)
                np.save(TRAINING_SESSION_ROOT + "ys_mean.npy", ys_mean)
                np.save(TRAINING_SESSION_ROOT + "ys_std.npy", ys_std)
            X = standardise(X, Xs_mean, Xs_std)
            y = standardise(y, ys_mean, ys_std)
            # add zero-mean gaussian noise
            X += np.random.normal(0, 0.05, X.shape)
            y += np.random.normal(0, 0.05, y.shape)
            # train model
            model.fit(X, y, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_split=0.2, 
                     callbacks=[keras.callbacks.ModelCheckpoint(MODEL_FILE, monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=False, period=10)
                               ,keras.callbacks.CSVLogger(MODEL_TRAINING_LOG_FILE, append=True)
                               ,keras.callbacks.TerminateOnNaN()])
        if NEW_POLICY:
            policy_X = states[:-1, :]
            policy_y = actions[:]
            # standardise training data
            """
            if policy_Xs_mean is None:
                policy_Xs_mean = np.mean(policy_X, 0)
                policy_Xs_std = np.std(policy_X, 0)
                policy_ys_mean = np.mean(policy_y, 0)
                policy_ys_std = np.std(policy_y, 0)
                #policy_rs_mean = np.mean(advantages, 0)
                #policy_rs_std = np.std(advantages, 0)
            policy_X = standardise(policy_X, policy_Xs_mean, policy_Xs_std)
            policy_y = standardise(policy_y, policy_ys_mean, policy_ys_std)
            """
            advantages = standardise(advantages, np.mean(advantages), np.std(advantages))

            # train policy
            policy.train(policy_X, policy_y, advantages, POLICY_TRAINING_LOG_FILE)
        # Reset the scene
        current_state = env.reset(False)[np.newaxis, :]
        
        # From episode SWITCH_POINT_EP onward switch between sampling from model and sampling from real environment
        if i >= SWITCH_POINT_EP:
            SAMPLE_FROM_MODEL = not SAMPLE_FROM_MODEL
        
    if NEW_MODEL:
        model.save(MODEL_FILE)
    if NEW_POLICY:
        policy.save(POLICY_FILE)

In [None]:
env = VREPPushTaskEnvironment()
states = env.reset(True)

In [None]:
del model
del policy
del env

In [None]:
del env

In [None]:
K.learning_phase()

In [None]:
states

In [None]:
import time
for _ in range(200):
    states = env.step(generateRandomVel(2)[np.newaxis, :])
    print(states)
    #time.sleep(0.05)