In [1]:
import shutup
shutup.please()

In [5]:
import gfootball.env as football_env
import numpy as np

import gym
import copy
import tensorflow as tf

from keras.callbacks import TensorBoard
from keras.layers import Input, Dense, Conv2D, MaxPooling2D, Flatten
from keras.models import Model
from tensorflow.keras.optimizers import Adam
from keras import backend as K
from keras.applications.mobilenet_v2 import MobileNetV2
from tensorflow.python.framework.ops import disable_eager_execution

disable_eager_execution()

clipping_val = 0.2
critic_discount = 0.5
entropy_beta = 0.001
gamma = 0.99
lmbda = 0.95


env = gym.make('CartPole-v1')
    
state = env.reset()
state_dims = env.observation_space.shape

state_size = state_dims
n_actions = env.action_space.n
action_space = n_actions
input_shape = env.observation_space.shape

dummy_n = np.zeros((1, 1, n_actions))
dummy_1 = np.zeros((1, 1, 1))

tensor_board = TensorBoard(log_dir='./logs/')


target_reached = False
best_reward = 0
iters = 0
max_iters = 50
training_batch=1000
episode = 0
replay_count = 0
shuffle = False
epochs = 10



def get_advantages(values, masks, rewards):
    returns = []
    gae = 0
    for i in reversed(range(len(rewards))):
        delta = rewards[i] + gamma * values[i + 1] * masks[i] - values[i]
        gae = delta + gamma * lmbda * masks[i] * gae
        returns.insert(0, gae + values[i])

    adv = np.array(returns) - values[:-1]
    return returns, (adv - np.mean(adv)) / (np.std(adv) + 1e-10)

def critic_PPO2_loss(values):
    def loss(y_true, y_pred):
        LOSS_CLIPPING = clipping_val
        clipped_value_loss = values + K.clip(y_pred - values, -LOSS_CLIPPING, LOSS_CLIPPING)
        v_loss1 = (y_true - clipped_value_loss) ** 2
        v_loss2 = (y_true - y_pred) ** 2
            
        value_loss = 0.5 * K.mean(K.maximum(v_loss1, v_loss2))
        #value_loss = K.mean((y_true - y_pred) ** 2) # standard PPO loss
        return value_loss
    return loss

def critic_ppo_loss(y_true, y_pred):
    value_loss = K.mean((y_true - y_pred) ** 2) # standard PPO loss
    return value_loss

def actor_ppo_loss(y_true, y_pred):
    # Defined in https://arxiv.org/abs/1707.06347
    advantages, prediction_picks, actions = y_true[:, :1], y_true[:, 1:1+action_space], y_true[:, 1+action_space:]
    LOSS_CLIPPING = clipping_val
    ENTROPY_LOSS = entropy_beta
        
    prob = actions * y_pred
    old_prob = actions * prediction_picks

    prob = K.clip(prob, 1e-10, 1.0)
    old_prob = K.clip(old_prob, 1e-10, 1.0)

    ratio = K.exp(K.log(prob) - K.log(old_prob))
        
    p1 = ratio * advantages
    p2 = K.clip(ratio, min_value=1 - LOSS_CLIPPING, max_value=1 + LOSS_CLIPPING) * advantages

    actor_loss = -K.mean(K.minimum(p1, p2))

    entropy = -(y_pred * K.log(y_pred + 1e-10))
    entropy = ENTROPY_LOSS * K.mean(entropy)
        
    total_loss = actor_loss - entropy

    return total_loss

def actor_ppo_loss_continuous(y_true, y_pred):
    advantages, actions, logp_old_ph, = y_true[:, :1], y_true[:, 1:1+action_space], y_true[:, 1+action_space]
    LOSS_CLIPPING = clipping_val
    logp = gaussian_likelihood(actions, y_pred)

    ratio = K.exp(logp - logp_old_ph)

    p1 = ratio * advantages
    p2 = tf.where(advantages > 0, (1.0 + LOSS_CLIPPING)*advantages, (1.0 - LOSS_CLIPPING)*advantages) # minimum advantage

    actor_loss = -K.mean(K.minimum(p1, p2))

    return actor_loss

def gaussian_likelihood(self, actions, pred): # for keras custom loss
    log_std = -0.5 * np.ones(self.action_space, dtype=np.float32)
    pre_sum = -0.5 * (((actions-pred)/(K.exp(log_std)+1e-8))**2 + 2*log_std + K.log(2*np.pi))
    return K.sum(pre_sum, axis=1)
    
def get_common_layer(X_input, model="MLP"):
    # Shared CNN layers:
    if model=="CNN":
        X = Conv1D(filters=64, kernel_size=6, padding="same", activation="tanh")(X_input)
        X = MaxPooling1D(pool_size=2)(X)
        X = Conv1D(filters=32, kernel_size=3, padding="same", activation="tanh")(X)
        X = MaxPooling1D(pool_size=2)(X)
        X = Flatten()(X)

    # Shared LSTM layers:
    elif model=="LSTM":
        X = LSTM(512, return_sequences=True)(X_input)
        X = LSTM(256)(X)

    # Shared Dense layers:
    else:
        X = Flatten()(X_input)
        X = Dense(512, activation="relu")(X)
        
    return X

def get_model_actor_simple(input_shape=input_shape, n_actions=n_actions, continuous=False):

    X_input = Input(input_shape)
    X = get_common_layer(X_input)
    X = Dense(512, activation="relu", kernel_initializer=tf.random_normal_initializer(stddev=0.01))(X_input)
    X = Dense(256, activation="relu", kernel_initializer=tf.random_normal_initializer(stddev=0.01))(X)
    X = Dense(64, activation="relu", kernel_initializer=tf.random_normal_initializer(stddev=0.01))(X)
    if continuous:
        output = Dense(n_actions,activation="tanh")(X)
    else:
        output = Dense(n_actions, activation="softmax")(X)

    model = Model(inputs = X_input, outputs = output)
    model.compile(loss=actor_ppo_loss, optimizer=Adam(lr=0.00025))
    return model


def get_model_critic_simple(input_shape=input_shape):
    X_input = Input(input_shape)
    V = get_common_layer(X_input)
    V = Dense(512, activation="relu", kernel_initializer='he_uniform')(X_input)
    V = Dense(256, activation="relu", kernel_initializer='he_uniform')(V)
    V = Dense(64, activation="relu", kernel_initializer='he_uniform')(V)
    value = Dense(1, activation=None)(V)

    model = Model(inputs=X_input, outputs = value)
    model.compile(loss=critic_ppo_loss, optimizer=Adam(lr=0.00025))
    return model
    
actor = get_model_actor_simple()
critic = get_model_critic_simple()

def act(state):
    """ example:
    pred = np.array([0.05, 0.85, 0.1])
    action_size = 3
    np.random.choice(a, p=pred)
    result>>> 1, because it have the highest probability to be taken
    """
    # Use the network to predict the next action to take, using the model
    prediction = actor.predict(state)[0]
    action = np.random.choice(n_actions, p=prediction)
    action_onehot = np.zeros([n_actions])
    action_onehot[action] = 1
    return action, action_onehot, prediction

def discount_rewards(reward):#gaes is better
    # Compute the gamma-discounted rewards over an episode
    # We apply the discount and normalize it to avoid big variability of rewards
    gamma = 0.99    # discount rate
    running_add = 0
    discounted_r = np.zeros_like(reward)
    for i in reversed(range(0,len(reward))):
        running_add = running_add * gamma + reward[i]
        discounted_r[i] = running_add

    discounted_r -= np.mean(discounted_r) # normalizing the result
    discounted_r /= (np.std(discounted_r) + 1e-8) # divide by standard deviation
    return discounted_r

def get_gaes(rewards, dones, values, next_values, gamma = 0.99, lamda = 0.9, normalize=True):
    deltas = [r + gamma * (1 - d) * nv - v for r, d, nv, v in zip(rewards, dones, next_values, values)]
    deltas = np.stack(deltas)
    gaes = copy.deepcopy(deltas)
    for t in reversed(range(len(deltas) - 1)):
        gaes[t] = gaes[t] + (1 - dones[t]) * gamma * lamda * gaes[t + 1]

    target = gaes + values
    if normalize:
        gaes = (gaes - gaes.mean()) / (gaes.std() + 1e-8)
    return np.vstack(gaes), np.vstack(target)

def test_reward():
    state = env.reset()
    done = False
    total_reward = 0
   
    limit = 0
    while not done:
        state_input = K.expand_dims(state, 0)
        action_probs = model_actor.predict([state_input, dummy_n, dummy_1, dummy_1, dummy_1], steps=1)
        action = np.argmax(action_probs)
        next_state, reward, done, _ = env.step(action)
        state = next_state
        total_reward += reward
        #print('test reward',reward)
        limit += 1
        if limit > 20:
            break
    print('testing...', total_reward)
    return total_reward

def critic_predict(state):
    return critic.predict([state, np.zeros((state.shape[0], 1))])

def replay(states, actions, rewards, predictions, dones, next_states):
    # reshape memory to appropriate shape for training
    states = np.vstack(states)
    next_states = np.vstack(next_states)
    actions = np.vstack(actions)
    predictions = np.vstack(predictions)

    # Get Critic network predictions 
    values = critic_predict(states)
    next_values = critic_predict(next_states)

    # Compute discounted rewards and advantages
    #discounted_r = self.discount_rewards(rewards)
    #advantages = np.vstack(discounted_r - values)
    advantages, target = get_gaes(rewards, dones, np.squeeze(values), np.squeeze(next_values))
    '''
        pylab.plot(advantages,'.')
        pylab.plot(target,'-')
        ax=pylab.gca()
        ax.grid(True)
        pylab.subplots_adjust(left=0.05, right=0.98, top=0.96, bottom=0.06)
        pylab.show()
        '''
    # stack everything to numpy array
    # pack all advantages, predictions and actions to y_true and when they are received
    # in custom PPO loss function we unpack it
    y_true = np.hstack([advantages, predictions, actions])
        
    # training Actor and Critic networks
    print()
 
    a_loss = actor.fit(states, y_true, epochs=epochs, verbose=0, shuffle=shuffle)
    print('actor loss',np.mean(a_loss.history['loss']))
    print()

    c_loss = critic.fit(states, target, epochs=epochs, verbose=0, shuffle=shuffle)
    print('critic loss',np.mean(c_loss.history['loss']))
    print()
#     self.writer.add_scalar('Data/actor_loss_per_replay', np.sum(a_loss.history['loss']), self.replay_count)
#     self.writer.add_scalar('Data/critic_loss_per_replay', np.sum(c_loss.history['loss']), self.replay_count)
#     self.replay_count += 1
    #replay_count += 1

def run_batch(): # train every self.Training_batch episodes
    scores_ = []
    episodes_ = []
    averages_= [] 
    episode = 0
    state = env.reset()
    state = np.reshape(state, [1,state_size[0]])
    done, score, SAVING = False, 0, ''
    while True:
        # Instantiate or reset games memory
        states, next_states, actions, rewards, predictions, dones = [], [], [], [], [], []
        for t in range(training_batch):
           # env.render()
            # Actor picks an action
            action, action_onehot, prediction = act(state)
                # Retrieve new state, reward, and whether the state is terminal
            next_state, reward, done, _ = env.step(action)
            # Memorize (state, action, reward) for training
            states.append(state)
            next_states.append(np.reshape(next_state, [1, state_size[0]]))
            actions.append(action_onehot)
            rewards.append(reward)
            dones.append(done)
            predictions.append(prediction)
            # Update current state
            state = np.reshape(next_state, [1, state_size[0]])
            score += reward
            if done:
                episode += 1
                SAVING = False
                scores_.append(score)
                averages_.append(sum(scores_[-50:]) / len(scores_[-50:]))

                print('score', averages_[-1])
#                     average, SAVING = self.PlotModel(score, self.episode)
#                     print("episode: {}/{}, score: {}, average: {:.2f} {}".format(self.episode, self.EPISODES, score, average, SAVING))
#                     self.writer.add_scalar(f'Workers:{1}/score_per_episode', score, self.episode)
#                     self.writer.add_scalar(f'Workers:{1}/learning_rate', self.lr, self.episode)

                state, done, score, SAVING = env.reset(), False, 0, ''
                state = np.reshape(state, [1, state_size[0]])
   
        replay(states, actions, rewards, predictions, dones, next_states)
        if episode >= 1000:
            break
    env.close()  




In [None]:

run_batch() # train as PPO, train every epesode

score 17.0
score 24.5
score 24.0
score 22.5
score 22.6
score 20.833333333333332
score 24.285714285714285
score 23.75
score 23.333333333333332
score 21.8
score 20.90909090909091
score 21.5
score 20.923076923076923
score 20.928571428571427
score 21.533333333333335
score 22.25
score 21.88235294117647
score 22.166666666666668
score 21.526315789473685
score 21.65
score 22.857142857142858
score 22.59090909090909
score 22.043478260869566
score 21.833333333333332
score 21.48
score 21.615384615384617
score 21.59259259259259
score 21.678571428571427
score 22.03448275862069
score 22.133333333333333
score 22.483870967741936
score 22.6875
score 22.272727272727273
score 23.294117647058822
score 23.057142857142857
score 23.77777777777778
score 24.135135135135137
score 24.07894736842105
score 23.974358974358974
score 24.2
score 23.951219512195124

actor loss -0.014927236577868463

critic loss 17.086272804641723

score 23.857142857142858
score 23.72093023255814
score 24.068181818181817
score 23.8
score

In [None]:
state_size

In [None]:
type(critic)