In [None]:
# enviroment: flappy bird
    # 4 states (normalized), 2 actions
    #  reward: -5 die, +0.05 survive, +5 pass pipe

# framework: tf.keras
# model: RL PPO (2 NN)
    # method: ratio policy is ratio between 2 actor predict (same model), which is clipped to prevent overshoot
    # Advantage = Q - value where Q = R + discount * Q' 
    # Actor: 4 input | 64 relu | 64 relu | softmax -> policy. loss: mean (min (ratio policy * advantage), (clipped ratio * advantage)
    # Critic: 4 input | 64 relu | 64 relu | linear -> value. loss: mean square (advantage)
    # hyperparams: discount 0.999, optimizer adam-learning rate 0.001. clipping epsilon: 0.2 (best)

# result:

from IPython import display
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import time
from enviroment import FlappyBird
from modelPPO import PPO

In [None]:
env = FlappyBird()
scores, rewards = [], []

model = PPO(0.999)
model.actor = tf.keras.models.load_model('models/PPOactor.keras')
model.critic = tf.keras.models.load_model('models/PPOcritic.keras')

for episode in range(2000):
    try:
        model.training_loop(env, 10000)
        scores.append(model.score)
        rewards.append(model.reward)
        display.clear_output(wait=True)
        print(f"Episode {episode}:")
        print(f"current score: {model.score:.2f}, highest score: {max(scores):.2f}, avg score: {np.mean(scores):.2f}")
        print(f"current reward: {model.reward:.2f}, highest reward: {max(rewards):.2f}, avg reward: {np.mean(rewards):.2f}")
    except KeyboardInterrupt:
        time.sleep(5)

In [None]:
# Reward EMA
previous_reward = 0
R_EMA = []
factor = 2 / (1+50)
for reward in rewards:
    if previous_reward != 0:
        new_reward = previous_reward * (1 - factor) + reward * factor
    else:
        new_reward = reward
    previous_reward = new_reward
    
    R_EMA.append(new_reward)

plt.plot(list(range(len(rewards))), rewards, alpha=0.5, label="Reward")
plt.plot(list(range(len(R_EMA))), R_EMA, label="Exponential Moving Average")
plt.legend()

In [None]:
# Score EMA
previous_score = 0
S_EMA = []
factor = 2 / (1+50)
for score in scores:
    if previous_score != 0:
        new_score = previous_score * (1 - factor) + score * factor
    else:
        new_score = score
    previous_score = new_score
    
    S_EMA.append(new_score)

plt.plot(list(range(len(scores))), scores, alpha=0.5, label="Score")
plt.plot(list(range(len(S_EMA))), S_EMA, label="Exponential Moving Average")
plt.legend()

In [None]:
model.actor.save('PPOactor.keras')
model.critic.save('PPOcritic.keras')