In [None]:
'''
enviroment: flappy bird
    input: 5 states, reward: -5 die, +0.05 survive, +5 pass pipe
    output: 2 actions

framework: tf.keras 
model: 2 fully connected neural networks
    layers:
        Actor: 5 input | 64 relu | 64 relu | softmax -> policy
        Critic: 5 input | 64 relu | 64 relu | linear -> value (value[action] is probs of action)
    params: state s, action a, reward R, value V 
    hyperparams: discount 0.999, learning rate 0.0003 optimizer adam, clipping epsilon: 0.2 (best)
    algorithm: PPO
        bellman Q = R + discount * Q', advantage A = Q - V, ratio = new_probs/old_probs, clipped ratio
        Actor loss: mean (min (ratio policy * advantage), (clipped ratio * advantage))
        Critic loss: mean square (advantage)

result: performing exactly like A2C... or at least no significantly changes
    test:
    ep 250: model score first pipe for the first time
    ep 450: model start score first pipe more often, peaked 2 score
    ep 1000: model score first pipe most the time, trying to reduce... (well it exactly same)
    ep ~5000: trending score are 1-4 with a few 5s, peaked 9
'''

In [None]:
from IPython import display
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import pygame
import sys
from enviroment import FlappyBird
from modelPPO import PPO

In [None]:
env = FlappyBird()
scores, rewards = [], []

model = PPO()
model.actor = tf.keras.models.load_model('models/PPOactor.keras')
model.critic = tf.keras.models.load_model('models/PPOcritic.keras')

for episode in range(10000):
    model.training_loop(env, 10000)
    scores.append(model.score)
    rewards.append(model.reward)
    display.clear_output(wait=True)
    print(f"Episode {episode}:")
    print(f"current score: {model.score:.2f}, highest score: {max(scores):.2f}, avg score: {np.mean(scores):.2f}")
    print(f"current reward: {model.reward:.2f}, highest reward: {max(rewards):.2f}, avg reward: {np.mean(rewards):.2f}")
    print(f"actor loss: {model.actor_loss:.4f}, critic loss: {model.critic_loss:.4f}")
pygame.quit()
sys.exit()

In [None]:
# Reward EMA
previous_reward = 0
R_EMA = []
factor = 2 / (1+50)
for reward in rewards:
    if previous_reward != 0:
        new_reward = previous_reward * (1 - factor) + reward * factor
    else:
        new_reward = reward
    previous_reward = new_reward
    
    R_EMA.append(new_reward)

plt.plot(list(range(len(rewards))), rewards, alpha=0.5, label="Reward")
plt.plot(list(range(len(R_EMA))), R_EMA, label="Exponential Moving Average")
plt.legend()

In [None]:
score_counts = Counter(scores)
trending = score_counts.most_common()
print("Trending Scores:")
for rank, (score, count) in enumerate(trending, 1):
    print(f"{rank}. Score: {score} | Count: {count}")

In [None]:
model.actor.save('models/PPOactor.keras')
model.critic.save('models/PPOcritic.keras')