In [None]:
'''
enviroment: flappy bird
    input: 5 states, reward: -5 die, +0.05 survive, +5 pass pipe
    output: 2 actions

framework: tf.keras 
model: 2 fully connected neural networks
    layers:
        Actor: 5 input | 64 relu | 64 relu | softmax -> policy
        Critic: 5 input | 64 relu | 64 relu | linear -> value
    params: state s, action a, reward R, value V
    hyperparams: discount 0.999, learning rate 0.001 and 0.0003 optimizer adam
    algorithm: A2C
        bellman Q = R + discount * Q', advantage A = Q - V, log policy
        Actor loss: mean (log policy * advantage)
        Critic loss: mean square (advantage)

result: learning-average and highest score/reward slowly increase over time
    test:
    ep400: actor loss begin to convergence around +-1, critic loss start to converges slowly: around 10-20 = able to cross pipe smoothly
    ep1000: model score first pipe most the time, trying to reduce critic loss at second pipe and peaked 3 score
    ep3000: model score 2 pipe most the time, trying to reduce critic loss at third pipe and peaked 4 score
    ep13000: 1-4 are trending score, fewer 0s and more 5s and 6s, peak at 11. Critic increase at pipe 1, 2 but still pass anyway. Other pipes loss reducing
    ep...: model stop learning. Trending score respectively 2 3 4 1 5, peak at 12, mva varies around 3 score
'''

In [None]:
from IPython import display
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import pygame
import sys
from enviroment import FlappyBird
from modelA2C import A2C

In [None]:
env = FlappyBird()
scores, rewards = [], []

model = A2C()
model.actor = tf.keras.models.load_model('models/A2Cactor.keras')
model.critic = tf.keras.models.load_model('models/A2Ccritic.keras')

for episode in range(50000):
    model.trainingLoop(env, 10000)
    scores.append(model.score)
    rewards.append(model.reward)
    display.clear_output(wait=True)
    print(f"Episode {episode}:")
    print(f"current score: {model.score:.2f}, highest score: {max(scores):.2f}, avg score: {np.mean(scores):.2f}")
    print(f"current reward: {model.reward:.2f}, highest reward: {max(rewards):.2f}, avg reward: {np.mean(rewards):.2f}")
    print(f"actor loss: {model.actorLoss:.4f}, critic loss: {model.criticLoss:.4f}")
pygame.quit()
sys.exit()

In [None]:
# Reward EMA
previous_reward = 0
R_EMA = []
factor = 2 / (1+50)
for reward in rewards:
    if previous_reward != 0:
        new_reward = previous_reward * (1 - factor) + reward * factor
    else:
        new_reward = reward
    previous_reward = new_reward
    
    R_EMA.append(new_reward)

plt.plot(list(range(len(rewards))), rewards, alpha=0.5, label="Reward")
plt.plot(list(range(len(R_EMA))), R_EMA, label="Exponential Moving Average")
plt.legend()

In [None]:
# Score EMA
previous_score = 0
S_EMA = []
factor = 2 / (1 + 50)

for score in scores:
    if previous_score != 0:
        new_score = previous_score * (1 - factor) + score * factor
    else:
        new_score = score
    previous_score = new_score

    S_EMA.append(new_score)

plt.plot(list(range(len(scores))), scores, alpha=0.5, label="Scores")
plt.plot(list(range(len(S_EMA))), S_EMA, label="Exponential Moving Average")
plt.legend()

In [None]:
score_counts = Counter(scores)
trending = score_counts.most_common()
print("Trending Scores:")
for rank, (score, count) in enumerate(trending, 1):
    print(f"{rank}. Score: {score} | Count: {count}")

In [None]:
model.actor.save('models/A2Cactor.keras')
model.critic.save('models/A2Ccritic.keras')