In [None]:
# enviroment: flappy bird.

# model: reinforcement learning A2C. 4 states (normalized), 2 actions
# actor: 4 input | 64 relu | 64 relu | softmax. critic: 4 input | 64 relu | 64 relu | linear
# params: discount 0.999, optimizer adam-learning rate 0.001, no e-greedy
# reward: -5 die, +0.05 survive, +5 pass pipe

# result: 2 points mva at ep1000, peak 10 points at ep1400

In [None]:
from IPython import display
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import time
from envBird import FlappyBird
from modelBird import A2Cmodel

In [None]:
env = FlappyBird()
scores, rewards = [], []

model = A2Cmodel(4, 2, 0.999)
model.actor = tf.keras.models.load_model('actor.keras')
model.critic = tf.keras.models.load_model('critic.keras')

for episode in range(2000):
    try:
        model.training_loop(env, 10000)
        scores.append(model.score)
        rewards.append(model.reward)
        display.clear_output(wait=True)
        print(f"Episode {episode}:")
        print(f"current score: {model.score:.2f}, highest score: {max(scores):.2f}, avg score: {np.mean(scores):.2f}")
        print(f"current reward: {model.reward:.2f}, highest reward: {max(rewards):.2f}, avg reward: {np.mean(rewards):.2f}")
    except KeyboardInterrupt:
        time.sleep(5)

In [None]:
# EMA
previous_reward = 0
EMA = []
factor = 2 / (1+50)
for reward in rewards:
    if previous_reward != 0:
        new_reward = previous_reward * (1 - factor) + reward * factor
    else:
        new_reward = reward
    previous_reward = new_reward
    
    EMA.append(new_reward)

plt.plot(list(range(len(rewards))), rewards, alpha=0.5, label="Reward")
plt.plot(list(range(len(EMA))), EMA, label="Exponential Moving Average")
plt.legend()
plt.show()

In [None]:
model.actor.save('actor.keras')
model.critic.save('critic.keras')