In [None]:
import matplotlib.pyplot as plt
import time
import numpy as np
import gym
import random
from collections import deque
import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.keras import backend as K
#from tensorflow.keras.utils.np_utils import to_categorical
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Input, Activation
from tensorflow.keras.losses import sparse_categorical_crossentropy
from tensorflow.keras.optimizers import Adam
import os
    
class DQNAgent:
    def __init__(self, state_size, action_size, arqTrained):
        self.state_size = state_size  #  Observation array length   state_size = env.observation_space.shape[0]
        self.action_size = action_size  #   Action space length       action_size = env.action_space.n
        self.gamma = 0.95
        self.learning_rate = 0.001
        self.arqTrained = arqTrained 
        self.memory = deque(maxlen=2000)
        self.epsilon = 0.7
        self.epsilon_min= 0.01
        self.epsilon_decay= 0.995
        self.loss = []
        
                
        if os.path.isfile('./' + self.arqTrained):
            print("LOAD existing keras model....")
            self.model = load_model(self.arqTrained)
            print(self.model.summary())
        else:
            # Call function model to build the model   
            print("Build a new model")
            self.model = self.buildNetworkModel()      
        
        #storing the values per episodes
        self.ep_rewards = []
        
    def buildNetworkModel(self):
        # Neural Net for Deep-Q learning Model
        model = Sequential()
        model.add(Dense(128, input_dim=self.state_size, activation='relu'))
        model.add(Dense(128, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss='mse',
                      optimizer=Adam(lr=self.learning_rate))
        return model
    
        
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
        self.ep_rewards.append(reward)
    
            
    def choose_action(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])  # returns action
                       

    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        
        self.loss = []
        
        for state, action, reward, next_state, done in minibatch:            
            target = reward
            if not done:
                target = (reward + self.gamma *
                          np.amax(self.model.predict(next_state)[0]))
            target_f = self.model.predict(state)
            
            target_f[0][action] = target
            
            # Train
            history = self.model.fit(state, target_f, epochs=1, verbose=0)
            
            #armazendo a funcao de perda
            self.loss.append(history.history['loss'])    
            
            #self.ep_obs, self.ep_rewards, self.ep_action, self.ep_obs_new, self.ep_flags = [], [], [], [], []
        
        #storing mean of function loss
        meanLoss = np.mean(self.loss)    
        
        if self.epsilon >= self.epsilon_min:
            self.epsilon *= self.epsilon_decay
        #self.save(self.arqTrained)
        return history, meanLoss
        
    def save(self, name):
        self.model.save(name)
 
    
if __name__ == '__main__':
        
    viewRendering = input("Visualize rendering in Training ? [y/n]:  ")
    
    RENDER_REWARD_MIN = 5000
    RENDER_ENV = False
    
    if viewRendering == 'y': 
        RENDER_ENV = True  #flag for rendering the environment
    
    arqTrained = "lunar-lander_solved.h5"
    
    batch_size = 16
    EPISODES = 9000    # Number of episodes
    
    env = gym.make('LunarLander-v2')
    env = env.unwrapped
    
    # Observation and Action array length
    state_size = env.observation_space.shape[0] 
    action_size = env.action_space.n
    agent = DQNAgent(state_size,action_size, arqTrained)
    
    rewards_over_time = []
    error = []
    epsilon = []
    rew_var = []
    rew_mean = []
    mean_100 = []
    seed = np.random.seed(666)
         
    #print("-----------------------------------")        
    #print("Environment Observation_space: ", env.observation_space)
    #print("Environment Action_space: ", env.action_space) 
    #print("-----------------------------------\n")
    
    w = 0
        
    # Start running the episodes        
    for i in range(EPISODES): 
        state = env.reset()         
        #state = np.reshape(state, [1, state_size])                
        state = state.reshape(1,-1) 
        start = time.time()
        
        while True:            
            if RENDER_ENV==True:
                env.render()
            
            action = agent.choose_action(state)
            next_state, reward, done, _ = env.step(action)
            # Append
            
            reward = reward if not done else -10
            
            #next_state = np.reshape(next_state, [1, -1])
            next_state = next_state.reshape(1,-1)
            
            agent.remember(state, action, reward, next_state, done)   
            state = next_state         
            
            #-------------- VER COM CALMA A PARTIR DAQUI-------------#
            end = time.time()
            time_space = end - start
            
            if time_space > 15:
                done = True
          
            # Sum the episode rewards
            ep_rew_total = sum(agent.ep_rewards)
            mean = np.mean(agent.ep_rewards)
            var = np.var(agent.ep_rewards)
            if ep_rew_total < -300:
                done = True
            
            if done==True:
                rewards_over_time.append(ep_rew_total)
                rew_mean.append(mean)
                rew_var.append(var)
                max_reward = np.max(rewards_over_time)
                episode_max = np.argmax(rewards_over_time)
                if ep_rew_total >=200 :
                    w = w + 1
                    agent.save(arqTrained)
                                        
                print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
                print("Episode: ", i)
                print("Time: ", np.round(time_space, 2),"secs")
                print("Reward:", ep_rew_total)
                print("Maximum Reward: " + str(max_reward) + "  on Episode: " + str(episode_max))
                print("Times win: " + str(w))
                
                if i % 100 ==0:
                    print("Mean reward of the past 100 episodes: ", str(np.mean(rewards_over_time[-100:])))
                    mean_100.append(np.mean(rewards_over_time[-100:]))
                
                # Start training the Neural Network
                hist, meanLoss= agent.replay(batch_size)
                
                epsilon.append(agent.epsilon)
                                           
                error.append(meanLoss)
                
                if max_reward > RENDER_REWARD_MIN: RENDER_ENV = True
                
                break
            
    plt.figure(1)
    plt.plot(error)
    plt.xlabel("Episodes")
    plt.ylabel("Average Error")
    plt.title("Average_Loss Vs Episodes")
    plt.show()
    
    plt.figure(1)
    plt.plot(epsilon)
    plt.xlabel("Episodes")
    plt.ylabel("Epsilon value")
    plt.title("Epsilon Vs Episodes")
    plt.show()
    
    np.save("rewards_over_time", rewards_over_time)
    np.save("mean100", mean_100)            
            
            
    plt.figure(1)            
    plt.plot(rewards_over_time, label="Rewards")
    plt.plot(rew_mean, label="Mean")
    plt.plot(rew_var, label="Variance")    
    plt.xlabel("Episodes")
    plt.ylabel("Rewards")
    plt.title("Rewards per Episode")
    plt.legend(loc=0)
    plt.show()