In [None]:
import numpy as np
import pygame
import time
import gym
import random
from keras import Sequential
from collections import deque
from keras.layers import Dense
from keras.optimizers import Adam
import matplotlib.pyplot as plt
from keras.activations import relu, linear


class DeepQNetwork:


    def __init__(self, action_space, state_space, learning_rate=0.001):
        # initialize Deep Q-Network parameters
        self.epsilon = 1.0
        self.gamma = .95
        self.batch_size = 64
        self.epsilon_min = .01
        self.learning_rate = learning_rate
        self.epsilon_decay = .90
        self.memory = deque(maxlen=100000)
        self.action_space_size = action_space
        self.state_space_shape = state_space
        self.model = self.build_model()

    def build_model(self):
        # build the neural network model
        model = Sequential()
        model.add(Dense(20, input_dim=self.state_space_shape, activation=relu))
        model.add(Dense(25, activation=relu))
        model.add(Dense(self.action_space_size, activation=linear))
        model.compile(loss='mse', optimizer=Adam(learning_rate=self.learning_rate))
        return model

    def rememberFunction(self, state, action, reward, nextState, done):
        # remember the experience (state, action, reward, next_state, done)
        self.memory.append((state, action, reward, nextState, done))

    def act(self, state):
        # choose an action based on epsilon-greedy policy
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_space_size)
        state = np.expand_dims(state, axis=0)
        actValues = self.model.predict(state)
        return np.argmax(actValues[0])

    def replayFunction(self):
        # experience replay to train the model
        if len(self.memory) < self.batch_size:
            return

        miniBatchVar = random.sample(self.memory, self.batch_size)
        states = np.array([i[0] for i in miniBatchVar])
        actions = np.array([i[1] for i in miniBatchVar])
        rewards = np.array([i[2] for i in miniBatchVar])
        nextStates = np.array([i[3] for i in miniBatchVar])
        dones = np.array([i[4] for i in miniBatchVar])

        states = np.squeeze(states)
        nextStates = np.squeeze(nextStates)

        targets = rewards + self.gamma * (np.amax(self.model.predict_on_batch(nextStates), axis=1)) * (1 - dones)
        targetsFull = self.model.predict_on_batch(states)

        indexes = np.array([i for i in range(self.batch_size)])
        targetsFull[[indexes], [actions]] = targets

        self.model.fit(states, targetsFull, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay


def rewardFunction(nextStateInfo):
    # define the reward function based on the next state
    nextState = nextStateInfo[0]  # extracting the nextState array
    if nextState[0] >= 0.5:
        print("Car reached the top")
        return 10
    if nextState[0] > -0.4:
        return (1 + nextState[0]) ** 2
    return 0


def trainDQNetwork(environment, agent, episode):
    # Training the Deep Q-Network
    episodeScores = []
    for e in range(episode):
        state = environment.reset()[0]  # extracting the state array
        score = 0
        maxSteps = 1000
        for i in range(maxSteps):
            environment.render()
            for event in pygame.event.get():
                if event.type == pygame.QUIT:
                    pygame.quit()
                    quit()
            state = np.array(state)  
            action = agent.act(state)
            stepResult = environment.step(action)  # get all return values
            nextState, reward, done = stepResult[:3]  # get the first three elements
            reward = rewardFunction(stepResult)  # pass values to rewardFunction function
            score += reward
            nextState = np.array(nextState)  
            agent.rememberFunction(state, action, reward, nextState, done)
            state = nextState
            agent.replayFunction()
            if done:
                print("Episode: {}/{}, score: {}".format(e, episode, score))
                break
        episodeScores.append(score)
    return episodeScores

def main():
    pygame.init()  # initialize Pygame
    environment = gym.make('MountainCar-v0', render_mode="human") # render as human
    np.random.seed(10)  # numpy random seed

    print(environment.observation_space)
    print(environment.action_space)
    agent = DeepQNetwork(environment.action_space.n, environment.observation_space.shape[0], learning_rate=0.001)
    episodes = 60
    episodeScores = trainDQNetwork(environment, agent, episodes)
    plt.plot([i+1 for i in range(episodes)], episodeScores)
    plt.show()

if __name__ == '__main__':
    main()
