# Disclaimer
***
This project was conducted for University of Toronto - School of Continuing Studies (SCS) as part of the Intelligent Agents & Reinforcement Learning - 3547 Course.
***
**Project Title:** Safe Landings In Deep Space<br><br>
**Team Members:** Adnan Lanewala, Nareshkumar Patel, Nisarg Patel<br><br>
**Course:** UFT 3547 - Intelligent Agents & Reinforcement Learning<br><br>
**Instructor:** Larry Simon<br><br>
**Session:** December 2019<br><br>
**Open AI Gym Environment:** https://github.com/openai/gym<br><br>
**Lunar Lander:** http://gym.openai.com/envs/LunarLander-v2/<br><br>
**DQN Algorith Reference:** https://arxiv.org/pdf/1312.5602.pdf

# Import All Dependencies
***

In [None]:
### IMPORT ALL LIBRARIES AND FUNCTIONS TO BE USED ###
import gym # Lunar Lander environment
import numpy as np
from collections import deque
import random
import os

### KERAS IMPORTS ###
import keras
from keras.utils import plot_model
from keras import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from keras.models import Model


In [None]:
print("Gym version:",gym.__version__) # Print GYM VERSION and ensure its > 0.15.4
print("Keras version:",keras.__version__) # Print GYM VERSION and ensure its > 0.15.4

# Setup
***

In [None]:
MAX_STEPS = 3000
MAX_EPISODES = 500

# Setup Paths for saving and loading weights
ROOT_PATH = os.getcwd()
WEIGHTS_PATH = os.path.join(ROOT_PATH,"modelweights")

print("Root Path:",ROOT_PATH)
print("Weights Path:",WEIGHTS_PATH)

# Q-Learning
***

In [None]:
class LunarLanderDQNAgent:

    def __init__(self, state_size, action_size): # initializing method
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        
        # Hyper-parameters
        
        # discount rate. If small then the agent looks for immediate reward. 
        # If big then the agent looks for long term reward
        self.gamma = 0.99
        
        # how fast an agent learns
        self.learning_rate = 0.001 # learning rate
        
        # exploration parameter
        self.epsilon = 1.0 # exploration rate
        self.epsilon_min = 0.01 # minimum exploration probability
        self.epsilon_decay = 0.9 # exponential decay rate for exploration probability
    
        # builds a keras model
        self.model = self.build_keras_model()
        
    # This function creates a neural network using keras library for Deep Q-Learning model
    def build_keras_model(self):
        model = Sequential() # we will create a sequential model

        # 1st Layer: Input Layer with State Size = 8 and Hidden layer with 30 nodes
        model.add(Dense(30, input_dim = self.state_size, activation = "relu", name = "Input_Layer"))

        # 2nd layer: Hidden layer with 30 nodes
        model.add(Dense(30, activation = "relu", name = "Hidden_Layer"))

        # 3rd Layer: Output Layer with dimensions of the # of actions = 4
        model.add(Dense(self.action_size, activation="linear", name = "Output_Layer"))

        # Compile the model
        # Loss function is Mean Square Error
        # Optimizer is Adam
        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))

        model.summary() # Print Model Summary
        return model
    
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
        
    def act(self,state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state) # predict the action
        return np.argmax(act_values[0]) # return the action
    
    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target = reward + self.gamma * \
                         np.amax(self.model.predict(next_state)[0])
            target_f = self.model.predict(state)
            target_f[0][action] = target
            self.model.fit(state, target_f, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

In [None]:

# observation = env.reset() # reset the environment
# for _ in range(1000): # run the environment for 1000 iterations
#     env.render()
#     action = env.action_space.sample() # take a random action
#     observation, reward, done, info = env.step(action)
#     if done:
#         env.reset() # reset the environment and change it so we can try it on a diff env
# env.close() # close the environment and free up the memory space

# for i_episode in range(20):
#     observation = env.reset()
#     for t in range(100):
#         env.render()
#         #print(observation,"\n")
#         action = env.action_space.sample()
#         observation, reward, done, info = env.step(action)
#         print("Observation:",observation)
#         print("Reward:",reward)
#         print("Done:",done)
#         print("Info:",info)
#         if done:
#             print("Episode finished after {} timesteps".format(t+1))
#             break
# env.close()

# Main
***

In [None]:
if __name__ == "__main__":
    
    env = gym.make('LunarLander-v2')
    
    # Get Action Size from the Action Space
    ACTION_SIZE = env.action_space.n # 4 discrete action (Do nothing, fire left engine, fire main engine, fire right engine)
    print("Action Space:", ACTION_SIZE)

    # Get State Size from the Observation Space
    STATE_SIZE = env.observation_space.shape[0]
    print("Observation Space:", STATE_SIZE)
    
    agent = LunarLanderDQNAgent(STATE_SIZE, ACTION_SIZE)
    done = False
    batch_size = 32

    
#     for episode in range(MAX_EPISODES):
#         state = env.reset()
#         state = np.reshape(state, [1, STATE_SIZE])
#         score = 0
        
#         for step in range(MAX_STEPS):
#             env.render()
#             action = agent.act(state)
            
#             next_state, reward, done, _ = env.step(action) # take action and get results
#             score += reward
#             next_state = np.reshape(next_state, [1, STATE_SIZE])
#             agent.remember(state, action, reward, next_state, done)
#             state = next_state
            
#             if done:
#                 print("episode: {}/{}, score: {}".format(episode, MAX_EPISODES, score))
#                 break
                
#             if len(agent.memory) > batch_size:
#                 agent.replay(batch_size)      
#     env.close()