### Lunar Lander Environment
Observation Space: 8-dimensional vector: the coordinates of the lander, its linear velocities, its angle, angular velocity, and bools representing whether each leg is touching the ground  
[x, y, vx, vy, angle, angle_vel, left_leg_on_ground, right_leg_on_ground]  
Action Space: 4 Discrete actions: do nothin, fire left orinetation engine, fire main engine, fire right orientation engine.



### Theory
V(s) = max(Q(s,a)  
Q(s,a) = R(s,a) + gV(s')  
p(s) = max Q(s, a)

In [7]:
import gym

### Imports


In [62]:
import numpy as np
import gym
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import sys

In [87]:
class LLAgent:
    """A learning Agent for the Lunar Landar environment"""
    
    def __init__(self, env = gym.make("LunarLander-v2")):
        """
        Create a new LLAgent. Initialize the q_network and the target_q_network
        
        Params:
        env:
            the environment that the agent will learn from
        """
        
        self.env = env
        self.q_function_input_size = env.observation_space.shape[0] + 1
        self.q_function_output_size = 0
        
        self.q_function = self.get_q_function(self.q_function_input_size,
                                              self.q_function_output_size)
        self.q_target_function = self.get_q_function(self.q_function_input_size,
                                              self.q_function_output_size)
        
        
        
    
    def get_optimum_action(self, state: np.ndarray, q_function: keras.Model) -> int:
        """
        Return an optimal action based on the state
        
        Params:
        state: np.ndarray
            the numpy array representing the state for wchich an optimal action is needed
        q_function: keras.Model
            the model that will be used to predict the reward of each action
        """
        states_actions = np.array([np.append(state, [i]) for i in range(4)])
        print(q_function.predict(states_actions))
        return np.argmax(q_function.predict(states_actions))
        
    
    def train(history):
        """
        Based on new history data, train the model
        """
    
    # HELPER METHODS
    def get_q_function(self, 
                         input_size: tuple, 
                         output_size: tuple,
                         num_layers: int = 3,
                         layer_sizes: list[int] = [64, 32, 16],
                         activation: str = "relu") -> keras.Model:
        """
        Create a neural net to represent the q-function
        
        Params:
        input_size: tuple
            the size/dimensions of the function input (should be the shape of the observation space)
        output_size: tuple
            the size/dimensions of the function output (should be the shape of the action space)
        num_layers: int
            the number of hidden layers in the neural network
        layer_sizes: list[int]
            the sizes of each hidden layer: [hidden layer 1 size, hiddden layer 2 size...hidden layer -num-layyers- size]
        activation: str
            the activation function of the neural network
        """
        # Assertions
        assert num_layers == len(layer_sizes), f"Number of layers must be the same as the length of layer sizes: num: {num_layers} != sizes: {len(layer_sizes)}"
        
        # Build Neural Net
        inputs = layers.Input(shape=input_size) 
        layer = inputs
        for layer_num in range(len(layer_sizes)):
            layer = layers.Dense(layer_sizes[layer_num], activation = "relu")(layer)
        output = layers.Dense(output_size, activation = "sigmoid")(layer)
        
        model = keras.Model(inputs = inputs, outputs = output)
        model.summary()
        return model
        
        
        
    
        

In [88]:
a = LLAgent()
a.get_optimum_action(np.array([0, 0, 0, 0, 0, 0, 0, 0]), a.q_function)

Model: "model_14"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_15 (InputLayer)       [(None, 9)]               0         
                                                                 
 dense_56 (Dense)            (None, 64)                640       
                                                                 
 dense_57 (Dense)            (None, 32)                2080      
                                                                 
 dense_58 (Dense)            (None, 16)                528       
                                                                 
 dense_59 (Dense)            (None, 0)                 0         
                                                                 
Total params: 3,248
Trainable params: 3,248
Non-trainable params: 0
_________________________________________________________________
Model: "model_15"
______________________________________

ValueError: attempt to get argmax of an empty sequence

In [11]:
env = gym.make("LunarLander-v2")
env.reset()

(array([ 4.9562455e-04,  1.4062922e+00,  5.0182693e-02, -2.0568186e-01,
        -5.6747103e-04, -1.1367132e-02,  0.0000000e+00,  0.0000000e+00],
       dtype=float32),
 {})

In [69]:
env.action_space.contains(4)


False

In [43]:
a

False

In [75]:
state = np.array([1, 1, 1])
np.array([np.append(state, [i]) for i in range(4)])

array([[1, 1, 1, 0],
       [1, 1, 1, 1],
       [1, 1, 1, 2],
       [1, 1, 1, 3]])

In [73]:
state.a

array([3, 3, 3])

In [77]:
type(state)

numpy.ndarray