# Deep Reinforment Learning

## Imports and setup optimization

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import animation, rc

from keras.models import Sequential
from keras.layers import Dense, ReLU
from keras.optimizers import RMSprop

# set up the optimizer
optimizer = RMSprop()

2023-03-07 17:41:52.733336: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Brain class

In [2]:
class Brain:
    # initialize the brain of the agent with the following parameters
    # build the model
    def __init__(self, n_states, n_hidden, n_actions, gamma=0.9, r=0.99):
        self.epsilon = 1.0 # initial exploration rate
        self.gamma = gamma # discount factor
        self.r = r # decay rate of epsilon

        model = Sequential()
        model.add(Dense(n_hidden, input_shape=(n_states,)))
        model.add(ReLU())
        model.add(Dense(n_hidden))
        model.add(ReLU())
        model.add(Dense(n_actions))
        model.compile(loss='mse', optimizer=optimizer)
        self.model = model

    # train the model with predicted Q values
    # if the episode is terminated, the target is the reward
    # if the episode is not terminated, the target is the reward + discounted Q value of the next state
    def train(self, states, next_states, action ,reward, terminal):
        q = self.model.predict(states)
        q_next = self.model.predict(next_states)
        t = np.copy(q)
        if terminal:
            t[:, action] = reward # if the episode is terminated, the target is the reward
        else:
            t[:, action] = reward + self.gamma * np.max(q_next, axis=1)
        self.model.fit(states, t, verbose=0)

    # get the action with the highest Q value
    # if the random number is less than the exploration rate, the action is random
    # if the random number is greater than the exploration rate, the action is the one with the highest Q value
    def get_action(self, state):
        q = self.model.predict(state)
        if np.random.rand() < self.epsilon:
            action =  np.random.randint(0, q.shape[1])
        else:
            action = np.argmax(q, axis=1)
        if self.epsilon > 0.1: # decay the exploration rate
            self.epsilon *= self.r
        return action

## Agent class

In [5]:
class Agent:
    # initialize the agent with the following action parameters
    def __init__(self, v_x, v_y_sigma, v_jump, brain):
        self.v_x = v_x # velocity in the x direction
        self.v_y_sigma = v_y_sigma # std of the velocity in the y direction
        self.v_jump = v_jump # velocity in the y direction when the agent jumps
        self.brain = brain
        self.reset()

    # reset the agent position and velocity in the y direction
    # initial position is (x, y) = (-1, 0)
    # initial velocity in the y direction is randomly sampled from a normal distribution
    def reset(self):
        self.x = -1. # x position
        self.y = 0. # y position
        self.v_y = self.v_y_sigma * np.random.randn() # y velocity

    # update the agent steate, reward, and brain
    # the reward is 0 if the agent position is y = -1 or y = 1
    # the reward is 1 if the agent position is x = 1
    # if the agent jumps, the velocity in the y direction is updated by the jump velocity
    # if the agent does not jump, the velocity in the y direction is updated by the gravity
    def step(self, g):
        states = np.array([[self.y, self.v_y]])
        self.x += self.v_x
        self.y += self.v_y

        reward = 0
        terminal = False 
        # set the reward and terminal flag
        if self.y < -1 or self.y > 1:
            reward = -1
            terminal = True
        elif self.x > 1:
            reward = 1
            terminal = True
        reward = np.array([reward])

        # get the action from the brain
        action = self.brain.get_action(states)
        if action[0] == 1:
            self.v_y = self.v_jump
        else:
            self.v_y -= g
        
        # update the brain
        next_states = np.array([[self.y, self.v_y]])
        self.brain.train(states, next_states, action, reward, terminal)

        # if the episode is terminated, reset the agent
        if terminal:
            self.reset()

## Environment class

In [6]:
class Enviroment:
    # initialize the enviroment with the following parameters
    def __init__(self, agent, g):
        self.agent = agent
        self.g = g

    # advance the agent by one step
    def step(self):
        self.agent.step(self.g)
        return self.agent.x, self.agent.y

## Animation

In [7]:
# show the animation of the agent action
def animation(enviroment , interval , frames):
    fig, ax = plt.subplots()
    plt.close() # close the figure
    ax.set_xlim((-1, 1))
    ax.set_ylim((-1, 1))
    sc = ax.scatter([], [], s=100, c='r')

    # plot the agent position
    def plot(data):
        x, y = enviroment.step()
        sc.set_offsets(np.array[x, y])
        return sc,

    return animation.FuncAnimation(fig, plot, frames=frames, interval=interval, blit=True)

## Training

In [None]:
# train the agent and show the animation
def main(r=0.99):
    # build the brain
    n_states = 2 # number of states
    n_hidden = 32 # number of hidden units
    n_actions = 2 # number of actions
    brain = Brain(n_states, n_hidden, n_actions, r=r)

    # build the agent
    v_x = 0.05 # velocity in the x direction
    v_y_sigma = 0.1 # std of the velocity in the y direction
    v_jump = 0.2 # velocity in the y direction when the agent jumps
    agent = Agent(v_x, v_y_sigma, v_jump, brain)

    # build the enviroment
    g = 0.2 # gravity
    enviroment = Enviroment(agent, g)

    # show the animation
    interval = 50 # interval between frames
    frames = 1024 # number of frames
    ani = animation(enviroment, interval, frames)
    rc('animation', html='jshtml')
    return ani

## Random action

In [None]:
# observe the agetnt acting randomly
ani = main(r=1.0)

## DQN

In [None]:
# observe the agent acting with the trained brain
ani = main(r=0.99)