In [21]:
#https://www.youtube.com/watch?v=ewRw996uevM&list=PLZbbT5o_s2xoWNVdDudn51XM8lOuZ_Njv&index=18
#https://www.youtube.com/watch?v=0bt0SjbS3xc&list=PLZbbT5o_s2xoWNVdDudn51XM8lOuZ_Njv&index=13

In [22]:
import gym
import random
import numpy as np
import tensorflow as tf
from collections import deque
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import  RMSprop
from tensorflow.keras.models import load_model

MODEL_FILE_NAME = "dqncartpole.h5"
env = gym.make('CartPole-v0')
tf.random.set_seed(200)

gpu = len(tf.config.list_physical_devices('GPU')) > 0
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
# tf.debugging.set_log_device_placement(True)
tf.test.is_built_with_cuda()


Num GPUs Available:  0


True

In [23]:
state_space = env.observation_space.shape[0] 
env.observation_space.shape

(4,)

In [24]:
action_space= env.action_space.n
action_space

2

In [25]:
class DQNQLearnCartPoleSolver():
    def __init__(self, env,  input_shape, action_shape, episodes, epsilon_decay_rate=0.995, min_epsilon=0.001):
        self.input_size = input_shape
        self.episodes = episodes
        self.env = env
        self.action_size = action_shape
        self.memory = deque([],maxlen=2000)
        self.min_epsilon=min_epsilon
        self.epsilon_decay_rate = epsilon_decay_rate
        self.epsilon = 0.1
        self.state_size = input_shape
        self.batch_size = 128
        self.gamma = 0.99
        self.train_start = 128
        self.model = Sequential()
        self.model.add(Dense(24, input_dim=input_shape, activation='relu', kernel_initializer='he_uniform'))
        self.model.add(Dense(action_shape, activation="linear", kernel_initializer='he_uniform'))

        self.model.compile(loss="mse", optimizer=RMSprop(
            learning_rate=0.001), metrics=["accuracy"])

 

    def action(self, state):
        # print(f" rand nr {np.random.random()}  eps {self.epsilon}")
        if np.random.rand() <= self.epsilon:
            return self.env.action_space.sample()
        else:
            return np.argmax(self.model.predict(state))

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def preprocess_state(self, state):
        return np.reshape(state, [1, self.state_size]) 
  
    def update_q_func(self,reward, next_state, done):
        if done:
            return reward
        else:
            return reward + self.gamma * np.max(next_state)

    def update_q_values(self, minibatch, target, target_next ):
        for index, (_, action, reward, _, done) in enumerate(minibatch):
            target[index][action] = self.update_q_func(reward, target_next[index], done)

    def update_epsilon(self):
        self.epsilon *= self.epsilon_decay_rate
        self.epsilon = max(self.min_epsilon, self.epsilon)


    def replay(self):
        if len(self.memory) < self.train_start:
            return
        minibatch = random.sample(self.memory, min(len(self.memory), self.batch_size))
        states = np.zeros((self.batch_size, self.state_size))
        next_states = np.zeros((self.batch_size, self.state_size))
        for index, (state, _, _, next_state, _ )in enumerate(minibatch):
            states[index] = state
            next_states[index] = next_state
        target = self.model.predict(states)
        target_next = self.model.predict(next_states)
        self.update_q_values(minibatch, target, target_next)
        self.model.fit(np.array(states), np.array(target), batch_size=self.batch_size, verbose=0)
        self.update_epsilon()
    

    def get_reward(self, done, step, reward):
        if not done or step == self.env._max_episode_steps-1:
                return reward
        else:
            return -100
    
            
    def train(self):
        scores = []
        for episode in range(self.episodes):
            done = False
            state = self.preprocess_state(self.env.reset())
            step = 0
            while not done:
                action = self.action(state)
                next_state, reward, done, _ = self.env.step(action) 
                next_state =  self.preprocess_state(next_state)
                reward = self.get_reward(done, step, reward)
                step +=1
                self.remember(state, action, reward, next_state, done)
                state = next_state
            scores.append(step)
            print(f"{scores[episode]}  score for ep {episode+1} epsilon {self.epsilon}")
            if step == 200:
                print(f"Saving trained model as {MODEL_FILE_NAME}")
                self.model.save(MODEL_FILE_NAME)
            self.replay()
        print('Finished training!')

    def test(self):
        self.model = load_model(MODEL_FILE_NAME)
        state = self.preprocess_state(self.env.reset())
        done = False
        score = 0
        while not done:
            self.env.render()
            action = np.argmax(self.model.predict(state))
            next_state, reward, done, _ = self.env.step(action)
            state = self.preprocess_state(next_state)
            score += 1
        print(f"{score}  score")
        self.env.close()

In [26]:

model = DQNQLearnCartPoleSolver(env, state_space, action_space, episodes=10)
model.train()

69  score for ep 1 epsilon 0.1
79  score for ep 2 epsilon 0.1
180  score for ep 3 epsilon 0.0995
122  score for ep 4 epsilon 0.09900250000000001
61  score for ep 5 epsilon 0.0985074875
58  score for ep 6 epsilon 0.09801495006250001
62  score for ep 7 epsilon 0.09752487531218751
124  score for ep 8 epsilon 0.09703725093562657
131  score for ep 9 epsilon 0.09655206468094843
46  score for ep 10 epsilon 0.09606930435754368
Finished training!


In [27]:
model.test()

69  score


# Explanation

## Model
```python
self.model = Sequential()
self.model.add(Dense(24, input_dim=input_shape, activation='relu',kernel_initializer='he_uniform'))
self.model.add(Dense(action_shape, activation="linear", kernel_initializer='he_uniform'))
```

We start with an input layer of the size of the observation space of the enviorment as we can see at the top of the file it is 4, then comes the  neural network part of the model which are all the dense layers, which creates hidden layers with n nodes. Every Hidden layer is wraped with a relu activation function which simplifies the data in the network, this is done by applying a max function on the value and 0 which leads to only positive values. Every layer also has a kernel initializer set to he uniform which initializes all the weights to non zero values in the different layers. More spesificly it draws samples from a uniform distribution within [-limit, limit] where limit is sqrt(6 / fan_in) where fan_in is the number of input units in the weight tensor. The last layer has activation linear to shape the output of the model. This model has only one hidden layer because the cartpole problem is a pretty easy problem to solve.

## Optimizer
```python
self.model.compile(loss="mse", optimizer=RMSprop(
            learning_rate=0.00025, epsilon=0.01), metrics=["accuracy"])
```

Since DQN is a RNN the RMSprop optimizer is used, which beats out normal gradient decent and adam optimizers for RNN, read more here: [Optimizers](https://ruder.io/optimizing-gradient-descent/index.html)

## Code
Most of the logic here is the same as in QLearn-cartpole.ipynb

When training we firstly loop for n episodes given to the model on creation. For each episode we reset the env as the init state
 ```python  
 self.preprocess_state(self.env.reset())

 def preprocess_state(self, state):
        return np.reshape(state, [1, self.state_size]) 
  
 ```


state is always transformed into and (1, n) array so it can be used as an input to the model. for every episode we loop until the agent has either failed by tiping or won by getting 200 points. we then choose an action, and preform that action.
```python
action = self.action(state)
next_state, reward, done, _ = self.env.step(action)


 def action(self, state):
    if np.random.random() <= self.epsilon:
        return random.randrange(self.action_size)
    else:
        return np.argmax(self.model.predict(state))
```
when choosing an action we either explore or exploit this is chosen by generating a random number between 0, 1 and then comparing it to epsilon. if the number is smaller than epsilon we use the model to predict a action (exploit) same as in normal q-learning if the number is lager than epsilon we choose a random action in the env.


In normal q-learn we would now update the q table with new q values by Optimizing the q function, but in DQN we instead push the current state, action, reward, next state, and if the agent is done or not. Since we dont update q-values in a DQN we instead use the stored values to "replay" the previous attempt and train on the model on the values stored in replay memory which is where we put all the values from each step during the while loop. This is done by firstly picking out a subset of values from the replay memory
```python
minibatch = random.sample(self.memory, min(len(self.memory), self.batch_size))
```
we then use the model to predict q values for the current state and the next state

```python
target = self.model.predict(states)
target_next = self.model.predict(next_states)
```
we then update the target values which we are going to use for training with the predicted next

```python
def update_q_func(self,reward, next_state, done):
    if done:
        return reward
    else:
        return reward + self.gamma * np.max(next_state)

def update_q_values(self, minibatch, target, target_next ):
    for index, (_, action, reward, _, done) in enumerate(minibatch):
        target[index][action] = self.update_q_func(reward, target_next[index], done)
```

then we train the model with the the states from the batch and the targets
```python
self.model.fit(np.array(states), np.array(target), batch_size=self.batch_size, verbose=0)
```
we then update the epsilon value to reduce it since the model is now better so we should trust it more and exploit more then we explore.

```python
def update_epsilon(self):
    if self.epsilon > self.min_epsilon:
        self.epsilon *= self.epsilon_decay_rate
```

## Side notes
With a lower epsilon the DQN seems to preform better faster, this might be because it results in less exploration and more exploitation which might in this case be good.

"I think the problem is with openAI gym CartPole-v0 environment reward structure. The reward is always +1 for each time step. So if pole falls reward is +1 itself. So we need to check and redefine the reward for this case. So in the train function try this:"

```python
if not done:
    new_q = reward + DISCOUNT * np.max(future_qs_list)
else:
    # if done assign some negative reward
    new_q = -20
```
[Source](https://ai.stackexchange.com/questions/22986/my-deep-q-learning-network-does-not-learn-for-openai-gyms-cartpole-problem)

       