In this notebook we will implement Deep Q Learning for Mountain Car environment.
As we know that the Deep Q Learning takes states as an input to the deep neural network and computes Q values as a outputs.
The squared error between expected Q value and computed Q value will be treated as an error for neural network. 
Based on this error the network parameters which are weights will be updated.

## So lets begin.



In [1]:
import gym
env = gym.make("MountainCar-v0")

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


<img src="poster.jpg">

# Description
Get an under powered car to the top of a hill (top = 0.5 position)

# Environment
## Observation
Type: Box(2)

| Num	| Observation |	Min | Max |
| --- | --- | --- | --- |
| 0 | position | -1.2 |	0.6 |
| 1 | velocity | -0.07 | 0.07 |


## Actions
Type: Discrete(3)

| Num | Action |
| --- | --- |
| 0 | push left |
| 1 | no push |
| 2 | push right |


## Reward
-1 for each time step, until the goal position of 0.5 is reached. As with MountainCarContinuous v0, there is no penalty for climbing the left hill, which upon reached acts as a wall.

## Starting State
Random position from -0.6 to -0.4 with no velocity.

## Episode Termination
The episode ends when you reach 0.5 position, or if 200 iterations are reached.

In [2]:
print("Action space ", env.action_space)
print("Observation space ", env.observation_space)

Action space  Discrete(3)
Observation space  Box(2,)


In [3]:
# What will be stored inside a single memory
# State, Action, Reward, Next Action
from collections import deque
import numpy as np

class memory_storage():
    def __init__(self, mem_size):
        self.memory = deque(maxlen = mem_size)
        
    def add(self, experience):
        self.memory.append(experience)
    
    def retrive_mem(self, batch_size):
        idx = np.random.choice(range(len(self.memory)), size=batch_size)
        return [self.memory[id] for id in idx]
    
    def generate_save_samples(self, no_episodes):
        for i in range(no_episodes):
            current_state = env.reset()
            while True:
                current_action = env.action_space.sample()
                next_state, reward, done, _ = env.step(current_action)
                if done == True:
                    next_state = np.zeros(current_state.shape)
                    self.memory.append([current_state, current_action, reward, next_state])
                    break
                else:
                    self.memory.append([current_state, current_action, reward, next_state])
                    current_state = next_state

In [4]:
# Here we generate samples from no of multiple episodes
myMemory = memory_storage(10000)
myMemory.generate_save_samples(10)

In [5]:
# Our network will be 2 --> 10 --> 10 --> 3

import tensorflow as tf

class NeuralNetwork():
    def __init__(self, ip, h1, h2, op, lr):
        self.inputs_ = tf.placeholder(shape=[None, ip], dtype=tf.float32)
        self.targets = tf.placeholder(shape=[None], dtype=tf.float32)
        self.actions_ = tf.placeholder(shape=[None], dtype=tf.float32)
        one_hot_actions = tf.one_hot(tf.cast(self.actions_, tf.int32), op)
        
        self.w1 = tf.Variable(initial_value = tf.truncated_normal(shape=[ip, h1]))
        self.b1 = tf.Variable(initial_value = tf.truncated_normal(shape=[h1]))
        self.y1 = tf.nn.relu(tf.matmul(self.inputs_, self.w1) + self.b1)

        self.w2 = tf.Variable(initial_value = tf.truncated_normal(shape=[h1, h2]))
        self.b2 = tf.Variable(initial_value = tf.truncated_normal(shape=[h2]))
        self.y2 = tf.nn.relu(tf.matmul(self.y1, self.w2) + self.b2)

        self.w3 = tf.Variable(initial_value = tf.truncated_normal(shape=[h2, op]))
        self.b3 = tf.Variable(initial_value = tf.truncated_normal(shape=[op]))
        self.output_ = tf.matmul(self.y2, self.w3) + self.b3
        
        self.Q = tf.reduce_sum(tf.multiply(self.output_, one_hot_actions), axis=1)
        
        self.loss = tf.reduce_mean(tf.squared_difference(self.Q, self.targets))
        self.optimizer = tf.train.AdamOptimizer(lr).minimize(self.loss)

  from ._conv import register_converters as _register_converters


In [6]:
total_eps = 10000
learning_rate = 0.001
batch_retrive_size = 20
gamma = 0.99
decay_rate = 0.001

# Exploration parameters
explore_start = 1.0            # exploration probability at start
explore_stop = 0.01   

In [7]:
myNeuralNetwork = NeuralNetwork(ip=env.observation_space.shape[0], h1=128, h2=128, op=env.action_space.n, lr=learning_rate)

init = tf.global_variables_initializer()
saver = tf.train.Saver()
avg_reward = deque(maxlen=25)
total_reward = []

with tf.Session() as sess:
    sess.run(init)

    for i in range(total_eps):
        episode_reward = 0
        current_state = env.reset()
        while True:
            """
            take action based on either from network or on random basis
            initially you need to take more random
            but as the no of episodes are increasing the actions will be taken more with the help of network instead of random action
            """
            
            myVar = np.exp(-decay_rate*i)
            if myVar > np.random.rand():
                current_action = env.action_space.sample()
            else:
                op = sess.run(myNeuralNetwork.output_, feed_dict={myNeuralNetwork.inputs_: current_state.reshape((1, *current_state.shape))})
                current_action = np.argmax(op)
            
            
            next_state, reward, done, _ = env.step(current_action)
            episode_reward += reward
            if done == True:
                next_state = np.zeros(current_state.shape)
                myMemory.add([current_state, current_action, reward, next_state])
                avg_reward.append(reward)
                total_reward.append((i, episode_reward))
                print("-----------")
                print("Episode reward: ", episode_reward)
                print("-----------")
                break
            else:
                myMemory.add([current_state, current_action, reward, next_state])
                avg_reward.append(reward)
                current_state = next_state
                
                
            # At the end of each time step we will update the network weights.
            batch = myMemory.retrive_mem(batch_retrive_size)
            batch_states = [a[0] for a in batch]
            #print(np.array(batch_states).shape)
            actions = [a[1] for a in batch]
            rewards = [a[2] for a in batch]
            next_s = [a[3] for a in batch]
            
            q_st = sess.run(myNeuralNetwork.output_, feed_dict={myNeuralNetwork.inputs_: batch_states})
            
            target_q = rewards + gamma*np.max(q_st, axis=1)
            #print(np.array(target_q).shape)
            
            loss, _ = sess.run([myNeuralNetwork.loss, myNeuralNetwork.optimizer], 
                               feed_dict={myNeuralNetwork.inputs_: batch_states, myNeuralNetwork.targets: target_q, myNeuralNetwork.actions_: actions})
    
        print("Episode: {}/{}".format(i+1, total_eps), 
              "Average reward: {}".format(np.mean(avg_reward)),
              "Loss: {}".format(loss))
        
    saver.save(sess, "checkpoints/mountaincar_model.ckpt")
    

-----------
Episode reward:  -200.0
-----------
Episode: 1/10000 Average reward: -1.0 Loss: 0.5732564330101013
-----------
Episode reward:  -200.0
-----------
Episode: 2/10000 Average reward: -1.0 Loss: 0.23500461876392365
-----------
Episode reward:  -200.0
-----------
Episode: 3/10000 Average reward: -1.0 Loss: 0.0725632980465889
-----------
Episode reward:  -200.0
-----------
Episode: 4/10000 Average reward: -1.0 Loss: 0.013959852047264576
-----------
Episode reward:  -200.0
-----------
Episode: 5/10000 Average reward: -1.0 Loss: 0.10314229875802994
-----------
Episode reward:  -200.0
-----------
Episode: 6/10000 Average reward: -1.0 Loss: 0.044416218996047974


KeyboardInterrupt: 

In [None]:
current_state = env.reset()
current_state.reshape((1, current_state.shape))

In [None]:
test_eps = 10
test_max_step = 400
with tf.Session() as sess:
    saver.restore(sess, tf.train.latest_checkpoint("checkpoints"))
    t_reward = []
    for ep in range(test_eps):
        t = 0
        eps_reward = 0
        current_state = env.reset()
        while t < test_max_step:
            env.render()
            
            action = sess.run(myNeuralNetwork.output_, feed_dict={myNeuralNetwork.inputs_: current_state.reshape(1, *current_state.shape)})
            
            action = np.argmax(action)
            next_state, reward, done, _ = env.step(action)
            eps_reward += reward
            
            if done == True:
                t_reward.append((ep, eps_reward))
                break
            else:
                current_state = next_state
                t += 1
    
    env.close()