# Reference: 
    
Deep Reinforcement Learning with Python

By: Sudharsan Ravichandiran



In [1]:
import gym
import pandas as pd
import random
import numpy as np

In [2]:
env = gym.make('FrozenLake-v1')

In [3]:

Q = {}
for s in range(env.observation_space.n):
    for a in range(env.action_space.n):
        Q[(s,a)] = 0.0

In [4]:
def epsilon_greedy(state, epsilon):
    if random.uniform(0,1) < epsilon:
        return env.action_space.sample()
    else:
        return max(list(range(env.action_space.n)), key = lambda x: Q[(state,x)])

In [5]:
alpha = 0.85
gamma = 0.90
epsilon = 0.8

In [13]:
num_episodes = 50000
num_steps = 1000

Compute the optimal policy using the Q learning update rule as:

$$ Q(s,a) = Q(s,a) + \alpha (r + \gamma \max_{a'} Q(s'a') - Q(s,a)) $$

In [14]:
#for each episode:
for i in range(num_episodes):
    
    #initialize the state by resetting the environment
    s = env.reset()[0]
    
    #for each step in the episode
    for t in range(num_steps):
        
        #select the action using the epsilon-greedy policy
        a = epsilon_greedy(s,epsilon)
        
        #perform the selected action and store the next state information
        s_, r, done, truncate, _ = env.step(a)
        
        #first, select the action a dash which has a maximum Q value in the next state
        a_ = np.argmax([Q[(s_, a)] for a in range(env.action_space.n)])
    
        # we calculate the Q value of previous state using our update rule
        Q[(s,a)] += alpha * (r + gamma * Q[(s_,a_)]-Q[(s,a)])
    
        #update current state to next state
        s = s_
        
        #if the current state is the terminal state then break  
        if done or truncate:
            break

After all the iterations, we will have the optimal Q function. Then we can extract the
optimal policy by selecting the action which has maximum Q value in each state. 

In [15]:
import numpy as np
Q_values = np.zeros([env.observation_space.n, env.action_space.n])

In [16]:
Q_values

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [17]:
for i in Q.keys():
    Q_values[i[0]][i[1]] = Q[i]

In [18]:
Q_values

array([[2.82763270e-01, 2.47221871e-01, 2.68378238e-01, 2.52279818e-01],
       [3.99028943e-02, 2.43860929e-01, 2.85520442e-02, 1.58472439e-01],
       [1.52971967e-01, 1.52656589e-01, 1.90797216e-01, 1.86326385e-01],
       [1.68459259e-01, 2.13277897e-01, 2.89332386e-02, 1.97691633e-01],
       [2.85328286e-01, 3.14777994e-01, 2.15008909e-01, 3.71195678e-02],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.56307663e-01, 1.05591796e-01, 7.23899987e-02, 7.82432632e-04],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [7.75298244e-03, 3.01230373e-01, 3.13788016e-01, 3.82190102e-01],
       [4.02236372e-01, 4.21854385e-01, 6.65861010e-02, 3.12999261e-01],
       [6.63611924e-01, 3.84397910e-01, 4.05657843e-01, 4.99864871e-01],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [4.68212011e-01, 8.50189351e-02, 4.16244157e

In [19]:
np.argmax(Q_values, axis=1)

array([0, 1, 2, 1, 1, 0, 0, 0, 3, 1, 0, 0, 0, 0, 1, 0])