# Preliminaries

This notebook lets you import a gym environment and set up an agent that acts within the environment. Your tasks is to then implement some of the classical RL algorithms: Value iteration and Policy iteration. Play attention to how you are going to evaluate your agents.

First, we make sure that all dependencies are met

In [1]:
!pip install gym > /dev/null 2>&1

# Testing the Gym environments

Our next step is to import the gym package, create an environment, and make sure that we can use it.

In [2]:
import gym
import math
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

#create a cliff-walker
env = gym.make('CliffWalking-v0')

#set the start state
state = env.reset()
#and take some random actions
for i in range(4):
  #render the environment
  env.render()
  
  #select a random action
  action = env.action_space.sample()
  #take a step and record next state, reward and termination
  state, reward, done, _ = env.step(action)
  print("Acted: {}".format(action))
  print("State: {}".format(state))
  print("Reward: {}".format(reward))
  if done:
    #this environment only terminates once the goal is reached
    print("Done.")
    break

o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
x  C  C  C  C  C  C  C  C  C  C  T

Acted: 1
State: 36
Reward: -100
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
x  C  C  C  C  C  C  C  C  C  C  T

Acted: 2
State: 36
Reward: -1
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
x  C  C  C  C  C  C  C  C  C  C  T

Acted: 0
State: 24
Reward: -1
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
x  o  o  o  o  o  o  o  o  o  o  o
o  C  C  C  C  C  C  C  C  C  C  T

Acted: 0
State: 12
Reward: -1


# Defining an agent

The next step is to define a class for our agents. We will derive from this class to later implement a Value Iteration, Policy Iteration and Monte Carlo control agent. The base class will only provide simple functionality.

In [4]:
class Agent :
  def __init__(self,env,discount_factor):
    self.env = env
    self.gamma = discount_factor
  
  def act(self, state):
    return self.env.action_space.sample() #returns a random action

  def evaluate(self):
    # now let's test our random action agent
    n_steps = 100 #number of steps per episode

    s = env.reset()
    episode_reward = 0
    
    for i in range(n_steps):
      s, r, d, _ = env.step(self.act(s))
      episode_reward += r
      if d:
        break
    return episode_reward

#test simple evaluation function
random_agent = Agent(env,0.99)
episode_reward=random_agent.evaluate()
print("Episode return {}".format(episode_reward))

Episode return -1387


# Value Iteration Agent

In this section you are to implement an agent that solves the environment, using Value Iteration

In [12]:
class ValueAgent(Agent):
  def __init__(self,env,discount_factor,theta):
    super().__init__(env,discount_factor)
    #theta is an approximation error threshold
    self.theta = theta
    self.V = np.random.rand(self.env.shape[0], self.env.shape[1])
    #set terminal state to 0
    self.V[-1,-1] = 0
  
  def act(self, state): 
    #here choose action that would bring us to state with highest value
    values=[]
    for i in range(self.env.nA):
      v=0
      for j in range(len(env.P[state][i])):
        prob, next_state, reward, done = env.P[state][i][j]
        next_position = np.unravel_index(next_state, self.env.shape)
        v += reward + self.gamma*self.V[next_position]
      values.append(v)
    
    action = np.argmax(values)
    if (type(action)==np.array): print (action)
    return action

   

  def iterate(self):
    while(True):
      delta = 0.0
      for state in range(self.env.nS-1):
        position = np.unravel_index(state, self.env.shape)
        v = self.V[position]
        action = self.act(state)
        value = 0
        for j in range(len(env.P[state][action])):
          prob, next_state, reward, done = self.env.P[state][action][j]
          next_position = np.unravel_index(next_state, self.env.shape)
          value += prob * (reward + self.gamma*self.V[next_position])
        self.V[position] = value
        delta = max([delta, np.abs(v-self.V[position])])
      print(delta)
      if (delta < self.theta):
        print(delta)
        break

np.set_printoptions(precision=3, linewidth=200)

agent = ValueAgent(env,0.99,0.001)
print(agent.V)
#perform value iteration
agent.iterate()
#evaluate agent and plot relevant qualities
episode_reward=agent.evaluate()
print("Episode return {}".format(episode_reward))

print(agent.V)

[[0.733 0.018 0.813 0.48  0.47  0.784 0.332 0.432 0.061 0.743 0.494 0.451]
 [0.844 0.065 0.237 0.095 0.276 0.247 0.857 0.574 0.779 0.696 0.957 0.972]
 [0.121 0.337 0.367 0.15  0.674 0.457 0.063 0.027 0.114 0.297 0.422 0.91 ]
 [0.268 0.428 0.84  0.969 0.665 0.588 0.957 0.033 0.293 0.841 0.939 0.   ]]
2.9200050072988297
1.3800795863819826
1.3662787905181628
1.3526160026129816
1.3390898425868518
1.226340800469469
1.0266721934966148
1.0164054715616482
0.931713174529043
0.9223960427837534
0.9131720823559153
0.9040403615323562
0.8949999579170314
0.8860499583378623
0.844364598550488
0.0
0.0
Episode return -13
[[-13.125 -12.248 -11.362 -10.466  -9.562  -8.648  -7.726  -6.793  -5.852  -4.901  -3.94   -2.97 ]
 [-12.248 -11.362 -10.466  -9.562  -8.648  -7.726  -6.793  -5.852  -4.901  -3.94   -2.97   -1.99 ]
 [-11.362 -10.466  -9.562  -8.648  -7.726  -6.793  -5.852  -4.901  -3.94   -2.97   -1.99   -1.   ]
 [-12.248 -11.362 -10.466  -9.562  -8.648  -7.726  -6.793  -5.852  -4.901  -3.94   -1.      0

# Policy Iteration Agent
Follow the same procedure for implementing a policy iteration agent

In [14]:
class PolicyAgent(Agent):
  def __init__(self,env,discount_factor,theta):
    super().__init__(env,discount_factor)
    #theta is an approximation error threshold
    self.theta = theta
    self.V = np.random.rand(self.env.shape[0], self.env.shape[1])
    #set terminal state to 0
    self.V[-1,-1] = 0
    self.policy = np.random.randint(4, size=self.env.shape)
  
  def act(self, state): 
    #here choose action that would bring us to state with highest value
    values=[]
    for i in range(self.env.nA):
      v=0
      for j in range(len(env.P[state][i])):
        prob, next_state, reward, done = env.P[state][i][j]
        next_position = np.unravel_index(next_state, self.env.shape)
        v += reward + self.gamma*self.V[next_position]
      values.append(v)
    
    action = np.argmax(values)
    if (type(action)==np.array): print (action)
    return action

   

  def evaluate_policy(self):
    while(True):
      delta = 0.0
      for state in range(self.env.nS-1):
        position = np.unravel_index(state, self.env.shape)
        v = self.V[position]
        action = self.policy[position]
        value = 0
        for j in range(len(env.P[state][action])):
          prob, next_state, reward, done = self.env.P[state][action][j]
          next_position = np.unravel_index(next_state, self.env.shape)
          value += prob * (reward + self.gamma*self.V[next_position])
        self.V[position] = value
        delta = max([delta, np.abs(v-self.V[position])])
      #print(delta)
      if (delta < self.theta):
        #print(delta)
        break

  def improve(self):
    policy_stable = True
    #for i in range(200):
    while(True):
      policy_stable = True
      for state in range(self.env.nS-1):
        position = np.unravel_index(state, self.env.shape)
        old_action = self.policy[position]
        self.policy[position] = self.act(state)
        if not (self.policy[position] == old_action):
          policy_stable = False
      if policy_stable:
        break
      else:
        self.evaluate_policy()





agent = PolicyAgent(env,0.99,0.001)
print(agent.V)

print(agent.policy)
#perform value iteration
agent.improve()
#evaluate agent and plot relevant qualities
episode_reward=agent.evaluate()
print("Episode return {}".format(episode_reward))
np.set_printoptions(precision=3, linewidth=200)
print(agent.V)

print(agent.policy)

[[0.815 0.366 0.065 0.349 0.204 0.262 0.921 0.389 0.414 0.5   0.034 0.062]
 [0.497 0.24  0.617 0.461 0.992 0.602 0.378 0.842 0.234 0.693 0.563 0.414]
 [0.726 0.342 0.655 0.282 0.184 0.533 0.38  0.094 0.556 0.787 0.122 0.224]
 [0.4   0.221 0.105 0.263 0.728 0.009 0.538 0.016 0.796 0.87  0.871 0.   ]]
[[1 0 0 2 0 3 1 1 2 3 0 3]
 [3 3 1 2 0 0 2 1 1 0 3 2]
 [1 3 2 2 0 3 2 0 3 3 3 2]
 [0 1 1 2 3 0 0 1 1 0 1 2]]
Episode return -13
[[-13.125 -12.248 -11.362 -10.466  -9.562  -8.648  -7.726  -6.793  -5.852  -4.901  -3.94   -2.97 ]
 [-12.248 -11.362 -10.466  -9.562  -8.648  -7.726  -6.793  -5.852  -4.901  -3.94   -2.97   -1.99 ]
 [-11.362 -10.466  -9.562  -8.648  -7.726  -6.793  -5.852  -4.901  -3.94   -2.97   -1.99   -1.   ]
 [-12.248 -11.362 -10.466  -9.562  -8.648  -7.726  -6.793  -5.852  -4.901  -3.94   -1.      0.   ]]
[[1 1 1 1 1 1 1 1 1 1 1 2]
 [1 1 1 1 1 1 1 1 1 1 1 2]
 [1 1 1 1 1 1 1 1 1 1 1 2]
 [0 0 0 0 0 0 0 0 0 0 1 2]]


#Monte Carlo control agent
Follow the same procedure for implementing a Monte Carlo control agent

In [13]:
#code here