# Preliminaries

This notebook lets you import a gym environment and set up an agent that acts within the environment. Your tasks is to then implement some of the classical RL algorithms: Value iteration and Policy iteration. Play attention to how you are going to evaluate your agents.

First, we make sure that all dependencies are met

In [None]:
!pip install gym > /dev/null 2>&1

# Testing the Gym environments

Our next step is to import the gym package, create an environment, and make sure that we can use it.

In [None]:
import gym
import math
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

#create a cliff-walker
env = gym.make('CliffWalking-v0')

#set the start state
state = env.reset()
#and take some random actions
for i in range(4):
  #render the environment
  env.render()
  
  #select a random action
  action = env.action_space.sample()
  #take a step and record next state, reward and termination
  state, reward, done, _ = env.step(action)
  print("Acted: {}".format(action))
  print("State: {}".format(state))
  print("Reward: {}".format(reward))
  if done:
    #this environment only terminates once the goal is reached
    print("Done.")
    break

o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
x  C  C  C  C  C  C  C  C  C  C  T

Acted: 1
State: 36
Reward: -100
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
x  C  C  C  C  C  C  C  C  C  C  T

Acted: 3
State: 36
Reward: -1
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
x  C  C  C  C  C  C  C  C  C  C  T

Acted: 3
State: 36
Reward: -1
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
x  C  C  C  C  C  C  C  C  C  C  T

Acted: 1
State: 36
Reward: -100


In [None]:

env.P[47]

{0: [(1.0, 35, -1, False)],
 1: [(1.0, 47, -1, True)],
 2: [(1.0, 47, -1, True)],
 3: [(1.0, 36, -100, False)]}

# Defining an agent

The next step is to define a class for our agents. We will derive from this class to later implement a Value Iteration, Policy Iteration and Monte Carlo control agent. The base class will only provide simple functionality.

In [None]:
class Agent :
  def __init__(self,env,discount_factor):
    self.env = env
    self.gamma = discount_factor
  
  def act(self, state):
    return self.env.action_space.sample() #returns a random action

  def evaluate(self):
    # now let's test our random action agent
    n_steps = 100 #number of steps per episode

    s = env.reset()
    episode_reward = 0
    
    for i in range(n_steps):
      s, r, d, _ = env.step(self.act(s))
      episode_reward += r
      if d:
        break
    return episode_reward

#test simple evaluation function
random_agent = Agent(env,0.99)
episode_reward=random_agent.evaluate()
print("Episode return {}".format(episode_reward))

Episode return -1783


# Value Iteration Agent

In this section you are to implement an agent that solves the environment, using Value Iteration

In [None]:
class ValueAgent(Agent):
  def __init__(self,env,discount_factor,theta):
    super().__init__(env,discount_factor)
    #theta is an approximation error threshold
    self.theta = theta
    self.V = np.random.rand(self.env.observation_space.n)
    #set terminal state to 0
    self.V[-1] = 0
  
  def act(self, state): 
    #here choose action that would bring us to state with highest value
    values=[]
    for i in range(self.env.nA):
      v=0
      for j in range(len(env.P[state][i])):
        prob, next_state, reward, done = env.P[state][i][j]
        v += reward + self.gamma*self.V[next_state]
      values.append(v)
    
    action = np.argmax(values)
    if (type(action)==np.array): print (action)
    return action

   

  def iterate(self):
    while(True):
      delta = 0.0
      for state in range(self.env.nS-1):
        v = self.V[state]
        action = self.act(state)
        value = 0
        for j in range(len(env.P[state][action])):
          prob, next_state, reward, done = self.env.P[state][action][j]
          value += prob * (reward + self.gamma*self.V[next_state])
        self.V[state] = value
        delta = max([delta, np.abs(v-self.V[state])])
      print(delta)
      if (delta < self.theta):
        print(delta)
        break


agent = ValueAgent(env,0.99,0.001)
print(agent.V[:12])
print(agent.V[12:24])
print(agent.V[24:36])
print(agent.V[36:])
#perform value iteration
agent.iterate()
#evaluate agent and plot relevant qualities
episode_reward=agent.evaluate()
print("Episode return {}".format(episode_reward))
np.set_printoptions(precision=3, linewidth=200)
print(agent.V[:12])
print(agent.V[12:24])
print(agent.V[24:36])
print(agent.V[36:])

[0.46223671 0.25684455 0.19497257 0.3056002  0.05850981 0.26538718
 0.09940028 0.17186533 0.34198701 0.27755251 0.59974448 0.93047607]
[1.44170801e-01 8.27591536e-01 2.43928019e-01 5.37591319e-01
 5.00579445e-01 2.51550393e-01 5.67495723e-04 7.04946268e-01
 5.83921930e-01 8.85068984e-01 1.11293850e-01 8.20413242e-01]
[0.73032607 0.81293906 0.88875561 0.08099286 0.12760931 0.09946621
 0.1845644  0.58056678 0.7167344  0.37649339 0.27969584 0.88737672]
[0.12975515 0.77636782 0.47056165 0.6169239  0.68605769 0.90463783
 0.62139236 0.93365853 0.80285221 0.88560684 0.24507194 0.        ]
2.713746259756124
1.7904869481518353
1.4822449503966837
1.4674225008927166
1.4527482758837893
1.4382207931249518
1.4238385851937023
1.4096001993417655
1.3955041973483473
0.9220173068321298
0.9127971337638083
0.9036691624261692
0.8946324708019073
0.8856861460938887
0.8083471863971727
0.0
0.0
Episode return -13
[-13.125 -12.248 -11.362 -10.466  -9.562  -8.648  -7.726  -6.793  -5.852  -4.901  -3.94   -2.97 ]
[-

# Policy Iteration Agent
Follow the same procedure for implementing a policy iteration agent

In [None]:
class PolicyAgent(Agent):
  def __init__(self,env,discount_factor,theta):
    super().__init__(env,discount_factor)
    #theta is an approximation error threshold
    self.theta = theta
    self.V = np.random.rand(self.env.observation_space.n)
    #set terminal state to 0
    #self.V[-11:-1] = -1000 
    self.V[-1] = 0
    self.policy = np.random.randint(4, size=self.env.observation_space.n)
  
  def act(self, state): 
    #here choose action that would bring us to state with highest value
    values=[]
    for i in range(self.env.nA):
      v=0
      for j in range(len(env.P[state][i])):
        prob, next_state, reward, done = env.P[state][i][j]
        v += reward + self.gamma*self.V[next_state]
      values.append(v)
    
    action = np.argmax(values)
    if (type(action)==np.array): print (action)
    return action

   

  def evaluate_policy(self):
    while(True):
      delta = 0.0
      for state in range(self.env.nS-1):
        v = self.V[state]
        action = self.policy[state]
        value = 0
        for j in range(len(env.P[state][action])):
          prob, next_state, reward, done = self.env.P[state][action][j]
          value += prob * (reward + self.gamma*self.V[next_state])
        self.V[state] = value
        delta = max([delta, np.abs(v-self.V[state])])
      #print(delta)
      if (delta < self.theta):
        #print(delta)
        break

  def improve(self):
    policy_stable = True
    #for i in range(200):
    while(True):
      policy_stable = True
      for state in range(self.env.nS-1):
        old_action = self.policy[state]
        self.policy[state] = self.act(state)
        if not (self.policy[state] == old_action):
          policy_stable = False
      if policy_stable:
        break
      else:
        self.evaluate_policy()





agent = PolicyAgent(env,0.99,0.001)
print(agent.V[:12])
print(agent.V[12:24])
print(agent.V[24:36])
print(agent.V[36:])
print(agent.policy[:12])
print(agent.policy[12:24])
print(agent.policy[24:36])
print(agent.policy[36:])
#perform value iteration
agent.improve()
#evaluate agent and plot relevant qualities
episode_reward=agent.evaluate()
print("Episode return {}".format(episode_reward))
np.set_printoptions(precision=3, linewidth=200)
print(agent.V[:12])
print(agent.V[12:24])
print(agent.V[24:36])
print(agent.V[36:])

print(agent.policy[:12])
print(agent.policy[12:24])
print(agent.policy[24:36])
print(agent.policy[36:])

[0.62  0.038 0.394 0.641 0.63  0.001 0.778 0.652 0.477 0.39  0.317 0.127]
[0.797 0.394 0.321 0.839 0.143 0.684 0.495 0.748 0.139 0.081 0.348 0.629]
[0.503 0.344 0.746 0.549 0.756 0.772 0.263 0.929 0.954 0.536 0.897 0.045]
[0.442 0.779 0.972 0.627 0.917 0.191 0.324 0.112 0.465 0.386 0.467 0.   ]
[0 3 2 2 0 2 0 3 2 0 1 2]
[2 1 3 1 1 0 2 1 3 1 1 1]
[2 2 1 2 3 3 2 1 0 0 0 1]
[0 2 2 1 1 2 0 3 0 1 2 1]
Episode return -13
[-13.125 -12.248 -11.362 -10.466  -9.562  -8.648  -7.726  -6.793  -5.852  -4.901  -3.94   -2.97 ]
[-12.248 -11.362 -10.466  -9.562  -8.648  -7.726  -6.793  -5.852  -4.901  -3.94   -2.97   -1.99 ]
[-11.362 -10.466  -9.562  -8.648  -7.726  -6.793  -5.852  -4.901  -3.94   -2.97   -1.99   -1.   ]
[-12.248 -11.362 -10.466  -9.562  -8.648  -7.726  -6.793  -5.852  -4.901  -3.94   -1.      0.   ]
[1 1 1 1 1 1 1 1 1 1 1 2]
[1 1 1 1 1 1 1 1 1 1 1 2]
[1 1 1 1 1 1 1 1 1 1 1 2]
[0 0 0 0 0 0 0 0 0 0 1 1]


Modifying the transisiotn probabilities to create a non-deterministic environment

#Monte Carlo control agent
Follow the same procedure for implementing a Monte Carlo control agent

In [None]:
#code here