In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%matplotlib inline

In [3]:
import sys

In [4]:
sys.path.append('../')

In [5]:
import gym

In [6]:
import random
import numpy as np
import pandas as pd

In [7]:
from mdp.policyEvaluation import PolicyEvaluator


In [8]:
from mdp.policyIteration import PolicyIterator

In [9]:
from mdp.utils.env import make_env
from mdp.utils.generate import generate_arbitrary_policy

In [10]:
import matplotlib.pyplot as plt 

In [11]:
seed = 0  # Random number generator seed
gamma = 0.95  # Discount factor
num_iters = 10  # Number of iterations
random.seed(seed)  # Set the random seed to ensure results can be reproduced
np.random.seed(seed)

##  Load and set-up Environment 

In [12]:
# Now set up the environment
env_info = make_env('FrozenLake-v1', seed=seed)

In [13]:
env=env_info['env']

## Policy Evaluation 

### Generate an arbitrary policy

In [14]:
policy = generate_arbitrary_policy(env)

In [15]:
policy

{0: {0: 0.2275676958864965,
  1: 0.29655610706227664,
  2: 0.2499382243253878,
  3: 0.22593797272583904},
 1: {0: 0.17660310472827,
  1: 0.2692449274061814,
  2: 0.18241091619631808,
  3: 0.37174105166923044},
 2: {0: 0.3612302752574101,
  1: 0.1437335664175093,
  2: 0.2967791899378672,
  3: 0.19825696838721354},
 3: {0: 0.34389290839676195,
  1: 0.5603541372383523,
  2: 0.043005070953708896,
  3: 0.05274788341117701},
 4: {0: 0.008084102239973011,
  1: 0.3329138215977038,
  2: 0.31113735655710123,
  3: 0.34786471960522186},
 5: {0: 0.3240688320611365,
  1: 0.26464084246212655,
  2: 0.15281859279728796,
  3: 0.258471732679449},
 6: {0: 0.06406309990748013,
  1: 0.34661190718973506,
  2: 0.07764701376380721,
  3: 0.5116779791389775},
 7: {0: 0.2641869271128106,
  1: 0.20992357194778705,
  2: 0.13393189419097806,
  3: 0.3919576067484244},
 8: {0: 0.2746223369775663,
  1: 0.3422219571717576,
  2: 0.011312276990131154,
  3: 0.371843428860545},
 9: {0: 0.21442448354447244,
  1: 0.2161193890

### State sweep policy evaluation

In [16]:
policy_evaluator = PolicyEvaluator(env=env, policy=policy)
sync_state_values, sync_iterations = policy_evaluator.synchronous_state_sweep_policy_evaluation()


### In place policy evaluation 

In [17]:
async_state_values, async_iterations = policy_evaluator.asynchronous_inplace_policy_evaluation()

In [18]:
N = 1000
s_number_iterations_batch = []
as_number_iterations_batch = []
for _ in range(N):
    policy = generate_arbitrary_policy(env)
    sync_state_values, sync_iterations = policy_evaluator.synchronous_state_sweep_policy_evaluation()
    async_state_values, async_iterations = policy_evaluator.asynchronous_inplace_policy_evaluation()
    
    s_number_iterations_batch.append(sync_iterations)
    as_number_iterations_batch.append(async_iterations)
    
    
    


In [19]:
pd.Series(as_number_iterations_batch).mean()/pd.Series(s_number_iterations_batch).mean() -1

-0.2592592592592593

## Policy Iteration

### Synchronous 

In [20]:
policy_iterator = PolicyIterator(env=env, synchronous_evaluation=True)
sync_final_policy = policy_iterator.policy_iteration()

Iteration - 0 - of policy iteration algorithm
Iteration - 1 - of policy iteration algorithm
Policy iteration algorithm converged!


In [21]:
sync_final_policy

{0: {0: 0, 1: 1, 2: 0, 3: 0},
 1: {0: 0, 1: 0, 2: 1, 3: 0},
 2: {0: 0, 1: 1, 2: 0, 3: 0},
 3: {0: 1, 1: 0, 2: 0, 3: 0},
 4: {0: 0, 1: 1, 2: 0, 3: 0},
 5: {0: 1, 1: 0, 2: 0, 3: 0},
 6: {0: 0, 1: 1, 2: 0, 3: 0},
 7: {0: 1, 1: 0, 2: 0, 3: 0},
 8: {0: 0, 1: 0, 2: 1, 3: 0},
 9: {0: 0, 1: 1, 2: 0, 3: 0},
 10: {0: 0, 1: 1, 2: 0, 3: 0},
 11: {0: 1, 1: 0, 2: 0, 3: 0},
 12: {0: 1, 1: 0, 2: 0, 3: 0},
 13: {0: 0, 1: 0, 2: 1, 3: 0},
 14: {0: 0, 1: 0, 2: 1, 3: 0},
 15: {0: 1, 1: 0, 2: 0, 3: 0}}

### Asynchronous 

In [22]:
policy_iterator = PolicyIterator(env=env, synchronous_evaluation=True)
asynch_final_policy = policy_iterator.policy_iteration()

Iteration - 0 - of policy iteration algorithm
Iteration - 1 - of policy iteration algorithm
Policy iteration algorithm converged!


In [23]:
asynch_final_policy

{0: {0: 0, 1: 1, 2: 0, 3: 0},
 1: {0: 0, 1: 0, 2: 1, 3: 0},
 2: {0: 0, 1: 1, 2: 0, 3: 0},
 3: {0: 1, 1: 0, 2: 0, 3: 0},
 4: {0: 0, 1: 1, 2: 0, 3: 0},
 5: {0: 1, 1: 0, 2: 0, 3: 0},
 6: {0: 0, 1: 1, 2: 0, 3: 0},
 7: {0: 1, 1: 0, 2: 0, 3: 0},
 8: {0: 0, 1: 0, 2: 1, 3: 0},
 9: {0: 0, 1: 1, 2: 0, 3: 0},
 10: {0: 0, 1: 1, 2: 0, 3: 0},
 11: {0: 1, 1: 0, 2: 0, 3: 0},
 12: {0: 1, 1: 0, 2: 0, 3: 0},
 13: {0: 0, 1: 0, 2: 1, 3: 0},
 14: {0: 0, 1: 0, 2: 1, 3: 0},
 15: {0: 1, 1: 0, 2: 0, 3: 0}}