In [1]:
import sys
sys.path.append("..")
sys.path.append("../..")
import gym
from src.monte_carlo import MonteCarlo
from src.policy import StaticPolicy, EpsilonGreedyPolicy
from src.temporal_difference import TemporalDifferenceZero, SARSA, QLearning
import numpy as np

## Enviroment

In [2]:
env = gym.make('FrozenLake-v0' )

[2017-12-24 15:20:21,560] Making new env: FrozenLake-v0


In [3]:
nb_states = env.env.nS
nb_actions = env.env.nA

## Policy

In [4]:
policy = {0:1, 1:2, 2:1, 3:0, 4:1, 6:1, 8:2, 9:0, 10:1, 13:2, 14:2}
epsilon = .1

## Monte Carlo

### Static Policy

In [5]:
mc = MonteCarlo(StaticPolicy(policy.copy()), env, nb_states, nb_actions)

In [6]:
mc.evaluate_n_episodes()

In [7]:
print(mc.q)

[[ 0.          0.02354353  0.          0.        ]
 [ 0.          0.          0.01993959  0.        ]
 [ 0.          0.04077042  0.          0.        ]
 [ 0.01816479  0.          0.          0.        ]
 [ 0.          0.0282282   0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.08515027  0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.05802593  0.        ]
 [ 0.14644137  0.          0.          0.        ]
 [ 0.          0.26168479  0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.37820648  0.        ]
 [ 0.          0.          0.62854189  0.        ]
 [ 0.          0.          0.          0.        ]]


In [8]:
mc.evaluate_v_with_q()

  self.v = np.sum(self.q * self.n, axis=1) / np.sum(self.n, axis=1)


In [9]:
print(mc.v)

[ 0.02354353  0.01993959  0.04077042  0.01816479  0.0282282   0.
  0.08515027  0.          0.05802593  0.14644137  0.26168479  0.          0.
  0.37820648  0.62854189  0.        ]


### Epsilon Greedy Policy

In [10]:
mc = MonteCarlo(EpsilonGreedyPolicy(policy.copy(), nb_actions, epsilon), env, nb_states, nb_actions)

In [11]:
mc.test_performance()

0.0255

In [12]:
mc.control()

In [13]:
mc.test_performance()

0.51235

In [25]:
mc.evaluate_v_with_q()

  self.v = np.sum(self.q * self.n, axis=1) / np.sum(self.n, axis=1)


In [26]:
print(mc.v)

[ 0.22391967  0.19670757  0.19004753  0.09329338  0.25758352  0.
  0.19986692  0.          0.31858355  0.42013942  0.43310024  0.          0.
  0.55585659  0.74042053  0.        ]


## Temporal Difference

### Temporal Difference Zero

In [14]:
td0 = TemporalDifferenceZero(StaticPolicy(policy.copy()), nb_states, env)

In [15]:
td0.evaluate_n_episodes()

In [16]:
td0.v

array([ 0.02700241,  0.01860272,  0.05346401,  0.02436115,  0.03536224,
        0.        ,  0.11653935,  0.        ,  0.06603041,  0.22005818,
        0.29926902,  0.        ,  0.        ,  0.40604435,  0.66370938,  0.        ])

### SARSA

In [17]:
sarsa = SARSA(EpsilonGreedyPolicy(policy.copy(), nb_actions, epsilon), nb_states, nb_actions, env)

In [18]:
sarsa.test_performance()

0.02552

In [19]:
sarsa.control()

In [20]:
sarsa.test_performance()

0.63912

### Q Learning

In [21]:
q_learning = QLearning(EpsilonGreedyPolicy(policy.copy(), nb_actions, epsilon), nb_states, nb_actions, env)

In [22]:
q_learning.test_performance()

0.02579

In [23]:
q_learning.control()

In [24]:
q_learning.test_performance()

0.74259