In [15]:
import gym
import numpy as np
import random
np.random.seed(44)

In [2]:
def test_policy(env, policy, n_epoch=1000):
    rewards = []
    episode_counts = []
    for i in range(n_epoch):
        current_state = env.reset()
        ep = 0
        done = False
        episode_reward = 0
        while not done and ep < 10000:
            ep += 1
            act = int(policy[current_state])
            new_state, reward, done, _ = env.step(act)
            episode_reward += reward
            current_state = new_state
        rewards.append(episode_reward)
        episode_counts.append(ep)
    
    # all done
    mean_reward = sum(rewards)/len(rewards)
    mean_eps = sum(episode_counts)/len(episode_counts)
    return mean_reward, mean_eps, rewards, episode_counts 

In [10]:
def value_iteration(env, discount=0.9, epsilon=1e-12):
    number_of_states = env.observation_space.n
    number_of_actions = env.action_space.n
    policy = np.zeros((1, number_of_states))
    value_list = np.zeros((1, number_of_states))
    old_value_list = value_list.copy()
    episode = 0
    max_change = 1
    sigma = discount
    while max_change > epsilon:
        episode += 1
        for s in range(number_of_states):
            assigned_value = -np.inf
            for a in range(number_of_actions):
                # get new state and its reward        
                total_cand_value = 0
                for prob, new_state, reward, done in env.P[s][a]:
                    # get new states value
                    value_new_state = old_value_list[0][new_state]
                    cand_value = 0
                    if done:
                        cand_value = reward 
                    else:
                        cand_value = reward + sigma*value_new_state
                    total_cand_value += cand_value*prob 
                        
                if total_cand_value > assigned_value:
                    assigned_value = total_cand_value
                    policy[0][s] = a
                    value_list[0][s] = assigned_value
        changes = np.abs(value_list - old_value_list)
        max_change = np.max(changes)
        old_value_list = value_list.copy()
    print("Solved in: ", episode, " episodes")
    return policy[0], episode

In [11]:
def policy_iteration(env, number_of_states, number_of_actions):
    
    ## 1
    policy = np.random.randint(number_of_actions, size=(1,number_of_states))
#     policy = np.zeros((1,number_of_states))
    value_list = np.zeros((1, number_of_states))
    episode = 0
    sigma = 0.9
    
    ## 2
    policy_stable = False
    while not policy_stable:
        episode += 1
        eval_acc = True
        while eval_acc:
            eps = 0
            for s in range(number_of_states):
                # first row
                v = value_list[0][s]

                # get the new value 
                a = policy[0][s]
                total_val_new_state = 0
                for prob, new_state, reward, done in env.P[s][a]:
                    value_new_state = value_list[0][new_state]
                    # second row
                    cand_value = 0
                    if done:
                        cand_value = reward
                        # value_list[0][s] = reward
                    else:
                        cand_value = reward + sigma*value_new_state
                    total_val_new_state += cand_value*prob 
                value_list[0][s] = total_val_new_state
                    
                # third row
                eps = max(eps, np.abs(v-value_list[0][s]))
            if eps < 1e-12:
                eval_acc = False


        ## 3
        policy_stable = True
        for s in range(number_of_states):

            # assign 
            old_action = policy[0][s]
            # get the argmax a here
            max_value = -np.inf
            for a in range(number_of_actions):
                # get the new value 
                total_cand_value = 0
                for prob, new_state, reward, done in env.P[s][a]:
                    value_new_state = value_list[0][new_state]
                    cand_value = 0
                    if done:
                        cand_value = reward
                    else:
                        cand_value = reward + sigma*value_new_state
                    total_cand_value += prob*cand_value
                if total_cand_value > max_value:
                    max_value = total_cand_value
                    policy[0][s] = a

            # if old-action != policy[s]
            if old_action != policy[0][s]:
                policy_stable = False
    print("Solved in: ", episode, " episodes")

    return policy[0]
        

In [12]:
def q_learning(env, number_of_states, number_of_actions):
    total_episodes = int(1e6)
    qtable = np.zeros((number_of_states, number_of_actions))
    learning_rate = 0.1
    gamma = 0.9

    # exploration parameter
    epsilon = 1.0
    max_epsilon = 1.0
    min_epsilon = 0.01
    decay_rate = 1./total_episodes
    for episode in range(total_episodes):
        # reset the environment
        state = env.reset()
        step = 0
        done = False

        while True:

            # choose an action a in the corrent world state
            exp_exp_tradeoff = random.uniform(0,1)

            # if greater than epsilon --> exploit
            if exp_exp_tradeoff > epsilon:
                action = np.argmax(qtable[state, :])

            # else choose exploration
            else:
                action = env.action_space.sample()

            # take action (a) and observe the outcome state (s') and reward (r)    
            new_state, reward, done, info = env.step(action)

            # update Q(s,a) := Q(s,a) + lr [R(s,a) + gamma * max(Q (s', a') - Q(s,a))]
            if not done:
                qtable[state, action] = qtable[state, action] + learning_rate*(reward + gamma*np.max(qtable[new_state, :]) - qtable[state, action])
            else:
                qtable[state, action] = qtable[state,action] + learning_rate*(reward - qtable[state,action])

            # change state
            state = new_state

            # is it Done
            if done:
                break
                
        # reduce epsilon 
        epsilon = max(max_epsilon -  decay_rate * episode, min_epsilon) 
    #     print (epsilon)
    return np.argmax(qtable, axis=1)

In [13]:
env = gym.make('FrozenLake-v0')
vi_policy = value_iteration(env, discount=0.999, epsilon=1e-6)
print("training done")
vi_mrews, vi_meps, vi_rews, vi_eps = test_policy(env, vi_policy)    
print(vi_mrews)

Solved in:  414  episodes
training done
0.76


In [None]:
env = gym.make('FrozenLake-v0')

In [14]:
def train_and_test(env, P, R, discount=[0.9], epsilon=[1e-9], 
                   n_iters=[1e6], mute=False):
    
    vi_dict = {}
    
    for dis in discount:
        vi_dict[dis] = {}
        for eps in epsilon:
            vi_dict[dis][eps] = {}
            # run value iteration
            vi_mrews, vi_meps, vi_rews, vi_eps = test_policy(env, vi.policy)    
            vi_dict[dis][eps]["mean_reward"] = vi_mrews
            vi_dict[dis][eps]["mean_eps"] = vi_meps
            vi_dict[dis][eps]["policy"] = vi.policy
            if not mute:
                print("Value iteration for {} discount and {} eps is done".format(dis, eps))
                print("Mean reward: {} - mean eps: {}".format(vi_mrews, vi_meps))
    # run policy iteration
    pi_dict = {}
    for dis in discount:
        pi_dict[dis] = {}
        
        pi = mdptoolbox.mdp.PolicyIteration(P, R, discount=dis)
        pi.run()
        pi_mrews, pi_meps, pi_rews, pi_eps = test_policy(env, pi.policy)    
        pi_dict[dis]["mean_reward"] = pi_mrews
        pi_dict[dis]["mean_eps"] = pi_meps
        pi_dict[dis]["policy"] = pi.policy
        if not mute:
            print("Policy iteration for {} discount is done".format(dis))
            print("Mean reward: {} - mean eps: {}".format(pi_mrews, pi_meps))
    
    q_dict = {}
    for dis in discount:
        q_dict[dis] = {}
        for n_iter in n_iters:
            q_dict[dis][n_iter] = {}
            # run first
            q = mdptoolbox.mdp.QLearning(P, R, discount=dis, n_iter=n_iter)
            q.run()
            q_mrews, q_meps, q_rews, q_eps = test_policy(env, q.policy)
            q_dict[dis][n_iter]['mean_reward'] = q_mrews
            q_dict[dis][n_iter]['mean_eps'] = q_meps
            q_dict[dis][n_iter]['policy'] = q.policy
            if not mute:
                print("Q-learning iteration for {} discount and {} iterations is done".format(dis, n_iter))
                print("Mean reward: {} - mean eps: {}".format(q_mrews, q_meps))
    
    return vi_dict, pi_dict, q_dict