Notice:
0,1,2,3: Left, down, right, up
Reward: 0 for every step or falling in the hole, 1 for reaching the goal
env return done if have taken 100 steps

The environment’s step function returns exactly what we need. In fact, step returns four values. These are:

    observation (object): an environment-specific object representing your observation of the environment. For example, pixel data from a camera, joint angles and joint velocities of a robot, or the board state in a board game.

    reward (float): amount of reward achieved by the previous action. The scale varies between environments, but the goal is always to increase your total reward.

    done (boolean): whether it’s time to reset the environment again. Most (but not all) tasks are divided up into well-defined episodes, and done being True indicates the episode has terminated. (For example, perhaps the pole tipped too far, or you lost your last life.)

    info (dict): diagnostic information useful for debugging. It can sometimes be useful for learning (for example, it might contain the raw probabilities behind the environment’s last state change). However, official evaluations of your agent are not allowed to use this for learning.

    env.step(env.action_space.sample()) # take a random action


In [1]:
import gym
import numpy as np
import time
from IPython import display

In [2]:
env = gym.make('Taxi-v3')

In [None]:
env.render()
env.reset()

In [None]:
(env.P[1][1])

In [3]:
def value_iteration(env, max_iters, gamma):
    v_values = np.zeros(env.observation_space.n) #Create an matrix with the same structure as the game
    iterations = 0
    for i in range(max_iters):
        iterations +=1
        prev_v_values = np.copy(v_values) #Prev_v_value contains the previous value 

        # Compute the value for state
        for state in range(env.observation_space.n):
            q_values = []
            # Compute the q-value for each action
            for action in range(env.action_space.n):
                q_value = 0
                # Loop through each possible outcome
                for prob, next_state, reward, done in env.P[state][action]:
                    q_value += prob * (reward + gamma * prev_v_values[next_state])
                
                q_values.append(q_value)
            
            # Select the best action
            best_action = np.argmax(q_values)
            v_values[state] = q_values[best_action]
        
        # Check convergence
        if np.all(np.isclose(v_values, prev_v_values)):
            iterations = i
            break
    
    return v_values, iterations

def policy_extraction(env, v_values, gamma=0.9):
    policy = np.zeros(env.observation_space.n, dtype=np.int64)

    # Compute the best action for each state in the game
    # Compute q-value for each (state-action) pair in the game
    for state in range(env.observation_space.n):
        q_values = []
        # Compute q_value for each action
        for action in range(env.action_space.n):
            q_value = 0
            for prob, next_state, reward, done in env.P[state][action]:
                q_value += prob * (reward + gamma * v_values[next_state])
            q_values.append(q_value)
        
        # Select the best action
        best_action = np.argmax(q_values)
        policy[state] = best_action
    
    return policy

In [4]:
def play(env):
    state = env.reset()
    total_reward = 0
    done = False
    steps = 0

    start = time.time()
    v_values, iterations = value_iteration(env,max_iters=1000, gamma=0.9)
    policy = policy_extraction(env, v_values, gamma=0.9)
    end = time.time()


    while not done:
        action = policy[state]
        next_state, reward, done, info = env.step(action)
        total_reward += reward
        steps += 1
        state = next_state

    return total_reward, steps, end-start, iterations

def play_multiple_times(env, itera, max_episodes):
    with open('./FrozenLake-v0/Value-FrozenLake-v0-'+ str(itera)+'.txt',"w+") as writer:
        success = 0
        writer.write('Episode,Steps,Time,Converged at Iterations\n')

        for i in range(max_episodes):
            reward, steps, timeTakes, iterations = play(env)
            writer.write(str(i) + ',' + str(steps) + ',' + str(timeTakes) + ',' + str(iterations) + '\n')

            if reward > 0:
                success += 1
        
        return success

with open('./Success/Success-Value-FrozenLake-v0.txt','w+') as writer:
    writer.write('Success\n')
    for i in range(50):
        env = gym.make("FrozenLake-v0")
        success = play_multiple_times(env,i,max_episodes=1000)
        writer.write(str(success)+'\n')


In [5]:
def play(env):
    state = env.reset()
    total_reward = 0
    done = False
    steps = 0

    start = time.time()
    v_values, iterations = value_iteration(env,max_iters=1000, gamma=0.9)
    policy = policy_extraction(env, v_values, gamma=0.9)
    end = time.time()


    while not done:
        action = policy[state]
        next_state, reward, done, info = env.step(action)
        total_reward += reward
        steps += 1
        state = next_state

    return total_reward, steps, end-start, iterations


def play_multiple_times(env, itera, max_episodes):
    with open('./FrozenLake8x8-v0/Value-FrozenLake8x8-v0-'+ str(itera)+'.txt',"w+") as writer:
        success = 0
        writer.write('Episode,Steps,Time,Converged at Iterations\n')

        for i in range(max_episodes):
            reward, steps, timeTakes, iterations = play(env)
            writer.write(str(i) + ',' + str(steps) + ',' + str(timeTakes) + ',' + str(iterations) + '\n')

            if reward > 0:
                success += 1
        
        return success

with open('./Success/Success-Value-FrozenLake8x8-v0.txt','w+') as writer:
    writer.write('Success\n')
    for i in range(50):
        env = gym.make("FrozenLake8x8-v0")
        success = play_multiple_times(env,i,max_episodes=1000)
        writer.write(str(success)+'\n')

In [9]:
def playTaxi(env):
    state = env.reset()
    total_reward = 0
    done = False
    steps = 0

    start = time.time()
    v_values, iterations = value_iteration(env,max_iters=1000, gamma=0.9)
    policy = policy_extraction(env, v_values, gamma=0.9)
    end = time.time()


    while not done:
        action = policy[state]
        next_state, reward, done, info = env.step(action)
        total_reward += reward
        steps += 1
        state = next_state

    return total_reward, steps, end-start, iterations

def play_multiple_times_Taxi(env, itera, max_episodes):
    with open('./Taxi-v3/Value-Taxi-v3-'+str(itera)+'.txt',"w+") as writer:
        success = 0
        writer.write('Episode,Steps,Time,Converged at Iterations\n')

        avg_rwd=0
        for i in range(max_episodes):
            reward, steps, timeTakes, iterations = playTaxi(env)
            writer.write(str(i) + ',' + str(steps) + ',' + str(timeTakes) + ',' + str(iterations) + '\n')

            avg_rwd += reward
        return avg_rwd/max_episodes

with open('./Success/Success-Value-Taxi-v3.txt','w+') as writer:
    env = gym.make("Taxi-v3")
    writer.write('Success\n')
    for i in range(50):
        writer.write(str(play_multiple_times_Taxi(env, i,max_episodes=100))+'\n')
