In [1]:
import gym
import numpy as np
from IPython import display
import time
import datetime 
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import pandas as pd 
import random


In [2]:
def policyImprovement(v_values, env, gamma):
    policy = np.zeros(env.observation_space.n, dtype=np.int64)

    for state in range(env.observation_space.n):
        q_values = []
        for action in range(env.action_space.n):
            q_value = 0

            for prob, next_state, reward, done in env.P[state][action]:
                q_value += prob * (reward + gamma * v_values[next_state])

            q_values.append(q_value)
        
        best_action = np.argmax(q_values)
        policy[state] = best_action

    return policy

def policyEvaluation(policy, env, gamma):
    
    v_values = np.zeros(env.observation_space.n) #Setting a value array
    theta = 1e-3
    while True: #When 
        prev_v_values = np.copy(v_values) #Store previous value

        for state in range(env.observation_space.n): #With every state in the observation_space
            #Calculate value according to the policy in the beginning        
            action = policy[state] 
            v_values[state] = sum([prob * (reward + gamma * prev_v_values[next_state]) for prob, next_state, reward, done in env.env.P[state][action]])

        if np.all(np.isclose(v_values, prev_v_values)):
            return v_values

#policyIteration for FrozenLake
def policyIteration(env, gamma, theta, maxIterations):
    policy = np.random.choice(env.env.nA, size=(env.observation_space.n))

    for i in range(maxIterations): #while change in iterations is bigger than theta
        prev_policy = np.copy(policy)
        v_values = policyEvaluation(policy, env, gamma)
        policy = policyImprovement(v_values,env, gamma)

        if (np.all(policy == prev_policy)):
            iterations = i
            break

    return policy, iterations
        


In [4]:
def play(env):
    state = env.reset()
    total_reward = 0
    done = False
    steps = 0

    start = time.time()
    policy, iterations = policyIteration(env, gamma=0.9, theta=.00001, maxIterations=1000)
    end = time.time()


    while not done:
        action = policy[state]
        next_state, reward, done, info = env.step(action)
        total_reward += reward
        steps += 1
        state = next_state

    return total_reward, steps, end-start, iterations

def play_multiple_times(env, itera, max_episodes):
    with open('./FrozenLake-v0/Policy-FrozenLake-v0-'+ str(itera)+'.txt',"w+") as writer:
        success = 0
        writer.write('Episode,Steps,Time,Converged at Iterations\n')

        for i in range(max_episodes):
            reward, steps, timeTakes, iterations = play(env)
            writer.write(str(i) + ',' + str(steps) + ',' + str(timeTakes) + ',' + str(iterations) + '\n')

            if reward > 0:
                success += 1
        
        return success

with open('./Success/Success-Policy-FrozenLake-v0.txt','w+') as writer:
    writer.write('Success\n')
    for i in range(50):
        env = gym.make("FrozenLake-v0")
        success = play_multiple_times(env,i,max_episodes=1000)
        writer.write(str(success)+'\n')

In [5]:
def play(env):
    state = env.reset()
    total_reward = 0
    done = False
    steps = 0

    start = time.time()
    policy, iterations = policyIteration(env, gamma=0.9, theta=.00001, maxIterations=1000)
    end = time.time()


    while not done:
        action = policy[state]
        next_state, reward, done, info = env.step(action)
        total_reward += reward
        steps += 1
        state = next_state

    return total_reward, steps, end-start, iterations


def play_multiple_times(env, itera, max_episodes):
    with open('./FrozenLake8x8-v0/Policy-FrozenLake8x8-v0-'+ str(itera)+'.txt',"w+") as writer:
        success = 0
        writer.write('Episode,Steps,Time,Converged at Iterations\n')

        for i in range(max_episodes):
            reward, steps, timeTakes, iterations = play(env)
            writer.write(str(i) + ',' + str(steps) + ',' + str(timeTakes) + ',' + str(iterations) + '\n')

            if reward > 0:
                success += 1
        
        return success

with open('./Success/Success-Policy-FrozenLake8x8-v0.txt','w+') as writer:
    writer.write('Success\n')
    for i in range(50):
        env = gym.make("FrozenLake8x8-v0")
        success = play_multiple_times(env,i,max_episodes=1000)
        writer.write(str(success)+'\n')

In [6]:
#This is implement for Policy Iteration on Taxi-v3

In [7]:
def playTaxi(env):
    state = env.reset()
    total_reward = 0
    done = False
    steps = 0

    start = time.time()
    policy, iterations = policyIteration(env, gamma=0.9, theta=.00001, maxIterations=1000)
    end = time.time()


    while not done:
        action = policy[state]
        next_state, reward, done, info = env.step(action)
        total_reward += reward
        steps += 1
        state = next_state

    return total_reward, steps, end-start, iterations

def play_multiple_times_Taxi(env, itera, max_episodes):
    with open('./Taxi-v3/Policy-Taxi-v3-'+str(itera)+'.txt',"w+") as writer:
        success = 0
        writer.write('Episode,Steps,Time,Converged at Iterations\n')

        avg_rwd=0
        for i in range(max_episodes):
            reward, steps, timeTakes, iterations = playTaxi(env)
            writer.write(str(i) + ',' + str(steps) + ',' + str(timeTakes) + ',' + str(iterations) + '\n')

            avg_rwd += reward
        return avg_rwd/max_episodes

with open('./Success/Success-Policy-Taxi-v3.txt','w+') as writer:
    env = gym.make("Taxi-v3")
    writer.write('Success\n')
    for i in range(50):
        writer.write(str(play_multiple_times_Taxi(env, i,max_episodes=100))+'\n')
