In [1]:
import numpy as np

In [2]:
class Gridworld():
    def __init__(self):
        self.MAX_EPS = 100000
        self.MAX_TIME_STEPS = 100
        self.STATE_DIM = 23
        self.NUM_ACTIONS = 4
        self.GAMMA = .95
        self.NewEpisode()
        
    
    def NewEpisode(self):
        self.x = 0
        self.y = 0
        self.t = 0
        self.TAS = False
        
    def GetState(self):
        result = np.zeros(self.STATE_DIM)
        
        if(not self.TAS):
            state = self.y * 5 + self.x
            
            if (state > 12):
                state -= 1
            if (state > 16):
                state -= 1
            result[state] = 1
            
        return result
    
    def Transition(self, a):
        self.t += 1
        if((self.x == 4) and (self.y == 4)):
            self.TAS = True
            return 0
        
        if(self.t == self.MAX_TIME_STEPS):
            self.TAS = True
            return -100
        
        effective_action = a
        temp = np.random.random_sample()
        
        if(temp <= 0.1):
            effective_action = -1 # stay
        elif(temp <= 0.15):
            effective_action = (effective_action + 1) % self.NUM_ACTIONS #rotate
        elif(temp <= 0.2):
            effective_action = (effective_action - 1) % self.NUM_ACTIONS #rotate
            
        x_prime = self.x
        y_prime = self.y
        if ((effective_action == 0) and (self.y >= 1)):
            y_prime -= 1
        elif(effective_action == 1):
            x_prime += 1
        elif(effective_action == 2):
            y_prime += 1
        elif(effective_action == 3):
            x_prime -= 1
        
        # checks location is valid
        if ((x_prime >= 0) and (y_prime >= 0) and (x_prime < 5) and (y_prime < 5) and ((x_prime != 2) or ((y_prime != 2) and (y_prime != 3)))):
            self.x = x_prime
            self.y = y_prime
            
        #compute rewards
        reward = 0
        if((self.x == 2) and (self.y == 4)):
            reward = -10
        elif((self.x == 4) and (self.y == 4)):
            reward = 10
        return reward * (self.GAMMA ** (self.t - 1))

In [3]:
gridworld = Gridworld()

In [4]:
# policy = np.full((gridworld.STATE_DIM, gridworld.NUM_ACTIONS),.25)
policy = np.load("policies\\gw\\delta_0.01\\safety_0.6097323533319994.npy")

In [5]:
def ExecutePolicy(policy, state):
    temp = np.random.random_sample()
    total = 0
    for a in range(policy.shape[1]):
        total += policy[state,a]
        if(temp < total):
            return a, policy[state,a]
    assert False #Error
    return -1, -1

In [6]:
output = str(gridworld.MAX_EPS) + "\n"
for ep in range(gridworld.MAX_EPS):
    if(ep % 10000 == 0):
        print("Episode: " + str(ep) + " / " + str(gridworld.MAX_EPS))
    gridworld.NewEpisode()
    traj_output = ""
    while not gridworld.TAS:
        state = np.where(gridworld.GetState() != 0)[0][0]
        action, pi_s_a = ExecutePolicy(policy, state)
        reward = gridworld.Transition(action)
        traj_output += str(state) + "," + str(action) + "," + str(reward) + "," + str(pi_s_a) + "\n"
    output += str(gridworld.t) + "\n" + traj_output

with open("gridworld_data.csv", "w") as f:
    f.write(output)

Episode: 0 / 100000
Episode: 10000 / 100000
Episode: 20000 / 100000
Episode: 30000 / 100000
Episode: 40000 / 100000
Episode: 50000 / 100000
Episode: 60000 / 100000
Episode: 70000 / 100000
Episode: 80000 / 100000
Episode: 90000 / 100000
