In [2]:
import gym
import numpy as np
import gym_maze
import random

In [14]:
env = gym.make('maze-sample-5x5-v0')
state_n = 25
action_n = 4
N = 100

class RandomAgent():
    def __init__(self, action_n):
        self.action_n = action_n
        
    def get_action(self, state):
        return random.randint(0, self.action_n-1)

    
class CEM():
    def __init__(self, state_n, action_n):
        self.state_n = state_n
        self.action_n = action_n
        self.policy = np.ones((self.state_n, self.action_n)) / self.action_n
    
    def get_action(self, state):
        return int(np.random.choice(np.arange(action_n),
            p = self.policy[state, :]))
    def update_policy(self, elite_trajectories):
        
        pre_policy = np.ones((self.state_n, self.action_n)) / self.action_n
        
        for trajectory in elite_trajectories:
            for state, action in zip(trajectory['states'], trajectory['actions']):
                pre_policy[state][action] += 1
        for state in range(self.state_n):
            if sum(pre_policy[state])>0:
                self.policy[state] = pre_policy[state] / sum(pre_policy[state])
                
    
def get_state(obs):
    return int(obs[1] * np.sqrt(state_n) + obs[0])

def get_trajectory(agent, trajectory_len):
    
    trajectory = {'states':[],
                 'actions':[],
                 'total_reward': 0}
    
    obs = env.reset()
    state = get_state(obs)
    
    for _ in range(trajectory_len):
        
        action = agent.get_action(state)
        trajectory['states'].append(state)
        trajectory['actions'].append(action)
        
        obs, reward, done, _ = env.step(action)
        state = get_state(obs)
        
        trajectory['total_reward']+= reward
        
        if done:
            break
    return trajectory


def get_elite_trajectories(trajectories, q_param):
    
    quantile = np.quantile([trajectory['total_reward'] for trajectory in trajectories],
                q_param)
    
    elite_trajectories = [trajectory for trajectory in trajectories if trajectory['total_reward']> quantile]
    return elite_trajectories
    
    

agent = CEM(state_n, action_n)

epoch_n = 100
trajectory_n = 100 # k
trajectory_len = 100
q_param = 0.9

for _ in range(epoch_n):
    trajectories = [get_trajectory(agent, trajectory_len) for _ in range(trajectory_n)]
    
    mean_total_reward = np.mean([trajectory['total_reward'] for trajectory in trajectories])
    print(mean_total_reward)
    elite_trajectories = get_elite_trajectories(trajectories, q_param)
    
    if len(elite_trajectories)>0:
        agent.update_policy(elite_trajectories)
    


-0.3787600000000002
0.35527999999999976
0.8494799999999999
0.8908
0.9286
0.93436
0.9336800000000003
0.93612
0.92168
0.9329999999999998
0.9360399999999999
0.9249199999999999
0.9349200000000001
0.9352399999999998
0.93884
0.90748
0.9400799999999998
0.9331599999999999
0.93964
0.9348800000000002
0.9358
0.9359199999999998
0.9368799999999999
0.9322
0.9337199999999999
0.9334
0.9221199999999999
0.9258799999999999
0.9216
0.9204399999999999
0.9383599999999997
0.9406399999999999
0.93372
0.9252400000000001
0.9375199999999999
0.93848
0.9364
0.9133999999999999
0.9191599999999999
0.93852
0.9350799999999998
0.9353599999999999
0.9352799999999999
0.9321200000000001
0.9355999999999999
0.9128399999999998
0.93152
0.9348399999999999
0.93472
0.9373600000000001
0.9214000000000001
0.93128
0.9363199999999999
0.93792
0.9251999999999997
0.9351199999999998
0.9395199999999998
0.9219199999999997
0.9257599999999999
0.92964
0.9362
0.9393600000000001
0.93448
0.9388799999999999
0.9303199999999998
0.9250399999999999
0.935

In [15]:
obs = env.reset()
state = get_state(obs)
print(obs)
for _ in range(trajectory_len):
    action = agent.get_action(state)
    
    obs, reward, done, _ = env.step(action)
    
    print(obs)
    
    state = get_state(obs)
    
    if done:
        break

[0. 0.]
[1 0]
[0 0]
[1 0]
[2 0]
[2 1]
[2 2]
[2 3]
[2 4]
[3 4]
[3 3]
[3 2]
[3 2]
[3 1]
[4 1]
[4 2]
[4 3]
[4 4]
