In [1]:
% matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from rlemmc import environment, policy, trajectory, montecarlo

In [2]:
# Environment
horizon = 10
env = environment.AngularMovement()

In [3]:
# Inference
sampleCount = 100
iterationCount = 100
selectedInference = montecarlo.importance_sampling

In [4]:
# Policy
kNearest = 5
p = policy.KnnPolicyContinuous(kNearest)

In [5]:
def plot_trajectories(states, color='red'):

    [sample_count, _, _] = states.shape

    for s in range(sample_count):
        plt.plot(states[s, :, 0], states[s, :, 1], '-', color=color)
        plt.plot(states[s, :, 0], states[s, :, 1], 'o', color=color)
        plt.plot(states[s, -1, 0], states[s, -1, 1], 'o', color=color)

In [None]:
def plot_iteration(states, newStates):
    plot_trajectories(states, color='red')
    plot_trajectories(newStates, color='green')
    plt.plot(env.targetState[0], env.targetState[1], 'o', color='blue', markersize=20)
    plt.grid()
    plt.show()

In [None]:
iterationRewards = []

for i in range(iterationCount):

    if i == 0:
        iterationPolicy = policy.UniformPolicyContinuous()
    else:
        iterationPolicy = p

    [states, actions, rewards] = \
        trajectory.rollout_trajectories(env, iterationPolicy, horizon, sampleCount)

    # E-Step
    [newStates, newActions] = selectedInference(states, actions, rewards[:,-1])

    # M-Step
    p.m_step(newStates, newActions)
    
    # Plot Trajectories
    # plot_iteration(states, newStates)
    
    # Average Reward
    iterationRewards.append(np.mean(rewards[:,-1]))
    print( f'Iteration {i+1} - Average Reward : {iterationRewards[i]}' )

Iteration 1 - Average Reward : 0.0018033297931797552
Iteration 2 - Average Reward : 0.10143965046892006
Iteration 3 - Average Reward : 0.19111506545831614
Iteration 4 - Average Reward : 0.1766406643583903
Iteration 5 - Average Reward : 0.1745899855592202
Iteration 6 - Average Reward : 0.22885611304261647
Iteration 7 - Average Reward : 0.24223060598027615
Iteration 8 - Average Reward : 0.24005548183781045
Iteration 9 - Average Reward : 0.26953042824462914
Iteration 10 - Average Reward : 0.26973754238291203
Iteration 11 - Average Reward : 0.30664679400148276
Iteration 12 - Average Reward : 0.28596407239288524
Iteration 13 - Average Reward : 0.3334368822073654
Iteration 14 - Average Reward : 0.4050659731439192
Iteration 15 - Average Reward : 0.50148404925754
Iteration 16 - Average Reward : 0.49074668166701685
Iteration 17 - Average Reward : 0.5361835724599693
Iteration 18 - Average Reward : 0.5707763702518044
Iteration 19 - Average Reward : 0.6169011852160771
Iteration 20 - Average Reward

In [None]:
plt.plot(iterationRewards)
plt.show()

In [None]:
plot_iteration(states,newStates)