In [1]:
import numpy as np
import matplotlib.pyplot as plt

import sys
sys.path.append('..')
import SeqGen
import RacetrackSetup as rt
import MCM

In [2]:
def initLearning(track_shape):
    QShape = track_shape + (rt.MAX_VELOCITY+1, rt.MAX_VELOCITY+1, rt.ACTIONS_NUM)
    np.random.seed(113)
    Q = (np.random.random(QShape)-0.5)*0.001 - 300.0
    return Q

REWARD = -1

def setupEnvironment(rt_contour, start_line, finish_line, Q,):
    sequence = SeqGen.SequenceGeneratorPlus(
                SeqGen.EpsilonGreedyPolicy(Q, 0.1),
                lambda: rt.getStartPosition(start_line),
                lambda s,a: rt.getTransition(
                        rt_contour, s, a, finish_line, 
                        getStartPosition=lambda: rt.getStartPosition(start_line))
                    + (REWARD,),
                episodes_max=1000
               )
    return sequence

class ImmutableGreedyPolicy:
    def __init__(self, Q):
        self.action = np.argmax(Q,axis=-1);
        
    def __call__(self, state):
        return self.action[state]

### Setup Environment #2

In [3]:
Q = initLearning(rt.track_shape_2)
sequence = setupEnvironment(rt.contour_2, rt.start_line_2, rt.finish_line_2, Q)

### Run Off-policy MC Control learning

In [4]:
C = np.zeros(dtype=np.float, shape=Q.shape)
#t_policy = getGreedyPolicy(Q)
MCM.learnByEpisode(sequence, 
                   lambda e,i: MCM.OffPolicyMCControl(e, Q, C, 
                                sequence.get_action.getDistribution, 0.99))

 Episode #:    1000; length:      13
Episodes generated: 1000


### Run On-policy First Visit MC Control learning

In [None]:
C = np.zeros(shape=Q.shape, dtype=int)
MCM.learnByEpisode(sequence, lambda e,i: MCM.OnPolicyFirstVisitMCControl(e, Q, C, 0.99))

### Run On-policy Every Visit MC Control learning

In [5]:
sequence = setupEnvironment(rt.contour_2, rt.start_line_2, rt.finish_line_2, Q)

In [7]:
C = np.zeros(shape=Q.shape, dtype=int)
MCM.learnByEpisode(sequence, lambda e,i: MCM.OnPolicyEveryVisitMCControl(e, Q, C, 0.99))

 Episode #:    1000; length:      30
Episodes generated: 1000


### Setup Environment #1

In [None]:
Q = initLearning(rt.track_shape_1)
sequence = setupEnvironment(rt.contour_1, rt.start_line_1, rt.finish_line_1, Q)