In [None]:
import sys
import numpy as np
import matplotlib.pyplot as plt


In [None]:
def lin_approx(x,w):
    return 1.0/(1.0 + np.exp(-np.matmul(w,x)))

def lin_approx_grad(x,w):
    q = lin_approx(x,w)
    grad = q*(1.0-q)*x
    return (q,grad)


In [None]:
REWARD_I = 4

def SARSAnApprox(sequence, q_approx, q_grad, w, n, gamma, alpha):
    gamma_powered = [gamma**n for n in range(0,n+1)]
    sequence_iter = iter(sequence)
    for episode_i in range(50000):
        step = next(sequence_iter)
        state, is_terminal, next_state, action, reward = step
        T = sys.maxsize
        t = 0
        history = [step]

        while True:
            if t < T:
                if is_terminal:
                    T = t+1
                    print("\rEpisode length is {:7}; tau={:7}".format(T,tau), end='', flush=True)
                else:
                    step = next(sequence_iter)
                    state, is_terminal, next_state, action, next_reward = step
                    history.append(step)

            tau = t-n+1
            if tau >= 0:
                G = np.sum( [gamma_powered[j]*history[tau+j][REWARD_I]
                             for j in range(0,min(n,T-tau))])
                if tau+n < T:
                    G = G + gamma_powered[n] * q_approx(state, action, w)
                Stau, istrm, next_tau, Atau, Rtau = history[tau]
                q,grad = q_grad(state, action, w)
                w += alpha*(G - q)*grad

            if tau == T-1: break
            t += 1
    print("\nFinished.")

In [None]:
def tiling(value, low_bound, high_bound, pitch):
    n = int((high_bound-low_bound)/pitch + 1)
    i = int((value - low_bound)/pitch)
    return [x <= i+2 and x >= i-2 for x in range(n)]


In [None]:
import gym
env_name = 'MountainCar-v0'
env = gym.make(env_name)
state_shape = env.env.observation_space.shape[0]
actions_num = env.env.action_space.n
state = env.reset()
next_state, reward, done, info = env.step(action)


### Mountain Car Problem Setup

In [None]:
def mc_getStartPosition():
    return (np.random.rand(1)[0]*0.2-0.6, 0.0)

REWARD = -1.0

def mc_getTransition(position, velocity, push):
    finished = False
    v = np.clip(velocity + 0.001*push - 0.0025*np.cos(3.0*position), -0.07, 0.07)
    p = position + v
    if p <= -1.2:
        p = -1.2
        v = 0.0
    elif p >= 0.5:
        p = 0.5
        finished = True
    return (p, v, finished, REWARD)

### Run the learning

In [None]:
sys.path.append('..')
import SeqGen

def get_features(pos,vel):
    return np.float32(tiling(pos, -1.2, 0.5, 0.2) + tiling(vel, -0.07, 0.06999, 0.02))

features_dim = len(get_features(.6, 0.01))
getFeatures = lambda k: get_features(k[0],k[1])
ACTION_FEATURES = [[1,0,0],[0,1,0],[0,0,1]]
actions_dim = len(ACTION_FEATURES)

class getActionValues:
    def __init__(self, approx_func, w, actions_num):
        self.approx_func = approx_func
        self.w = w

    def __getitem__(self, state):
        q=[]
        for action in range(actions_num):
            q.append(self.approx_func(state, action, w))
        return q

def tiling_lin_approx(state, action, w):
    x = np.concatenate((get_features(state[0],state[1]), ACTION_FEATURES[action]), axis=None)
    return lin_approx(x, w)

def tiling_lin_approx_grad(state, action, w):
    x = np.concatenate((get_features(state[0],state[1]), ACTION_FEATURES[action]), axis=None)
    return lin_approx_grad(x, w)

w = (np.random.rand(1, features_dim + actions_dim) - 0.5) * 0.001
# Need for SeqGen.EpsilonGreedyPolicy
q = getActionValues(tiling_lin_approx, w, actions_dim)

def getStateTransition(s,a):
    p,v,f,r = mc_getTransition(s[0], s[1], a-1.0)
    return (f, (p,v), r)

In [None]:
sequence = SeqGen.SequenceGenerator(SeqGen.EpsilonGreedyPolicy(q, 0.1), 
                                  mc_getStartPosition,
                                  getStateTransition,
                                  1
                                 )

In [None]:
s=(.4001, 0.049)
q[s]

In [None]:
SARSAnApprox(sequence, tiling_lin_approx, tiling_lin_approx_grad, \
             w, 8, 0.99, 0.1)

In [None]:
pol = SeqGen.EpsilonGreedyPolicy(q, 0.01)
s=(.4001, 0.049)
q[s], pol(s)