In [1]:
import sys
import random
import numpy as np
from time import sleep
from IPython.display import clear_output

import gym
from randomwalk import RandomWalkEnv

## Define The Environment

In [11]:
class RandomWalk:
    """Define RandomWalk environment."""
    
    def __init__(self, step=1, lr=2e-5, gamma=1, debug=True):
    
        self.state = 500
        self.actions = ['left', 'right']
        self.end = False
        self.step = step
        self.lr = lr
        self.gamma = gamma
        self.debug = debug
        
    def choose_action(self):
        action = np.random.choice(self.actions)
        return action
    
    def take_action(self, action):
        
        # choose steps from 1 to 100
        steps = np.random.choice(range(1, 101))
        if action == 'left':
            state = self.state - steps
        else:
            state = self.state + steps
            
        # judge if end of the game
        if state <= 0 or state >= 1001:
            self.end = True
            if state <= 0: state = 0
            else: state = 1001
                
        self.state = state
        return state
    
    def give_reward(self):
        if self.state == 0:
            return -1
        if self.state == 1001:
            return 1
        return 0
    
    def reset(self):
        self.state = 500
        self.end = False
        
    def play(self, value_function, rounds=1e5):
        
        for i_round in range(rounds):
            
            self.reset()
            time_step = 0
            T = np.inf
            action = self.choose_action()
            
            actions = [action]
            states = [self.state]
            rewards = [0]
            
            while True:
                
                if time_step < T:
                    next_state = self.take_action(action)
                    reward = self.give_reward()
                    
                    states.append(state)
                    rewards.append(reward)
                    
                    state = next_state
                    
                    if self.end:
                        if self.debug:
                            if (i_round+1) % 5000 == 0:
                                print("Round {}: End at state {} | number of states {}".format(i_round+1, state, len(states)))
                        T = time_step+1
                    else:
                        action = self.choose_action()
                        actions.append(action)
                
                # the time whose estimate is being updated
                time_updated = time_step - self.step + 1
                
                if time_updated >= 0:
                    G = 0
                    for i in range(time_updated+1, min(time_updated + self.step + 1, T+1)):
                        G += np.power(self.gamma, i-time_updated-1) * rewards[i]
                        
                    if time_updated + self.step < T:
                        state = states[time_updated + self.step]
                        G += np.power(self.gamma, self.step) * value_function.value(state)
                        
                    # update value function
                    state = states[time_updated]
                    delta = self.lr * (G - value_function.value(state))
                    value_function.update(delta, state)
                
                if time_updated == T-1: break
                time_step += 1

## Explore The Environment

In [14]:
env = RandomWalk(step=1, lr=1e-3)

In [24]:
print('Environment Display:')
env.reset() # reset environment to a new, random state

print("State Space {}".format(env.state))
print("Action Space {}".format(len(env.actions)))

Environment Display:
State Space 500
Action Space 2


## Define Function Approximation

In [6]:
num_states = 1000

In [7]:
class LinearValueFunction:
    
    def __init__(self, order, method='poly'):
        
        if method == 'poly':
            self.function = [lambda x, i=i: np.power(x, i) for i in range(0, order+1)]
        
        if method == 'fourier':
            self.function = [lambda x, i=i: np.cos(np.pi*x*i) for i in range(0, order+1)]
        
        self.weights = np.zeros(order+1)
        
    def value(self, state):
        state = state/ num_states
        features = np.array([func(state) for func in self.function])
        
        return np.dot(features, self.weights)
    
    def update(self, delta, state):
        state = state/ num_states
        dev = np.array([func(state) for func in self.function])
        self.weights += delta*dev

---