### <font color='orange'> **강화학습 • Reinforcement Learning | FINAL EXAM | 2020년12월14일 (월)**
**<font color='red'> Melia Putri Handayani • 2019 55718 • 산업및데이터공학과 부경대학교**

#### <font color='grey'> **Mountain Car Problem**
Solve mountain car problem with linear function approximation
Objective: 
* Implement a linear function approximation-based reinforcement learning algorithm
* Not allow to use the non-linear function approximation and the lookup table
* Learn a policy which can complete any episode within 300 steps

Submit a source code to the ‘assignment’ board in lms
File format: ‘JunPyoHong_2015111438.py’
Due: 24:00, Dec. 14 

In [12]:
#Importing the Libraries
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d.axes3d import Axes3D
matplotlib.use('Agg')

from tqdm import tqdm

from math import floor

In [13]:
#Create Class Index-Hash-Table (IHT)
class IHT:
    "Structure to handle collisions"
    def __init__(self,size_val):
        self.size=size_val
        self.overfull_count=0
        self.dictionary={}

    def count(self):
        return len(self.dictionary)

    def full(self):
        return len(self.dictionary)>=self.size

    def get_index(self,obj,read_only=False):
        d=self.dictionary
        if obj in d:
            return d[obj]
        elif read_only:
            return None
        size=self.size
        count=self.count()
        if count>=size:
            if self.overfull_count == 0: print('IHT full, starting to allow collisions')
            self.overfull_count += 1
            return hash(obj) % self.size
        else:
            d[obj]=count
            return count

In [14]:
#Define 'hash_coords' function
def hash_coords(coordinates, m, read_only=False):
    if isinstance(m, IHT): return m.get_index(tuple(coordinates),read_only)
    if isinstance(m, int): return hash(tuple(coordinates)) % m
    if m is None: return coordinates

In [15]:
#Define 'tiles' function
def tiles(iht_or_size,num_tilings,floats,ints=None,read_only=False):
    """Returns num-tilings tile indices corresponding to the floats and ints"""
    if ints is None:
        ints=[]
    qfloats=[floor(f*num_tilings) for f in floats]
    tiles=[]
    for tiling in range(num_tilings):
        tilingX2=tiling*2
        coords=[tiling]
        b=tiling
        for q in qfloats:
            coords.append((q+b)//num_tilings)
            b+=tilingX2
        coords.extend(ints)
        tiles.append(hash_coords(coords, iht_or_size, read_only))
    return tiles

**From this point, tiles coding is ended**

#### **Problem Description (Refer to the page 5 of the slides “6_Function Approximation_2.pdf”)**
* A car is started at the bottom of valley.
* For any given state, the agent may choose to accelerate to the left, right or cease any acceleration.

<h5><center> **OBSERVATION** </center></h5>

Index |       Observation     |  Min  |  Max 
:---: | :-------------------: | :---: | :---:
  0   | Car position (x-axis) | -1.2  |  0.6
  1   | Car velocity          | -0.07 |  0.07


<h5><center> **ACTION** </center></h5>

Index |          Action          
:---: | :----------------------:
  0   | Accelerate to the left   
  1   | Do not accelerate        
  2   | Accelerate to the right  
  
  
**EPISODE TERMINATION**
> The car position is more than 0.5 or episode length is greater than 1000 steps

* Refer to “MountainCar_env.py” for the detailed information.

In [5]:
#Define the position and velocity of observation
position_min=-1.2
position_max=0.5
velocity_min=-0.07
velocity_max=0.07

#Define all possible actions (This did not work)
#acc_left=0
#acc_none=1
#acc_right=2

#Define all possible actions
acc_left=-1
acc_none=0
acc_right=1

#Define the orders of actions
actions_order=[acc_left,acc_none,acc_right]

#Use optimistic initial value, so it's ok to set epsilon to 0
epsilon=0

In [6]:
#Define 'step' function
#Take an @action at @position and @velocity
#@return: new position, new velocity, reward (always -1)

def step(position,velocity,action):
    new_velocity=velocity+0.001*action-0.0025*np.cos(3*position)
    new_velocity=min(max(velocity_min,new_velocity),velocity_max)
    new_position=position+new_velocity
    new_position=min(max(position_min, new_position),position_max)
    reward=-1.0
    if new_position == position_min:
        new_velocity=0.0
    return new_position,new_velocity,reward

In [7]:
#Create wrapper class for state action value function
class ValueFunction:
    def __init__(self,step_size,num_of_tilings=8,max_size=2048):
        self.max_size=max_size
        self.num_of_tilings=num_of_tilings

        #Divide step size equally to each tiling
        self.step_size=step_size/num_of_tilings

        self.hash_table=IHT(max_size)

        #Weight for each tile
        self.weights=np.zeros(max_size)

        #Position and velocity needs scaling to satisfy the tile software
        self.position_scale=self.num_of_tilings/(position_max-position_min)
        self.velocity_scale=self.num_of_tilings/(velocity_max-velocity_min)

    #Get indices of active tiles for given state and action
    def get_active_tiles(self,position,velocity,action):
        active_tiles=tiles(self.hash_table,self.num_of_tilings,
                           [self.position_scale*position,self.velocity_scale*velocity],[action])
        return active_tiles

    #Estimate the value of given state and action
    def value(self,position,velocity,action):
        if position == position_max:
            return 0.0
        active_tiles=self.get_active_tiles(position,velocity,action)
        return np.sum(self.weights[active_tiles])

    #Learn with given state, action and target
    def learn(self,position,velocity,action,target):
        active_tiles=self.get_active_tiles(position,velocity,action)
        estimation=np.sum(self.weights[active_tiles])
        delta=self.step_size*(target-estimation)
        for active_tile in active_tiles:
            self.weights[active_tile]+=delta

    #Get # of steps to reach the goal under current state value function
    def cost_to_go(self,position,velocity):
        costs=[]
        for action in actions_order:
            costs.append(self.value(position,velocity,action))
        return -np.max(costs)

In [8]:
#Get action at @position and @velocity based on epsilon greedy policy and @valueFunction
def get_action(position,velocity,value_function):
    if np.random.binomial(1,epsilon)==1:
        return np.random.choice(actions_order)
    values=[]
    for action in actions_order:
        values.append(value_function.value(position,velocity,action))
    return np.random.choice([action_ for action_,value_ in enumerate(values) if value_==np.max(values)])-1

In [9]:
#Define 'semi_gradient_n_step_sarsa' function
#Semi-gradient n-step Sarsa
#@valueFunction: state value function to learn
#@n: # of steps
def semi_gradient_n_step_sarsa(value_function,n=1):
    #Start at a random position around the bottom of the valley
    current_position = np.random.uniform(-0.6,-0.4)
    #Initial velocity is 0
    current_velocity=0.0
    #Get initial action
    current_action=get_action(current_position,current_velocity,value_function)

    #Track previous position, velocity, action and reward
    positions=[current_position]
    velocities=[current_velocity]
    actions=[current_action]
    rewards=[0.999]

    #Track the time
    time=0

    #The length of this episode
    T=float('inf')
    while True:
        #Go to next time step
        time+=1

        if time < T:
            #Take current action and go to the new state
            new_postion,new_velocity,reward=step(current_position,current_velocity,current_action)
            #Choose new action
            new_action=get_action(new_postion,new_velocity,value_function)

            #Track new state and action
            positions.append(new_postion)
            velocities.append(new_velocity)
            actions.append(new_action)
            rewards.append(reward)

            if new_postion==position_max:
                T=time

        #Get the time of the state to update
        update_time=time - n
        if update_time >= 0:
            returns=0.0
            #Calculate corresponding rewards
            for t in range(update_time+1, min(T,update_time+n)+1):
                returns+=rewards[t]
            #Add estimated state action value to the return
            if update_time+n<=T:
                returns+=value_function.value(positions[update_time+n],
                                              velocities[update_time+n],
                                              actions[update_time+n])
            #Update the state value function
            if positions[update_time] != position_max:
                value_function.learn(positions[update_time],
                                     velocities[update_time],
                                     actions[update_time],
                                     returns)
        if update_time==T-1:
            break
        current_position=new_postion
        current_velocity=new_velocity
        current_action=new_action

    return time

In [10]:
#Define the funtion to print learned cost to go
def print_cost(value_function,episode,ax):
    grid_size=40
    positions=np.linspace(position_min,position_max,grid_size)
    velocities=np.linspace(velocity_min,velocity_max,grid_size)
    axis_x=[]
    axis_y=[]
    axis_z=[]
    for position in positions:
        for velocity in velocities:
            axis_x.append(position)
            axis_y.append(velocity)
            axis_z.append(value_function.cost_to_go(position,velocity))

    ax.scatter(axis_x,axis_y,axis_z)
    ax.set_xlabel('Position')
    ax.set_ylabel('Velocity')
    ax.set_zlabel('Cost to go')
    ax.set_title('Episode %d' % (episode + 1))

In [11]:
#Plotting the Results
def figure_10_1():
    episodes=1000
    plot_episodes=[0,99,episodes-1]
    fig=plt.figure(figsize=(40,10),facecolor='w')
    axes=[fig.add_subplot(1,len(plot_episodes),i+1,projection='3d') for i in range(len(plot_episodes))]
    num_of_tilings=8
    alpha=0.5
    value_function=ValueFunction(alpha,num_of_tilings)
    for ep in tqdm(range(episodes)):
        semi_gradient_n_step_sarsa(value_function)
        if ep in plot_episodes:
            print_cost(value_function,ep,axes[plot_episodes.index(ep)])

    plt.savefig('MeliaPutriHandayani_201955718.png')
    plt.close()

if __name__ == '__main__':
    figure_10_1()

100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:24<00:00, 40.98it/s]
