# Pessimistic Neighbourhood Aggregation for States in Reinforcement Learning

Author: Maleakhi Agung Wijaya  
Supervisor: Marcus Hutter, Sultan Javed Majeed  
Date Created: 21/12/2017

In [67]:
import random
import math

## Mountain Car Environment

In [68]:
class MountainCarEnvironment:
    """
    Implementation of Sutton & Barto (1998) Mountain Car Problem environment.
    """
    velocity_boundaries = (-0.07, 0.07)
    position_boundaries = (-1.2, 0.6)  
   
    # Constructor for MountainCarEnvironment
    # Input: agent for the MountainCarEnvironment
    # Output: MountainCarEnvironment object
    def __init__(self, car):
        self.car = car
        self.reset()
        
    # Compute next state (feature)
    # Output: [new velocity, new position]
    def nextState(self, action):
        # Get current state (velocity, position) and the action chosen by the agent
        velocity = self.car.state[0]
        position = self.car.state[1]
        
        # Calculate the new velocity and new position
        velocity += action * 0.001 + math.cos(3*position) * -0.0025
        position += + velocity
        
        new_state = [velocity, position]
        return(new_state)
    
    # Reset to the initial state    
    def reset(self):
        self.car.state[0] = 0.0
        self.car.state[1] = -0.5
        
    # Give reward for each of the chosen action, depending on what the next state that the agent end up in
    # Output: terminal state = 0, non-terminal state = -1
    def calculateReward(self):
        # Get current position of the agent
        position = self.car.state[1]
        
        # Determine the reward given
        if (position >= 0.6):
            return(0)
        else:
            return(-1)

## KNN Agent

In [69]:
class Agent:
    """
    Implementation of agent (car) that will be used in the Mountain Car Environment using the kNN-TD underlying algorithm
    """
    
    # Constructor
    # Input: size of the storage for previous Q values, parameters for how many neighbours which the agent will choose
    def __init__(self, size, k):
        self.state = [0.0, -0.5]
        self.actions = [-1, 0, 1]
        self.q_storage = []
        self.k = k # fixed number of nearest neighbours that we will used
        self.alpha = 0.5 # choose fixed alpha, but we can varied alpha later
        
        # Storage of the k nearest neighbour (data) and weight (inverse of distance) for a particular step
        self.knn = []
        self.weight = []
        
        # Initialise the storage with random point 
        for i in range(size):
            initial_value = -1.0
            initial_action = random.randint(-1, 1)
            initial_state = [random.uniform(-0.07, 0.07), random.uniform(-1.2, 0.6)]
            
            # Each data on the array will consist of state, action pair + value
            data = {"state": initial_state, "value": initial_value, "action": initial_action}
            self.q_storage.append(data)
      
    # Standardise feature vector given
    # Input: feature vector to be standardised
    # Output: standardised feature vector
    def standardiseState(self, state):
        standardised_state = []
        standardised_velocity = 2 * ((state[0]+0.07) / (0.07+0.07)) - 1
        standardised_position = 2 * ((state[1]+1.2) / (0.6+1.2)) - 1
        standardised_state.append(standardised_velocity)
        standardised_state.append(standardised_position)
        
        return(standardised_state)
    
    # Calculate Euclidean distance between 2 vectors
    # Input: 2 feature vectors
    # Output: distance between them
    def calculateDistance(self, vector1, vector2):
        return(math.sqrt((vector1[0]-vector2[0])**2 + (vector1[1]-vector2[1])**2))
    
    # Calculate total weight
    # Input: list of weights
    # Output: total weight
    def calculateTotalWeight(self, weight_list):
        total_weight = 0
        for i in range(len(weight_list)):
            total_weight += weight_list[i][2]
        
        return(total_weight)
    
    # Apply the kNN algorithm for feature vector and store the data point on the neighbours array
    # Input: feature vector of current state, actions array consisting of all possible actions, list that will store knn data and weights data
    # Output: vector containing the value of taking each action (left, neutral, right)
    def kNNTD(self, state, actions, knn_list, weight_list):
        approximate_action = []
        
        # Get the standardised version of state
        standardised_state = self.standardiseState(state)
        
        # Loop through every element in the storage array and only calculate for particular action
        for action in actions:
            temp = [] # array consisting of tuple (distance, original index, weight) for each point in the q_storage
            for i in range(len(self.q_storage)):
                data = self.q_storage[i]
                # Only want to calculate the nearest neighbour state which has the same action
                if (data["action"] == action):
                    vector_2 = data["state"]
                    distance = self.calculateDistance(standardised_state, vector_2)
                    index = i
                    weight = 1 / (1+distance**2)
            
                    # Create the tuple and append that to temp
                    temp.append(tuple((distance, index, weight)))
                else:
                    continue
        
            # After we finish looping through all of the point and calculating the standardise distance,
            # Sort the tuple based on the distance and only take k of it and append that to the neighbours array
            # We also need to calculate the total weight to make it into valid probability that we can compute it's expectation
            sorted_temp = sorted(temp, key=lambda x: x[0])
            for i in range(self.k):
                weight_list.append(sorted_temp[i])
                knn_list.append(self.q_storage[sorted_temp[i][1]])
            
            # Calculate the expected value of the action and append it to the approximate_action array
            expected_value = 0
            total_weight = self.calculateTotalWeight(weight_list[(action+1)*self.k:])
            for i in range(self.k):
                weight = weight_list[i][2]
                probability = weight / total_weight
                expected_value += probability * knn_list[i]["value"]
                
            approximate_action.append(expected_value)
        
        return(approximate_action)
    
    # Select which action to choose, whether left, neutral, or right
    # Output: -1 (left), 0 (neutral), 1 (right)
    def selectAction(self):
        # First call the knn-td algorithm to determine the value of each Q(s,a) pairs
        action_value = self.kNNTD(self.state, self.actions, self.knn, self.weight)
        
        # Use the epsilon-greedy method to choose value
        epsilon = 0.1
        random_number = random.uniform(0.0, 1.0)
        if (random_number <= epsilon):
            action_chosen = random.randint(-1, 1)
        else:
            # Return the action with highest Q(s,a)
            action_chosen = action_value.index(max(action_value)) - 1
        
        # Only store chosen data in the knn and weight list
        # Clearance step
        chosen_knn = []
        chosen_weight = []
        for i in range(self.k*(action_chosen+1), self.k*(action_chosen+1)+self.k):
            chosen_knn.append(self.knn[i])
            chosen_weight.append(self.weight[i])
        self.knn = chosen_knn
        self.weight = chosen_weight
        
        return action_chosen
    
    # Calculate TD target based on Q Learning/ SARSAMAX
    # Input: Immediate reward based on what the environment gave
    # Output: TD target based on off policy Q learning
    def calculateTDTarget(self, immediate_reward):
        knn_prime = []
        weight_prime = []
        action_value = self.kNNTD(self.state, self.actions, knn_prime, weight_prime)
        
        return(immediate_reward + max(action_value))
    
    # Q learning TD updates on every neighbours on the kNN based on the contribution that are calculated using probability weight
    # Input: Immediate reward based on what the environment gave
    def TDUpdate(self, immediate_reward):
        # First, calculate the TD target
        td_target = self.calculateTDTarget(immediate_reward)
        
        # Iterate every kNN and update using Q learning method based on the weighting
        total_weight = self.calculateTotalWeight(self.weight)
        for i in range(len(self.weight)):
            index = self.weight[i][1]
            probability = self.weight[i][2] / total_weight
            
            # Begin updating
            td_error = td_target - self.q_storage[index]["value"]
            self.q_storage[index]["value"] = self.q_storage[index]["value"] + self.alpha*td_error*probability

## KNN Main Function