## Policy Gradient Method: Learning Algorithm - Single agent

This script contains the algorithm that uses PGM for improving the controller directly applied to the single agent system 

### 'Continuous_car' 

Instantiated with a controller K, and boolean variables defining whether you want system noise or controller noise

### 'PGM'

This carries out a chosen number of updates to the controller given an initial controller and an update constant, epsilon.
It returns a list of controllers learned







In [1]:
import numpy as np
import pdb
import matplotlib.pyplot as plt


class Continuous_car():

    
    def __init__(self,K,noisy_model = False, noisy_controller=False):
        
        #system
        self.A = np.array([[1,1],[0, 1]])
        #x = np.array([[1],[1]])
        self.B = np.array([[0.5],[1]])

        #cost matrices
        self.E = np.array([[1,0],[0, 0.5]])
        self.F = np.array([1])

        # Arbitrary gain choice used (stability checked)
        self.K = K
        
        # Optimal Gains found using LQR on Matlab:
        # K_Optimal = [ 0.4634, 1.0170 ]

        self.disc_fact = 0.99
        self.n_states = 2
        
        if noisy_model == True:
            self.sigma_model = 0.25
        else:
            self.sigma_model = 0
            
        self.sigma_controller = 0.2
        self.noisy_controller = noisy_controller
    
    def GetPolicyInput(self,x):
        #For a given state, return the input u according to a defined policy
        # x: 2x1 array
        # K will be a 1x2 array
        
        inp = np.matmul(self.K,x)[0] #scalar
        
        if self.noisy_controller:
            inp += self.GetControllerNoise() #scalar
        
        return inp #scalar
    
    def GetControllerNoise(self):
        # returns scalar value of noise
        
        contr_noise = self.sigma_controller * np.random.randn(1)[0] # scalar
        
        return contr_noise
    
    def GetCost(self, x, u):
        #For a given state, return the one step cost of this new state
        
        # x is a 2x1 array
        # u is a scalar
        
        # x'Ex
        cost1 = np.matmul(np.matmul(x.transpose(),self.E),x)[0][0]
        
        # u'Fu
        cost2 = u*u*self.F[0]
        
        return cost1+cost2 #scalar
        
    def GetNoise(self):
        # Returns a vector with noise for the model only for velocity state
        
        w = np.array([0,self.sigma_model*np.random.randn(1)[0]]).reshape(2,1)
        
        return w
        #return np.array([[0],[0]])
    
    def GetNextState(self,current_state,current_input):
            
        
        x_next_1 = np.matmul(self.A,current_state)
        
        x_next_2 = self.B * current_input
        x_next_3 = self.GetNoise()
        #pdb.set_trace()
        x_next = x_next_1 + x_next_2 + x_next_3
        return x_next
    

    def RunEpisode(self, episode_length, state_initial):
        #function will return lists of the states, inputs and costs for a trajectory given an 
        #initial state and f, a probability of the input being randomly generated
        '''
        length: integer
        state_initial: list form, e.g. [3,2] for position of 3 and velocity of 2
        '''

        x = np.array(state_initial).reshape(2,1)

        state_list = [x]
        cost_list = []
        input_list = []
        pos_list = [x[0][0]]
        vel_list = [x[1][0]]
            


       
        #input_list.append(sys.GetInput(x))

        for k in range(episode_length):
            
            u = self.GetPolicyInput(x)
                

            input_list.append(u)
            
            #pdb.set_trace()

            x = self.GetNextState(x,u)
            cost = self.GetCost(x,u)

            state_list.append(x)
            cost_list.append(cost)


            pos_list.append(x[0][0])
            vel_list.append(x[1][0])
            
            #pdb.set_trace()

        return state_list, cost_list, input_list, pos_list, vel_list
    
    
    def CollectSimulations(self,number_of_episodes=10,episode_length=50):
        # This function runs a certain number of episodes and returns them all in big matrix lists of lists
        
        Vel_Matrix_list = []
        Pos_Matrix_list = []
        
        U_Matrix_list = []
        Cost_Matrix_list = []
        
        ## define some initial states parameters
        
        sigma_pos = 4
        sigma_vel = 2
        
        def GetRandomInitialState():
            
            x_in = sigma_pos * np.random.randn(1)[0]
            v_in = sigma_vel * np.random.randn(1)[0]
            
            return [x_in,v_in]
        
        for episode in range(number_of_episodes):
            
            #Get Initial State
            X_initial = GetRandomInitialState()
            
            SL,CO,IN,PO,VEL = self.RunEpisode(episode_length,X_initial)
            
            Vel_Matrix_list.append(VEL)
            Pos_Matrix_list.append(PO)
            U_Matrix_list.append(IN)
            Cost_Matrix_list.append(CO)
        
        return Pos_Matrix_list,Vel_Matrix_list, U_Matrix_list,Cost_Matrix_list

In [1]:
def GetGradLnPi(x_k, u_k, K, sig):
    # takes in the following values:
    
    #x_k - the current state vector (2x1)
    #u_k - the current input value (scalar)
    
    #K - the current policy we are following (in the form u=Kx) FOR ONLY ONE INPUT - IE k IS A 1X2 MATRIX
    #sig - the value of the controller noise sigma (scalar)
    
    # a 2x1 vector
    return (1/sig) * (u_k - np.matmul(K,x_k)) * x_k

    
def GetValFunc(x_k,W):
    # function that takes in a vector of the current state x_k and the current weights W and returns the scalar value function
    # W is a 4x1 array
    
    position = x_k[0][0]
    velocity = x_k[1][0]
    
    valfunc = position*position * W[0][0] + position * velocity * W[1][0] + velocity * velocity * W[2][0] + W[3][0]
    
    return valfunc  #scalar

def GetGradV(x_k):
    #function that returns a 4x1 vector of the gradient of the value function
    
    position = x_k[0][0]
    velocity = x_k[1][0]
    
    return np.array([position*position, position*velocity, velocity*velocity,1]).reshape(4,1)
    


## We will do this first one by using Gt as a Monte Carlo sample. Ie we collect one trajectory and then for each transition within this,
# we update the Theta based on the actual observed returns 




def PGM(K_initial,epsilon,n_episodes=1000,ep_length=500):
        
    #rand_ep = np.random.randint(0,n_episodes)
    #print(f'random episode = {rand_ep}')
    #rand_transition = np.random.randint(0,ep_length)
    #print(f'random transition = {rand_transition}')



    # Initialise Controller

    K = K_initial
    


    list_of_controllers = []


    # for however many iterations:

    for episode in range(n_episodes):

        ten_perc = n_episodes/10
        if episode%ten_perc == 0:
            print(f'{episode} done, {n_episodes-episode} remaining!')

        # Create the new gain matrix
        list_of_controllers.append(K)

        # Generate an episode (we need states, actions, costs at every step)

        sys = Continuous_car(K,noisy_model = True, noisy_controller=True)
        X1,V1,U,C = sys.CollectSimulations(1,ep_length)

        #make the arrays 1d
        X1 = X1[0]
        V1 = V1[0]
        U = U[0]
        C = C[0]

        K_this_ep = K.copy()
        # for each transition, we do an update:

        for transition_idx, _ in enumerate(U):
        #for transition_idx in range(1):



            # calculate discounted return from this point onwards
            G = 0
            k = transition_idx
            while True:
                G += sys.disc_fact **(k-transition_idx) * C[k]                    

                if k==len(C)-1:
                    break

                k +=1
            #G = G/(k+1-transition_idx)
            
            


            # get the value of grad(ln(pi)) evaluated at our sampled action and state

            x1_k = X1[transition_idx]
            v1_k = V1[transition_idx]
            
            u_k = U[transition_idx]
            sig = sys.sigma_controller

            x_k = np.array([x1_k,v1_k]).reshape(2,1)

            ######## Use most recently updated K ##############
            
            gradJ = GetGradLnPi(x_k, u_k, K_this_ep, sig).reshape(1,2)

            K_this_ep = (K_this_ep - epsilon * (sys.disc_fact**transition_idx) *  G * gradJ)[0]  #term removed
            #pdb.set_trace()


        K = K_this_ep

    list_of_controllers.append(K)
    
    print('All Improvements Done!')

    return list_of_controllers