# Code For: Experimentally Comparing Reinforcement Learning Algorithms with Finite-Time Optimality Guarantees

Contributors: Andrew Lee, Nathan Ng, Sam Poulin, Rose Zhang, and Fivos Kalogiannis 

## Setup Game

Before implementing the Reinforcement Learning algorithms, we first implement a version of Gridworld to test the algorithms on and define constants in.

We first begin with the necessary inputs for the rest of the code:

In [2]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Arrow, Circle
from itertools import product
from multiprocessing import Pool
from tqdm import tqdm
from collections import namedtuple

We then construct a class that allows us to store and represent a Grid World, alongside some requisite parameters:

In [3]:
class Gridworld:
    def __init__(self, w, h, gamma, rho = None):
        self.RX = w
        self.RY = h
        self.NSTATES = w*h
        self.NACTIONS = 5

        self.gamma = gamma

        if rho is None:
            self.rho = np.zeros(self.NSTATES)
            self.rho[0] = 1.0
        else:
            self.rho = rho

        #setup transition matrix:
        self.P = np.zeros( (self.RY, self.RX, self.NACTIONS, self.RY, self.RX) )
        for idx in range(self.RY*self.RX):
            x = idx % self.RX
            y = idx // self.RX
            for a in range(self.NACTIONS):
                ny, nx = self.move(y, x, a)
                ny = np.clip(ny, 0, self.RY-1)
                nx = np.clip(nx, 0, self.RX-1)
                self.P[y, x, a, ny, nx] = 1

        R = np.zeros((self.RY, self.RX, self.NACTIONS))
        self.R = R.reshape(self.RY*self.RX, self.NACTIONS)
    
    def set_reward(self, x, y, val):
        self.R = self.R.reshape(self.RY, self.RX, self.NACTIONS)
        if x not in range(0, self.RX) or y not in range(0, self.RY):
            return
        self.R[y, x, 1] = val
        if y-1 in range(0, self.RY):
            self.R[y-1, x, 3] = val
        if y+1 in range(0, self.RY): 
            self.R[y+1, x, 4] = val
        if x-1 in range(0, self.RX): 
            self.R[y, x-1, 2] = val
        if x+1 in range(0, self.RX):
            self.R[y, x+1, 0] = val
        self.R = self.R.reshape(self.RY*self.RX, self.NACTIONS)

    def move(self, y, x, a):
        if a == 0:
            x -= 1
            return y, x
        elif a == 1:
            return y, x
        elif a == 2:
            x += 1
            return y, x
        elif a == 3:
            y += 1
            return y, x
        elif a == 4:
            y -= 1
            return y, x

## Projected Gradient Ascent

From there, we are able to implement a Projected Gradient Ascent Reinforcement Learning algorithm to solve this implementation of Grid World.

We begin by defining functions used when calculating the value and gradient of a given policy for each step of the Projected Gradient algorithm:

In [4]:
def get_d(gamma, Ptheta):
    sz = np.shape(Ptheta)
    return np.linalg.inv(np.eye(*sz) - gamma * Ptheta)

def get_V(gw, policy, d):
    print(np.shape(policy))
    print(np.shape(gw.R))
    r_expect = np.sum(policy * gw.R, axis=1)
    V = np.zeros(gw.NSTATES)
    V = d.dot(r_expect)
    return V

def get_Q(gw, policy, d):
    V = get_V(gw, policy, d)
    return gw.R + gw.gamma * np.einsum( 'ijk,k->ij', gw.P.reshape( gw.NSTATES, gw.NACTIONS, gw.NSTATES) , V )

def get_Ptheta2D(P, policy, gw):
    Pprime = P.reshape(gw.NSTATES * gw.NACTIONS, gw.NSTATES) * policy.reshape(gw.NSTATES * gw.NACTIONS)[:, None]
    Ptheta = np.zeros((gw.NSTATES, gw.NSTATES))

    for s in range(gw.NSTATES):
        z = np.sum( Pprime[s*gw.NACTIONS:(s+1)*gw.NACTIONS, :], axis=0 )
        Ptheta[s, :] = z
    return Ptheta

def get_direct_grad(gw, policy):
    Ptheta = get_Ptheta2D(gw.P, policy, gw)
    d = get_d(gw.gamma, Ptheta)
    V = get_V(gw, policy, d)
    q = get_Q(gw, policy, d)
    grad = (1/(1 - gw.gamma)) * gw.rho.dot(d)[:, None] * q
    return grad

We then implement an algorithm for projecting onto a simplex described by Yunmei Chen and Xiaojing Ye ([arXiv:1101.6081](https://arxiv.org/abs/1101.6081)) to ensure that each state in a policy describes a valid probability distribution.

In [5]:
def projsplx(y):
    """
    Python implementation of:
    https://arxiv.org/abs/1101.6081
    """
    s = np.sort(y)
    n = len(y) ; flag = False

    parsum = 0
    tmax = -np.inf
    for idx in range(n-2, -1, -1):
        parsum += s[idx+1]
        tmax = (parsum - 1) / (n - (idx + 1) )
        if tmax >= s[idx]:
            flag = True ; break

    if not flag:
        tmax = (np.sum(s) - 1) / n

    return np.maximum(y - tmax, 0)

def policy_projsplx(policy):
    new_pol = np.zeros_like(policy)
    for idx, row in enumerate(policy):
        new_pol[idx, :] = projsplx(row.flatten())
    return new_pol

Finally, we use the functions defined above to implement a Projected Gradient Ascent Reinforcement Learning algorithm.

In [6]:
def projected_gradient_ascent(gw, T = 1000, eta = 0.3):
    policy = np.random.rand(gw.RY, gw.RX, gw.NACTIONS)
    policy = policy.reshape(gw.RY*gw.RX, gw.NACTIONS)

    for idx, row in enumerate(policy):
        policy[idx, :] = projsplx(row.flatten())

    Vs = []
    for _ in range(T + 1):
        ptheta = get_Ptheta2D(gw.P, policy, gw)
        d = get_d(gw.gamma, ptheta)
        V = get_V(gw, policy, d)
        Vs.append(gw.rho.dot(V))
        
        grad = get_direct_grad(gw, policy)
        policy = policy + eta * grad
        for idx, row in enumerate(policy):
            policy[idx, :] = projsplx(row.flatten())

    return policy, Vs

We can then run this algorithm 

In [11]:
gw = Gridworld(6, 3, 0.5)
gw.set_reward(4, 2, 10)

gradient_ascent_policies = []
gradient_asent_values_over_learning = []

for _ in range(1):
    policy, values = projected_gradient_ascent(gw)
    gradient_ascent_policies.append(policy)
    gradient_ascent_policies.append(values)

(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)


KeyboardInterrupt: 

## Entropy Regularized Softmax Policy Gradient

In [7]:
def entropy(x):
    eps = 1e-20
    return -np.sum([xi*np.log(xi+eps) for xi in x])

def discounted_entropy(policy, d):
    state_entropies = np.apply_along_axis(entropy, 1, policy)
    return np.sum(d * state_entropies)

def softmax(x):
    exp_vals = np.exp(x)
    return exp_vals/np.sum(exp_vals)
    

In [8]:
def get_V_tilde(gw, policy, d, temperature):
    #print("GVT: " + str(type(gw)))
    return np.sum(get_V(gw, policy, d), temperature * discounted_entropy(policy, d))

def get_Q_tilde(gw, policy, d, temperature):
    #print("GQT: " + str(type(gw)))
    P, R = gw.P, gw.R
    V = get_V_tilde(gw, policy, d, temperature)
    return R + gw.gamma * np.einsum( 'ijk,k->ij', P.reshape( gw.NSTATES, gw.NACTIONS, gw.NSTATES) , V )

def get_A_tilde(gw, policy, temperature, d):
    #print("GAT: " + str(type(gw)))
    return get_Q_tilde(gw, policy, d, temperature) - temperature * numpy.log(policy) - get_V_tilde(gw, policy, d, temperature)

def get_grad_V_tilde(gw, policy, temperature):
    #print("GGVT: " + str(type(gw)))
    Ptheta = get_Ptheta2D(gw.P, policy, gw)
    d = get_d(gw.gamma, Ptheta)
    return (1 / (1 - gw.gamma)) * d * get_A_tilde(gw, policy, temperature, d)

In [None]:
def entropy_softmax_gradient_ascent(gw, T = 1000, eta = 0.3, temp = 0.1):
    logits = np.ones((gw.RY, gw.RX, gw.NACTIONS))
    logits = logits.reshape(gw.RY*gw.RX, gw.NACTIONS)
    policy = np.apply_along_axis(softmax, 1, logits)
    #policy = policy.reshape(gw.RY*gw.RX, gw.NACTIONS)

    Vs = []
    for _ in range(T):
        grad = get_grad_V_tilde(gw, policy, temp)
        logits = logits + eta * grad
        policy = np.apply_along_axis(softmax, 1, logits)

        ptheta = get_Ptheta2D(gw.P, policy, gw)
        d = get_d(gw.gamma, ptheta)
        V = get_V_tilde(gw, policy, temp)
        Vs.append(gw.rho.dot(V))
    return policy, Vs

In [10]:
gw = Gridworld(6, 3, 0.5)
gw.set_reward(4, 2, 10)
entropy_softmax_gradient_ascent(gw)

(3, 6, 5)
(18, 5)


ValueError: operands could not be broadcast together with shapes (3,6,5) (18,5) 

In [None]:
gw = Gridworld(6, 3, 0.5)
gw.set_reward(4, 2, 10)

# print(np.shape(gw.R))
# print(gw.R[13])
# print(gw.R[12])
# print(gw.R[14])
# print(gw.R[8])



bad = 0
for i in range(10):
    policy, Vs = entropy_softmax_gradient_ascent(gw)
    if(Vs[-1]) < 0.6:
        bad += 1
        print(f"bad #{bad} (${i} processed):", Vs)

bad

(3, 6, 5)



KeyboardInterrupt

