# Code For: Experimentally Comparing Reinforcement Learning Algorithms with Finite-Time Optimality Guarantees

## Setup Game

In [81]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Arrow, Circle
from itertools import product
from multiprocessing import Pool
from tqdm import tqdm
from collections import namedtuple

In [82]:
STAR_VAL = 10

RX = 6
RY = 3
NSTATES = RX*RY
NACTIONS = 5

gamma = 0.5

rho = np.zeros(NSTATES)
rho[0] = 1.0

def move(y, x, a):
    if a == 0:
        x -= 1
        return y, x
    elif a == 1:
        return y, x
    elif a == 2:
        x += 1
        return y, x
    elif a == 3:
        y += 1
        return y, x
    elif a == 4:
        y -= 1
        return y, x

P = np.zeros( (RY, RX, NACTIONS, RY, RX) )
for idx in range(RY*RX):
    x = idx % RX
    y = idx // RX
    for a in range(NACTIONS):
        ny, nx = move(y, x, a)
        ny = np.clip(ny, 0, RY-1)
        nx = np.clip(nx, 0, RX-1)
        P[y, x, a, ny, nx] = 1


R = np.zeros((RY, RX, NACTIONS))
R[2, -2, 1] = STAR_VAL
R[2, -3, 2] = STAR_VAL
R[2, -1, 0] = STAR_VAL
R[2, -2, 3] = STAR_VAL

R = R.reshape(RY*RX, NACTIONS)

## Projected Gradient Ascent

In [83]:
def get_d(gamma, Ptheta):
    sz = np.shape(Ptheta)
    return np.linalg.inv(np.eye(*sz) - gamma*Ptheta)

def get_V(P, R, policy, d):
    global NSTATES
    r_expect = np.sum(policy*R, axis=1)
    V = np.zeros(NSTATES)
    V = d.dot(r_expect)
    return V

def get_Q(P, R, policy, gamma, d):
    V = get_V(P, R, policy, d)
    return R + gamma * np.einsum( 'ijk,k->ij', P.reshape( NSTATES, NACTIONS, NSTATES) , V )

def get_Ptheta2D(P, policy):
    global NSTATES
    Pprime = P.reshape(NSTATES * NACTIONS, NSTATES) * policy.reshape(NSTATES * NACTIONS)[:, None]
    Ptheta = np.zeros((NSTATES, NSTATES))

    for s in range(NSTATES):
        z = np.sum( Pprime[s*NACTIONS:(s+1)*NACTIONS, :], axis=0 )
        Ptheta[s, :] = z
    return Ptheta

In [101]:
def get_direct_grad(P, R, gamma, policy):
    Ptheta = get_Ptheta2D(P, policy)
    d = get_d(gamma, Ptheta)
    V = get_V(P, R, policy, d)
    q = get_Q(P, R, policy, gamma, d)
    grad = (1/(1-gamma)) * rho.dot(d)[:, None] * q
    return grad

def projsplx(y):
    """Python implementation of:
    https://arxiv.org/abs/1101.6081"""
    s = np.sort(y)
    n = len(y) ; flag = False

    parsum = 0
    tmax = -np.inf
    for idx in range(n-2, -1, -1):
        parsum += s[idx+1]
        tmax = (parsum - 1) / (n - (idx + 1) )
        if tmax >= s[idx]:
            flag = True ; break

    if not flag:
        tmax = (np.sum(s) - 1) / n

    return np.maximum(y - tmax, 0)

def policy_projsplx(policy):
    new_pol = np.zeros_like(policy)
    for idx, row in enumerate(policy):
        new_pol[idx, :] = projsplx(row.flatten())
    return new_pol

In [107]:
def projected_gradient_ascent():
    policy = np.random.rand(RY, RX, NACTIONS)
    policy = policy.reshape(RY*RX, NACTIONS)

    T = 1000
    eta = 0.3
    Vs = []
    for _ in range(T):
        
        grad = get_direct_grad(P, R, gamma, policy)
        policy = policy + eta * grad
        for idx, row in enumerate(policy):
            policy[idx, :] = projsplx(row.flatten())
        
        ptheta = get_Ptheta2D(P, policy)
        d = get_d(gamma, ptheta)
        V = get_V(P, R, policy, d)
        Vs.append(rho.dot(V))

    return Vs

In [108]:
projected_gradient_ascent()

[0.0,
 0.0024126651540275347,
 0.004799276823861764,
 0.007270374753385305,
 0.009932722105797494,
 0.012898260806428382,
 0.016292558845980222,
 0.02026388995836355,
 0.024994005334495802,
 0.030711786469107178,
 0.03727689212879412,
 0.045272121910606415,
 0.05512536560205271,
 0.0673568367375883,
 0.08263905789079973,
 0.10187382714450001,
 0.12260860106020893,
 0.14762778740769253,
 0.17831695204565579,
 0.21621703402552253,
 0.2618178570512038,
 0.30614316090813143,
 0.3463014524978818,
 0.38740231863255997,
 0.4374663684008416,
 0.47666991372144624,
 0.5211827302252576,
 0.5864613297606245,
 0.625,
 0.625,
 0.625,
 0.625,
 0.625,
 0.625,
 0.625,
 0.625,
 0.625,
 0.625,
 0.625,
 0.625,
 0.625,
 0.625,
 0.625,
 0.625,
 0.625,
 0.625,
 0.625,
 0.625,
 0.625,
 0.625,
 0.625,
 0.625,
 0.625,
 0.625,
 0.625,
 0.625,
 0.625,
 0.625,
 0.625,
 0.625,
 0.625,
 0.625,
 0.625,
 0.625,
 0.625,
 0.625,
 0.625,
 0.625,
 0.625,
 0.625,
 0.625,
 0.625,
 0.625,
 0.625,
 0.625,
 0.625,
 0.625,
 0.6

## Entropy Regularized Softmax Policy Gradient

In [87]:
def entropy(x):
    eps = 1e-20
    return -np.sum([xi*np.log(xi+eps) for xi in x])

def discounted_entropy(policy, d):
    state_entropies = np.apply_along_axis(entropy, 1, policy)
    return np.sum(d * state_entrodef logit(p):
    return math.log(p/(1-p)))

def sof
    exp_vals =np.els/() t_svexp_valsr /np.sum(np.exp(x)) np.sum([])
    

In [89]:
def get_V_tilde(P, R, policy, d, temperature):
    return np.sum(get_V(P, R, policy, d), temperature * discounted_entropy(policy, d))

def get_Q_tilde(P, R, policy, gamma, d):
    V = get_V_tilde(P, R, policy, d)
    return R + gamma * np.einsum( 'ijk,k->ij', P.reshape( NSTATES, NACTIONS, NSTATES) , V )

def get_A_tilde(P, R, policy, gamma, temperature):
    #  ˜Qπθ (s, a) − τ log πθ (a|s) − ˜V πθ (s), 
    return get_Q_tilde(P, R, policy, gamma, d) - temperature * ___ - get_V_tilde(P, R, policy, d, temperature)

def get_grad_V_tilde(P, R, policy, gamma, temperature):
    Ptheta = get_Ptheta2D(P, policy)
    d = get_d(gamma, Ptheta)
    return (1 / (1 - gamma)) * d  * get_A_tilde()P, R, policy, gamma, temperature

In [None]:
def entropy_softmax_gradient_ascent():
    logits = np.ones(RY, RX, NACTIONS)
    policy = np.apply_along_axis(softmax, 1, logits)

    T = 1000
    eta = 0.3
    Vs = []
    temp = .1
    for _ in range(T):
        grad = get_grad_V_tilde(P, R, policy, gamma, temp)
        logits = logits + eta * grad
        policy = np.apply_along_axis(softmax, 1, logits)

        ptheta = get_Ptheta2D(P, policy)
        d = get_d(gamma, ptheta)
        V = get_V_tilde(P, R, policy, d, temp)
        Vs.append(rho.dot(V))
    return policy, Vs