In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import time
import numpy as np
import multiprocessing, time
import matplotlib.pyplot as plt
from joblib import Parallel, delayed
from FH_Functions import get_truncated_normal
from Rollout_Functions import range_act

In [3]:
def Rollout_search(thetaRange, thetaMean, thetaStd, nThetas, nSteps, lam, S, A, pol, improve=False):
    costs = []
    num_cores = multiprocessing.cpu_count()
    np.random.seed(1)

    if improve:
        update_every = 100
        Rollout_acts = np.zeros(nSteps)
        
    for nTheta, baseTheta in enumerate(thetaRange):
        Xc = 0
        Xo = 1
        lb = 0
        ub = 1
        totDist = 0
        
        if improve:
            if nTheta and (nTheta % update_every == 0):
                pol = Rollout_acts/update_every
                Rollout_acts = np.zeros(nSteps)
                
        for nn in range(nSteps):

            state = np.abs(ub - lb)
            
            thetaDist1  = get_truncated_normal(thetaMean, thetaStd, min(Xc,Xo), max(Xc,Xo))
            thetaRange1 = thetaDist1.rvs(size=nThetas)  

            Q = Parallel(n_jobs=num_cores)(delayed(range_act)
                        (thetaRange1,Xc, Xo, act, lam, pol[nn+1:], S) for act in A)

            bestAct = A[np.argmin(Q)]
            
            dist = state*bestAct
            dist = S[np.argmin(np.abs(S - dist))]
            totDist += dist

            if Xc < Xo:   
                Xc += dist

            elif Xc > Xo:
                Xc -= dist

            if Xc < baseTheta:
                lb = Xc
                Xo = ub
            else:
                ub = Xc         
                Xo = lb

            if improve:
                Rollout_acts[nn] += bestAct

        costs.append(lam*totDist + np.abs(ub - lb))
        print("\rSweep: %d / %d"%(nTheta+1, len(thetaRange)),end="")
    if improve:
        return np.array(costs), pol
    else:
        return np.array(costs)

In [4]:
# Rollout w/ Policy Improvement Search Test
lam      = 0.4
nSteps   = 5
nStates  = 501
nThetas  = 10
nActions = 101

g   = 0.4
pol = g*np.ones(nSteps)

S = np.linspace(0, 1.0, nStates)
A = np.linspace(0.0, 0.5, nActions)          

nSweeps = 1001

thetaMean  = 0.5
thetaStd   = 0.1
thetaDist  = get_truncated_normal(thetaMean, thetaStd, 0, 1)
np.random.seed(0)
thetaRange = thetaDist.rvs(size=nSweeps)

start = time.time()
costs, pol = Rollout_search(thetaRange, thetaMean, thetaStd, nThetas, nSteps, lam, S, A, pol, improve=True)
stop = time.time()
print("\nTime: %.2f"%(stop-start))
avg_cost = np.mean(costs)
print("cost: ", avg_cost)
print("Policy: ", pol)

Sweep: 1001 / 1001
Time: 383.14
cost:  0.2836799200799201
Policy:  [0.40165 0.1473  0.20675 0.28585 0.34075]


In [5]:
# plt.plot(pol)
np.save('./Policies/Rollout_bestAction_mu5_sig1_lam4_N5_s501_A101',pol)

In [None]:
# Rollout Search Test - no improvement
lam      = 0.4
nSteps   = 5
nStates  = 501
nThetas  = 10
nActions = 101

g   = 0.4
pol = g*np.ones(nSteps)

S = np.linspace(0, 1.0, nStates)
A = np.linspace(0.0, 0.5, nActions)          

nSweeps = 1000

thetaMean  = 0.5
thetaStd   = 0.1
thetaDist  = get_truncated_normal(thetaMean, thetaStd, 0, 1)
np.random.seed(0)
thetaRange = thetaDist.rvs(size=nSweeps)

start = time.time()
costs = Rollout_search(thetaRange, thetaMean, thetaStd, nThetas, nSteps, lam, S, A, pol)
stop = time.time()
print("\nTime: %.2f"%(stop-start))
avg_cost = np.mean(costs)
print("cost: ", avg_cost)