In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [3]:
import time
import numpy as np
import multiprocessing, time
import scipy.stats as stats
import matplotlib.pyplot as plt
from joblib import Parallel, delayed
from Rollout_Functions import range_act

In [3]:
def Rollout_search(thetaRange, nThetas, nSteps, lam, S, A, pol, improve=False):
    costs = []
    num_cores = multiprocessing.cpu_count()
    np.random.seed(1)
    
    if improve:
        update_every = 100
        Rollout_acts = np.zeros(nSteps)
    
    for nTheta, baseTheta in enumerate(thetaRange):
        Xc = 0
        Xo = 1
        lb = 0
        ub = 1
        totDist = 0
        
        if improve:
            if nTheta and (nTheta % update_every == 0):
                pol = Rollout_acts/update_every
                Rollout_acts = np.zeros(nSteps)
                
        for nn in range(nSteps):
            state = np.abs(ub - lb)
            thetaRange1 = np.random.uniform(lb, ub, nThetas)  

            Q = Parallel(n_jobs=num_cores)(delayed(range_act)
                        (thetaRange1,Xc, Xo, act, lam, pol[nn+1:], S) for act in A)

            bestAct = A[np.argmin(Q)]
            dist = state*bestAct
            dist = S[np.argmin(np.abs(S - dist))]
            totDist += dist

            if Xc < Xo:   
                Xc += dist

            elif Xc > Xo:
                Xc -= dist

            if Xc < baseTheta:
                lb = Xc
                Xo = ub
            else:
                ub = Xc         
                Xo = lb
                
            if improve:
                Rollout_acts[nn] += bestAct

        costs.append(lam*totDist + np.abs(ub - lb))
        print("\rSweep: %d / %d"%(nTheta+1, len(thetaRange)),end="")
    if improve:
        return np.array(costs), pol
    else:
        return np.array(costs)


In [7]:
# Rollout Improvement Search Test
num_cores = multiprocessing.cpu_count()

lam      = 0.4
nSteps   = 5
nStates  = 501
nThetas  = 10
nActions = 101

g   = 0.4
pol = g*np.ones(nSteps)

S = np.linspace(0, 1.0, nStates)
A = np.linspace(0.0, 0.5, nActions)          

nSweeps = 1001

thetaMean  = 0.5
thetaStd   = 0.1
np.random.seed(0)
thetaRange = np.random.rand(nSweeps)

start = time.time()
np.random.seed(1)
costs, pol = Rollout_search(thetaRange, nThetas, nSteps, lam, S, A, pol, improve=True)
stop = time.time()
print("\nTime: %.2f"%(stop-start))
avg_cost = np.mean(costs)
print("Avg. cost: ", avg_cost)
print("Last 100 avg. cost: ", np.mean(costs[-100:]))
print("Policy: ", pol)

Sweep: 1001 / 1001
Time: 384.12
Avg. cost:  0.3432667332667333
Last 100 avg. cost:  0.34396
Policy:  [0.2147  0.25955 0.2809  0.32045 0.3545 ]


In [10]:
np.save('./Policies/Rollout_bestAction_Uniform_lam4_N5_s501_A101',pol)

In [None]:
# Rollout Search Test - no improvement
num_cores = multiprocessing.cpu_count()

lam      = 0.4
nSteps   = 5
nStates  = 501
nThetas  = 10
nActions = 101

g   = 0.4
pol = g*np.ones(nSteps)

S = np.linspace(0, 1.0, nStates)
A = np.linspace(0.0, 0.5, nActions)          

nSweeps = 1000

thetaMean  = 0.5
thetaStd   = 0.1
np.random.seed(0)
thetaRange = np.random.rand(nSweeps)

start = time.time()
np.random.seed(1)
costs = Rollout_search(thetaRange, nThetas, nSteps, lam, S, A, pol)
stop = time.time()
print("\nTime: %.2f"%(stop-start))
avg_cost = np.mean(costs)

In [5]:
print("Avg. cost:           ", avg_cost)
print("Last 100 avg. cost:  ", np.mean(costs[-100:]))
print("First 100 avg. cost: ", np.mean(costs[:100]))


Avg. cost:  0.3466424000000001
Last 100 avg. cost:   0.35261600000000004
First 100 avg. cost:  0.33605599999999997
