In [3]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [4]:
import numpy as np
import time
import matplotlib.pyplot as plt
from scipy.stats import truncnorm
from FH_Functions import get_truncated_normal

In [5]:
#@title Plain Q Training - Step Reward
def QL_stepCost(mean = 0.5, sd = 0.1, lam = 0.4, Nsamples = 5, Nstates = 501, Nactions = 101, 
        epsDecay=0.9999, learningRate = 0.001, Nthetas = 100, Nsweeps = 10000):

    S = np.linspace(0,1,Nstates)
    A = np.linspace(0,0.5,Nactions)
    Q = np.zeros((Nsamples+1, Nstates, Nstates, Nactions))
    
    for nn in range(Nsamples+1):
        for idXc in range(Nstates):
            for idXo in range(Nstates):
                # Value of terminal states is the size - smaller is better
                Q[nn, idXc, idXo, :] = np.abs(S[idXc] - S[idXo])
    
    epsilon = 1
    minEpsilon = 0.1
    np.random.seed(1)

    for ii in range(Nsweeps):
        tn = get_truncated_normal(mean, sd, 0, 1)
        thtRange = tn.rvs(size=Nthetas)  
        
        for theta in thtRange:
            
            Xc = 0  # current location
            Xo = 1  # opposite end of interval
            lb = 0  # lower bound
            ub = 1  # upper bound
            
            for nn in range(Nsamples):
                # choose next action according to epsilon-greedy policy
                idXc = np.argmin(np.abs(Xc-S))
                idXo = np.argmin(np.abs(Xo-S)) 
                
                state = ub - lb

                if np.abs(idXc - idXo) > 1:
                    
                    U = np.random.rand()
                    if U < epsilon:
#                         explore with probability epsilon
                        aa = np.random.randint(Nactions)
                    else:
#                         exploit with probability 1-epsilon
                        aa = np.argmin(Q[nn,idXc,idXo,:])

                    act  = A[aa]
                    dist = state*act

                    if Xc < Xo:   
                        Xc += dist

                    elif Xc > Xo:
                        Xc -= dist
                        
                    if Xc < theta:
                        lb = Xc
                        Xo = ub
                    else:
                        ub = Xc         
                        Xo = lb
                        
                    newidXc = np.argmin(np.abs(Xc-S))
                    newidXo = np.argmin(np.abs(Xo-S))     

                    # Reward is the distance travelled
                    reward = lam*dist

                    Q[nn, idXc, idXo, aa] += learningRate*(reward + np.amin(Q[nn+1, newidXc, newidXo :]) - Q[nn, idXc, idXo, aa])
                    
        epsilon = max(minEpsilon, epsilon*epsDecay)
                
        print('\rsweep: %d of %d. eps: %.4f'%(ii+1, Nsweeps, epsilon),end="")
    return Q

In [6]:
start = time.time()
Q = QL_stepCost(Nsamples = 5, epsDecay = 0.99995, Nsweeps = 50500)
stop = time.time()

sweep: 50500 of 50500. eps: 0.1000

In [10]:
print("\nTime: ", stop-start)
np.save('./Policies/Q_table_mu5_sig1_lam4_N5_s501_A101',Q)


Time:  1405.4810228347778


In [8]:
Nsamples = 5
Nstates  = 501
Nactions = 101

A = np.linspace(0,0.5,Nactions)
bestAct = np.zeros((Nsamples, Nstates, Nstates))
for nn in range(Nsamples):
    for idXc in range(Nstates):
        for idXo in range(Nstates):
            # Select the action that gives best cost
            bestAct[nn, idXc, idXo] = A[np.argmin(Q[nn,idXc,idXo,:])] 

In [9]:
np.save('./Policies/QL_bestAction_mu5_sig1_lam4_N5_s501_A101',bestAct)