In [None]:
import numpy as np
import os
import copy
import matplotlib.pylab as plt
import scipy.signal


In [1]:
!wget https://raw.githubusercontent.com/lucianodellier/Estudo_Swimmer/main/DadosNumpy500-5000/evolqlquantities_numpy_500steps.txt -O evolqlquantities.txt &> /dev/null
!wget https://raw.githubusercontent.com/lucianodellier/Estudo_Swimmer/main/DadosNumpy500-5000/deltas.txt -O deltas.txt &> /dev/null

In [None]:
dir_   = '.'
eqlfile = 'evolqlquantities.txt'
EQL = np.loadtxt(dir_+'/'+eqlfile)

deltasfile = 'deltas.txt'
DELTAS = np.loadtxt(dir_+'/'+deltasfile)


In [None]:
nballs = 4
nlinks = nballs-1
nstates = 2**nlinks
nactions = nlinks
ndeltas = 6

tfdata = 500 
nqlsteps = tfdata

s_batch  = np.asarray(EQL[:tfdata,0],dtype=np.int32)
a_batch  = np.asarray(EQL[:tfdata,1],dtype=np.int32)
sn_batch = np.asarray(EQL[:tfdata,2],dtype=np.int32)
x_batch = np.asarray(EQL[:tfdata,3],dtype=np.int32)
y_batch = np.asarray(EQL[:tfdata,4],dtype=np.int32)
xn_batch = np.asarray(EQL[:tfdata,5],dtype=np.int32)
yn_batch = np.asarray(EQL[:tfdata,6],dtype=np.int32)
theta_batch = np.asarray(EQL[:tfdata,7],dtype=np.int32)
thetan_batch = np.asarray(EQL[:tfdata,8],dtype=np.int32)

rcase = 1

if rcase == 1: #rotation
    r_batch = EQL[:tfdata,8] - EQL[:tfdata,7]

In [None]:
def QL_steps_zb(s_batch,sn_batch,a_batch,r_batch,alpha,gamma):
  # Initilize QL matrix

  MQL  = np.zeros((nstates,nactions))
  MQLn = np.zeros((nstates,nactions))
  DMQL = []

  # QL steps
  for it in range(nqlsteps):   
    
    # Learning  #######################################################    
    prev_state_dec  = s_batch[it]
    next_action_dec = a_batch[it]
    next_state_dec  = sn_batch[it]
    next_MQLmax     = np.max(MQL[next_state_dec,:])
    current_MQLval  = MQL[prev_state_dec,next_action_dec]
    current_reward  = r_batch[it]    

    MQL[prev_state_dec,next_action_dec] = (1-alpha)*current_MQLval + \
      alpha*(current_reward + gamma*next_MQLmax)

    DMQL.append(np.linalg.norm(MQL-MQLn))
    MQLn = copy.deepcopy(MQL)    

  return MQL, DMQL


def replay_zb(MQL,s_batch,sn_batch,a_batch,r_batch,alpha,gamma,DMQL):
   
   nreplay = 2500 
   MQLC = copy.deepcopy(MQL)
   MQLn = copy.deepcopy(MQL)
   #DQ = []
   for i in range(nreplay):     
      rl = np.random.randint(s_batch.shape[0], size=1)[0]
      prev_state_dec    = s_batch[rl]
      next_action_dec   = a_batch[rl]
      next_state_dec    = sn_batch[rl]
      next_MQLmax       = np.max(MQLC[next_state_dec,:])
      current_MQLval    = MQLC[prev_state_dec,next_action_dec]
      current_reward    = r_batch[rl]
      
      MQLC[prev_state_dec,next_action_dec] = (1-alpha)*current_MQLval + \
         alpha*(current_reward + gamma*next_MQLmax)
         
      DMQL.append(np.max(MQLC-MQLn)) #DMQL.append(np.linalg.norm(MQLC-MQLn))
      MQLn = copy.deepcopy(MQLC)
   
      
   return MQLC, DMQL  

def find_gait_zb(state_dec, MQL):
  
  vel = 0.0
  slist = []
  alist = []
  current_state_dec = state_dec
  Deltas = ReadDeltasTables(ndeltas, nstates, nactions, DELTAS)
  
  for j in range(10):
    action = np.zeros(nlinks, dtype=int)
    aux_current = np.array([int(i) for i in np.binary_repr(current_state_dec,width=nlinks)])
    action[np.argmax(MQL[current_state_dec,:])] = 1
    aux_state = np.remainder(aux_current+action,2*np.ones(nlinks, dtype=int))    
    next_state_dec = np.dot(aux_state,2**np.arange(nlinks-1,-1,-1,dtype=int))
    
    if(j > 5):
        vel += Deltas[2][current_state_dec, action.argmax()]
    
    slist.append(current_state_dec)
    alist.append(action)
    
    current_state_dec = next_state_dec 
  
  vel = vel/4
  return slist, alist,vel

def ReadDeltasTables(ndeltas, nstates, nactions, filetable):            
    
    A = filetable
    Deltas = []
    for d in range(ndeltas):
        Deltas.append(np.zeros(shape=(nstates, nactions)))
        for state in range(nstates):
            Deltas[d][state, :] = A[d * nstates + state, :]
    return Deltas  
  

In [None]:
gammalist = [0.999,0.99,0.95,0.9,0.7,0.1] #0.96
alphalist = [1.0,0.5,0.1]

for alpha in alphalist:
  for gamma in gammalist:
    MQL, DMQL = QL_steps_zb(s_batch,sn_batch,a_batch,r_batch,alpha,gamma)

    prev_state_dec  = s_batch[0]
    slist, alist, vel = find_gait_zb(prev_state_dec, MQL)
    FDMQL = scipy.signal.savgol_filter(DMQL,15,0)
    print(f'alpha = {alpha}, gamma = {gamma:.3f}, dQ = {FDMQL[-1]:e}, {slist}, vel = {vel:.3f}')
    
    

alpha = 1.0, gamma = 0.999, dQ = 4.712329e-03, [0, 4, 6, 2, 3, 7, 5, 1, 0, 4], vel = -0.110
alpha = 1.0, gamma = 0.990, dQ = 4.051837e-04, [0, 4, 5, 1, 0, 4, 5, 1, 0, 4], vel = 0.008
alpha = 1.0, gamma = 0.950, dQ = 2.864333e-03, [0, 1, 0, 1, 0, 1, 0, 1, 0, 1], vel = 0.000
alpha = 1.0, gamma = 0.900, dQ = 2.129663e-03, [0, 1, 0, 1, 0, 1, 0, 1, 0, 1], vel = 0.000
alpha = 1.0, gamma = 0.700, dQ = 6.587908e-05, [0, 1, 0, 1, 0, 1, 0, 1, 0, 1], vel = 0.000
alpha = 1.0, gamma = 0.100, dQ = 1.713336e-11, [0, 1, 0, 1, 0, 1, 0, 1, 0, 1], vel = 0.000
alpha = 0.5, gamma = 0.999, dQ = 2.059390e-03, [0, 4, 6, 2, 3, 7, 5, 1, 0, 4], vel = -0.110
alpha = 0.5, gamma = 0.990, dQ = 3.101709e-04, [0, 4, 5, 1, 0, 4, 5, 1, 0, 4], vel = 0.008
alpha = 0.5, gamma = 0.950, dQ = 1.018289e-03, [0, 1, 0, 1, 0, 1, 0, 1, 0, 1], vel = 0.000
alpha = 0.5, gamma = 0.900, dQ = 1.398558e-03, [0, 1, 0, 1, 0, 1, 0, 1, 0, 1], vel = 0.000
alpha = 0.5, gamma = 0.700, dQ = 7.966224e-04, [0, 1, 0, 1, 0, 1, 0, 1, 0, 1], vel = 0.0