In [1]:
#import packages
import numpy as np
import pandas as pd

In [2]:
class FQlearnTable:
    #initialize the table by dividing it into layers based on the actions
    def __init__(self,no_actions,no_layers=3):
        self.no_acts = no_actions
        self.no_layers = no_layers
        self.levels = {0: FLevel(action = [no_actions])}
        if(no_layers > 1):
            for i in range(1, no_layers-1):
                self.levels[i] = FLevel(action = list(range(no_actions+1)))
            self.levels[no_layers - 1] = FLevel(action = list(range(no_actions)))

# function for selecting an action depending on the level of the state
    def select_action(self, current_state):
        lev_state = self.get_lev_st(current_state)
        action_array = []
        for act in range(self.no_layers):
            action_array.append(self.levels[act].select_action(lev_state[act]))
        return action_array

#function capturing the model tuples based on the state update by allocating the lower level reward
    def memory(self, current_state, action, reward, new_state, done):
        lev_state = self.get_lev_st(current_state)
        lev_primes = self.get_lev_st(new_state)

        good_r = 0
        bad_r = -1

        for i in range(self.no_layers):
            if i == 0:
                rwd = reward
            else:
                if action[i-1] == 4:
                    rwd = reward
                else:
                    if action[i-1] == action[i]:
                        rwd = good_r
                    else:
                        rwd = bad_r

            self.levels[i].memory(lev_state[i], action[i],
                                  rwd, lev_primes[i], done)

#function for updating the state based on the hierarchical levels
    def get_lev_st(self,current_state):
        state_array = []
        state_array.append(current_state)
        for i in range(self.no_layers -2, -1, -1):
            state_array.append((int(state_array[-1][0]/2),int(state_array[-1][1]/2)))
#         state_array.remove()
        return state_array


In [3]:
class FLevel:
#initializing the parameters for q-learning
    def __init__(self, action, lr = 0.01, r_decay=0.9, epsilon_g =0.9):
        self.action = action
        self.lr = lr
        self.gamma = r_decay
        self.epsilon = epsilon_g
        self.q_table = pd.DataFrame(columns=self.action, dtype=np.float_)

#function for action selection 
    def select_action(self, obs):
        obs = str(obs)
        self.verify_state(obs)
        #select action
        if np.random.uniform() < self.epsilon:
            #select the best action
            best = self.q_table.loc[obs, :]
            best = best.reindex(np.random.permutation(best.index))
            action = best.idxmax()
        else:
            action = np.random.choice(self.action)
        return action
    
#a function for updating the q-table 
    def memory(self, current_state, act, reward, new_state, done):
        current_state = str(current_state)
        new_state = str(new_state)
        self.verify_state(new_state)
        new_q = self.q_table.loc[current_state, act]
        if not done:
            q_threshold = reward + self.gamma * self.q_table.loc[new_state, :].max()
        else:
            q_threshold = reward
        self.q_table.loc[current_state,act] += self.lr * (q_threshold - new_q)

#function that checks for the index of the state corresponding to the specific level
    def verify_state(self, current_state):
        if current_state not in self.q_table.index:
            self.q_table = self.q_table.append(pd.Series(
                [0]*len(self.action), index= self.q_table.columns,
                name = str(current_state),))

In [4]:
flevel = FLevel(['Left','Right','Top','Bottom'])  #grid world with four actions
print("Q_Table at start\n", flevel.q_table)
start_state=(0,0) # the grid world (x,y) coordinates
flevel.verify_state(start_state)
print("\n\nQ Table after verifying initial state\n", flevel.q_table)

# now update memory for taking an action 'Right' and moving from (0,0) to (0,1) with done=False(i.e. episode has not ended)
flevel.memory((0,0), 'Right', 4, (0,1), False)
print("\n\nQ Table after making a transition and getting a reward\n", flevel.q_table)



Q_Table at start
 Empty DataFrame
Columns: [Left, Right, Top, Bottom]
Index: []


Q Table after verifying initial state
         Left  Right  Top  Bottom
(0, 0)   0.0    0.0  0.0     0.0


Q Table after making a transition and getting a reward
         Left  Right  Top  Bottom
(0, 0)   0.0   0.04  0.0     0.0
(0, 1)   0.0   0.00  0.0     0.0
