## Q-learning

#### Import packages

In [1]:
import pandas as pd
import numpy as np
import os
import random
from random import randint
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.optimizers import SGD , Adam, RMSprop
from keras.layers.advanced_activations import PReLU

Using TensorFlow backend.


#### Load input data, set hyperparameters and other variables for training

In [2]:
#wd = 'C:\\Users\\homeuser\\Documents\\MEJ\\City MSc\\09DISSERTATION\\Coding\\RLcoding\\'
wd = 'C:\\Users\\homeuser\\Documents\\MEJ\\City MSc\\09DISSERTATION\\Report\\PicturesRawData\\RL\\'
os.chdir(wd)
rewardFileName = 'RewardComputation.csv'
envFileName = 'Environment.csv'
rewardMatrix = pd.read_csv(wd+rewardFileName)
environment = pd.read_csv(wd+envFileName)
env = np.array(environment)
learningAlpha = 0.9
discountGamma = 0.9
greedyEpsilon = 0.9
greedyEpsilonRate = 0.1
episodeConvergence = 1 # 1 = Not Converged , 0= Converged
QMat_Old = np.zeros(shape=(5,5))# Initialise Q-Matrix Matrices
QMat_New = np.zeros(shape=(5,5))
Q_Old = 0
Q_New = 0
QfromEpisodes = pd.DataFrame(columns = ['Episode','QMatrixaverage','EpisodeTotalQ','Policy','PolicyLength','QvaluesForEpisode'])
rewardStepChange = -5
rewardCost = 0
currentReward = 0
endPos = (5,5)
num_actions = 4
nrows, ncols = env.shape
target = (nrows-1, ncols-1)
visited_mark = 0.8  # Cells visited by the agent will be painted by gray 0.8
#agent_mark = 0.5      # The current agent cell will be painteg by gray 0.5
actions_dict = {
    0:'LEFT',
    1: 'UP',
    2: 'RIGHT',
    3: 'DOWN',
}# Actions dictionary
actionDecode_dict = {
    'LEFT': 0,
    'UP': 1,
    'RIGHT': 2,
    'DOWN': 3,
}

#### User-defined functions

In [3]:
def valid_actions(cell):
    row, col = cell
    actions = [0, 1, 2, 3]
    nrows, ncols = env.shape
    if row == 0:
        actions.remove(1)
    elif row == nrows-1:
        actions.remove(3)

    if col == 0:
        actions.remove(0)
    elif col == ncols-1:
        actions.remove(2)
    return actions
        
def act(currentPos, action):
    [row,col] = currentPos
    if(action == 'DOWN'):
        row = row + 1
    elif(action == 'UP'):
        row = row - 1
    elif(action == 'LEFT'):
        col = col - 1
    elif(action == 'RIGHT'):
        col = col + 1
    nxtState = env[row,col]
    nxtPos = (row,col)
    return nxtState, nxtPos

def getReward(sourceId,target):
    temp = rewardMatrix[rewardMatrix['SourceID'] == sourceId]
    reward = temp[temp['TargetID']==target]['Cost'].iloc[0]
    return reward
    
def selectGreedyTargetsReward (nextStateReward):
    item = nextStateReward.index(max(nextStateReward))    
    return nextPos[item],nextState[item],max(nextStateReward),nextAction[item]
    
def selectRandomTargetsReward (nextStateReward):
    item = randint(0,len(nextStateReward)-1)
    return nextPos[item],nextState[item],nextStateReward[item],nextAction[item]
    
def calculatenoOfEpochs():
    noOfEpochs = int((greedyEpsilon- greedyEpsilonRate)/greedyEpsilonRate) + 1
    return noOfEpochs
    
def exploreAndExploit(Epsilon):#Explore : 1 and Exploit : 0
    if random.uniform(0, 1) <= Epsilon:
        return 1
    return 0
    
def bellmanEqn(currentStateReward,Q_Old,target):   # calculate maximum Q Value from Target to Targets of Target
    targetOfTargetList = rewardMatrix[rewardMatrix['SourceID'] == target]['TargetID'].tolist()
    nextStateValidActions = valid_actions(targetPos)
    validTargetOfTargetList = list(set(targetOfTargetList) & set(nextStateValidActions))    
    Q_temp = np.zeros(shape=(len(validTargetOfTargetList),1))#Fix needed to check for visited
    for i in range(0,len(validTargetOfTargetList)):
        Q_temp[i] = QMat_Old[target, i]
    Q_max =  max(Q_temp)
    Qnew = Q_Old + learningAlpha * ((currentStateReward + (discountGamma * Q_max))-Q_Old)
    return(Qnew) # Qnew = Qold + alpha[{reward + (gamma * Q_max)}-Qold]
    
def build_model(env, lr=0.001):
    model = Sequential()
    model.add(Dense(env.size, input_shape=(env.size,)))
    model.add(PReLU())
    model.add(Dense(env.size))
    model.add(PReLU())
    model.add(Dense(num_actions))
    model.compile(optimizer='adam', loss='mse')
    return model

#### Q-learning training

In [4]:
visited = set()
noOfEpochs = calculatenoOfEpochs()
#noOfEpochs = 5
QvalueForAllEpisodes = pd.DataFrame(index=range(noOfEpochs))
for i in range(5):
    #print('Episode: ==> ', i)
    stateCount = 0    
    episodeQvalue = []
    policyPie = [env[0,0]]
    Qtrain_Reward = []
    currentPos = (0,0)
    Qtrain_Done = []
    Qtrain_Action = []
    episodeConvergence = 1
    QvalueForAllEpisodes_row = 0
    while(episodeConvergence):
        
        row, col = currentPos
        currentState = env[currentPos[0],currentPos[1]]
        validActions = valid_actions(currentPos)
        
        nextAction = []
        nextState = []
        nextStateReward = []
        nextPos = []
        for i,each in enumerate(validActions):
            nextAction.append(actions_dict[each])
            [state, pos] = act(currentPos,nextAction[i])
            nextState.append(state)
            nextPos.append(pos)
            nextStateReward.append(getReward(currentState,nextState[i]))
        
        explore = exploreAndExploit(greedyEpsilon)# Decide explore or exploit
        if explore:
            [targetPos, targetState, targetStateReward, targetAction] = selectRandomTargetsReward(nextStateReward)
            #print('random')
        else:
            [targetPos, targetState, targetStateReward, targetAction] = selectGreedyTargetsReward(nextStateReward)
            #print('deterministic')
        policyPie.append(targetState)
        Qtrain_Action.append(actionDecode_dict[targetAction])
        
        Q_Old = QMat_Old[currentState,targetState]# Get Q Value of Current State and Target
        if targetPos in visited:
            visitPenalty = -20
        else:
            visitPenalty = 0
        currentStateReward = targetStateReward + rewardStepChange + visitPenalty
        
        Q_New = bellmanEqn(currentStateReward,Q_Old,targetState)# Calculate New Q-Value based on Old QValue and Reward  
        Qtrain_Reward.append(Q_New[0])
        
        QMat_New[currentState, targetState] = Q_New#update the New Q-Value in Q New matrix
        QvalueForAllEpisodes[i,QvalueForAllEpisodes_row] = float(Q_New)
        QvalueForAllEpisodes_row += 1
        episodeQvalue.append(float(Q_New))
        stateCount = stateCount + 1
        visited.add(targetPos)
        currentPos = targetPos#Update Current state as Target state Because in next intration it will act as source
        if(currentPos == endPos):            
            episodeConvergence = 0
            Qtrain_Done.append(1)
            #print(policyPie)
            #QMatrixaverage = (QMat_New.sum())/(rewardMatrix.shape[0])
            #EpisodeAvgQ = np.mean(episodeQvalue)
            #QfromEpisodes.loc[len(QfromEpisodes)] = [i,QMatrixaverage,EpisodeAvgQ,policyPie]            
        else:
            Qtrain_Done.append(0)
            
    QMatrixaverage = (QMat_New.sum())/(rewardMatrix.shape[0])
    #EpisodeAvgQ = np.mean(episodeQvalue)
    EpisodeSumQ = np.sum(episodeQvalue)
    #QfromEpisodes.ix([i,QMatrixaverage,EpisodeAvgQ,policyPie])
    #QfromEpisodes.loc[len(QfromEpisodes)] = [i,QMatrixaverage,EpisodeAvgQ,policyPie,len(policyPie)]
    QfromEpisodes.loc[len(QfromEpisodes)] = [i,QMatrixaverage,EpisodeSumQ,policyPie,len(policyPie),Qtrain_Reward]

bestReward = QfromEpisodes['EpisodeTotalQ'].max()
bestPolicy = QfromEpisodes[QfromEpisodes['EpisodeTotalQ']==bestReward]
bestQvalues = QfromEpisodes[QfromEpisodes['EpisodeTotalQ']==bestReward]['QvaluesForEpisode']

Qtrain = [policyPie[1:], Qtrain_Reward, Qtrain_Done, Qtrain_Action]
[a,b,c,d] = Qtrain

QfromEpisodes.to_csv('QlearningEpisodes1.csv')
bestQvalues.to_csv('bestQ.csv')

print("Best policy ==> ", bestPolicy['Policy'])

Best policy ==>  4    [2, 1, 1, 1, 0, 1, 0, 1, 0, 2, 0, 1, 1, 1, 1, ...
Name: Policy, dtype: object
