# Q-Learning

In [4]:
import numpy as np
import pandas as pd
import random

# TODO: DEFINE ACTION SPACE FOR EACH STATE

### Define States ###
states = []
for i in range(14):
    for j in range(20):
        states.append((i+1, j+1))

# Left wall states
left_wall = []
for i in range(16):
    left_wall.append((i, 0))
# Upper wall states
upper_wall = []
for j in range(22):
    upper_wall.append((15, j))
# Right wall
right_wall = []
for i in range(16):
    left_wall.append((i, 21))
# Bottom wall
bottom_wall = []
for j in range(22):
    upper_wall.append((0, j))

# Concat to walls
walls = left_wall + upper_wall + right_wall + bottom_wall

# Vertical wall + tree (right from Trainer)
for i in range(4, 14+1):
	walls.append((i, 4))

# Horizontal trees + building + mountain
for i in range(9, 16):
    for j in range(4, 20+1):
	    walls.append((i, j))
walls.remove((9,8)) # Do not add door to walls

# Horizontal lower wall 
for j in range(4, 16+1):
	walls.append((4, j))
walls.remove((4,8)) # Do not add staircase to walls

# Vertial Call right from charmander
for i in range(3, 9+1):
	walls.append((i, 19)) 
    
# Vertical Wall left from charmander
walls.append((8, 12))
walls.append((7, 12))
for j in range(12, 20):
    walls.append((6, j))
walls.remove((6, 16))


### Define Rewards ###
rewards = {}
for state in states:
    # Catch Bisasam
    #if state == (8, 10):
        #rewards[state] = 100
    # Catch Schiggy
    #elif state == (2, 13):
        #rewards[state] = 100
    # Catch Charmander
    if state == (8, 18):
        rewards[state] = 100
    # Go to building
    #elif state == (9, 8):
        #rewards[state] = 1
    # For all other States
    else:
        rewards[state] = -0.5
    
# Function that returns the valid next state, given current state and action
def getNextState(a: str, s: tuple) -> tuple:
    if a == "U":
        next_state = (s[0] - 1, s[1])
        if next_state in walls:
            next_state = (s[0], s[1])

    if a == "D":
        next_state = (s[0] + 1, s[1])
        if next_state in walls:
            next_state = (s[0], s[1])    

    if a == "L":
        next_state = (s[0], s[1] - 1)
        if next_state in walls:
            next_state = (s[0], s[1])

    if a == "R":
        next_state = (s[0], s[1] + 1)
        if next_state in walls:
            next_state = (s[0], s[1])
    
    return next_state


### Define Actions ###
actions = {}
for s in states:
    temp_action = []

    if getNextState("U", s) != s:
        temp_action.append("U")

    if getNextState("D", s) != s:
        temp_action.append("D")

    if getNextState("L", s) != s:
        temp_action.append("L")

    if getNextState("R", s) != s:
        temp_action.append("R")

    actions[s] = temp_action


################################################
### Define Q-Table and initialize it with 0  ###
################################################
n_states = len(states)      
n_action = 4

# Initialize Q_Table to all Zeros and convert to pd DataFrame
Q = np.zeros(shape=(n_states, n_action))
Q = pd.DataFrame(Q, columns=["U", "D", "L", "R"], index=states)

print("Q-Table at the beginning:")
display(Q)

### Define Hyperparams ###
GAMMA = 0.9
EPSILON = 0.95
ALPHA = 0.9
EPISODE = 1000

# Q-Learning algorithm:
for i in range(EPISODE):
    reward_history = []

    # Init (a, b) to (1, 4) so that the while loop always triggers and shuffles new
    (a, b) = (0, 0)

    # Get random start state
    while (a, b) in walls:
        a = np.random.choice(a=np.arange(15))
        b = np.random.choice(a=np.arange(21))

    state = (a, b)
    print(f"Start State at Epsiode {i} is {state}")
    
    # Simulate Episode
    for i in range(100):
        #print(f"Current state: {state}")
        
        # Get next state randomly (Exploration)
        # Epsilon decreases in relation to the current episode
        # Thus Epsilon is high at the beginning and low at the end
        if random.random() < EPSILON**i**(1/3):
            # Get random action
            a = np.random.choice([action for action in actions[state]])
            next_state = getNextState(a, state)
            #print(f"Random Action '{a}' has been taken. Resulting next state is {next_state}")
        # Get next state from Q-Table (Exploitation)
        else:
            valid_actions = [action for action in actions[state]]
            #print(f"Valid Action in state {state} is: {valid_actions}")
            # Get max_a q(state, a) from the Q-Table
            a = list(Q.loc[[state], [action for action in actions[state]]].idxmax(axis=1).values)[0]
            #print(f"Max Action in state {state} is: {a}")
            next_state = getNextState(a, state)
            #print(f"Next state is: {next_state}")

        # Update Q-Table
        Q.at[state, a] = Q.at[state, a] + ALPHA * (rewards[next_state] + GAMMA * float(Q.loc[[next_state], :].max(axis=1)) - Q.at[state, a])
        reward_history.append(rewards[next_state])
        
        # If we reach one terminal state: end the episode
        #if next_state == (9, 8) or next_state == (2, 13) or next_state == (8, 10) or next_state == (8, 18):
            #rewards[next_state] = -0.5
            #break
            
        if next_state == (8, 18):
            break

        state = next_state
        

print("Training is finished!")
print("Q-Table in the end:")
display(Q)


Q-Table at the beginning:


Unnamed: 0,U,D,L,R
"(1, 1)",0.0,0.0,0.0,0.0
"(1, 2)",0.0,0.0,0.0,0.0
"(1, 3)",0.0,0.0,0.0,0.0
"(1, 4)",0.0,0.0,0.0,0.0
"(1, 5)",0.0,0.0,0.0,0.0
...,...,...,...,...
"(14, 16)",0.0,0.0,0.0,0.0
"(14, 17)",0.0,0.0,0.0,0.0
"(14, 18)",0.0,0.0,0.0,0.0
"(14, 19)",0.0,0.0,0.0,0.0


Start State at Epsiode 0 is (1, 3)
Start State at Epsiode 1 is (2, 19)
Start State at Epsiode 2 is (7, 8)
Start State at Epsiode 3 is (3, 13)
Start State at Epsiode 4 is (7, 10)
Start State at Epsiode 5 is (5, 14)
Start State at Epsiode 6 is (4, 3)
Start State at Epsiode 7 is (2, 19)
Start State at Epsiode 8 is (8, 9)
Start State at Epsiode 9 is (6, 5)
Start State at Epsiode 10 is (5, 5)
Start State at Epsiode 11 is (8, 18)
Start State at Epsiode 12 is (7, 17)
Start State at Epsiode 13 is (7, 17)
Start State at Epsiode 14 is (3, 16)
Start State at Epsiode 15 is (1, 2)
Start State at Epsiode 16 is (1, 15)
Start State at Epsiode 17 is (3, 5)
Start State at Epsiode 18 is (2, 6)
Start State at Epsiode 19 is (6, 7)
Start State at Epsiode 20 is (5, 5)
Start State at Epsiode 21 is (8, 13)
Start State at Epsiode 22 is (3, 7)
Start State at Epsiode 23 is (11, 3)
Start State at Epsiode 24 is (1, 15)
Start State at Epsiode 25 is (7, 2)
Start State at Epsiode 26 is (12, 3)
Start State at Epsiode 2

Start State at Epsiode 223 is (8, 11)
Start State at Epsiode 224 is (2, 18)
Start State at Epsiode 225 is (5, 11)
Start State at Epsiode 226 is (12, 3)
Start State at Epsiode 227 is (8, 6)
Start State at Epsiode 228 is (2, 13)
Start State at Epsiode 229 is (2, 13)
Start State at Epsiode 230 is (6, 10)
Start State at Epsiode 231 is (6, 8)
Start State at Epsiode 232 is (3, 16)
Start State at Epsiode 233 is (6, 7)
Start State at Epsiode 234 is (14, 2)
Start State at Epsiode 235 is (5, 14)
Start State at Epsiode 236 is (1, 5)
Start State at Epsiode 237 is (1, 18)
Start State at Epsiode 238 is (7, 11)
Start State at Epsiode 239 is (5, 3)
Start State at Epsiode 240 is (13, 1)
Start State at Epsiode 241 is (10, 2)
Start State at Epsiode 242 is (6, 16)
Start State at Epsiode 243 is (3, 13)
Start State at Epsiode 244 is (3, 8)
Start State at Epsiode 245 is (1, 2)
Start State at Epsiode 246 is (5, 11)
Start State at Epsiode 247 is (1, 9)
Start State at Epsiode 248 is (8, 17)
Start State at Epsio

Start State at Epsiode 445 is (9, 3)
Start State at Epsiode 446 is (2, 6)
Start State at Epsiode 447 is (3, 2)
Start State at Epsiode 448 is (8, 18)
Start State at Epsiode 449 is (8, 20)
Start State at Epsiode 450 is (6, 5)
Start State at Epsiode 451 is (6, 2)
Start State at Epsiode 452 is (1, 12)
Start State at Epsiode 453 is (2, 6)
Start State at Epsiode 454 is (1, 7)
Start State at Epsiode 455 is (10, 2)
Start State at Epsiode 456 is (6, 5)
Start State at Epsiode 457 is (2, 8)
Start State at Epsiode 458 is (6, 11)
Start State at Epsiode 459 is (7, 8)
Start State at Epsiode 460 is (2, 4)
Start State at Epsiode 461 is (8, 3)
Start State at Epsiode 462 is (3, 10)
Start State at Epsiode 463 is (5, 8)
Start State at Epsiode 464 is (3, 12)
Start State at Epsiode 465 is (2, 20)
Start State at Epsiode 466 is (5, 15)
Start State at Epsiode 467 is (3, 18)
Start State at Epsiode 468 is (2, 6)
Start State at Epsiode 469 is (5, 8)
Start State at Epsiode 470 is (12, 1)
Start State at Epsiode 471 

Start State at Epsiode 666 is (5, 6)
Start State at Epsiode 667 is (5, 13)
Start State at Epsiode 668 is (1, 11)
Start State at Epsiode 669 is (3, 17)
Start State at Epsiode 670 is (1, 5)
Start State at Epsiode 671 is (5, 1)
Start State at Epsiode 672 is (6, 16)
Start State at Epsiode 673 is (5, 6)
Start State at Epsiode 674 is (5, 11)
Start State at Epsiode 675 is (1, 11)
Start State at Epsiode 676 is (2, 18)
Start State at Epsiode 677 is (2, 15)
Start State at Epsiode 678 is (7, 20)
Start State at Epsiode 679 is (1, 12)
Start State at Epsiode 680 is (8, 11)
Start State at Epsiode 681 is (1, 3)
Start State at Epsiode 682 is (2, 4)
Start State at Epsiode 683 is (11, 1)
Start State at Epsiode 684 is (7, 14)
Start State at Epsiode 685 is (1, 11)
Start State at Epsiode 686 is (1, 1)
Start State at Epsiode 687 is (2, 20)
Start State at Epsiode 688 is (3, 7)
Start State at Epsiode 689 is (14, 2)
Start State at Epsiode 690 is (4, 2)
Start State at Epsiode 691 is (6, 11)
Start State at Epsiod

Start State at Epsiode 889 is (2, 17)
Start State at Epsiode 890 is (6, 16)
Start State at Epsiode 891 is (2, 1)
Start State at Epsiode 892 is (4, 20)
Start State at Epsiode 893 is (8, 15)
Start State at Epsiode 894 is (7, 13)
Start State at Epsiode 895 is (7, 5)
Start State at Epsiode 896 is (3, 6)
Start State at Epsiode 897 is (7, 2)
Start State at Epsiode 898 is (12, 2)
Start State at Epsiode 899 is (3, 3)
Start State at Epsiode 900 is (10, 1)
Start State at Epsiode 901 is (8, 5)
Start State at Epsiode 902 is (12, 2)
Start State at Epsiode 903 is (7, 8)
Start State at Epsiode 904 is (2, 17)
Start State at Epsiode 905 is (3, 14)
Start State at Epsiode 906 is (5, 9)
Start State at Epsiode 907 is (7, 3)
Start State at Epsiode 908 is (4, 20)
Start State at Epsiode 909 is (5, 17)
Start State at Epsiode 910 is (3, 6)
Start State at Epsiode 911 is (8, 2)
Start State at Epsiode 912 is (9, 3)
Start State at Epsiode 913 is (7, 18)
Start State at Epsiode 914 is (8, 6)
Start State at Epsiode 91

Unnamed: 0,U,D,L,R
"(1, 1)",0.0,29.664669,0.000000,29.682860
"(1, 2)",0.0,33.536512,26.214566,33.536496
"(1, 3)",0.0,37.818347,29.682827,37.818350
"(1, 4)",0.0,42.575945,33.536512,42.575946
"(1, 5)",0.0,47.862162,37.818350,47.862162
...,...,...,...,...
"(14, 16)",0.0,0.000000,0.000000,0.000000
"(14, 17)",0.0,0.000000,0.000000,0.000000
"(14, 18)",0.0,0.000000,0.000000,0.000000
"(14, 19)",0.0,0.000000,0.000000,0.000000


# Print optimal Policy 🚀


In [5]:
# Print optimal policy 
for state in states:
    # Skip terminal states and wall
    if state in walls:
        pass
    # All other non-terminal states 
    else:
        best_action = list(Q.loc[[state], :].idxmax(axis=1).values)[0]
        print(f"The best action in state {state} is {best_action}")

print("############################")

# Recursive function to draw path beginning at s (should be s = (14, 2))
def printPath(s):
    if s == (8, 18):
        print(s)
        print("Terminal state has been reached.")
        return
    #else:   
    print(s)
    best_action = str(list(Q.loc[[s], :].idxmax(axis=1).values)[0])
    next_state = getNextState(best_action, s)
    printPath(next_state)

printPath((14, 2))

print(f"Total Reward collected {sum(reward_history)}")


The best action in state (1, 1) is R
The best action in state (1, 2) is D
The best action in state (1, 3) is R
The best action in state (1, 4) is R
The best action in state (1, 5) is R
The best action in state (1, 6) is R
The best action in state (1, 7) is D
The best action in state (1, 8) is D
The best action in state (1, 9) is D
The best action in state (1, 10) is R
The best action in state (1, 11) is R
The best action in state (1, 12) is R
The best action in state (1, 13) is R
The best action in state (1, 14) is R
The best action in state (1, 15) is R
The best action in state (1, 16) is D
The best action in state (1, 17) is D
The best action in state (1, 18) is D
The best action in state (1, 19) is L
The best action in state (1, 20) is L
The best action in state (2, 1) is R
The best action in state (2, 2) is R
The best action in state (2, 3) is D
The best action in state (2, 4) is R
The best action in state (2, 5) is R
The best action in state (2, 6) is R
The best action in state (2

In [6]:

print(f"Total Reward collected {sum(reward_history)}")

Total Reward collected -50.0
