<a href="https://colab.research.google.com/github/michaelogenyi23/codesample/blob/main/QLearningTwo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import random
from IPython.display import HTML, display

def printTable(data):
    style = "border: 1px solid black;border-collapse: collapse; text-align:center;"
    dims = " width= 96 height=96"
    table = f"""
    <center>
    <table style='border: 1px solid black;border-collapse: collapse;'>
      <tr><td {dims} style='{style}'>{data[0]}</td><td {dims} style='{style}'>{data[1]}</td><td {dims} style='{style}'>{data[2]}</td><td {dims} style='{style} background-color:darkgreen;'>{data[3]}</td></tr>
      <tr><td {dims} style='{style}'>{data[4]}</td><td {dims} style='{style} background-color:gray;'></td><td {dims} style='{style}'>{data[5]}</td><td {dims} style='{style} background-color:red;'>{data[6]}</td></tr>
      <tr><td {dims} style='{style}'>{data[7]}</td><td {dims} style='{style}'>{data[8]}</td><td {dims} style='{style}'>{data[9]}</td><td {dims} style='{style}'>{data[10]}</td></tr>
    </table></center>"""
    display(HTML(table))


alpha = 0.5 # learning rate
epsilon = 0.2 # probability of choosing a random action instead of the current best action
discount = 0.9 # discount rate

states = [(row, col) for row in range(3) for col in range(4) if (row, col) != (1,1)]
actions = ["up", "down", "left", "right"]
Qtable = {(s,a): 0 for s in states for a in actions}
def reward(s):
    if s == (0,3):
        return 1
    elif s == (1,3):
        return -1
    else:
        return 0

def transitionFunction(states, s,a):
    # returns the next state (with randomness)
    if s in [(0,3),(1,3)]:
        return s
    row, col = s
    above = (row-1, col) if (row-1,col) in states else s
    below =  (row+1, col) if (row+1,col) in states else s
    left = (row, col-1) if (row, col-1) in states else s
    right = (row, col+1) if (row, col+1) in states else s
    if a == "up":
        return random.choice([above]*8+[left, right])
    if a == "down":
        return random.choice([below]*8+[left, right])
    if a == "left":
        return random.choice([left]*8+[above, below])
    if a == "right":
        return random.choice([right]*8+[above, below])
    print("Invalid action.")

def optimalMove(actions, s, Q):
    # consult the q-table Q to find the optimal action.
    maxVal = max([Q[s,a] for a in actions])
    bestActions = [a for a in actions if Q[s,a] == maxVal]
    return random.choice(bestActions)

def move(actions, s, Q, epsilon):
    # use the epsilon-greedy algorithm to pick next move.
    if random.random() < epsilon:
        return random.choice(actions)
    else:
        return optimalMove(actions, s, Q)

def episode(states, actions, Q):
    # run one episode, updating the Q-table as you go.
    s = (2,0) # the start state
    finalStates = [(0,3),(1,3)]
    while s not in finalStates:
        a = move(actions, s, Q, epsilon) # pick an action
        s2 = transitionFunction(states, s, a)
        Q[s,a] += alpha * (reward(s) + discount*max(Q[s2,a2] for a2 in actions) - Q[s,a])
        s = s2
    # Once you finish the while loop, s will be one of the final actions, so there are no next actions to take and the Q-table update function is just the following:
    Q[s,a] += alpha * (reward(s) - Q[s,a])

for i in range(1):
    episode(states, actions, Qtable)

printTable([{a:np.round(Qtable[s,a],2) for a in actions} for s in states])



0,1,2,3
"{'up': 0.0, 'down': 0, 'left': 0.0, 'right': 0.0}","{'up': 0, 'down': 0, 'left': 0, 'right': 0.0}","{'up': 0, 'down': 0, 'left': 0, 'right': 0.0}","{'up': 0, 'down': 0, 'left': 0, 'right': 0.5}"
"{'up': 0.0, 'down': 0, 'left': 0.0, 'right': 0.0}",,"{'up': 0, 'down': 0, 'left': 0, 'right': 0}","{'up': 0, 'down': 0, 'left': 0, 'right': 0}"
"{'up': 0.0, 'down': 0, 'left': 0, 'right': 0}","{'up': 0, 'down': 0, 'left': 0, 'right': 0}","{'up': 0, 'down': 0, 'left': 0, 'right': 0}","{'up': 0, 'down': 0, 'left': 0, 'right': 0}"


In [None]:
for i in range(10000):
    episode(states, actions, Qtable)
printTable([{a:np.round(Qtable[s,a],2) for a in actions} for s in states])


0,1,2,3
"{'up': 0.64, 'down': 0.5, 'left': 0.58, 'right': 0.63}","{'up': 0.62, 'down': 0.62, 'left': 0.64, 'right': 0.6}","{'up': 0.59, 'down': 0.54, 'left': 0.6, 'right': 0.76}","{'up': 1.0, 'down': 1.0, 'left': 0, 'right': 1.0}"
"{'up': 0.62, 'down': 0.46, 'left': 0.51, 'right': 0.49}",,"{'up': 0.63, 'down': -0.36, 'left': 0.4, 'right': -0.9}","{'up': -1.0, 'down': -1.0, 'left': -1.0, 'right': -1.0}"
"{'up': 0.56, 'down': 0.44, 'left': 0.44, 'right': 0.39}","{'up': 0.25, 'down': 0.34, 'left': 0.46, 'right': 0.3}","{'up': 0.22, 'down': 0.19, 'left': 0.37, 'right': 0.19}","{'up': -0.9, 'down': 0.2, 'left': -0.46, 'right': -0.53}"


In [None]:
printTable([optimalMove(actions,s,Qtable) for s in states])

0,1,2,3
up,up,right,right
up,,left,left
up,left,left,down
