# AI Reinforcement Learning Assignment 1

Author: Nicolas Arrieta Larraza

Date: 25/03/2021

## Importing

In [1]:
import numpy as np
import random
import matplotlib.pyplot as plt

## Defining parameters

In [2]:
states = 6 #Number of states
actions = 6 #Number of actions

# Reward matrix
R = np.array([[0,-0.001,0,0,0,0],
                   [-1,0,-0.001,0,0,0],
                   [0,-0.001,0,-0.001,0,0],
                   [0,0,-0.001,0,-0.001,0],
                   [0,0,0,-0.001,0,1],
                   [0,0,0,0,-0.001,0]])

#Q table initialized with zeros
Q_table = np.zeros([states,actions])
print("Q-table initialized with zeros and with dimension:",Q_table.shape)

Q-table initialized with zeros and with dimension: (6, 6)


In [None]:
epsilon = 1.0 #Max greed
epsilon_min = 0.005 #Min greed
epsilon_decay = -0.005 #Rate of epsilon decay after each episode
episodes = 10000 #Number of games
max_steps = 100 #Max steps per episode
lr = 0.65 #Learning rate 
gamma = 0.65 #Discount factor (1 for long-term rewards, 0 for immediate rewards) 

## Q-learning algorithm

In [None]:
def q_learning_algorithm(epsilon, epsilon_min, epsilon_decay, episodes, max_steps, lr, gamma):

  for episode in range(episodes):
    score = 0
    state = random.randint(0, 5)

    for _ in range(max_steps):

      invalid_move = True
      #Take best action in Q-table (exploitation)
      if np.random.uniform(0,1) > epsilon:
        action = np.argmax(Q_table[state,:])
      #Take random action (exploration)
      else:
        while invalid_move:
          rnd_action = random.randint(0,5)
          if R[state,:][rnd_action]!=0: 
            invalid_move = False
            action = rnd_action

      #Take a step
      next_state = action
      reward = R[state,action]

      #add up score
      score += reward

      #Update Q-table with new Q value
      Q_table[state,action] = Q_table[state,action] + lr*(reward + gamma*np.max(Q_table[next_state,:]) - Q_table[state,action])

      if state == 5: break

      #Update state
      state = next_state

    #Reducing epsilon each episode (exploration-exploitation trade-off)
    if epsilon >= epsilon_min: epsilon += epsilon_decay

In [None]:
q_learning_algorithm(epsilon, epsilon_min, epsilon_decay, episodes, max_steps, lr, gamma)

After the training the Q-table is updated as following:

In [None]:
Q_table

array([[ 0.        ,  0.30655369,  0.        ,  0.        ,  0.        ,
         0.        ],
       [-0.8007401 ,  0.        ,  0.47315952,  0.        ,  0.        ,
         0.        ],
       [ 0.        ,  0.30655369,  0.        ,  0.72947619,  0.        ,
         0.        ],
       [ 0.        ,  0.        ,  0.47315952,  0.        ,  1.12380952,
         0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.72947619,  0.        ,
         1.73047619],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  1.12380952,
         0.        ]])

## Testing implementation

In [None]:
def test_q_learning(episodes, max_steps):

  total_epochs, total_penalties = 0, 0

  for episode in range(episodes):
    #Reset the game parameters
    score = 0
    state = random.randint(0, 5)
    epochs, penalties = 0, 0

    for _ in range(max_steps):

      action = np.argmax(Q_table[state,:])

      #Take a step
      next_state = action
      reward =  R[state,action]

      #add up score
      score += reward

      epochs+=1

      if reward < 0:
        penalties+=1

      if state == 5: break

      #Update state
      state = next_state
    
    total_penalties += penalties
    total_epochs += epochs
  
  print(f"Results after {episodes} episodes:")
  print(f"Average timesteps per episode: {total_epochs / episodes}")
  print(f"Average penalties per episode: {total_penalties / episodes}")

In [None]:
test_episodes = 500
test_max_steps = 100
test_q_learning(test_episodes, test_max_steps)

Results after 500 episodes:
Average timesteps per episode: 3.424
Average penalties per episode: 2.598
