<a href="https://colab.research.google.com/github/mr-nudo/intelligent-tools/blob/master/7_Discount_factor_effect_and_Q_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# Discount factor effect and Q-Learning

## 1. Based on the probabilities on the arrows above, Model your MDP (transition probabilities and rewards) in the notebook. you can use the [s, a, s’] for each part or you can use your own way of defining the MDP. What should be seen are transition probabilities, rewards and possible actions

In [62]:
transition_probabilities = {
    ('s0', 'a0', 's0'): 0.7,
    ('s0', 'a0', 's1'): 0.3,
    ('s0', 'a1', 's0'): 1.0,
    ('s0', 'a2', 's1'): 0.2,
    ('s0', 'a2', 's0'): 0.8,
    ('s1', 'a0', 's1'): 1.0,
    ('s1', 'a2', 's2'): 1.0,
    ('s2', 'a1', 's0'): 0.8,
    ('s2', 'a1', 's1'): 0.1,
    ('s2', 'a1', 's2'): 0.1,
}

rewards = {
    ('s0', 'a0', 's0'): 10,
    ('s1', 'a2', 's2'): 50,
    ('s2', 'a1', 's0'): 40,
}

possible_actions = {
    's0': ['a0', 'a1', 'a2'],
    's1': ['a0', 'a2'],
    's2': ['a1'],
}

In [63]:
import numpy as np
import random

In [77]:
Q_values = Q_values2 = np.full((3, 3), -np.inf)  # -np.inf for impossible actions
# Q_values2 = np.full((3, 3), -np.inf)  # -np.inf for impossible actions
for state, actions in possible_actions.items():
#     print(state, actions)
#     Q_values[state, actions] = 0.0  # for all possible actions
  for action in actions:
    action_index = ord(action[1]) - ord('0') # Convert action to index
    Q_values[int(state[1]), action_index] = 0.0  # for all possible actions
    Q_values2[int(state[1]), action_index] = 0.0

In [78]:
Q_values

array([[  0.,   0.,   0.],
       [  0., -inf,   0.],
       [-inf,   0., -inf]])

In [79]:
Q_values2

array([[  0.,   0.,   0.],
       [  0., -inf,   0.],
       [-inf,   0., -inf]])

## 2. Take your discount factor to be 0.9. Perform Q-learning and report the Q-values for each (state, action) pair. Based on that, what is the optimal policy?

In [80]:
def q_learning(Q_values, gamma, alpha):
  for iteration in range(50):
    # --- fill here (perform a DP approach for filling up your Q-table (repeat the process by stting gamma to be 0.95 )
    for state in possible_actions.keys():
      for action in possible_actions[state]:
          action_index = ord(action[1]) - ord('0')
          next_states = [s for (s, a, s_) in transition_probabilities.keys() if s == state and a == action]
          probabilities = [transition_probabilities[(s, a, s_)] for (s, a, s_) in transition_probabilities.keys() if s == state and a == action]
          next_state = random.choices(next_states, probabilities)[0]
          reward = rewards.get((state, action, next_state), 0)
          max_q_next_state = np.max(Q_values[int(next_state[1]), :])
          Q_values[int(state[1]), action_index] = (1 - alpha) * Q_values[int(state[1]), action_index] + alpha * (reward + gamma * max_q_next_state)

gamma = 0.9  # the discount factor
alpha = 0.1  # the learning rate
q_learning(Q_values, gamma, alpha)

In [81]:
Q_values

array([[39.49939329, 30.15078333, 30.15078333],
       [ 0.        ,        -inf,  0.        ],
       [       -inf,  0.        ,        -inf]])

In [82]:
Q_values.argmax(axis=1)  # optimal action for each state

array([0, 0, 1])

In [83]:
optimal_policy = {}
for state in possible_actions.keys():
    action_index = np.argmax(Q_values[int(state[1]), :])
    # action_index = ord(action[1]) - ord('0')
    action_mapping = {
        'a0': 0,
        'a1': 1,
        'a2': 2,
    }
    optimal_action = [a for a, i in action_mapping.items() if i == action_index][0]
    optimal_policy[state] = optimal_action

print("Optimal policy:")
for state, action in optimal_policy.items():
    print(f"State {state}: {action}")

Optimal policy:
State s0: a0
State s1: a0
State s2: a1


## 3. Perform the same procedure but this time with a discount factor of 0.95. Did your optimal policy change? Explain your results.



In [70]:
gamma2 = 0.95  # the discount factor
alpha2 = 0.1  # the learning rate
q_learning(Q_values2, gamma2, alpha)

In [71]:
Q_values2

array([[44.33748859, 35.16218512, 35.16218512],
       [ 0.        ,        -inf,  0.        ],
       [       -inf,  0.        ,        -inf]])

In [72]:
Q_values2.argmax(axis=1)

array([0, 0, 1])

In [84]:
optimal_policy = {}
for state in possible_actions.keys():
    action_index = np.argmax(Q_values2[int(state[1]), :])
    # action_index = ord(action[1]) - ord('0')
    action_mapping = {
        'a0': 0,
        'a1': 1,
        'a2': 2,
    }
    optimal_action = [a for a, i in action_mapping.items() if i == action_index][0]
    optimal_policy[state] = optimal_action

print("Optimal policy:")
for state, action in optimal_policy.items():
    print(f"State {state}: {action}")

Optimal policy:
State s0: a0
State s1: a0
State s2: a1
