# Thông tin 
1. MSSV: 20520079
2. Họ và tên: Nguyễn Tư Thành Nhân
3. Bài tập: Assignment 4

# Import các thư viện cần thiết

In [None]:
import gym
import numpy as np
import random

# Một số hàm để chơi game

In [None]:
def play(env, q_table, render=False):
    state = env.reset()
    total_reward = 0
    steps = 0
    done = False
    while not done:
        action = np.argmax(q_table[state, :])
        next_state, reward, done, info = env.step(action)
        total_reward += reward
        steps += 1
        if render:
            env.render()
            time.sleep(0.2)
            if not done:
                display.clear_output(wait=True)
        state = next_state

    return (total_reward, steps)

In [None]:
def play_multiple_times(env, q_table, max_episodes):
    success = 0
    list_of_steps = []
    for i in range(max_episodes):
        total_reward, steps = play(env, q_table)

        if total_reward > 0:
            success += 1
            list_of_steps.append(steps)

    print(f'Number of successes: {success}/{max_episodes}')
    print(f'Average number of steps: {np.mean(list_of_steps)}')

# Q Learning

In [None]:
# Hyperparameters
gamma = 0.99
learning_rate = 0.1
max_epsilon = 1.0
min_epsilon = 0.01
epsilon_decay_rate = 0.005

num_episodes = 20000
num_steps_per_episode = 100

In [None]:
def q_learning(env, num_episodes, num_steps_per_episode, learning_rate, gamma, max_epsilon, min_epsilon, epsilon_decay_rate):
    q_table = np.ones((env.observation_space.n, env.action_space.n))
    rewards_all = []
    for episode in range(num_episodes):
        state = env.reset()

        reward_episode = 0.0
        done = False
        epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-epsilon_decay_rate*episode)
        for step in range(num_steps_per_episode):
            exploration = random.uniform(0,1)
            if exploration < epsilon:
                action = env.action_space.sample()
            else:
                action = np.argmax(q_table[state, :])

            next_state, reward, done, info = env.step(action)
            q_table[state, action] = q_table[state, action] * (1 - learning_rate) + learning_rate * (reward + gamma * np.max(q_table[next_state,:]))

            reward_episode += reward
            state = next_state

            if done:
                break
        rewards_all.append(reward_episode)
    print(f'Episode {episode} finished')
    return q_table, rewards_all

# SARSA


In [None]:
def choose_action(Q, epsilon, n_actions, s):
    if np.random.random() <= epsilon:
        return np.random.randint(n_actions)
    else:
        return np.argmax(Q[s, :])

def sarsa(env, num_episodes, num_steps_per_episode, learning_rate, gamma, max_epsilon, min_epsilon, epsilon_decay_rate):
    q_table = np.ones((env.observation_space.n, env.action_space.n))
    rewards_all = []
    n_states, n_actions = env.observation_space.n, env.action_space.n
    for episode in range(num_episodes):
        state = env.reset()
        reward_episode = 0.0
        done = False
        epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-epsilon_decay_rate*episode)
        action = choose_action(q_table, epsilon, n_actions ,state)

        for step in range(num_steps_per_episode):
            exploration = random.uniform(0,1)
            if exploration < epsilon:
                action = env.action_space.sample()
            else:
                action = np.argmax(q_table[state, :])

            next_state, reward, done, info = env.step(action)
            next_action = choose_action(q_table, epsilon, n_actions , next_state)
            
            q_table[state, action] = q_table[state, action] * (1 - learning_rate) + learning_rate * (reward + gamma * q_table[next_state, next_action])

            reward_episode += reward
            state = next_state
            action = next_action

            if done:
                break
        rewards_all.append(reward_episode)
    print(f'Episode {episode} finished')
    return q_table, rewards_all


# Thực nghiệm trên FrozenLake-v0

In [None]:
env = gym.make('FrozenLake-v0')

In [None]:
# Q Learning
q_table, rewards_all = q_learning(env, num_episodes, num_steps_per_episode, learning_rate, gamma, max_epsilon, min_epsilon, epsilon_decay_rate)
play_multiple_times(env, q_table, 1000)
print('Sum of rewards:',sum(rewards_all))

Episode 19999 finished
Number of successes: 67/1000
Average number of steps: 12.17910447761194
Sum of rewards: 968.0


In [None]:
# SARSA
sarsa_table, rewards_all = sarsa(env, num_episodes, num_steps_per_episode, learning_rate, gamma, max_epsilon, min_epsilon, epsilon_decay_rate)
play_multiple_times(env, sarsa_table, 1000)
print('Sum of rewards:',sum(rewards_all))

Episode 19999 finished
Number of successes: 63/1000
Average number of steps: 10.746031746031745
Sum of rewards: 800.0


# Thực nghiệm trên FrozenLake8x8-v0



In [None]:
env = gym.make('FrozenLake8x8-v0')

In [None]:
# Q Learning
q_table, rewards_all = q_learning(env, num_episodes, num_steps_per_episode, learning_rate, gamma, max_epsilon, min_epsilon, epsilon_decay_rate)
play_multiple_times(env, q_table, 1000)
print('Sum of rewards:',sum(rewards_all))

Episode 19999 finished
Number of successes: 2/1000
Average number of steps: 33.0
Sum of rewards: 75.0


In [None]:
# SARSA
sarsa_table, rewards_all = sarsa(env, num_episodes, num_steps_per_episode, learning_rate, gamma, max_epsilon, min_epsilon, epsilon_decay_rate)
play_multiple_times(env, sarsa_table, 1000)
print('Sum of rewards:',sum(rewards_all))

Episode 19999 finished
Number of successes: 4/1000
Average number of steps: 28.25
Sum of rewards: 34.0


# Thực nghiệm trên Taxi-v3


In [None]:
env = gym.make('Taxi-v3')

In [None]:
# Q Learning
q_table, rewards_all = q_learning(env, num_episodes, num_steps_per_episode, learning_rate, gamma, max_epsilon, min_epsilon, epsilon_decay_rate)
play_multiple_times(env, q_table, 1000)
print('Sum of rewards:',sum(rewards_all))

Episode 19999 finished
Number of successes: 1000/1000
Average number of steps: 12.998
Sum of rewards: 5066.0


In [None]:
# SARSA
sarsa_table, rewards_all = sarsa(env, num_episodes, num_steps_per_episode, learning_rate, gamma, max_epsilon, min_epsilon, epsilon_decay_rate)
play_multiple_times(env, sarsa_table, 1000)
print('Sum of rewards:',sum(rewards_all))

Episode 19999 finished
Number of successes: 1000/1000
Average number of steps: 13.069
Sum of rewards: 3669.0


# Nhận xét:
- Vì map FrozenLake8x8-v0 sẽ không thành công một lần chơi được với q_table khởi tạo là ma trận O (ma trận toàn số 0) (khi đó q_table sẽ giữ nguyên là ma trận O đến sau 20000 vòng lặp). Nên em đã thay thế bằng khởi tạo là ma trận toàn số 1. Tuy vậy, kết quả của map FrozenLake-v0 sẽ giảm đi (số lượng chơi thành công  giảm từ khoảng 700 xuống còn khoảng 70 lần)
- Số lần chơi thành công và số bước chơi trung bình qua 3 map của 2 thuật toán Q Learning và SARSA là tương đương nhau 
- Tổng điểm thưởng của Q Learning cho kết quả tốt hơn so với SARSA