In [1]:
################################################################
# Frozen Lake v1 : Q value iteration examples
################################################################

import gym
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation
from IPython.display import HTML
from tqdm import tqdm

# Parameters 
EPSILON_TRAIN = 0.5
EPSILON_TEST = 0.0
GAMMA = 0.999
ENV_NAME = "FrozenLake-v1"
NUM_EPISODES = 200


class GameAgent():
    def __init__(self):
        self.env = gym.make(ENV_NAME, render_mode="rgb_array")
        self.STATE_DIM = self.env.observation_space.n
        self.ACTION_DIM = self.env.action_space.n
        self.epsilon = 0.5
        self.GAMMA = GAMMA
        self.lr = 0.1
    
    def initialize_game(self):
        state, _ = self.env.reset() 
        return state
    
    def select_action(self, current_state=None):
        action = self.env.action_space.sample()
        return action
        
    def step_game(self, action):
        next_state, reward, terminated, truncated, info = self.env.step(action)
        return next_state, reward, terminated
        
    def run_episode(self, training=True):
        current_state = self.initialize_game()
        terminated = False
        episode_reward = 0.0
    
        while not terminated:
            # choose action
            action = self.select_action(current_state)
            
            # take action
            next_state, reward, terminated = self.step_game(action)

            # prepare next step
            current_state = next_state
            episode_reward += reward

        return episode_reward
    
    def run_epoch(self, training=True):
        rewards = np.zeros(NUM_EPISODES)
        for i in range(NUM_EPISODES):
            rewards[i] = self.run_episode(training)
        return rewards.mean()


In [2]:
class ModelFreeAgent(GameAgent):
    def __init__(self):
        super(ModelFreeAgent, self).__init__()
        self.q_table = np.zeros([self.STATE_DIM, self.ACTION_DIM])
        self.values = np.zeros(self.STATE_DIM)
        self.epsilon = 1.0
        self.GAMMA = GAMMA
        self.lr = 0.1
        
    def epsilon_greedy(self, state):
        if np.random.random() < self.epsilon:
            action = self.env.action_space.sample()
        else:
            _, action = np.unravel_index(np.argmax(self.q_table[state, :]), self.q_table.shape)
        return action
    
    def select_action(self, state):
        return self.epsilon_greedy(state)

    def update_q_table(self, current_state, action, next_state, reward, terminated):
        q_max = self.q_table[next_state, :].max() 
        if not terminated:
            self.q_table[current_state, action] += self.lr * (reward + self.GAMMA * q_max - self.q_table[current_state, action]) 
        else:
            self.q_table[current_state, action] += self.lr * (reward - self.q_table[current_state, action]) 
        
    def run_episode(self, training=True):
        current_state = self.initialize_game()
        terminated = False
        episode_reward = 0.0
        self.epsilon = 0.5 if training else 0.01
        
        while not terminated:
            # choose action
            action = self.select_action(current_state)
            
            # take action
            next_state, reward, terminated = self.step_game(action)
            
            # update q_table
            if training:
                self.update_q_table(current_state, action, next_state, reward, terminated)

            # prepare next step
            current_state = next_state
            episode_reward += reward

        return episode_reward
    
    def show_policy(self):
        policy = np.argmax(self.q_table, axis=1)
        return policy



    
def model_free_example():
    agent = ModelFreeAgent()
    for i in tqdm(range(10)):
        agent.run_epoch()
    print("")
    print("optimal policy:")
    print(agent.show_policy().reshape(4,4))
    
    rewards = np.zeros(10)
    for i in tqdm(range(10)):
        rewards[i] = agent.run_epoch(training=False)
    print("")
    print(rewards.mean())
    

model_free_example()


100%|██████████| 10/10 [00:01<00:00,  7.77it/s]



optimal policy:
[[0 3 3 3]
 [0 0 0 0]
 [3 1 0 0]
 [0 2 3 0]]


100%|██████████| 10/10 [00:04<00:00,  2.37it/s]


0.6425000000000001





In [3]:


class ModelBasedAgent(GameAgent):
    def __init__(self):
        super(ModelBasedAgent, self).__init__()
        self.q_table = np.zeros([self.STATE_DIM, self.ACTION_DIM])
        self.transtions_model = np.zeros([self.STATE_DIM, self.ACTION_DIM, self.STATE_DIM])
        self.rewards_model = np.zeros([self.STATE_DIM, self.ACTION_DIM, self.STATE_DIM])
        self.lr = 0.1
    
    def select_action(self, state, training=True):
        if training:
            return self.env.action_space.sample()
        else:
            _, action = np.unravel_index(np.argmax(self.q_table[state, :]), self.q_table.shape)
            return action

    def q_values_iteration(self):
        for current_state in range(self.STATE_DIM):
            for action in range(self.ACTION_DIM):
                total_bb = 0.0
                total_transition = self.transtions_model[current_state, action, :].sum()
                if total_transition == 0.0:
                   continue      
                probs = self.transtions_model[current_state, action, :] / total_transition
                rewards = self.rewards_model[current_state, action, :] / total_transition
                for next_state in range(self.STATE_DIM):
                    reward = rewards[next_state]
                    transition_prob = probs[next_state] 
                    q_max = self.q_table[next_state, :].max()
                    bellman_backup = reward + self.GAMMA * q_max
                    total_bb += transition_prob * bellman_backup
                self.q_table[current_state, action] = total_bb
    
    def count_transition(self, current_state, action, next_state):
        self.transtions_model[current_state, action, next_state] += 1
    
    def count_reward(self, current_state, action, next_state, reward):
        self.rewards_model[current_state, action, next_state] += reward
        
    def run_episode(self, training=True):
        current_state = self.initialize_game()
        terminated = False
        episode_reward = 0.0
        
        while not terminated:
            # choose action
            action = self.select_action(current_state, training)
            
            # take action
            next_state, reward, terminated = self.step_game(action)
            
            # learn the model
            self.count_transition(current_state, action, next_state)
            self.count_reward(current_state, action, next_state, reward)
            
            # prepare next step
            current_state = next_state
            episode_reward += reward

        return episode_reward
        
    def run_epoch(self, training=True):
        rewards = np.zeros(NUM_EPISODES)
        for i in range(NUM_EPISODES):
            rewards[i] = self.run_episode(training)
        return rewards.mean()
    
    def show_policy(self):
        policy = np.argmax(self.q_table, axis=1)
        return policy
        
    def reset():
        pass


def model_based_example():
    agent = ModelBasedAgent()
    iter_n = 0
    while iter_n < 30:
        # training
        for _ in tqdm(range(200)):
            agent.run_episode(training=True)
        agent.q_values_iteration()
    
        # testing
        rewards = np.zeros(200)
        for i in tqdm(range(200)):
            rewards[i] = agent.run_episode(training=False)
        mean = rewards.mean()
        print(f"average reward: {mean}")
        print("")
        iter_n += 1
    
    print(agent.show_policy().reshape(4,4))

model_based_example()


100%|██████████| 200/200 [00:00<00:00, 2842.97it/s]
100%|██████████| 200/200 [00:00<00:00, 1315.53it/s]


average reward: 0.0



100%|██████████| 200/200 [00:00<00:00, 2982.81it/s]
100%|██████████| 200/200 [00:00<00:00, 1233.45it/s]


average reward: 0.0



100%|██████████| 200/200 [00:00<00:00, 2370.16it/s]
100%|██████████| 200/200 [00:00<00:00, 1102.05it/s]


average reward: 0.0



100%|██████████| 200/200 [00:00<00:00, 2982.73it/s]
100%|██████████| 200/200 [00:00<00:00, 531.13it/s]


average reward: 0.6



100%|██████████| 200/200 [00:00<00:00, 3106.63it/s]
100%|██████████| 200/200 [00:00<00:00, 2562.46it/s]


average reward: 0.19



100%|██████████| 200/200 [00:00<00:00, 2751.34it/s]
100%|██████████| 200/200 [00:00<00:00, 884.85it/s]


average reward: 0.47



100%|██████████| 200/200 [00:00<00:00, 3252.95it/s]
100%|██████████| 200/200 [00:00<00:00, 858.36it/s]


average reward: 0.48



100%|██████████| 200/200 [00:00<00:00, 3044.00it/s]
100%|██████████| 200/200 [00:00<00:00, 809.28it/s]


average reward: 0.48



100%|██████████| 200/200 [00:00<00:00, 2822.05it/s]
100%|██████████| 200/200 [00:00<00:00, 718.37it/s]


average reward: 0.545



100%|██████████| 200/200 [00:00<00:00, 2951.63it/s]
100%|██████████| 200/200 [00:00<00:00, 729.99it/s]


average reward: 0.49



100%|██████████| 200/200 [00:00<00:00, 3238.96it/s]
100%|██████████| 200/200 [00:00<00:00, 791.99it/s]


average reward: 0.5



100%|██████████| 200/200 [00:00<00:00, 2845.93it/s]
100%|██████████| 200/200 [00:00<00:00, 531.70it/s]


average reward: 0.79



100%|██████████| 200/200 [00:00<00:00, 2882.74it/s]
100%|██████████| 200/200 [00:00<00:00, 495.52it/s]


average reward: 0.785



100%|██████████| 200/200 [00:00<00:00, 1754.81it/s]
100%|██████████| 200/200 [00:00<00:00, 518.66it/s]


average reward: 0.735



100%|██████████| 200/200 [00:00<00:00, 3018.33it/s]
100%|██████████| 200/200 [00:00<00:00, 592.19it/s]


average reward: 0.78



100%|██████████| 200/200 [00:00<00:00, 2941.28it/s]
100%|██████████| 200/200 [00:00<00:00, 502.88it/s]


average reward: 0.76



100%|██████████| 200/200 [00:00<00:00, 2785.51it/s]
100%|██████████| 200/200 [00:00<00:00, 460.70it/s]


average reward: 0.86



100%|██████████| 200/200 [00:00<00:00, 3332.61it/s]
100%|██████████| 200/200 [00:00<00:00, 561.17it/s]


average reward: 0.8



100%|██████████| 200/200 [00:00<00:00, 3234.02it/s]
100%|██████████| 200/200 [00:00<00:00, 492.05it/s]


average reward: 0.82



100%|██████████| 200/200 [00:00<00:00, 2807.38it/s]
100%|██████████| 200/200 [00:00<00:00, 447.69it/s]


average reward: 0.81



100%|██████████| 200/200 [00:00<00:00, 2661.69it/s]
100%|██████████| 200/200 [00:00<00:00, 520.28it/s]


average reward: 0.815



100%|██████████| 200/200 [00:00<00:00, 3224.18it/s]
100%|██████████| 200/200 [00:00<00:00, 520.10it/s]


average reward: 0.81



100%|██████████| 200/200 [00:00<00:00, 3147.98it/s]
100%|██████████| 200/200 [00:00<00:00, 482.24it/s]


average reward: 0.82



100%|██████████| 200/200 [00:00<00:00, 2765.97it/s]
100%|██████████| 200/200 [00:00<00:00, 552.54it/s]


average reward: 0.83



100%|██████████| 200/200 [00:00<00:00, 2956.89it/s]
100%|██████████| 200/200 [00:00<00:00, 497.00it/s]


average reward: 0.77



100%|██████████| 200/200 [00:00<00:00, 2829.53it/s]
100%|██████████| 200/200 [00:00<00:00, 511.21it/s]


average reward: 0.835



100%|██████████| 200/200 [00:00<00:00, 2939.43it/s]
100%|██████████| 200/200 [00:00<00:00, 517.21it/s]


average reward: 0.795



100%|██████████| 200/200 [00:00<00:00, 3018.64it/s]
100%|██████████| 200/200 [00:00<00:00, 527.93it/s]


average reward: 0.785



100%|██████████| 200/200 [00:00<00:00, 3110.65it/s]
100%|██████████| 200/200 [00:00<00:00, 477.37it/s]


average reward: 0.815



100%|██████████| 200/200 [00:00<00:00, 3058.78it/s]
100%|██████████| 200/200 [00:00<00:00, 525.64it/s]

average reward: 0.855

[[0 3 3 3]
 [0 0 0 0]
 [3 1 0 0]
 [0 2 1 0]]



