In [1]:
import gym
import numpy as np
import time
import matplotlib.pyplot as plt

In [2]:
env = gym.make('FrozenLake8x8-v0')

n_state = env.nS
n_action = env.nA

print(n_state)
print(n_action)

64
4


In [3]:
def plot_graph(param, curves, param_name="", curve_name=""):
    colors = ['deepskyblue', 'red', 'yellow', 'green', 'midnightblue', 'fuchsia']
    plt.figure()
    lines = []
    l, = plt.plot(param, curves, ls='-', marker='+', color='deepskyblue')
    lines.append(l)
    plt.xlabel(param_name)
    plt.ylabel(curve_name)
    
    plt.show()

In [4]:
def plot_graphs(param, curves, curve_labels, param_name="", curve_name=""):
    colors = ['firebrick', 'cadetblue', 'darksalmon', 'mediumseagreen', 'darkmagenta', 'skyblue', 
              'gold','palevioletred', 'olive', 'darkorange', 'mediumpurple', 'slategray', 'darkseagreen']
    plt.figure()
    lines = []
    for i in range(len(curve_labels)):
        l, = plt.plot(param, curves[i,:], ls='-', marker='', color=colors[i])
        lines.append(l)
    plt.xlabel(param_name)
    plt.ylabel(curve_name)
    
    plt.legend(handles=lines, labels=curve_labels, loc='best')
    plt.show()

Q-Learning

In [5]:
def run_episodes(Q, episodes, to_print=False):
    misses = 0
    steps_list = []
    
    for _ in range(episodes):
        state = env.reset()
        steps = 0
        
        while True:
            next_state, rew, done, _ = env.step(greedy(Q, state))
            
            state = next_state
            steps += 1
            if done and rew == 1:
                step_list.append(steps)
                break
            elif done and rew == 0:
                misses += 1
                break
    mean_steps = np.mean(steps_list)
    per_misses = (misses/episodes) * 100
    if to_print:
        print('Mean steps: %.3f . Percentage %.2f of games lost!' % (mean_steps, per_misses))
    else:
        return mean_steps

In [6]:
class Agent:
    def __init__(self, env, alpha=0.1, gamma=0.9):
        self.env = env
        self.episode_reward = 0.0
        self.turn_limit = 100
        self.alpha = alpha
        self.gamma = gamma
        self.q_val = np.zeros(n_state * n_action).reshape(n_state, n_action).astype(np.float32)
        self.q_val_rand_grd = np.zeros(n_state * n_action).reshape(n_state, n_action).astype(np.float32)
        self.q_val_greedy = np.zeros(n_state * n_action).reshape(n_state, n_action).astype(np.float32)
        self.eps = 0.33
        self.eps_decay = 0.00005
        self.start_time = 0
    
    def learn(self, alpha, gamma):
        state = self.env.reset()
        self.start_time = time.time()
        
        # for t in range(self.turn_limit):
        while True:
            act = self.env.action_space.sample()
            next_state, reward, done, info = self.env.step(act)
            q_next_max = np.max(self.q_val[next_state])
            
            self.q_val[state][act] = (1 - alpha) * self.q_val[state][act] + alpha * (reward + gamma * q_next_max)

            if done:
                return reward
            else:
                state = next_state

                
    def learn_rg(self, alpha, gamma):
        state = self.env.reset()
        self.start_time = time.time()
        self.eps = 0.33
        
        while True:
            rand = np.random.uniform(0,1)
            if rand < self.eps:
                act = self.env.action_space.sample()
            else:
                act = np.argmax(self.q_val_greedy[state])
                
            next_state, reward, done, info = self.env.step(act)
            q_next_max = np.max(self.q_val[next_state])
            
            self.q_val[state][act] = (1 - alpha) * self.q_val[state][act] + alpha * (reward + gamma * q_next_max)
            
            if self.eps > 0.01:
                self.eps -= self.eps_decay
                
            if done:
                return reward
            else:
                state = next_state
                
    
    def learn_greedy(self, alpha, gamma):
        state = self.env.reset()
        self.start_time = time.time()
        
        while True:
            act = np.argmax(self.q_val_greedy[state])
                
            next_state, reward, done, info = self.env.step(act)
            q_next_max = np.max(self.q_val_greedy[next_state])
            
            self.q_val_greedy[state][act] = (1 - alpha) * self.q_val_greedy[state][act] + alpha * (
                reward + gamma * q_next_max)
            if done:
                return reward
            else:
                state = next_state
                
    def test(self):
        state = self.env.reset()
        
        for t in range(self.turn_limit):
            act = np.argmax(self.q_val[state])
            next_state, reward, done, _ = self.env.step(act)
            
            if done:
                return reward
            else:
                state = next_state
        return 0.0
    
    def test_random_greedy(self):
        state = self.env.reset()
        
        for t in range(self.turn_limit):
            act = np.argmax(self.q_val_rand_grd[state])
            next_state, reward, done, _ = self.env.step(act)
            
            if done:
                return reward
            else:
                state = next_state
        return 0.0
    
    
    def test_greedy(self):
        state = self.env.reset()
        
        for t in range(self.turn_limit):
            act = np.argmax(self.q_val_greedy[state])
            next_state, reward, done, _ = self.env.step(act)
            
            if done:
                return reward
            else:
                state = next_state
        return 0.0

In [7]:
agent = Agent(env)

In [8]:
reward_tot = 0.0
epi_train = list(range(1000000))
tot_rew_list = []
avg_rew_list = []
for i in epi_train:
    reward_tot += agent.learn(alpha=0.1, gamma=0.9)
    tot_rew_list.append(reward_tot)
    avg_rew_list.append(reward_tot/(i+1) * 100)

KeyboardInterrupt: 

In [None]:
reward_test = 0.0
epi_test = list(range(1000))
tot_test_rew_list = []
avg_test_rew_list = []
for i in epi_test:
    reward_test += agent.test()
    tot_test_rew_list.append(reward_test)
    avg_test_rew_list.append(reward_test/(i+1) * 100)

In [None]:
plot_graph(epi_train, tot_rew_list, param_name="Q-Learning episodes", curve_name="total reward")

In [None]:
plot_graph(epi_train, avg_rew_list, param_name="Q-Learning episodes", curve_name="average reward %")

In [None]:
plot_graph(epi_test, tot_test_rew_list, param_name="Q-Learning test episodes", curve_name="total reward")

In [None]:
plot_graph(epi_test, avg_test_rew_list, param_name="Q-Learning test episodes", curve_name="average reward %")

In [None]:
reward_tot_rand_grd = 0.0
epi_train = list(range(100000))
tot_rew_list_rand_grd = []
avg_rew_list_rand_grd = []
for i in epi_train:
    reward_tot_rand_grd += agent.learn_rg(alpha=0.5, gamma=0.9)
    tot_rew_list_rand_grd.append(reward_tot_rand_grd)
    avg_rew_list_rand_grd.append(reward_tot_rand_grd/(i+1) * 100)

In [None]:
reward_test_rand_grd = 0.0
epi_test = list(range(1000))
tot_test_rew_list_rand_grd = []
avg_test_rew_list_rand_grd = []
for i in epi_test:
    reward_test_rand_grd += agent.test_random_greedy()
    tot_test_rew_list_rand_grd.append(reward_test_rand_grd)
    avg_test_rew_list_rand_grd.append(reward_test_rand_grd/(i+1) * 100)

In [None]:
plot_graph(epi_train, tot_rew_list_rand_grd, param_name="Q-Learning episodes", curve_name="total reward")

In [None]:
plot_graph(epi_train, avg_rew_list_rand_grd, param_name="Q-Learning episodes", curve_name="average reward %")

In [None]:
plot_graph(epi_test, tot_test_rew_list_rand_grd, param_name="Q-Learning test episodes", curve_name="total reward")

In [None]:
plot_graph(epi_test, avg_test_rew_list_rand_grd, param_name="Q-Learning test episodes", curve_name="average reward %")

In [None]:
reward_tot_greedy = 0.0
epi_train = list(range(1000000))
tot_rew_list_greedy = []
avg_rew_list_greedy = []
for i in epi_train:
    reward_tot_greedy += agent.learn_greedy(alpha=0.1, gamma=0.9)
    tot_rew_list_greedy.append(reward_tot_greedy)
    avg_rew_list_greedy.append(reward_tot_greedy/(i+1) * 100)

In [None]:
reward_test_greedy = 0.0
epi_test = list(range(1000))
tot_test_rew_list_greedy = []
avg_test_rew_list_greedy = []
for i in epi_test:
    reward_test_greedy += agent.test_greedy()
    tot_test_rew_list_greedy.append(reward_test_greedy)
    avg_test_rew_list_greedy.append(reward_test_greedy/(i+1) * 100)

In [None]:
plot_graph(epi_train, tot_rew_list_greedy, param_name="Q-Learning episodes", curve_name="total reward")

In [None]:
plot_graph(epi_train, avg_rew_list_greedy, param_name="Q-Learning episodes", curve_name="average reward %")

In [None]:
plot_graph(epi_test, tot_test_rew_list_greedy, param_name="Q-Learning test episodes", curve_name="total reward")

In [None]:
plot_graph(epi_test, avg_test_rew_list_greedy, param_name="Q-Learning test episodes", curve_name="average reward %")