In [1]:
import gymnasium as gym
import asyncio
import time
import scipy
import numpy as np
from matplotlib import pyplot as plt
from src.utils import *
from src.SARSA_lander import *

In [2]:
env = gym.make("LunarLander-v2")

## SARSA

#### Data collection and hyperparameters tuning
Outputs are kept to have an example of the training process.

Checkpoints of the training are printed due to the long time it takes to train the agent.

In [None]:
observation_space = (7, 5, 5, 5, 5, 5, 2, 2)
n_episodes = 8000
lam = [0, 0.5, 0.8]
epsilon0 = [0.2, 0.5]
k_e = [0.0000001, 0.0001]
k_l = [0.0000001, 0.0001]
for e in epsilon0:
    for k_lr in k_l:
        for k_ in k_e:
            for l in lam:
                sarsa = SARSA(env, space_size=observation_space, action_size=4, gamma=1)
                sarsa.train(n_episodes=n_episodes, lambda_=l, epsilon_0=e, k_epsilon=k_, k_lr=k_lr)
                sarsa.plot_traj(cumulative=True, local=True, save_img=True)
                sarsa.analyse(n_episodes=2000)


#### GIF generation
Generate an episode with the trained agent and save it as a gif (there's *a bit* of initial selection bias to have a nicer gif).

In [None]:
observation_space = (7, 5, 5, 5, 5, 5, 2, 2)
# test
env_test = gym.make('LunarLander-v2')
n_episodes = 100
performance_traj = np.zeros(n_episodes)
for i in range(n_episodes):
    
    state = env_test.reset(seed=i)[0]
    state = discretize_state(state, observation_space)
    done = False
    while not done:
        action = get_action_epsilon_greedy(sarsa.Qvalues, state, 0, 4)
        next_state, reward, truncated, terminated, _ = env_test.step(action)
        done = terminated or truncated
        performance_traj[i] += reward
        next_state = discretize_state(next_state, observation_space)
        state = next_state
best = np.argmax(performance_traj)
print("best episode: ", best)
print("best performance: ", performance_traj[best])
env_test.close()


In [None]:
from PIL import Image

observation_space = (7, 5, 5, 5, 5, 5, 2, 2)
# test
env_test = gym.make('LunarLander-v2', render_mode="rgb_array")
frames = []
state = env_test.reset(seed=91)[0] #best episode found in the first 100 seeds
state = discretize_state(state, observation_space)
done = False
while not done:
    action = get_action_epsilon_greedy(q_learning.Qvalues, state, 0, 4) #change 0 to 1 to see random agent
    next_state, reward, truncated, terminated, _ = env_test.step(action)
    done = terminated or truncated
    next_state = discretize_state(next_state, observation_space)
    state = next_state
    frames.append(env_test.render())

env_test.close()
imgs = [Image.fromarray(img) for img in frames]
# duration is the number of milliseconds between frames; this is 40 frames per second
imgs[0].save("gifs/ESARSA.gif", save_all=True, append_images=imgs[1:],duration = 50, loop=0)
