In [None]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import agent
# from q_learning_agent import QLearningAgent
from gridworld_with_door import MazeEnvironment
from tqdm import tqdm
# from lin_agent_tiles import LinearAgent
from nn_agent import LinearAgent as NNAgent
# from rnn_agent_n_step import RNNAgent
from rnn_agent import RNNAgent as RNNAgent
from sarsa_agent import SarsaAgent
from q_learning_agent import QLearningAgent
from IPython.display import HTML, Image
import numpy as np
from matplotlib import pyplot as plt
from matplotlib import animation
# enable if ffmpeg codec is installed 
# plt.rcParams['animation.ffmpeg_path'] = '/anaconda3/envs/pytorch/bin/ffmpeg'

In [None]:
plt.rcParams.update({'font.size': 15})
plt.rcParams.update({'figure.figsize': [10,5]})

### Long dependency

In [None]:
#### Limitation RNN GRU

### Buffer Training

In [None]:
### state dist investigation

### Helpers

In [None]:
def run_episode(env, agent, state_visits=None, keep_history=False):
    is_terminal = False
    sum_of_rewards = 0
    step_count = 0
    
    obs = env.env_start(keep_history=keep_history)
    action = agent.agent_start(obs)
    
    if state_visits is not None:
        state_visits[obs[0]] += 1

    while not is_terminal:
        reward, obs, is_terminal = env.env_step(action)
        print(agent.steps,end='\r')
        sum_of_rewards -= 1
        step_count += 1
        state = obs
        if step_count == 500:
            agent.agent_end(reward, state, append_buffer=False)
            break
        elif is_terminal:
            agent.agent_end(reward, state, append_buffer=True)
        else:
            action = agent.agent_step(reward, state)

        if state_visits is not None:
            state_visits[state[0]] += 1
    
    if keep_history:
        history = env.history
        env.env_cleanup()
        return sum_of_rewards, history
    else:
        return sum_of_rewards

In [None]:
def animate(history, name='history.gif'):
    frames = len(history)
    print(f"Rendering {frames} frames...")
    fig = plt.figure(figsize=(6, 2))
    fig_grid = fig.add_subplot(121)

    def render_frame(i):
        grid = history[i]
        fig_grid.matshow(grid, vmin=-1, vmax=1, cmap='jet')
    anim = animation.FuncAnimation(fig, render_frame, frames=frames, interval=100);
    plt.close(anim._fig)
    # Option a) if ffmpeg codec is installed, display animation with ffmpeg
    # display(HTML(anim.to_html5_video()))
    # Option b) save as gif and display
    anim.save(name, dpi=80, writer=animation.PillowWriter(fps=20));
    with open(name,'rb') as file:
        display(Image(file.read()))

### Experiment Setup

In [None]:
def reload(obj):
   import inspect
   import imp
   cur_mod = inspect.getmodule(obj)
   imp.reload(cur_mod)
   mod_name = cur_mod.__name__
   obj_name = obj.__name__
   #from mod_name import obj_name as obj
   return getattr(__import__(mod_name, fromlist=[obj_name]), obj_name)

In [None]:
RNNAgent = reload(RNNAgent)
RNNAgentM = reload(RNNAgentM)
NNAgent = reload(NNAgent)

agents = {
    "NN": NNAgent,
    "RNN": RNNAgentM,
    'TBPTT_Multi': RNNAgentM,
    "GRU": RNNAgentGRU,
}

In [None]:
envs = {
    'Grid-World': MazeEnvironment,
}
agent_infos = {
    "Q-learning": {"step_size": .5},
    "Sarsa": {"step_size": 1e-2, 'num_tilings': 4, 'num_tiles': 4, 'iht_size': 300},
    "Linear": {"step_size": 1e-3},
    "NN": {"step_size": 1e-3},
    "RNN": {"step_size": 1e-3},
    "GRU": {"step_size": 1e-3}
}
env_info = {
    "maze_dim": [7, 7], 
    "start_state": [6, 0], 
    "end_state": [6, 6],
    "obstacles": [[3, 3], [3, 5], [3, 6], [4, 3], [5, 3], [6, 3]],
    "doors": {tuple([6,0]):[3,4]},
}

### Train

In [None]:
all_reward_sums = {} # Contains sum of rewards during episode
all_state_visits = {} # Contains state visit counts during the last 10 episodes
all_history = {}

In [None]:
# all_reward_sums['NN'] = []
# all_reward_sums_sarsa_tile_4 = all_reward_sums['Sarsa']
# all_reward_sums['Sarsa'] = []

In [None]:
num_runs = 1
num_episodes = 500
Environment = envs['Grid-World']

for algorithm in tqdm(list(agents.keys())):
    all_reward_sums[algorithm] = []
    all_state_visits[algorithm] = []
    
    for run in tqdm(range(num_runs)):
        agent = agents[algorithm]()
        env = Environment()
        
        env.env_init(env_info)
        agent_info = {"num_actions": 4, "num_states": env.cols * env.rows, "epsilon": .1, "step_size": 0.5, "discount": 1} 
        agent_info["seed"] = run
        agent_info.update(agent_infos[algorithm])
        np.random.seed(run)
        agent.agent_init(agent_info)
        
        reward_sums = []
        state_visits = np.zeros(env.cols * env.rows)
        epsilon = 1
        for episode in range(num_episodes):
#             if episode < 50:
#                 agent.epsilon = 1 
#             else:
#                 agent.epsilon = .1 
#             print(f"episode {episode}",end='\r')
#             print("")
            agent.epsilon = epsilon
            if episode < num_episodes - 10:
                sum_of_rewards = run_episode(env, agent) 
            else: 
                # Runs an episode while keeping track of visited states and history
                sum_of_rewards, history = run_episode(env, agent, state_visits, keep_history=True)
                all_history.setdefault(algorithm, []).append(history)
            epsilon *= 0.99
            reward_sums.append(sum_of_rewards)
        all_reward_sums[algorithm].append(reward_sums)
        all_state_visits[algorithm].append(state_visits)