In [1]:
from lib.algorithms import Q_learn_freetime, Q_learn

In [2]:
import hydra
hydra.initialize(config_path='configs')
cfg = hydra.compose(config_name='config')


In [3]:
from lib.gym_windy_gridworld import WindyGridworld
env = WindyGridworld(
        height=cfg.env.height, 
        width=cfg.env.width, 
        rewards=list(cfg.env.rewards), 
        wind=cfg.env.wind, 
        start=cfg.env.start, 
        allowed_actions=list(cfg.env.allowed_actions), 
        reward_terminates_episode=cfg.env.reward_terminates_episode
    )
    

In [4]:
cfg

{'initializations': ['pessimistic'], 'random_initialization_seed': None, 'num_runs': 5, 'env': {'height': 20, 'width': 11, 'rewards': [[1, 0, 0], [10, 0, 11]], 'wind': True, 'start': 'random', 'allowed_actions': ['L', 'R', 'C'], 'reward_terminates_episode': True}, 'baseline': {'discount': 0.98, 'alpha': 0.01, 'num_steps': 10000, 'epsilon': 0.05, 'show_rewards': True, 'show_q': False, 'show_trajectory': True}, 'freetime': {'num_steps': 10000, 'epsilon': 0.05, 'discount': 0.98, 'alpha': 0.01, 'alpha_f': 0.01, 'tolerance': 0.0, 'show_rewards': True, 'show_q': False, 'show_f': True, 'show_f_actions': ['min'], 'show_trajectory': True}, 'trajectory_maps': {'num_plots': 2}, 'q_plots': {'vmin': 0.8, 'vmax': 1}, 'f_plots': {'vmin': None, 'vmax': None}, 'plot_freetime_vs_baseline_same_table': True}

In [5]:
cfg.freetime.num_steps = 100000
cfg.baseline.num_steps = 100000
cfg

{'initializations': ['pessimistic'], 'random_initialization_seed': None, 'num_runs': 5, 'env': {'height': 20, 'width': 11, 'rewards': [[1, 0, 0], [10, 0, 11]], 'wind': True, 'start': 'random', 'allowed_actions': ['L', 'R', 'C'], 'reward_terminates_episode': True}, 'baseline': {'discount': 0.98, 'alpha': 0.01, 'num_steps': 100000, 'epsilon': 0.05, 'show_rewards': True, 'show_q': False, 'show_trajectory': True}, 'freetime': {'num_steps': 100000, 'epsilon': 0.05, 'discount': 0.98, 'alpha': 0.01, 'alpha_f': 0.01, 'tolerance': 0.0, 'show_rewards': True, 'show_q': False, 'show_f': True, 'show_f_actions': ['min'], 'show_trajectory': True}, 'trajectory_maps': {'num_plots': 2}, 'q_plots': {'vmin': 0.8, 'vmax': 1}, 'f_plots': {'vmin': None, 'vmax': None}, 'plot_freetime_vs_baseline_same_table': True}

In [6]:
from lib.algorithms import build_q_table

def grid_search(initialization):
    
    results = {}
    for tolerance in [-0.01, -0.001, 0.0, 0.001, 0.01, 0.1, 0.5]:
        
        results[tolerance] = []
        
        for exp in range(10):
            
            Q = build_q_table(
                (env.height, env.width),                
                env.action_space.n, 
                initialization = initialization,
                seed = cfg.random_initialization_seed # type: ignore
            )
            
            Q, F, rewards, _ = Q_learn_freetime(
                env, 
                Q, 
                cfg.freetime.num_steps, 
                cfg.freetime.epsilon, 
                cfg.freetime.discount, 
                cfg.freetime.alpha, 
                cfg.freetime.alpha_f, 
                cfg.freetime.tolerance
            )
            
            results[tolerance].append(rewards[-1])
    
    results['baseline'] = []
    for exp in range(10):
        
        Q = build_q_table(
                (env.height, env.width),                
                env.action_space.n, 
                initialization = initialization,
                seed = cfg.random_initialization_seed # type: ignore
            )
        
        Q, rewards = Q_learn(
                env, 
                Q, 
                cfg.baseline.num_steps, 
                cfg.baseline.epsilon, 
                cfg.baseline.discount, 
                cfg.baseline.alpha
            )
        
        results['baseline'].append(rewards[-1])
        
    results = {k: sum(v)/len(v) for k, v in results.items()} 
    return results


In [7]:
results = grid_search('pessimistic')
results

100%|██████████| 100000/100000 [00:02<00:00, 33725.56it/s]
100%|██████████| 100000/100000 [00:03<00:00, 33118.88it/s]
100%|██████████| 100000/100000 [00:03<00:00, 33167.38it/s]
100%|██████████| 100000/100000 [00:03<00:00, 33301.50it/s]
100%|██████████| 100000/100000 [00:02<00:00, 33729.29it/s]
100%|██████████| 100000/100000 [00:02<00:00, 34933.93it/s]
100%|██████████| 100000/100000 [00:02<00:00, 34313.52it/s]
100%|██████████| 100000/100000 [00:02<00:00, 34532.05it/s]
100%|██████████| 100000/100000 [00:02<00:00, 34160.07it/s]
100%|██████████| 100000/100000 [00:02<00:00, 34337.37it/s]
100%|██████████| 100000/100000 [00:02<00:00, 34328.87it/s]
100%|██████████| 100000/100000 [00:02<00:00, 35362.20it/s]
100%|██████████| 100000/100000 [00:02<00:00, 33836.68it/s]
100%|██████████| 100000/100000 [00:02<00:00, 33515.65it/s]
100%|██████████| 100000/100000 [00:03<00:00, 33297.07it/s]
100%|██████████| 100000/100000 [00:02<00:00, 33981.91it/s]
100%|██████████| 100000/100000 [00:02<00:00, 34940.34it/

{-0.01: 23871.5,
 -0.001: 23750.2,
 0.0: 20382.5,
 0.001: 21955.1,
 0.01: 24422.6,
 0.1: 19739.3,
 0.5: 21550.5,
 'baseline': 7358.6}

In [8]:
results = grid_search('random')
results

100%|██████████| 100000/100000 [00:03<00:00, 32864.35it/s]
100%|██████████| 100000/100000 [00:02<00:00, 33547.53it/s]
100%|██████████| 100000/100000 [00:03<00:00, 33281.84it/s]
100%|██████████| 100000/100000 [00:03<00:00, 33160.02it/s]
100%|██████████| 100000/100000 [00:02<00:00, 33636.81it/s]
100%|██████████| 100000/100000 [00:02<00:00, 33575.86it/s]
100%|██████████| 100000/100000 [00:02<00:00, 34720.25it/s]
100%|██████████| 100000/100000 [00:02<00:00, 33733.65it/s]
100%|██████████| 100000/100000 [00:02<00:00, 34351.54it/s]
100%|██████████| 100000/100000 [00:02<00:00, 33894.97it/s]
100%|██████████| 100000/100000 [00:02<00:00, 34271.68it/s]
100%|██████████| 100000/100000 [00:02<00:00, 34216.23it/s]
100%|██████████| 100000/100000 [00:02<00:00, 33968.17it/s]
100%|██████████| 100000/100000 [00:02<00:00, 33913.99it/s]
100%|██████████| 100000/100000 [00:02<00:00, 33916.54it/s]
100%|██████████| 100000/100000 [00:02<00:00, 33880.97it/s]
100%|██████████| 100000/100000 [00:02<00:00, 33769.98it/

{-0.01: 17718.9,
 -0.001: 13085.5,
 0.0: 18547.2,
 0.001: 8481.6,
 0.01: 13765.7,
 0.1: 13707.3,
 0.5: 9931.6,
 'baseline': 9698.4}

In [9]:
results = grid_search('optimistic')
results

100%|██████████| 100000/100000 [00:02<00:00, 33724.33it/s]
100%|██████████| 100000/100000 [00:02<00:00, 33889.21it/s]
100%|██████████| 100000/100000 [00:02<00:00, 34440.80it/s]
100%|██████████| 100000/100000 [00:02<00:00, 34870.74it/s]
100%|██████████| 100000/100000 [00:02<00:00, 34982.29it/s]
100%|██████████| 100000/100000 [00:02<00:00, 34793.63it/s]
100%|██████████| 100000/100000 [00:02<00:00, 34719.52it/s]
100%|██████████| 100000/100000 [00:02<00:00, 34911.25it/s]
100%|██████████| 100000/100000 [00:02<00:00, 34917.25it/s]
100%|██████████| 100000/100000 [00:02<00:00, 35013.05it/s]
100%|██████████| 100000/100000 [00:02<00:00, 35016.47it/s]
100%|██████████| 100000/100000 [00:02<00:00, 34985.15it/s]
100%|██████████| 100000/100000 [00:02<00:00, 35008.13it/s]
100%|██████████| 100000/100000 [00:02<00:00, 35118.18it/s]
100%|██████████| 100000/100000 [00:02<00:00, 35165.92it/s]
100%|██████████| 100000/100000 [00:02<00:00, 34785.98it/s]
100%|██████████| 100000/100000 [00:02<00:00, 34957.54it/

{-0.01: 37473.2,
 -0.001: 38243.5,
 0.0: 38123.0,
 0.001: 38170.4,
 0.01: 38460.6,
 0.1: 37514.3,
 0.5: 37915.3,
 'baseline': 38829.1}