# Training Reinforcement Learning Agent
This notebook contains all the necessary code to train different policies and compare them to analyze the performance of the agent.

In [1]:
from map import MapLoader
from monte_carlo import MonteCarlo, Policy

In [3]:
def print_policy(agent, start_pos = (0, 16)):
    # Display the resulting policy for a subset of states for clarity
    for state in agent.policy.policy.keys():
        # Only show policy for zero velocity states for brevity
        if state[2:] == (0, 0):  
            print(f"Policy at {state}: {agent.policy[state]}")

    # Display the number of states in the policy
    print(f"There are {len(agent.policy)} states in policy")
    
    # Verify the policy by running an episodes
    episode = agent.generate_episode(start=start_pos)
    print("Episode:")
    for s, a, r in episode:
        print(f"State: {s}, Action: {a}, Reward: {r}")    

## Map 1

In [7]:
map_1 = MapLoader.load_map("./maps/map1.txt")

### Epsilon 0.9

In [8]:
mc_map = MonteCarlo(map_1, num_episodes=1_000, train_epsilon=0.9, policy_filename="policies/map_09.txt.policy")
mc_map.monte_carlo_control()

100%|██████████| 1000/1000 [04:24<00:00,  3.78it/s]


In [9]:
print_policy(mc_map)

Policy at (6, 16, 0, 0): decrease_vy
Policy at (4, 16, 0, 0): decrease_vy
Policy at (8, 16, 0, 0): increase_vx
Policy at (1, 16, 0, 0): decrease_vx_vy
Policy at (5, 16, 0, 0): decrease_vx_vy
Policy at (7, 16, 0, 0): decrease_vy
Policy at (3, 16, 0, 0): decrease_vy
Policy at (2, 16, 0, 0): increase_vx_decrease_vy
Policy at (9, 16, 0, 0): decrease_vx_vy
Policy at (0, 16, 0, 0): increase_vx_decrease_vy
There are 2397 states in policy
Episode:
State: (0, 16, 0, 0), Action: increase_vx_decrease_vy, Reward: -1
State: (1, 15, 1, -1), Action: decrease_vx_vy, Reward: -1
State: (1, 13, 0, -2), Action: increase_vx, Reward: -1
State: (2, 11, 1, -2), Action: decrease_vx_increase_vy, Reward: -1
State: (2, 10, 0, -1), Action: decrease_vx_vy, Reward: -1
State: (1, 8, -1, -2), Action: increase_vx, Reward: -1
State: (1, 6, 0, -2), Action: increase_vx_vy, Reward: -1
State: (2, 5, 1, -1), Action: increase_vx, Reward: -1
State: (4, 4, 2, -1), Action: increase_vy, Reward: 1


### Epsilon 0.7

In [10]:
mc_map = MonteCarlo(map_1, num_episodes=1_000, train_epsilon=0.7, policy_filename="policies/map_07.txt.policy")
mc_map.monte_carlo_control()

100%|██████████| 1000/1000 [00:34<00:00, 28.90it/s]


In [11]:
print_policy(mc_map)

Policy at (7, 16, 0, 0): decrease_vx_vy
Policy at (2, 16, 0, 0): increase_vx_decrease_vy
Policy at (6, 16, 0, 0): decrease_vy
Policy at (4, 16, 0, 0): decrease_vx_vy
Policy at (5, 16, 0, 0): decrease_vy
Policy at (9, 16, 0, 0): increase_vx_decrease_vy
Policy at (0, 16, 0, 0): increase_vx_decrease_vy
Policy at (1, 16, 0, 0): decrease_vy
Policy at (3, 16, 0, 0): increase_vx_decrease_vy
Policy at (8, 16, 0, 0): decrease_vx_vy
There are 2292 states in policy
Episode:
State: (0, 16, 0, 0), Action: increase_vx_decrease_vy, Reward: -1
State: (1, 15, 1, -1), Action: decrease_vy, Reward: -1
State: (2, 13, 1, -2), Action: decrease_vx, Reward: -1
State: (2, 11, 0, -2), Action: no_change, Reward: -1
State: (2, 9, 0, -2), Action: increase_vx, Reward: -1
State: (3, 7, 1, -2), Action: decrease_vx, Reward: -1
State: (3, 5, 0, -2), Action: increase_vx_vy, Reward: -1
State: (4, 4, 1, -1), Action: increase_vx_vy, Reward: 1


### Epsilon 0.5

In [12]:
mc_map = MonteCarlo(map_1, num_episodes=1_000, train_epsilon=0.5, policy_filename="policies/map_05.txt.policy")
mc_map.monte_carlo_control()

100%|██████████| 1000/1000 [00:15<00:00, 65.99it/s]


In [13]:
print_policy(mc_map)

Policy at (9, 16, 0, 0): decrease_vx_vy
Policy at (2, 16, 0, 0): increase_vx_decrease_vy
Policy at (3, 16, 0, 0): decrease_vy
Policy at (7, 16, 0, 0): decrease_vy
Policy at (6, 16, 0, 0): decrease_vy
Policy at (0, 16, 0, 0): increase_vx_decrease_vy
Policy at (4, 16, 0, 0): increase_vy
Policy at (5, 16, 0, 0): increase_vx_decrease_vy
Policy at (1, 16, 0, 0): decrease_vx_vy
Policy at (8, 16, 0, 0): decrease_vx_vy
There are 2178 states in policy
Episode:
State: (0, 16, 0, 0), Action: increase_vx_decrease_vy, Reward: -1
State: (1, 15, 1, -1), Action: decrease_vx_vy, Reward: -1
State: (1, 13, 0, -2), Action: no_change, Reward: -1
State: (1, 11, 0, -2), Action: decrease_vx, Reward: -1
State: (0, 9, -1, -2), Action: increase_vx, Reward: -1
State: (0, 7, 0, -2), Action: increase_vx, Reward: -1
State: (1, 5, 1, -2), Action: increase_vx_vy, Reward: -1
State: (3, 4, 2, -1), Action: decrease_vx_increase_vy, Reward: -1
State: (4, 4, 1, 0), Action: increase_vx, Reward: 1


## Map 2

## Map 3