In [1]:
import gym
#create a single game instance
env = gym.make("Taxi-v2")

#start new game
env.reset();

[2017-07-23 04:57:31,431] Making new env: Taxi-v2


In [2]:
env.render()

+---------+
|[34;1mR[0m: | : :[35mG[0m|
| : : : : |
| : : : : |
| |[43m [0m: | : |
|Y| : |B: |
+---------+



In [3]:
print("observations:", env.observation_space, 'n=', env.observation_space.n)
print("actions:", env.action_space, 'n=', env.action_space.n)

observations: Discrete(500) n= 500
actions: Discrete(6) n= 6


In [4]:
new_obs, reward, is_done, _ = env.step(3)
print("new observation code:", new_obs)
print("reward:", reward)
print("is game over?:", is_done)
print("printing new state:")
env.render()

new observation code: 321
reward: -1
is game over?: False
printing new state:
+---------+
|[34;1mR[0m: | : :[35mG[0m|
| : : : : |
| : : : : |
| |[43m [0m: | : |
|Y| : |B: |
+---------+
  (West)


In [5]:
import numpy as np
n_states = env.observation_space.n
n_actions = env.action_space.n
def get_random_policy():
    """
    Build a numpy array representing agent policy.
    This array must have one element per each of 16 environment states.
    Element must be an integer from 0 to 3, representing action
    to take from that state.
    """
    return np.random.choice(n_actions, size=n_states, replace=True)  # <your code>

In [6]:
def sample_reward(env, policy, t_max=11000):
    """
    Interact with an environment, return sum of all rewards.
    If game doesn't end on t_max (e.g. agent walks into a wall), 
    force end the game and return whatever reward you got so far.
    Tip: see signature of env.step(...) method above.
    """
    s = env.reset()
    total_reward = 0
    
    # <play & get reward>
    for t in range(t_max):
        s, reward, is_done, _ = env.step(policy[s])
        total_reward += reward
        # print(total_reward)
        if reward == 20:
            break
    return total_reward

In [20]:
p = get_random_policy()
print("a sample reward:", sample_reward(env, p))

a sample reward: -110000


In [129]:
def evaluate(policy, n_times=50):
    """Run several evaluations and average the score the policy gets."""
    rewards = [sample_reward(env, policy) for n in range(n_times)]
    # print(rewards)
    return float(np.mean(rewards))

In [130]:
print("evaluate a policy:", evaluate(p))

evaluate a sample reward: -60497.48


In [131]:
def crossover(policy1, policy2, p=0.5):
    """
    for each state, with probability p take action from policy1, else policy2
    """
    # <your code>
    mask = np.random.choice(2, len(policy1), replace=True, p=[p, 1-p])
    out = [policy1[i] if mask[i]==0 else policy2[i] for i in range(len(mask))]
    return out

In [132]:
def mutation(policy, p=0.3):
    """
    for each state, with probability p replace action with random action
    Tip: mutation can be written as crossover with random policy
    """
    # <your code>
    policy2 = np.random.choice(n_actions, len(policy), replace=True)
    return crossover(policy, policy2, p=(1-p))

In [133]:
n_epochs = 250 #how many cycles to make
pool_size = 100 #how many policies to maintain
n_crossovers = 50 #how many crossovers to make on each step
n_mutations = 50 #how many mutations to make on each tick

In [134]:
print("initializing...")
# <spawn a list of pool_size random policies>
pool = [get_random_policy() for i in range(pool_size)]
# <evaluate every policy in the pool, return list of scores>
pool_scores = [evaluate(p) for p in pool]

initializing...


In [135]:
#main loop
for epoch in range(n_epochs):
    print("Epoch %s:"%epoch)
    # <crossover random guys from pool, n_crossovers total>
    ids = [np.random.choice(pool_size, 2, replace=False) for i in range(n_crossovers)]
    crossovered = [crossover(pool[i], pool[j]) for i, j in ids]
    # <add several new policies at random, n_mutations total>
    ids = np.random.choice(pool_size, n_mutations, replace=False)
    mutated = [mutation(pool[i]) for i in ids]
    
    # assert type(crossovered) == type(mutated) == list
    
    #add new policies to the pool
    # <add up old population with crossovers/mutations>
    pool = pool + crossovered + mutated
    # <evaluate all policies again>
    pool_scores = [evaluate(p) for p in pool]
    
    #select pool_size best policies
    selected_indices = np.argsort(pool_scores)[-pool_size:]
    pool = [pool[i] for i in selected_indices]
    pool_scores = [pool_scores[i] for i in selected_indices]

    #print the best policy so far (last in ascending score order)
    print("best score:", pool_scores[-1])
    # print_policy(pool[-1])

Epoch 0:
best score: -44656.76
Epoch 1:
best score: -42678.38
Epoch 2:
best score: -42678.56
Epoch 3:
best score: -32778.74
Epoch 4:
best score: -32778.74
Epoch 5:
best score: -44657.84
Epoch 6:
best score: -42678.56
Epoch 7:
best score: -38718.74
Epoch 8:
best score: -38717.12
Epoch 9:
best score: -38718.74
Epoch 10:
best score: -36738.02
Epoch 11:
best score: -32777.84
Epoch 12:
best score: -38717.12
Epoch 13:
best score: -36738.02
Epoch 14:
best score: -32779.1
Epoch 15:
best score: -28819.28
Epoch 16:
best score: -30799.28
Epoch 17:
best score: -30799.64
Epoch 18:
best score: -32779.28
Epoch 19:
best score: -30800.0
Epoch 20:
best score: -32779.1
Epoch 21:
best score: -30799.1
Epoch 22:
best score: -32777.48
Epoch 23:
best score: -32778.38
Epoch 24:
best score: -30799.64
Epoch 25:


KeyboardInterrupt: 

In [None]:
print(pool_scores)

In [None]:
print(pool[-5])

In [None]:
print(sample_reward(env, pool[-2], t_max=250))

In [None]:
pool = [get_random_policy() for i in range(pool_size)] + pool[-20:]