In [53]:
from mlagents_envs.environment import ActionTuple, UnityEnvironment, BaseEnv
from mlagents_envs.side_channel.engine_configuration_channel import EngineConfigurationChannel
import numpy as np
import torch
import random
from collections import deque

In [71]:
channel = EngineConfigurationChannel()

env = UnityEnvironment(file_name='./Wave', seed=1, side_channels=[channel])
channel.set_configuration_parameters(time_scale = 20)
print("WAVE environment created.")

WAVE environment created.


In [72]:
l1 = 34 + 2
l2 = 150
l3 = 150
l4 = 2

model = torch.nn.Sequential(
  torch.nn.Linear(l1, l2),
  torch.nn.ReLU(),
  torch.nn.Linear(l2, l3),
  torch.nn.ReLU(),
  torch.nn.Linear(l3, l4),
)

learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [73]:
def loss_fn(pred, target):
    return torch.mean(0.5 * (pred - target) ** 2)

def preprocess_input(inp):
    return np.append(inp.obs[1], inp.obs[0], axis=1)

In [74]:
EPOCH = 500
mem_size = 10000
EPSILON = 1
MIN_EPSILON = .1
MAX_STEP = 5000

MIN_BATCH = 256
GAMMA = 0.9

losses = []
important_experiences = []
experiences = deque(maxlen=mem_size)

for i in range(EPOCH):
    env.reset()
    behavior_name = list(env.behavior_specs)[0]

    j = 0
    epoch_rewards = []
    while j < MAX_STEP:
        j += 1
        decision_steps, terminal_steps = env.get_steps(behavior_name)
        if (len(terminal_steps) > 0):
            break

        state = torch.Tensor(preprocess_input(decision_steps))

        qval = model(state)
        if np.random.rand() > EPSILON:
            # exploit
            action = np.argmax(qval.detach().numpy())
        else:
            # explore
            action = np.random.randint(0, 2)

        action_tuple = ActionTuple()
        action_tuple.add_discrete(np.array([[action]]))
        env.set_actions(behavior_name, action_tuple)
        env.step()

        new_decision_steps, new_terminal_steps = env.get_steps(behavior_name)
        current_step = new_decision_steps if len(new_terminal_steps) == 0 else new_terminal_steps
        reward = current_step.reward[0]

        if reward == 0:
            continue

        epoch_rewards.append(reward)
    
        state2 = torch.Tensor(preprocess_input(current_step))
        done = len(new_terminal_steps) > 0

        current_exp = (state, action, reward, state2, done)

        if done:
            important_experiences.append(current_exp)
        else:
            experiences.append(current_exp)

        all_exp = experiences.copy()
        all_exp.extend(important_experiences)

        if len(all_exp) >= MIN_BATCH:
            batch = random.sample(all_exp, MIN_BATCH)

            states = torch.cat([s for (s, a, r, s2, done) in batch])
            actions = torch.Tensor([a for (s, a, r, s2, done) in batch])
            states2 = torch.cat([s2 for (s, a, r, s2, done) in batch])
            done_data = torch.Tensor([done for (s, a, r, s2, done) in batch])
            rewards = torch.Tensor([r for (s, a, r, s2, done) in batch])

            qvals = model(states)

            with torch.no_grad():
                qvals_2 = model(states2)

            target = rewards + GAMMA * ((1 - done_data) * torch.max(qvals_2, dim=1)[0])
            action_qval_pred = qvals.gather(dim=1, index=actions.long().unsqueeze(dim=1)).squeeze()
            err = loss_fn(action_qval_pred, target.detach())
            losses.append(err.item())

            optimizer.zero_grad()
            err.backward()
            optimizer.step()

    print(f'EPOCH: {i}, total reward: {np.sum(epoch_rewards)}, timestep: {j}, epsilon: {EPSILON}')
    if EPSILON > MIN_EPSILON:
        EPSILON -= 1 / EPOCH

env.close()

EPOCH: 0, total reward: 36.1673583984375, timestep: 89, epsilon: 1
EPOCH: 1, total reward: 0.0, timestep: 2, epsilon: 0.998
EPOCH: 2, total reward: 2.881955146789551, timestep: 28, epsilon: 0.996
EPOCH: 3, total reward: 28.30318832397461, timestep: 86, epsilon: 0.994
EPOCH: 4, total reward: 2.8724164962768555, timestep: 27, epsilon: 0.992
EPOCH: 5, total reward: 11.162006378173828, timestep: 39, epsilon: 0.99
EPOCH: 6, total reward: 2.8469905853271484, timestep: 26, epsilon: 0.988
EPOCH: 7, total reward: 2.9827675819396973, timestep: 26, epsilon: 0.986
EPOCH: 8, total reward: 52.84823226928711, timestep: 116, epsilon: 0.984
EPOCH: 9, total reward: 2.824700355529785, timestep: 26, epsilon: 0.982
EPOCH: 10, total reward: 2.9046945571899414, timestep: 29, epsilon: 0.98
EPOCH: 11, total reward: 11.159300804138184, timestep: 49, epsilon: 0.978
EPOCH: 12, total reward: 36.16903305053711, timestep: 90, epsilon: 0.976
EPOCH: 13, total reward: 11.18298053741455, timestep: 51, epsilon: 0.974
EPO

In [75]:
env.close()

UnityEnvironmentException: No Unity environment is loaded.

In [76]:
env = UnityEnvironment(file_name='./Wave', seed=1)
print("WAVE environment created.")

env.reset()
i = 0
while True:
    i += 1
    behavior_name = list(env.behavior_specs)[0]

    decision_steps, terminal_steps = env.get_steps(behavior_name)
    if (len(terminal_steps) > 0):
        break

    preds = model(torch.Tensor(preprocess_input(decision_steps))).detach().numpy()

    action = np.argmax(preds)
    action_tuple = ActionTuple()
    action_tuple.add_discrete(np.array([[action]]))
    env.set_actions(behavior_name, action_tuple)
    env.step()

    new_decision_steps, new_terminal_steps = env.get_steps(behavior_name)
    current_step = new_decision_steps if len(new_terminal_steps) == 0 else new_terminal_steps
    try:
        reward = current_step.reward[0]
    except Exception as e:
        print(e)

    qvals_2 = model(torch.Tensor(preprocess_input(current_step)))
    target = reward + GAMMA * ((1 - int(len(new_terminal_steps) > 0)) * torch.max(qvals_2, dim=1)[0])
    print(i, 'Decision Steps', preprocess_input(decision_steps), 'Preds', preds, 'Selected Preds', np.argmax(preds), 'Done?', int(len(new_terminal_steps) > 0), 'Next Preds', ((1 - int(len(new_terminal_steps) > 0)) * torch.max(qvals_2, dim=1)[0]), 'Target', target, 'Reward', reward, sep='\n')
    print('==================================')
env.close()

WAVE environment created.
1
Decision Steps
[[0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
  1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]
Preds
[[5.0858493 5.257706 ]]
Selected Preds
1
Done?
0
Next Preds
tensor([4.2514], grad_fn=<MulBackward0>)
Target
tensor([3.8715], grad_fn=<AddBackward0>)
Reward
0.045248657
2
Decision Steps
[[5.72632   0.8788889 1.        1.        1.        1.        1.
  1.        1.        1.        1.        1.        1.        1.
  1.        1.        1.        1.        1.        1.        1.
  1.        1.        1.        1.        1.        1.        1.
  1.        1.        1.        1.        1.        1.        1.
  1.       ]]
Preds
[[3.8740757 4.251408 ]]
Selected Preds
1
Done?
0
Next Preds
tensor([4.2797], grad_fn=<MulBackward0>)
Target
tensor([4.0629], grad_fn=<AddBackward0>)
Reward
0.21112537
3
Decision Steps
[[5.2240553 1.8152071 1.        1.        1.        1.        1.
  1.        1.        1.        1.        1.        1.    

In [70]:
env.close()

UnityEnvironmentException: No Unity environment is loaded.