In [37]:
from mlagents_envs.environment import ActionTuple, UnityEnvironment
from mlagents_envs.side_channel.engine_configuration_channel import EngineConfigurationChannel
import numpy as np
import torch

In [53]:
channel = EngineConfigurationChannel()

env = UnityEnvironment(file_name='./Wave', seed=1, side_channels=[channel])
channel.set_configuration_parameters(time_scale = 3.0)
print("WAVE environment created.")

WAVE environment created.


In [49]:
l1 = 38
l2 = 150
l3 = 150
l4 = 2

model = torch.nn.Sequential(
  torch.nn.Linear(l1, l2),
  torch.nn.ReLU(),
  torch.nn.Linear(l2,l3),
  torch.nn.ReLU(),
  torch.nn.Linear(l3,l4),
  torch.nn.Softmax(dim=1)
)
  
learning_rate = 5e-3
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [54]:
def discount_rewards(rewards: np.ndarray, gamma):
    reversed = np.copy(rewards)[::-1]
    discounted_rewards = []
    for i, reward in enumerate(reversed):
        discounted_rewards.append(reward + (0 if i == 0 else reversed[i - 1]))
        reversed[i] = reward * gamma
        if i > 0:
            reversed[i] += reversed[i - 1] * gamma
    return np.array(discounted_rewards[::-1])

def loss_fn(predictions, rewards):
    return -1 * torch.sum(rewards * torch.log(predictions))

def preprocess_input(inp):
    return np.append(inp.obs[1], inp.obs[0], axis=1).reshape(-1)

def get_trajectories(model):
    states = []
    actions = []
    action_sets = []
    rewards = []

    env.reset()
    behavior_name = list(env.behavior_specs)[0]
    while True:
        decision_steps, terminal_steps = env.get_steps(behavior_name)
        if (len(terminal_steps) > 0):
            break

        state = preprocess_input(decision_steps)

        states.append(state)
        pred = model(torch.Tensor(np.array([state])))
        action = np.random.choice(np.array([0, 1]), p=pred.detach().numpy().flatten())
        actions.append(action)
        action_tuple = ActionTuple()
        action_tuple.add_discrete(np.array([[action]]))
        env.set_actions(behavior_name, action_tuple)
        env.step()
        
        new_decision_steps, new_terminal_steps = env.get_steps(behavior_name)
        current_step = new_decision_steps if len(new_terminal_steps) == 0 else new_terminal_steps
        try:
            reward = current_step.reward[0]
        except:
            reward = 0
        rewards.append(float(reward))

    return torch.from_numpy(np.array(states)).float(), np.array(actions), np.array(rewards), np.array(action_sets)

In [56]:
EPOCH = 1000
GAMMA = 0.35

for i in range(EPOCH):
    states, actions, rewards, action_sets = get_trajectories(model)
    if states.shape[0] == 0:
        continue
    print(states.shape)
    predictions = model(states)
    cumulative_reward = np.sum(rewards)
    print(f'EPOCH {i}: cumulative reward {cumulative_reward}')
    discounted_rewards = torch.tensor(discount_rewards(rewards, GAMMA))
    print(discounted_rewards)

    actions = torch.tensor(actions.reshape(-1, 1)).long()
    prob_batch = predictions.gather(dim=1,index=actions).squeeze()
    loss = loss_fn(prob_batch, discounted_rewards)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

torch.Size([121, 38])
EPOCH 0: cumulative reward -28.731335401535034
tensor([  0.0565,   0.0587,   0.0685,   0.0780,   0.0872,   0.0960,   0.1045,
          0.1127,   0.1207,   0.1283,   0.1357,   0.1428,   0.1497,   0.1563,
          0.1627,   0.1689,   0.1749,   0.1806,   0.1862,   0.1916,   0.1967,
          0.2017,   0.2065,   0.2112,   0.2157,   0.2200,   0.2242,   0.2282,
          0.2321,   0.2359,   0.2395,   0.2430,   0.2463,   0.2496,   0.2527,
          0.2558,   0.2587,   0.2615,   0.2642,   0.2669,   0.2694,   0.2719,
          0.2742,   0.2765,   0.2787,   0.2808,   0.2829,   0.2848,   0.2868,
          0.2886,   0.2904,   0.2921,   0.2937,   0.2953,   0.2969,   0.2984,
          0.2998,   0.3012,   0.3025,   0.3038,   0.3050,   0.3062,   0.3074,
          0.3085,   0.3096,   0.3106,   0.3116,   0.3126,   0.3136,   0.3145,
          0.3153,   0.3162,   0.3170,   0.3178,   0.3185,   0.3192,   0.3199,
          0.3206,   0.3213,   0.3219,   0.3225,   0.3231,   0.3237,   0.3

KeyboardInterrupt: 

In [50]:
env = UnityEnvironment(file_name='./Wave', seed=1)
print("WAVE environment created.")

env.reset()
while True:
    behavior_name = list(env.behavior_specs)[0]

    decision_steps, terminal_steps = env.get_steps(behavior_name)
    print(decision_steps.obs)
    if (len(terminal_steps) > 0):
        break

    preds = model(torch.Tensor(preprocess_input(decision_steps))).detach().numpy()
    print(preds)
    action = np.argmax(preds)
    action_tuple = ActionTuple()
    action_tuple.add_discrete(np.array([[action]]))
    env.set_actions(behavior_name, action_tuple)
    env.step()

env.close()

UnityWorkerInUseException: Couldn't start socket communication because worker number 0 is still in use. You may need to manually close a previously opened environment or use a different worker number.

In [52]:
env.close()