In [85]:
import os
import numpy as np
import gym
from collections import OrderedDict
from gym.spaces import Box, MultiDiscrete, Dict, Tuple
from stable_baselines3 import PPO, A2C
from gym.wrappers import FlattenObservation

In [86]:
n = 300 # Number of VMs
m = 30 # Number of Servers

In [111]:
class CpuMem():
    def __init__(self, cpu, mem):
        self.cpu = cpu
        self.mem = mem
    def get_arr(self):
        return [self.cpu, self.mem]
    def __le__(self, other):
        return self.cpu <= other.cpu and self.mem <= other.mem
    def __isub__(self, other):
        self.cpu -= other.cpu
        self.mem -= other.mem
        return self

STOP = 2048
dtype = np.float32

class CloudEnv(gym.Env):
    def __init__(self):
        self.reset()
        self.action_space = MultiDiscrete([n, m])
        self.observation_space = Dict(
            servers=Box(low=0, high=STOP, shape=(m, 2), dtype=dtype),
            vms=Box(low=0, high=STOP, shape=(n, 2), dtype=dtype),
        )

    def get_state(self):
        vms = np.zeros((n, 2), dtype=dtype)
        for i in range(n):
            vms[i] = self.vms[i].get_arr()
        servers = np.zeros((m, 2), dtype=dtype)
        for i in range(m):
            servers[i] = self.servers[i].get_arr()
        return OrderedDict([
            ('servers', servers),
            ('vms', vms),
        ])

    def get_reward(self):
        ans = 0
        for vm in self.vms:
            if vm.cpu == STOP:
                ans += 1
        reward = ans - self.was
        self.was = ans
        return 2 * reward - 1
    
    def can_move(self, vm_index, server_index):
        vm = self.vms[vm_index]
        server = self.servers[server_index]
        if vm.cpu == STOP:
            return False
        if not vm <= server:
            return False
        return True

    def is_done(self):
        for vm in self.vms:
            if vm.cpu == STOP:
                continue
            for server in self.servers:
                if vm <= server:
                    return False
        return True

    def render(self):
        pass

    def step(self, action):
        vm_index, server_index = action[0], action[1]
        if self.can_move(vm_index, server_index):
            self.servers[server_index] -= self.vms[vm_index]
            self.vms[vm_index] = CpuMem(STOP, STOP)
        return self.get_state(), self.get_reward(), self.is_done(), {}

    def reset(self):
        self.vms = []
        self.servers = []
        for i in range(n):
            self.vms.append(CpuMem(1, 2))
        for i in range(m):
            self.servers.append(CpuMem(16, 32))
        self.was = 0
        return self.get_state()

In [112]:
env = CloudEnv()
# env = FlattenObservation(env)

In [113]:
# from stable_baselines3.common.vec_env import VecFrameStack
# env = VecFrameStack(env, n_stack=8)

In [114]:
log_path = os.path.join('Training', 'Logs')
model = PPO(policy='MultiInputPolicy', env=env, verbose=1, tensorboard_log=log_path)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [116]:
model.learn(total_timesteps=1000000)

Logging to Training/Logs/PPO_3
-----------------------------
| time/              |      |
|    fps             | 631  |
|    iterations      | 1    |
|    time_elapsed    | 3    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 425         |
|    iterations           | 2           |
|    time_elapsed         | 9           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.009999612 |
|    clip_fraction        | 0.0663      |
|    clip_range           | 0.2         |
|    entropy_loss         | -8.62       |
|    explained_variance   | 0.824       |
|    learning_rate        | 0.0003      |
|    loss                 | 1.67        |
|    n_updates            | 500         |
|    policy_gradient_loss | -0.0181     |
|    value_loss           | 8.47        |
-----------------------------------------
---

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 9.67e+03    |
|    ep_rew_mean          | -9.07e+03   |
| time/                   |             |
|    fps                  | 347         |
|    iterations           | 12          |
|    time_elapsed         | 70          |
|    total_timesteps      | 24576       |
| train/                  |             |
|    approx_kl            | 0.018239502 |
|    clip_fraction        | 0.207       |
|    clip_range           | 0.2         |
|    entropy_loss         | -8.48       |
|    explained_variance   | 0.917       |
|    learning_rate        | 0.0003      |
|    loss                 | 0.581       |
|    n_updates            | 600         |
|    policy_gradient_loss | -0.0283     |
|    value_loss           | 1.08        |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 9.67e+

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.05e+04     |
|    ep_rew_mean          | -9.89e+03    |
| time/                   |              |
|    fps                  | 364          |
|    iterations           | 22           |
|    time_elapsed         | 123          |
|    total_timesteps      | 45056        |
| train/                  |              |
|    approx_kl            | 0.0055570584 |
|    clip_fraction        | 0.0271       |
|    clip_range           | 0.2          |
|    entropy_loss         | -8.36        |
|    explained_variance   | 0.377        |
|    learning_rate        | 0.0003       |
|    loss                 | 66.7         |
|    n_updates            | 700          |
|    policy_gradient_loss | -0.013       |
|    value_loss           | 51.4         |
------------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_m

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.17e+04    |
|    ep_rew_mean          | -1.11e+04   |
| time/                   |             |
|    fps                  | 363         |
|    iterations           | 32          |
|    time_elapsed         | 180         |
|    total_timesteps      | 65536       |
| train/                  |             |
|    approx_kl            | 0.023910634 |
|    clip_fraction        | 0.279       |
|    clip_range           | 0.2         |
|    entropy_loss         | -7.98       |
|    explained_variance   | 1.19e-07    |
|    learning_rate        | 0.0003      |
|    loss                 | 0.11        |
|    n_updates            | 800         |
|    policy_gradient_loss | -0.0288     |
|    value_loss           | 0.327       |
-----------------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 1.17e+04

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.17e+04    |
|    ep_rew_mean          | -1.11e+04   |
| time/                   |             |
|    fps                  | 353         |
|    iterations           | 42          |
|    time_elapsed         | 243         |
|    total_timesteps      | 86016       |
| train/                  |             |
|    approx_kl            | 0.027202938 |
|    clip_fraction        | 0.384       |
|    clip_range           | 0.2         |
|    entropy_loss         | -7.86       |
|    explained_variance   | 9.42e-06    |
|    learning_rate        | 0.0003      |
|    loss                 | -0.0356     |
|    n_updates            | 900         |
|    policy_gradient_loss | -0.0172     |
|    value_loss           | 0.000152    |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.17e+

----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 1.17e+04   |
|    ep_rew_mean          | -1.11e+04  |
| time/                   |            |
|    fps                  | 359        |
|    iterations           | 52         |
|    time_elapsed         | 296        |
|    total_timesteps      | 106496     |
| train/                  |            |
|    approx_kl            | 0.02584418 |
|    clip_fraction        | 0.353      |
|    clip_range           | 0.2        |
|    entropy_loss         | -7.92      |
|    explained_variance   | 0.000516   |
|    learning_rate        | 0.0003     |
|    loss                 | -0.0381    |
|    n_updates            | 1000       |
|    policy_gradient_loss | -0.0211    |
|    value_loss           | 0.000217   |
----------------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 1.17e+04   |
|    ep_rew_mean

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.17e+04    |
|    ep_rew_mean          | -1.11e+04   |
| time/                   |             |
|    fps                  | 363         |
|    iterations           | 62          |
|    time_elapsed         | 349         |
|    total_timesteps      | 126976      |
| train/                  |             |
|    approx_kl            | 0.025634887 |
|    clip_fraction        | 0.309       |
|    clip_range           | 0.2         |
|    entropy_loss         | -7.4        |
|    explained_variance   | 0.000668    |
|    learning_rate        | 0.0003      |
|    loss                 | 0.0168      |
|    n_updates            | 1100        |
|    policy_gradient_loss | -0.0179     |
|    value_loss           | 3.87e-05    |
-----------------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 1.17e+04

----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 1.17e+04   |
|    ep_rew_mean          | -1.11e+04  |
| time/                   |            |
|    fps                  | 363        |
|    iterations           | 72         |
|    time_elapsed         | 405        |
|    total_timesteps      | 147456     |
| train/                  |            |
|    approx_kl            | 0.02098041 |
|    clip_fraction        | 0.342      |
|    clip_range           | 0.2        |
|    entropy_loss         | -7.82      |
|    explained_variance   | 0.00479    |
|    learning_rate        | 0.0003     |
|    loss                 | -0.0266    |
|    n_updates            | 1200       |
|    policy_gradient_loss | -0.022     |
|    value_loss           | 1.43e-05   |
----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.17e+04    |
|    ep_rew_m

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.17e+04    |
|    ep_rew_mean          | -1.11e+04   |
| time/                   |             |
|    fps                  | 363         |
|    iterations           | 82          |
|    time_elapsed         | 462         |
|    total_timesteps      | 167936      |
| train/                  |             |
|    approx_kl            | 0.022855544 |
|    clip_fraction        | 0.356       |
|    clip_range           | 0.2         |
|    entropy_loss         | -7.83       |
|    explained_variance   | 0.00119     |
|    learning_rate        | 0.0003      |
|    loss                 | -0.0288     |
|    n_updates            | 1300        |
|    policy_gradient_loss | -0.0183     |
|    value_loss           | 0.00068     |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.17e+

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.17e+04    |
|    ep_rew_mean          | -1.11e+04   |
| time/                   |             |
|    fps                  | 363         |
|    iterations           | 92          |
|    time_elapsed         | 518         |
|    total_timesteps      | 188416      |
| train/                  |             |
|    approx_kl            | 0.022193126 |
|    clip_fraction        | 0.333       |
|    clip_range           | 0.2         |
|    entropy_loss         | -7.74       |
|    explained_variance   | 0.434       |
|    learning_rate        | 0.0003      |
|    loss                 | -0.0219     |
|    n_updates            | 1400        |
|    policy_gradient_loss | -0.0219     |
|    value_loss           | 0.000231    |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.17e+

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.17e+04    |
|    ep_rew_mean          | -1.11e+04   |
| time/                   |             |
|    fps                  | 364         |
|    iterations           | 102         |
|    time_elapsed         | 573         |
|    total_timesteps      | 208896      |
| train/                  |             |
|    approx_kl            | 0.024750454 |
|    clip_fraction        | 0.367       |
|    clip_range           | 0.2         |
|    entropy_loss         | -7.69       |
|    explained_variance   | 2.41e-05    |
|    learning_rate        | 0.0003      |
|    loss                 | -0.0374     |
|    n_updates            | 1500        |
|    policy_gradient_loss | -0.0208     |
|    value_loss           | 4.55e-06    |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.17e+

----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 1.17e+04   |
|    ep_rew_mean          | -1.11e+04  |
| time/                   |            |
|    fps                  | 364        |
|    iterations           | 112        |
|    time_elapsed         | 629        |
|    total_timesteps      | 229376     |
| train/                  |            |
|    approx_kl            | 0.02368699 |
|    clip_fraction        | 0.374      |
|    clip_range           | 0.2        |
|    entropy_loss         | -7.44      |
|    explained_variance   | 0.0944     |
|    learning_rate        | 0.0003     |
|    loss                 | -0.0289    |
|    n_updates            | 1600       |
|    policy_gradient_loss | -0.0234    |
|    value_loss           | 2.21e-06   |
----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.17e+04    |
|    ep_rew_m

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.17e+04    |
|    ep_rew_mean          | -1.11e+04   |
| time/                   |             |
|    fps                  | 366         |
|    iterations           | 122         |
|    time_elapsed         | 681         |
|    total_timesteps      | 249856      |
| train/                  |             |
|    approx_kl            | 0.023712445 |
|    clip_fraction        | 0.335       |
|    clip_range           | 0.2         |
|    entropy_loss         | -7.28       |
|    explained_variance   | 0.105       |
|    learning_rate        | 0.0003      |
|    loss                 | -0.0169     |
|    n_updates            | 1700        |
|    policy_gradient_loss | -0.0189     |
|    value_loss           | 0.000256    |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.17e+

-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 1.17e+04      |
|    ep_rew_mean          | -1.11e+04     |
| time/                   |               |
|    fps                  | 364           |
|    iterations           | 132           |
|    time_elapsed         | 741           |
|    total_timesteps      | 270336        |
| train/                  |               |
|    approx_kl            | 0.00070837315 |
|    clip_fraction        | 0.00483       |
|    clip_range           | 0.2           |
|    entropy_loss         | -7.09         |
|    explained_variance   | 1             |
|    learning_rate        | 0.0003        |
|    loss                 | 0             |
|    n_updates            | 1800          |
|    policy_gradient_loss | 0             |
|    value_loss           | 2.36e-12      |
-------------------------------------------
---------------------------------------
| rollout/                |         

---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 1.17e+04  |
|    ep_rew_mean          | -1.11e+04 |
| time/                   |           |
|    fps                  | 362       |
|    iterations           | 142       |
|    time_elapsed         | 801       |
|    total_timesteps      | 290816    |
| train/                  |           |
|    approx_kl            | 0.0       |
|    clip_fraction        | 0         |
|    clip_range           | 0.2       |
|    entropy_loss         | -7.09     |
|    explained_variance   | 1         |
|    learning_rate        | 0.0003    |
|    loss                 | 0         |
|    n_updates            | 1900      |
|    policy_gradient_loss | 0         |
|    value_loss           | 0         |
---------------------------------------
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 1.17e+04  |
|    ep_rew_mean          | -1.11e+04 |


---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 1.17e+04  |
|    ep_rew_mean          | -1.11e+04 |
| time/                   |           |
|    fps                  | 363       |
|    iterations           | 152       |
|    time_elapsed         | 857       |
|    total_timesteps      | 311296    |
| train/                  |           |
|    approx_kl            | 0.0       |
|    clip_fraction        | 0         |
|    clip_range           | 0.2       |
|    entropy_loss         | -7.09     |
|    explained_variance   | 1         |
|    learning_rate        | 0.0003    |
|    loss                 | 0         |
|    n_updates            | 2000      |
|    policy_gradient_loss | 0         |
|    value_loss           | 0         |
---------------------------------------
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 1.17e+04  |
|    ep_rew_mean          | -1.11e+04 |


---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 1.17e+04  |
|    ep_rew_mean          | -1.11e+04 |
| time/                   |           |
|    fps                  | 361       |
|    iterations           | 162       |
|    time_elapsed         | 917       |
|    total_timesteps      | 331776    |
| train/                  |           |
|    approx_kl            | 0.0       |
|    clip_fraction        | 0         |
|    clip_range           | 0.2       |
|    entropy_loss         | -7.09     |
|    explained_variance   | 1         |
|    learning_rate        | 0.0003    |
|    loss                 | 0         |
|    n_updates            | 2100      |
|    policy_gradient_loss | 0         |
|    value_loss           | 0         |
---------------------------------------
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 1.17e+04  |
|    ep_rew_mean          | -1.11e+04 |


KeyboardInterrupt: 

In [174]:
state = env.reset()

done = False
score = 0
steps = 0
while not done:
    steps += 1
    action = env.action_space.sample()
    state, reward, done, additional_info = env.step(action)
#     print(action)
#     print(done)
#     print(reward)
#     print(state)
    score += reward
print(steps)
print(score)

1695
-1695


  np.array(a)


array([array([[0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0