In [5]:
import os
import numpy as np
import gym
from collections import OrderedDict
from gym.spaces import Box, MultiDiscrete, Dict, Tuple
from stable_baselines3 import PPO, A2C
from gym.wrappers import FlattenObservation

In [6]:
n = 300 # Number of VMs
m = 30 # Number of Servers

In [7]:
class CpuMem():
    def __init__(self, cpu, mem):
        self.cpu = cpu
        self.mem = mem
        self._scpu = cpu
        self._smem = mem
    def get_lcc(self):
        return self.cpu / self._scpu
    def get_lmm(self):
        return self.mem / self._smem
    def get_arr(self):
        return [self.cpu, self.mem]
    def __le__(self, other):
        return self.cpu <= other.cpu and self.mem <= other.mem
    def __isub__(self, other):
        self.cpu -= other.cpu
        self.mem -= other.mem
        return self

STOP = 2048
dtype = np.float32

class CloudEnv(gym.Env):
    def __init__(self):
        self.reset()
        self.action_space = MultiDiscrete([n, m])
        self.observation_space = Dict(
            servers=Box(low=0, high=STOP, shape=(m, 2), dtype=dtype),
            vms=Box(low=0, high=STOP, shape=(n, 2), dtype=dtype),
        )

    def get_state(self):
        vms = np.zeros((n, 2), dtype=dtype)
        for i in range(n):
            vms[i] = self.vms[i].get_arr()
        servers = np.zeros((m, 2), dtype=dtype)
        for i in range(m):
            servers[i] = self.servers[i].get_arr()
        return OrderedDict([
            ('servers', servers),
            ('vms', vms),
        ])

    def get_reward(self):
        ans = 0.0
        for i in range(n):
            if self.vms[i].cpu != STOP:
                ans -= 1
        lc = [server.get_lcc() for server in self.servers]
        lm = [server.get_lmm() for server in self.servers]
        D = lambda v: np.std(v)
        ans += (1 / (D(lc) + 1) + 1 / (D(lm) + 1)) / 2
        reward = ans - self.was
        self.was = ans
        return reward
    
    def can_move(self, vm_index, server_index):
        vm = self.vms[vm_index]
        server = self.servers[server_index]
        if vm.cpu == STOP:
            return False
        if not vm <= server:
            return False
        return True

    def is_done(self):
        for vm in self.vms:
            if vm.cpu == STOP:
                continue
            for server in self.servers:
                if vm <= server:
                    return False
        return True

    def render(self):
        pass

    def step(self, action):
        vm_index, server_index = action[0], action[1]
        if self.can_move(vm_index, server_index):
            self.servers[server_index] -= self.vms[vm_index]
            self.vms[vm_index] = CpuMem(STOP, STOP)
        return self.get_state(), self.get_reward(), self.is_done(), {}

    def reset(self):
        self.vms = []
        self.servers = []
        for i in range(n):
            self.vms.append(CpuMem(1, 2))
        for i in range(m):
            self.servers.append(CpuMem(16, 32))
        self.was = 0
        return self.get_state()

In [8]:
env = CloudEnv()
# env = FlattenObservation(env)

In [9]:
# from stable_baselines3.common.vec_env import VecFrameStack
# env = VecFrameStack(env, n_stack=8)

In [10]:
log_path = os.path.join('Training', 'Logs')
model = PPO(policy='MultiInputPolicy', env=env, verbose=1, tensorboard_log=log_path)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [None]:
model.learn(total_timesteps=1000000)

Logging to Training/Logs/PPO_4
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1.58e+03 |
|    ep_rew_mean     | 0.869    |
| time/              |          |
|    fps             | 539      |
|    iterations      | 1        |
|    time_elapsed    | 3        |
|    total_timesteps | 2048     |
---------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 1.58e+03   |
|    ep_rew_mean          | 0.869      |
| time/                   |            |
|    fps                  | 419        |
|    iterations           | 2          |
|    time_elapsed         | 9          |
|    total_timesteps      | 4096       |
| train/                  |            |
|    approx_kl            | 0.04656899 |
|    clip_fraction        | 0.293      |
|    clip_range           | 0.2        |
|    entropy_loss         | -9.06      |
|    explained_variance   | -0.0102    |
|    learning_

In [174]:
state = env.reset()

done = False
score = 0
steps = 0
while not done:
    steps += 1
    action = env.action_space.sample()
    state, reward, done, additional_info = env.step(action)
#     print(action)
#     print(done)
#     print(reward)
#     print(state)
    score += reward
print(steps)
print(score)

1695
-1695


  np.array(a)


array([array([[0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0., 0.],
              [0

0.816496580927726
