In [175]:
import os
import numpy as np
import gym
from gym.spaces import Box, MultiDiscrete, Dict
from stable_baselines3 import PPO, A2C

In [176]:
n = 300 # Number of VMs
m = 30 # Number of Servers

In [177]:
class CpuMem():
    def __init__(self, cpu, mem):
        self.cpu = cpu
        self.mem = mem
    def get_arr(self):
        return [self.cpu, self.mem]
    def __le__(self, other):
        return self.cpu <= other.cpu and self.mem <= other.mem
    def __isub__(self, other):
        self.cpu -= other.cpu
        self.mem -= other.mem
        return self

STOP = 2048

class CloudEnv(gym.Env):
    def __init__(self):
        self.reset()
        self.action_space = MultiDiscrete([n, m])
        self.observation_space = Box(low=0, high=STOP, shape=(n + m, 2), dtype=np.float32)

    def get_state(self):
        ans = np.zeros([n + m, 2])
        for i in range(n):
            ans[i] = self.vms[i].get_arr()
        for i in range(m):
            ans[n + i] = self.servers[i].get_arr()
        return ans

    def get_reward(self):
        ans = 0
        for vm in self.vms:
            if vm.cpu == STOP:
                ans += 1
        reward = ans - self.was
        self.was = ans
        return 2 * reward - 1
    
    def can_move(self, vm_index, server_index):
        vm = self.vms[vm_index]
        server = self.servers[server_index]
        if vm.cpu == STOP:
            return False
        if not vm <= server:
            return False
        return True

    def is_done(self):
        for vm in self.vms:
            if vm.cpu == STOP:
                continue
            for server in self.servers:
                if vm <= server:
                    return False
        return True

    def render(self):
        pass

    def step(self, action):
        vm_index, server_index = action[0], action[1]
        if self.can_move(vm_index, server_index):
            self.servers[server_index] -= self.vms[vm_index]
            self.vms[vm_index] = CpuMem(STOP, STOP)
        return self.get_state(), self.get_reward(), self.is_done(), {}

    def reset(self):
        self.vms = []
        self.servers = []
        for i in range(n):
            self.vms.append(CpuMem(1, 2))
        for i in range(m):
            self.servers.append(CpuMem(16, 32))
        self.was = 0
        return self.get_state()

In [178]:
env = CloudEnv()

In [184]:
# from stable_baselines3.common.vec_env import VecFrameStack
# env = VecFrameStack(env, n_stack=8)

In [181]:
log_path = os.path.join('Training', 'Logs')
model = PPO(policy='MlpPolicy', env=env, verbose=1, tensorboard_log=log_path)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [182]:
model.learn(total_timesteps=100000)

Logging to Training/Logs/PPO_19
----------------------------------
| rollout/           |           |
|    ep_len_mean     | 1.93e+03  |
|    ep_rew_mean     | -1.33e+03 |
| time/              |           |
|    fps             | 543       |
|    iterations      | 1         |
|    time_elapsed    | 3         |
|    total_timesteps | 2048      |
----------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 1.93e+03   |
|    ep_rew_mean          | -1.33e+03  |
| time/                   |            |
|    fps                  | 417        |
|    iterations           | 2          |
|    time_elapsed         | 9          |
|    total_timesteps      | 4096       |
| train/                  |            |
|    approx_kl            | 0.03391386 |
|    clip_fraction        | 0.248      |
|    clip_range           | 0.2        |
|    entropy_loss         | -9.08      |
|    explained_variance   | -0.0731    |
|  

----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 2.39e+03   |
|    ep_rew_mean          | -1.79e+03  |
| time/                   |            |
|    fps                  | 376        |
|    iterations           | 11         |
|    time_elapsed         | 59         |
|    total_timesteps      | 22528      |
| train/                  |            |
|    approx_kl            | 0.03125317 |
|    clip_fraction        | 0.32       |
|    clip_range           | 0.2        |
|    entropy_loss         | -8.73      |
|    explained_variance   | 0.487      |
|    learning_rate        | 0.0003     |
|    loss                 | 3.49       |
|    n_updates            | 100        |
|    policy_gradient_loss | -0.0427    |
|    value_loss           | 29.1       |
----------------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 2.52e+03   |
|    ep_rew_mean

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 3.06e+03    |
|    ep_rew_mean          | -2.46e+03   |
| time/                   |             |
|    fps                  | 369         |
|    iterations           | 21          |
|    time_elapsed         | 116         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.016774505 |
|    clip_fraction        | 0.168       |
|    clip_range           | 0.2         |
|    entropy_loss         | -8.7        |
|    explained_variance   | 0.207       |
|    learning_rate        | 0.0003      |
|    loss                 | 3.85        |
|    n_updates            | 200         |
|    policy_gradient_loss | -0.0219     |
|    value_loss           | 40.9        |
-----------------------------------------
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 3.06e+03  

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 4.01e+03    |
|    ep_rew_mean          | -3.41e+03   |
| time/                   |             |
|    fps                  | 365         |
|    iterations           | 31          |
|    time_elapsed         | 173         |
|    total_timesteps      | 63488       |
| train/                  |             |
|    approx_kl            | 0.014176988 |
|    clip_fraction        | 0.171       |
|    clip_range           | 0.2         |
|    entropy_loss         | -8.45       |
|    explained_variance   | 0.282       |
|    learning_rate        | 0.0003      |
|    loss                 | 9.74        |
|    n_updates            | 300         |
|    policy_gradient_loss | -0.0143     |
|    value_loss           | 53.3        |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 4.01e+

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 4.49e+03    |
|    ep_rew_mean          | -3.89e+03   |
| time/                   |             |
|    fps                  | 360         |
|    iterations           | 41          |
|    time_elapsed         | 233         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.026503969 |
|    clip_fraction        | 0.301       |
|    clip_range           | 0.2         |
|    entropy_loss         | -8.09       |
|    explained_variance   | -1.19e-07   |
|    learning_rate        | 0.0003      |
|    loss                 | 0.000561    |
|    n_updates            | 400         |
|    policy_gradient_loss | -0.0257     |
|    value_loss           | 0.0566      |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 4.96e+

<stable_baselines3.ppo.ppo.PPO at 0x7fe6a827d670>

In [174]:
state = env.reset()

done = False
score = 0
steps = 0
while not done:
    steps += 1
    action = env.action_space.sample()
    state, reward, done, additional_info = env.step(action)
#     print(action)
#     print(done)
#     print(reward)
#     print(state)
    score += reward
print(steps)
print(score)

1695
-1695


In [117]:
gym.Env??